In [8]:
import pandas as pd
import numpy as np
import nltk 
import re
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import joblib
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
stop_words = stopwords.words('english')
#import sklearn
file = 'data/EscortDataLabelAll.csv'
df = pd.read_csv(file)
data = df[['tweet','SUSPICIOUS']].copy()
data.head()

Unnamed: 0,tweet,SUSPICIOUS
0,rt @_just__v: available for romantic appointme...,1
1,requiring a #mature dinner companion in #orlan...,1
2,wanting to plan a sensual escape in #orlando? ...,1
3,"Ã°ÂŸÂ“Â£well, now am i not only open for inqui...",1
4,#orlando #florida &amp; surrounding cities. \n...,0


In [9]:
#functions
def clean_text(df):
    df['clean_text'] = df['tweet'].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem))
    return df
def tokenize(row):
    res = word_tokenize(row['clean_text'])
    return res
def stop_word_filtering(row):
    res = [word for word in row['token'] if word not in stop_words]
    return res
def lemmatizer(row):   
    res = [WordNetLemmatizer().lemmatize(word=word) for word in row['token'] ]
    return res
def rejoin(row):
    res = ''
    for i in row['lement']:
        res+=i
        res+=' '
    return res

In [10]:
#load models
grid = joblib.load('models/model.pkl')
vectorizer = joblib.load('models/vectorizer.pkl')

In [11]:
data = clean_text(data)
data['token']=data.apply(tokenize,axis=1)
data['token']=data.apply(stop_word_filtering,axis=1)
data['lement']=data.apply(lemmatizer,axis=1)
data['final'] = data.apply(rejoin,axis=1)
X = vectorizer.transform(data['final'])
data['tfidf']=X.toarray().tolist()
y = list(data['SUSPICIOUS'])

In [23]:
X_train, X_test,y_train, y_test = train_test_split(X,y,test_size=.2, random_state=42)

In [24]:
print(grid.best_params_)
print(grid.best_estimator_)

{'C': 10, 'gamma': 1, 'kernel': 'linear'}
SVC(C=10, gamma=1, kernel='linear')


In [25]:
grid_pred = grid.predict(X_test)
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

           0       0.98      0.73      0.84        64
           1       0.96      1.00      0.98       381

    accuracy                           0.96       445
   macro avg       0.97      0.87      0.91       445
weighted avg       0.96      0.96      0.96       445

