In [1]:
import pandas as pd
import numpy as np
import pickle as pkl
import os

from glob import glob
from time import time
from collections import Counter

In [2]:
PATH = '../'

PROCESSED_DATA_PATH = os.path.join(PATH, 'data/processed/')

DF_FAKE_PATH = os.path.join(PROCESSED_DATA_PATH, 'df_fake_clean.pkl')
DF_LEGIT_PATH = os.path.join(PROCESSED_DATA_PATH, 'df_legit_clean.pkl')

In [3]:
df_fake = pkl.load(open(DF_FAKE_PATH, 'rb'))

df_legit = pkl.load(open(DF_LEGIT_PATH, 'rb'))

In [4]:
df = pd.concat((df_fake, df_legit), axis=0)

In [5]:
from imblearn.under_sampling import RandomUnderSampler

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier

from sklearn.pipeline import Pipeline

In [6]:
y_all = df['FAKE']

In [7]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=5, max_df=0.9)

In [8]:
vectorizer.fit(df['TEXT_CLEAN'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.9, max_features=None,
                min_df=5, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [9]:
X_all = vectorizer.transform(df['TEXT_CLEAN'])

In [10]:
rus = RandomUnderSampler(random_state=42)

In [11]:
X_resampled, y_resampled = rus.fit_resample(X_all, y_all)

In [12]:
model_list = [
    LogisticRegression(),
    RandomForestClassifier(n_estimators=1000),
    GaussianNB(),
    LinearSVC(),
    SVC(),
    XGBClassifier(n_estimators=100)
]

In [13]:
n_folds = 10
n_jobs = 3

columns = ['model', 'mean'] + ['fold_{}'.format(i) for i in range(1, n_folds + 1)]
df_results = pd.DataFrame()
df_results

line = {'model': [], 'mean': []}
for i in range(n_folds):
    line['fold_{}'.format(i)] = []

for i, model in enumerate(model_list):
    print('Currently running {}'.format(type(model).__name__))
    time_start = time()
    try:
        cv = cross_val_score(model, X_resampled, y_resampled, n_jobs=n_jobs, cv=n_folds, verbose=0)
    except:
        cv = cross_val_score(model, X_resampled.toarray(), y_resampled, n_jobs=1, cv=n_folds, verbose=0)
        
    line['model'].append(type(model).__name__)
    line['mean'].append(cv.mean())
    
    for j, item in enumerate(cv):
        line['fold_{}'.format(j)].append(item)
        
    print('Finished running {}'.format(type(model).__name__))
    print('Time elapsed {}s'.format(time() - time_start))
    print()
df_results = df_results.append(pd.DataFrame(line))
df_results

Currently running LogisticRegression
Finished running LogisticRegression
Time elapsed 3.279806613922119s

Currently running RandomForestClassifier
Finished running RandomForestClassifier
Time elapsed 3.488658905029297s

Currently running GaussianNB
Finished running GaussianNB
Time elapsed 179.81365060806274s

Currently running LinearSVC
Finished running LinearSVC
Time elapsed 3.540477991104126s

Currently running SVC
Finished running SVC
Time elapsed 64.0390317440033s

Currently running XGBClassifier




Finished running XGBClassifier
Time elapsed 475.71717405319214s



Unnamed: 0,model,mean,fold_0,fold_1,fold_2,fold_3,fold_4,fold_5,fold_6,fold_7,fold_8,fold_9
0,LogisticRegression,0.974416,0.980159,0.976,0.988,0.972,0.968,0.952,0.94,0.992,0.98,0.996
1,RandomForestClassifier,0.952035,0.956349,0.952,0.94,0.936,0.964,0.94,0.92,0.98,0.96,0.972
2,GaussianNB,0.654324,0.595238,0.584,0.636,0.608,0.656,0.568,0.66,0.844,0.756,0.636
3,LinearSVC,0.980416,0.980159,0.98,0.988,0.968,0.984,0.972,0.944,1.0,0.988,1.0
4,SVC,0.733924,0.595238,0.656,0.688,0.716,0.844,0.56,0.752,0.924,0.864,0.74
5,XGBClassifier,0.962841,0.948413,0.976,0.976,0.948,0.964,0.96,0.936,0.968,0.984,0.968


In [15]:
df_results.sort_values(by='mean')

Unnamed: 0,model,mean,fold_0,fold_1,fold_2,fold_3,fold_4,fold_5,fold_6,fold_7,fold_8,fold_9
2,GaussianNB,0.654324,0.595238,0.584,0.636,0.608,0.656,0.568,0.66,0.844,0.756,0.636
4,SVC,0.733924,0.595238,0.656,0.688,0.716,0.844,0.56,0.752,0.924,0.864,0.74
1,RandomForestClassifier,0.952035,0.956349,0.952,0.94,0.936,0.964,0.94,0.92,0.98,0.96,0.972
5,XGBClassifier,0.962841,0.948413,0.976,0.976,0.948,0.964,0.96,0.936,0.968,0.984,0.968
0,LogisticRegression,0.974416,0.980159,0.976,0.988,0.972,0.968,0.952,0.94,0.992,0.98,0.996
3,LinearSVC,0.980416,0.980159,0.98,0.988,0.968,0.984,0.972,0.944,1.0,0.988,1.0
