In [1]:
import pandas as pd
import numpy as np
import pickle as pkl
import os

from glob import glob
from collections import Counter

In [2]:
PATH = '../'

PROCESSED_DATA_PATH = os.path.join(PATH, 'data/processed/')

DF_FAKE_PATH = os.path.join(PROCESSED_DATA_PATH, 'df_fake_clean.pkl')
DF_LEGIT_PATH = os.path.join(PROCESSED_DATA_PATH, 'df_legit_clean.pkl')

In [3]:
df_fake = pkl.load(open(DF_FAKE_PATH, 'rb'))

df_legit = pkl.load(open(DF_LEGIT_PATH, 'rb'))

In [4]:
df = pd.concat((df_fake, df_legit), axis=0)

In [5]:
from imblearn.under_sampling import RandomUnderSampler

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from xgboost import XGBClassifier

from sklearn.pipeline import Pipeline

In [6]:
y_all = df['FAKE']

In [7]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=5, max_df=0.9)

In [8]:
vectorizer.fit(df['TEXT_CLEAN'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.9, max_features=None, min_df=5,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [9]:
X_all = vectorizer.transform(df['TEXT_CLEAN'])

In [10]:
rus = RandomUnderSampler(random_state=42)

In [11]:
X_resampled, y_resampled = rus.fit_resample(X_all, y_all)

In [12]:
clf = LogisticRegression(n_jobs=-1)
cv_logistic_regression = cross_val_score(clf, X_resampled, y_resampled, n_jobs=-1, cv=5, verbose=0)
print(cv_logistic_regression)
print(cv_logistic_regression.mean())

[0.97808765 0.98       0.958      0.966      0.992     ]
0.9748175298804782


In [13]:
clf = RandomForestClassifier(n_jobs=-1, n_estimators=1000)
cv_random_forest = cross_val_score(clf, X_resampled, y_resampled, n_jobs=-1, cv=5, verbose=0)
print(cv_random_forest)
print(cv_random_forest.mean())

[0.97609562 0.97       0.986      0.974      0.982     ]
0.9776191235059761


In [14]:
clf = GaussianNB()
cv_gaussian_naive_bayes = cross_val_score(clf, X_resampled.toarray(), y_resampled, n_jobs=1, cv=5, verbose=0)
print(cv_gaussian_naive_bayes)
print(cv_gaussian_naive_bayes.mean())

[0.57569721 0.602      0.616      0.736      0.694     ]
0.6447394422310756


In [15]:
clf = SVC()
cv_svc = cross_val_score(clf, X_resampled, y_resampled, n_jobs=-1, cv=5, verbose=0)
print(cv_svc)
print(cv_svc.mean())

[0.56374502 0.652      0.752      0.85       0.818     ]
0.7271490039840638


In [16]:
clf = XGBClassifier(n_estimators=100, n_jobs=-1)
cv_xgboost = cross_val_score(clf, X_resampled, y_resampled, n_jobs=-1, cv=5, verbose=5)
print(cv_xgboost)
print(cv_xgboost.mean())

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  2.0min remaining:  3.0min


[0.96015936 0.962      0.96       0.956      0.972     ]
0.9620318725099601


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  2.8min finished


In [17]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)