# Classification tests with Word2Vec embedding

In [1]:
import pandas as pd
import re
from collections import defaultdict
import spacy
from glob import glob
import unidecode
import os
import string
import numpy as np
import pickle as pkl
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import pickle as pkl

import warnings
warnings.filterwarnings("ignore")

## Loading variables

In [2]:
PATH = '../'

PROCESSED_DATA_PATH = os.path.join(PATH, 'data/processed/')

DF_FAKE_PATH = os.path.join(PROCESSED_DATA_PATH, 'df_fake_clean.pkl')
DF_LEGIT_PATH = os.path.join(PROCESSED_DATA_PATH, 'df_legit_clean.pkl')

df_fake = pkl.load(open(DF_FAKE_PATH, 'rb'))

df_legit = pkl.load(open(DF_LEGIT_PATH, 'rb'))

df = pd.concat((df_fake, df_legit), axis=0)

df.rename({'TEXT_CLEAN': 'X', 'FAKE': 'y'}, axis=1, inplace=True)

df.head()

Unnamed: 0,DATE,y,X,TITLE_CLEAN,TEXT_LEN_CHAR,TEXT_LEN_TOKEN,TITLE_LEN_CHAR,TITLE_LEN_TOKEN
0,03/08/2019,1,vamos assinar peticao cassacao mandato bolsona...,peticao impeachment bolsonaro precisa milhoes ...,328,36,81,9
1,04/08/2019,1,lula vitima golpe politico merece estar preso ...,peticao lula livre contribui liberdade preside...,242,30,54,7
2,05/08/2019,1,professor contou dilma matou mario kozel filho...,mario kozel filho assassinado dilma tiros,170,24,41,6
3,03/08/2019,1,vergonha presidente oab mentiu pai morto milit...,felipe santa cruz presidente oab mentiu sobre ...,158,25,82,13
4,06/08/2019,1,partido diabolico bandidos abrindo buracos est...,esquerda abrindo buracos estradas nordeste con...,155,19,59,7


In [3]:
df.shape

(16950, 8)

# Training Word2Vec model

In [None]:
from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec

## Joining terms that are commonly found together

e.g.: belo_horizonte

In [None]:
sent = [text.split() for text in df['X']]
phrases = Phrases(sent, min_count=30, progress_per=10000)

In [None]:
df['X_phrased'] = [phrases[s] for s in sent]

### Training Word2Vec

In [None]:
w2v_model = Word2Vec(df['X_phrased'],
                    min_count=5,
                    window=9,
                    size=100,
                    workers=3,
                    iter=50)

In [None]:
wrd = 'bolsonaro'
print("Palavra: {}".format(wrd))
print("Palavras semelhantes:")
if wrd in w2v_model:
    for item in w2v_model.wv.most_similar(wrd):
        print('- {}'.format(item[0]))

In [None]:
from tqdm import tqdm_notebook

<span style="color: #ff3333; font-weight: bold">ATTENTION!</span> The cell below takes a long time to process!

If file is already on disk, you can just load it on the other cell.

In [None]:
# all_df = pd.DataFrame()

# for text in tqdm_notebook(df['X_phrased']):
#     aux = pd.DataFrame()
#     for word in text:
#         try:
#             word_vec = w2v_model[word]
#             aux = aux.append(pd.Series(word_vec), ignore_index=True)
#         except:
#             pass
#         doc_vec = aux.mean()
#     all_df = all_df.append(doc_vec, ignore_index=True)

In [5]:
# pkl.dump(all_df, open('../data/processed/text_w2v.pkl', 'wb'))

all_df = pkl.load(open('../data/processed/text_w2v.pkl', 'rb'))

## Training model

In [4]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.under_sampling import RandomUnderSampler

In [6]:
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(all_df, df['y'])

In [7]:
cv = cross_val_score(RandomForestClassifier(n_estimators=1000, n_jobs=-1), X_resampled, y_resampled, n_jobs=-1, cv=5, verbose=10)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   22.6s remaining:   33.9s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   22.8s remaining:   15.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   28.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   28.3s finished


In [8]:
cv

array([0.94223108, 0.928     , 0.926     , 0.916     , 0.926     ])

In [14]:
cv.mean()

0.9276462151394422

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled,
                                                   y_resampled,
                                                   test_size = 0.2,
                                                   random_state = 1)

In [10]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2001, 100), (501, 100), (2001,), (501,))

In [11]:
model = RandomForestClassifier(n_estimators=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [12]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.92      0.92       252
           1       0.92      0.92      0.92       249

   micro avg       0.92      0.92      0.92       501
   macro avg       0.92      0.92      0.92       501
weighted avg       0.92      0.92      0.92       501



In [13]:
print(confusion_matrix(y_test, y_pred))

[[233  19]
 [ 21 228]]


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
from sklearn.decomposition import PCA
import time
time_start = time.time()

pca = PCA(n_components=2)
pca_result = pca.fit_transform(X_resampled)
print('PCA done! Time elapsed: {} seconds'.format(time.time() - time_start))

In [None]:
df_pca = pd.DataFrame(columns = ['pca1','pca2'])

df_pca['pca1'] = pca_result[:,0]
df_pca['pca2'] = pca_result[:,1]

print('Variance explained per principal component: {}'.format(pca.explained_variance_ratio_))

In [None]:
def fashion_scatter(x, colors):
    # choose a color palette with seaborn.
    num_classes = len(np.unique(colors))
    palette = np.array(sns.color_palette(["#c52f33", "#2A8DC7"], num_classes))

    # create a scatter plot.
    f = plt.figure(figsize=(8, 8))
    ax = plt.subplot(aspect='equal')
    sc = ax.scatter(x[:,0], x[:,1], lw=0, s=40, c=palette[colors.astype(np.int)])
    plt.xlim(-25, 25)
    plt.ylim(-25, 25)
    ax.axis('off')
    ax.axis('tight')

    # add the labels for each digit corresponding to the label
    txts = []

    for i in range(num_classes):

        # Position of each label at median of data points.

        xtext, ytext = np.median(x[colors == i, :], axis=0)
        txt = ax.text(xtext, ytext, str(i), fontsize=24, color="#222222", bbox=dict(facecolor='#eeeeee', alpha=0.8))
#         txt.set_path_effects([
#             PathEffects.Stroke(linewidth=5, foreground="w"),
#             PathEffects.Normal()])
        txts.append(txt)

    return f, ax, sc, txts

In [None]:
fashion_scatter(df_pca.values, y_resampled)

In [None]:
from sklearn.manifold import TSNE
import time
time_start = time.time()

fashion_tsne = TSNE(random_state=42).fit_transform(X_resampled)

print('t-SNE done! Time elapsed: {} seconds'.format(time.time() - time_start))

In [None]:
fashion_scatter(fashion_tsne, y_resampled)