# word2vec Vectorization

In [1]:
import pandas as pd
from collections import Counter
import re
import numpy as np
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.metrics import f1_score, accuracy_score , recall_score , precision_score, confusion_matrix
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
from sklearn.naive_bayes import GaussianNB
%matplotlib inline

In [2]:
alldata = pd.read_csv('news_dataset.csv')

In [3]:
alldata.head()

Unnamed: 0.1,Unnamed: 0,title,content,publication,label
0,0,Muslims BUSTED: They Stole Millions In Gov’t B...,Print They should pay all the back all the mon...,100percentfedup,fake
1,1,Re: Why Did Attorney General Loretta Lynch Ple...,Why Did Attorney General Loretta Lynch Plead T...,100percentfedup,fake
2,2,BREAKING: Weiner Cooperating With FBI On Hilla...,Red State : \nFox News Sunday reported this mo...,100percentfedup,fake
3,3,PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...,Email Kayla Mueller was a prisoner and torture...,100percentfedup,fake
4,4,FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...,Email HEALTHCARE REFORM TO MAKE AMERICA GREAT ...,100percentfedup,fake


In [4]:
alldata['label'].replace({'fake':1, 'real':0}, inplace=True)
alldata.isnull().values.any()
alldata.dropna(inplace=True)

In [177]:
X_body_text = alldata['content']
X_headline_text = alldata['title']
X_combined_text = 10*(alldata['title'] + " ") + alldata['content']
y = alldata['label']

In [6]:
import gensim
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [7]:
titles_list = [title for title in X_headline_text]
big_title_string = ' '.join(titles_list)
tokens = word_tokenize(big_title_string)
words = [word.lower() for word in tokens if word.isalpha()]
stop_words = set(stopwords.words('english'))
words = [word for word in words if not word in stop_words]

In [8]:
print (words[:20])

['muslims', 'busted', 'stole', 'millions', 'gov', 'benefits', 'attorney', 'general', 'loretta', 'lynch', 'plead', 'fifth', 'breaking', 'weiner', 'cooperating', 'fbi', 'hillary', 'email', 'investigation', 'pin']


In [9]:
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary = True) 

In [10]:
model.vector_size

300

In [16]:
model['news']

array([-0.13867188,  0.04370117, -0.13085938, -0.16796875, -0.06054688,
       -0.07080078,  0.00854492, -0.09960938,  0.19628906,  0.14648438,
       -0.23046875, -0.09619141, -0.140625  ,  0.25585938, -0.16699219,
        0.14257812, -0.16992188,  0.06884766,  0.23925781, -0.28515625,
        0.17089844,  0.17089844,  0.05615234, -0.06445312,  0.13867188,
        0.32226562,  0.05078125,  0.27148438, -0.00564575,  0.02587891,
       -0.05639648,  0.03125   ,  0.171875  , -0.18261719,  0.05029297,
       -0.01062012, -0.02600098,  0.20019531, -0.07568359,  0.125     ,
        0.01300049, -0.17089844, -0.09667969,  0.21972656,  0.06494141,
        0.24804688,  0.11083984,  0.0267334 , -0.10986328,  0.17578125,
        0.328125  , -0.11865234,  0.03222656,  0.30273438,  0.09179688,
       -0.05224609, -0.27539062, -0.046875  ,  0.20996094, -0.20410156,
        0.10595703, -0.08398438, -0.09765625, -0.16015625, -0.0045166 ,
        0.09228516,  0.05737305, -0.04418945,  0.06445312,  0.16

In [18]:
X_headline_text[:5]

0    Muslims BUSTED: They Stole Millions In Gov’t B...
1    Re: Why Did Attorney General Loretta Lynch Ple...
2    BREAKING: Weiner Cooperating With FBI On Hilla...
3    PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...
4    FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...
Name: title, dtype: object

In [19]:
words[:5]

['muslims', 'busted', 'stole', 'millions', 'gov']

In [None]:
np.mean([model[w] for w in words if w in word2vec] or [np.zeros(dim)], axis=0)

In [40]:
np.mean(np.array([model[w] for w in [word.lower() for word in word_tokenize("PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe") if word.isalpha()] if w in words ]), axis=0)

array([ 0.0388916 , -0.12163086,  0.01960144,  0.05614014,  0.00795898,
       -0.00070801, -0.05495606, -0.13168946,  0.18251953,  0.04238281,
        0.1772461 , -0.12548828, -0.07553711, -0.00820312, -0.08354492,
       -0.0024353 ,  0.04921875,  0.1685547 , -0.05024719, -0.1395996 ,
       -0.01674805,  0.19472656,  0.1328125 ,  0.05839844,  0.07675781,
       -0.16594544, -0.11185303,  0.02973633,  0.1584961 , -0.10786133,
       -0.03115234, -0.09890137, -0.14863281, -0.00673828,  0.00585938,
        0.0361084 ,  0.1244873 ,  0.05317383,  0.02167969,  0.03893433,
        0.14355469, -0.08088379,  0.17337647, -0.00735474,  0.0154541 ,
        0.04521179, -0.0480957 ,  0.02337646, -0.01516113, -0.08320312,
        0.03275909,  0.02709961,  0.06071777, -0.01948242,  0.20391235,
       -0.11320801, -0.146875  , -0.08613281, -0.01938782, -0.08071289,
        0.07125549,  0.0300293 ,  0.025     ,  0.00892029,  0.11816406,
       -0.06083984, -0.08557129,  0.00537109,  0.14208984,  0.16

In [55]:
'loretta' in words

True

In [134]:
# Converting a string to tokens, calculating mean of word2vec for every word in a sentence

def vectCalc(text):
    a = [word.lower() for word in word_tokenize(text) if word.isalpha()]
    b = np.array([model[w] for w in a if w in model and w in words])
    if len(b)>0:
        return np.mean(b, axis=0)
    else:
        return np.zeros(300)

In [135]:
vectCalc("vvsandomsample rfake")

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [136]:
new_ser = pd.DataFrame(X_headline_text)['title'].apply(vectCalc)

In [139]:
np.unique(new_ser.apply(np.size).values)

array([300])

In [149]:
X_headline_vec = pd.DataFrame(new_ser.values.tolist())

## Saving Vectorized dataset for future use

In [163]:
X_headline_vec.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,0.075134,0.095032,-0.018885,0.180705,-0.102905,-0.075216,-0.176351,0.073771,0.0966,0.183268,...,0.090658,-0.074249,-0.071838,0.033,-0.082937,0.046611,-0.05678,-0.076497,-0.079346,-0.049487
1,0.051987,0.013672,0.053021,-0.163184,-0.148584,-0.017378,0.009473,-0.025668,0.185156,0.015948,...,0.059033,0.02207,0.054199,-0.01062,-0.01106,0.019336,-0.096582,-0.219296,-0.019531,-0.03457
2,-0.053218,-0.006348,0.029297,-0.024763,-0.101214,0.095721,0.017421,0.055664,0.077637,0.046125,...,0.025997,0.05406,0.016811,0.03901,-0.062762,-0.095503,0.080261,-0.145647,-0.119036,0.022374
3,0.004852,-0.001902,0.050012,0.075821,-0.047064,-0.048322,-0.006976,-0.144267,0.094218,0.108246,...,0.041168,-0.010461,-0.087931,0.035131,0.014521,-0.154073,-0.074272,-0.091553,0.031362,0.067342
4,0.041016,0.048256,0.016785,0.097397,-0.040054,-0.036987,0.063026,-0.016579,0.097771,-0.001984,...,-0.027811,-0.087311,-0.083374,-0.000671,-0.103638,-0.021179,0.025879,-0.113892,0.025085,0.084534


In [164]:
y.head()

0    1
1    1
2    1
3    1
4    1
Name: label, dtype: int64

In [165]:
headline_vec = X_headline_vec.copy()

In [167]:
headline_vec["Fake"] = y

In [173]:
headline_vec.to_csv("Headline Vector.csv", index=False)

In [181]:
X_body_text.head()

0    Print They should pay all the back all the mon...
1    Why Did Attorney General Loretta Lynch Plead T...
2    Red State : \nFox News Sunday reported this mo...
3    Email Kayla Mueller was a prisoner and torture...
4    Email HEALTHCARE REFORM TO MAKE AMERICA GREAT ...
Name: content, dtype: object

In [183]:
body_ser = pd.DataFrame(X_body_text)['content'].apply(vectCalc)

In [184]:
np.unique(body_ser.apply(np.size).values)

array([300])

In [185]:
X_body_vec = pd.DataFrame(body_ser.values.tolist())

In [186]:
X_body_vec.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,0.050662,0.031227,-0.030101,0.094391,-0.005058,-0.045496,0.008541,-0.025954,0.10329,0.047973,...,-0.058377,0.055381,-0.074973,0.028826,0.023275,0.031859,-0.017179,-0.050521,0.052581,-0.06699
1,0.009187,0.034447,0.044124,0.011204,-0.079713,-0.041712,-0.007093,-0.044684,0.11837,0.050185,...,-0.04906,0.001031,-0.014836,0.001227,-0.007322,0.006936,-0.039997,-0.085951,-0.020036,-0.007204
2,-0.00791,0.002882,0.005868,0.047244,-0.055505,0.019863,0.053245,-0.041645,0.109752,0.058014,...,-0.033843,0.015184,-0.013253,0.021607,-0.017107,-0.031132,0.006545,-0.079502,-0.001678,-0.012361
3,0.043716,-0.007644,0.054707,0.102776,0.002611,-0.017399,0.096088,-0.036231,0.099773,0.103034,...,-0.012712,0.004585,-0.109193,-0.035175,-0.038788,-0.099942,-0.001761,-0.086343,-0.017817,0.046936
4,0.001495,0.016298,0.006372,0.112088,-0.049997,-0.070807,0.067386,-0.013888,0.112655,0.003094,...,-0.076776,0.029751,-0.045796,0.004718,-0.031584,0.07721,0.007886,-0.023598,0.035955,-0.041712


In [187]:
body_vec = X_body_vec.copy()
body_vec["Fake"] = y

In [188]:
body_vec.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,Fake
0,0.050662,0.031227,-0.030101,0.094391,-0.005058,-0.045496,0.008541,-0.025954,0.10329,0.047973,...,0.055381,-0.074973,0.028826,0.023275,0.031859,-0.017179,-0.050521,0.052581,-0.06699,1.0
1,0.009187,0.034447,0.044124,0.011204,-0.079713,-0.041712,-0.007093,-0.044684,0.11837,0.050185,...,0.001031,-0.014836,0.001227,-0.007322,0.006936,-0.039997,-0.085951,-0.020036,-0.007204,1.0
2,-0.00791,0.002882,0.005868,0.047244,-0.055505,0.019863,0.053245,-0.041645,0.109752,0.058014,...,0.015184,-0.013253,0.021607,-0.017107,-0.031132,0.006545,-0.079502,-0.001678,-0.012361,1.0
3,0.043716,-0.007644,0.054707,0.102776,0.002611,-0.017399,0.096088,-0.036231,0.099773,0.103034,...,0.004585,-0.109193,-0.035175,-0.038788,-0.099942,-0.001761,-0.086343,-0.017817,0.046936,1.0
4,0.001495,0.016298,0.006372,0.112088,-0.049997,-0.070807,0.067386,-0.013888,0.112655,0.003094,...,0.029751,-0.045796,0.004718,-0.031584,0.07721,0.007886,-0.023598,0.035955,-0.041712,1.0


In [189]:
body_vec.to_csv("Body Vector.csv", index=False)