# Import necessary dependencies and settings

In [33]:
import pandas as pd
import numpy as np
import string
import logging
import re
import pickle

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

from sklearn.linear_model import  LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

from skopt import BayesSearchCV
from skopt.callbacks import DeadlineStopper, VerboseCallback, DeltaXStopper

import nltk
from nltk.corpus import stopwords

#import python scripts to use user_defined functions
import sys
import os
from pyprojroot import here
sys.path.append(os.path.join(here(), 'src'))

from cleaning import process_text, process_text2
from training import extract_features, train_model, extract_final_features, train_final_model  
from prediction import get_predictions

# Loading the Data

In [34]:
train_news = pd.read_csv('../data/processed/train.csv').drop('len', axis=1)
val_news = pd.read_csv('../data/processed/val.csv')
test_news = pd.read_csv('../data/processed/test.csv').drop('len', axis=1)

In [35]:
train_news.head()

Unnamed: 0,label,statement
0,False,Says the Annies List political group supports ...
1,True,When did the decline of coal start? It started...
2,True,"Hillary Clinton agrees with John McCain ""by vo..."
3,False,Health care reform legislation is likely to ma...
4,True,The economic turnaround started at the end of ...


In [36]:
display(train_news), display(val_news), display(test_news)

Unnamed: 0,label,statement
0,False,Says the Annies List political group supports ...
1,True,When did the decline of coal start? It started...
2,True,"Hillary Clinton agrees with John McCain ""by vo..."
3,False,Health care reform legislation is likely to ma...
4,True,The economic turnaround started at the end of ...
...,...,...
10231,True,There are a larger number of shark attacks in ...
10232,True,Democrats have now become the party of the [At...
10233,True,Says an alternative to Social Security that op...
10234,False,On lifting the U.S. Cuban embargo and allowing...


Unnamed: 0,label,statement
0,False,We have less Americans working now than in the...
1,False,"When Obama was sworn into office, he DID NOT u..."
2,False,Says Having organizations parading as being so...
3,True,Says nearly half of Oregons children are poor.
4,True,On attacks by Republicans that various program...
...,...,...
1279,True,"For the first time in more than a decade, impo..."
1280,True,Says Donald Trump has bankrupted his companies...
1281,True,"John McCain and George Bush have ""absolutely n..."
1282,False,A new poll shows 62 percent support the presid...


Unnamed: 0,label,statement
0,True,Building a wall on the U.S.-Mexico border will...
1,False,Wisconsin is on pace to double the number of l...
2,False,Says John McCain has done nothing to help the ...
3,True,Suzanne Bonamici supports a plan that will cut...
4,False,When asked by a reporter whether hes at the ce...
...,...,...
1260,True,Says his budget provides the highest state fun...
1261,False,Ive been here almost every day.
1262,False,"In the early 1980s, Sen. Edward Kennedy secret..."
1263,False,Says an EPA permit languished under Strickland...


(None, None, None)

## Merging train & val data for K-Fold

In [37]:
"""
Merging the training and validation data together, so that I can peroform k-fold cross validation 
and shuffle the data to reduce the bias.
"""
labelEncoder = LabelEncoder()
frames = [train_news, val_news]
train_val = pd.concat(frames)
train_val['label'].value_counts()
train_val['label'] = labelEncoder.fit_transform(train_val['label'])

In [38]:
train_val

Unnamed: 0,label,statement
0,0,Says the Annies List political group supports ...
1,1,When did the decline of coal start? It started...
2,1,"Hillary Clinton agrees with John McCain ""by vo..."
3,0,Health care reform legislation is likely to ma...
4,1,The economic turnaround started at the end of ...
...,...,...
1279,1,"For the first time in more than a decade, impo..."
1280,1,Says Donald Trump has bankrupted his companies...
1281,1,"John McCain and George Bush have ""absolutely n..."
1282,0,A new poll shows 62 percent support the presid...


## Feature Weighting

Not all words are equally important to a particular document / category. For example, while words like ‘murder’, ‘knife’ and ‘abduction’ are important to a crime related document, words like ‘news’ and ‘reporter’ may not be quite as important. 

### Binary Weighting
The most basic form of feature weighting, is binary weighting. Where if a word is present in a document, the weight is ‘1’ and if the word is absent the weight is ‘0’. 

### CountVectorizer

It Convert a collection of text documents to a matrix of token counts.


### Tfidf Weighting 

TF-IDF weighting where words that are unique to a particular document would have higher weights compared to words that are used commonly across documents. 

1. TF (Term Frequency): The number of times a word appears in a document divded by the total number of words in the document. Every document has its own term frequency.

2. IDF (Inverse Data Frequency): The log of the number of documents divided by the number of documents that contain the word w. Inverse data frequency determines the weight of rare words across all documents in the corpus.

3. Lastly, the TF-IDF is simply the TF multiplied by IDF.

## Metric

I need to minimize false positives (number of fake news predicted as real news) as it can be very misleadling . For class 0 i.e. 'fake', recall should be high as well as precision. Because we want our model to perform well on both classes (real & fake). In short, we need to maximize f1-score.

### Cases I considered to choose the right metric

**1. Maximizing recall of class 0 (fake) or minimizing false positives(FP)?**
Well, in extreme case, what if all the news predicted by model are labelled as 'fake'. Recall will still be 1, but overall model is really bad i.e. not able to predict class 1 ('real'). 

Ex=> TN = 553, FP = 0, TP = 0, FN = 714

Class0-Recall = TN / (TN + FP) = 1
Class0-Precision = TN / (TN + FN) = 0.43

F1-Score = 2 * Class0-Recall * Class0-Precision/(Class0-Recall + Class0-Precision) = 0.60

Recall, Precision and F1-score for class 1 will be 0.

**2. Considering an extreme case, if all the news classified as True (Even, fake news are predicted as True).**

Ex=>  TN = 0, FP = 553, TP = 714, FN =0
In that case, TN will be 0, which led to Precision 0, Recall 0 and F1 = 0 for class 0 ('fake').

For class 1, Class1-Recall = TP / (TP + FN) = 1
Class1-Precision = TP / (TP + FP) = 0.56

## Word2Vec

Word2vec is a type of mapping that allows words with similar meaning to have similar vector representation.

The idea behind Word2vec is rather simple: we want to use the surrounding words to represent the target words with a Neural Network whose hidden layer encodes the word representation.

The idea behind Word2vec is rather simple: we want to use the surrounding words to represent the target words with a Neural Network whose hidden layer encodes the word representation.

First we load a word2vec model. It has been pre-trained by Google on a 100 billion word Google News corpus.

In [39]:
train_val['clean_statement'] = train_val['statement'].apply(lambda x: process_text(x))

In [40]:
train_val['clean_statement1'] = train_val['clean_statement'].apply(lambda x: process_text2(x))

In [41]:
train_val

Unnamed: 0,label,statement,clean_statement,clean_statement1
0,0,Says the Annies List political group supports ...,"[Says, Annies, List, political, group, support...",Says Annies List political group support abort...
1,1,When did the decline of coal start? It started...,"[decline, coal, start, started, natural, gas, ...",decline coal start started natural gas took st...
2,1,"Hillary Clinton agrees with John McCain ""by vo...","[Hillary, Clinton, agrees, John, McCain, votin...",Hillary Clinton agrees John McCain voting give...
3,0,Health care reform legislation is likely to ma...,"[Health, care, reform, legislation, likely, ma...",Health care reform legislation likely mandate ...
4,1,The economic turnaround started at the end of ...,"[economic, turnaround, started, end, term]",economic turnaround started end term
...,...,...,...,...
1279,1,"For the first time in more than a decade, impo...","[first, time, decade, import, accounted, le, h...",first time decade import accounted le half oil...
1280,1,Says Donald Trump has bankrupted his companies...,"[Says, Donald, Trump, bankrupted, company, twi...",Says Donald Trump bankrupted company twice fou...
1281,1,"John McCain and George Bush have ""absolutely n...","[John, McCain, George, Bush, absolutely, plan,...",John McCain George Bush absolutely plan univer...
1282,0,A new poll shows 62 percent support the presid...,"[new, poll, show, percent, support, president,...",new poll show percent support president plan r...


In [10]:
import gensim

#Dimension of vectors we are generating
EMBEDDING_DIM = 300

#Creating Word Vectors by Word2Vec Method (takes time...)
# wv = gensim.models.Word2Vec(sentences=train_val['clean_statement'], size=EMBEDDING_DIM, window=5, min_count=1)

#loading pretrained model
wv = gensim.models.KeyedVectors.load_word2vec_format("../models/GoogleNews-vectors-negative300.bin.gz", binary=True)
wv.init_sims(replace=True)

2020-12-16 22:09:30,129 : INFO : loading projection weights from ../models/GoogleNews-vectors-negative300.bin.gz
2020-12-16 22:10:12,554 : INFO : loaded (3000000, 300) matrix from ../models/GoogleNews-vectors-negative300.bin.gz
2020-12-16 22:10:12,555 : INFO : precomputing L2-norms of word weight vectors


In [11]:
from itertools import islice
list(islice(wv.vocab, 13030, 13050))

['Memorial_Hospital',
 'Seniors',
 'memorandum',
 'elephant',
 'Trump',
 'Census',
 'pilgrims',
 'De',
 'Dogs',
 '###-####_ext',
 'chaotic',
 'forgive',
 'scholar',
 'Lottery',
 'decreasing',
 'Supervisor',
 'fundamentally',
 'Fitness',
 'abundance',
 'Hold']

In [12]:
# vocab size
len(wv.wv.vocab)

  len(wv.wv.vocab)


3000000

In [13]:
#printing similarity index
print(wv.most_similar('Hillary'))

[('Hillary_Clinton', 0.7051242589950562), ('Clinton', 0.6970474123954773), ('Clintons', 0.659970760345459), ('HIllary', 0.6327946782112122), ('Barack', 0.6296452283859253), ('Sen._Hillary_Clinton', 0.6141951084136963), ('Barack_Obama', 0.6014484167098999), ('Hil_lary', 0.5962479114532471), ('Billary', 0.5903605222702026), ('Mcain', 0.5828679203987122)]


BOW based approaches that includes averaging, summation, weighted addition. The common way is to average the two word vectors

In [14]:
#Word averaging
def word_averaging(wv, words):
    all_words, mean = set(), []
    
    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in wv.wv.vocab:
            mean.append(wv.wv.syn0[wv.wv.vocab[word].index])
            all_words.add(wv.wv.vocab[word].index)


    if not mean:
        #logging.warning("cannot compute similarity with no input %s", words)
        return np.zeros(wv.vector_size,)

    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

def  word_averaging_list(wv, text_list):
    return np.vstack([word_averaging(wv, clean_title_publish) for clean_title_publish in text_list])

In [15]:
# We will tokenize the text and apply the tokenization to “clean_statement1” column, and apply word vector averaging to tokenized text.
# Tokenization
def w2v_tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text, language='english'):
        for word in nltk.word_tokenize(sent, language='english'):
            if len(word) < 2:
                continue
            tokens.append(word)
    return tokens

In [16]:
%%time
train, test = train_test_split(train_val, test_size=0.2,shuffle=True)
test_tokenized = test.apply(lambda r: w2v_tokenize_text(r['clean_statement1']), axis=1).values
train_tokenized = train.apply(lambda r: w2v_tokenize_text(r['clean_statement1']), axis=1).values

CPU times: user 2.47 s, sys: 39.4 ms, total: 2.51 s
Wall time: 2.56 s


In [17]:
train_tokenized

array([list(['gay', 'pride', 'vote', 'nothing']),
       list(['Romney', 'outsourced', 'call', 'center', 'job', 'India']),
       list(['million', 'Americans', 'work']), ...,
       list(['Greg', 'Abbott', 'benefitted', 'payday', 'lender', 'given', 'received', 'ruling', 'operate', 'loophole', 'law', 'allows', 'charge', 'unlimited', 'rate', 'fee']),
       list(['Says', 'Barack', 'Obama', 'put', 'place', 'board', 'tell', 'people', 'ultimately', 'treatment', 'theyre', 'going', 'receive']),
       list(['last', 'six', 'month', 'President', 'Barack', 'Obama', 'golfed', 'time', 'held', 'fundraiser', 'job', 'council', 'never', 'met'])],
      dtype=object)

In [18]:
test_tokenized

array([list(['Kasichs', 'budget', 'increase', 'state', 'spending', 'billion', 'billion', 'second', 'largest', 'twoyear', 'spending', 'increase', 'Ohio', 'history']),
       list(['Declaration', 'Independence', 'written', 'paper', 'made', 'hemp']),
       list(['Seventy', 'percent', 'Missourians', 'support', 'voter', 'photo', 'legislation']),
       ..., list(['Romneys', 'record', 'raised', 'tax', 'million']),
       list(['public', 'funding', 'abortion', 'legislation']),
       list(['Says', 'foreman', 'jury', 'convicted', 'Greenpeace', 'activist'])],
      dtype=object)

In [19]:
%%time
X_train_word_average = word_averaging_list(wv,train_tokenized)
X_test_word_average = word_averaging_list(wv,test_tokenized)

  elif word in wv.wv.vocab:
  mean.append(wv.wv.syn0[wv.wv.vocab[word].index])
  mean.append(wv.wv.syn0[wv.wv.vocab[word].index])
  all_words.add(wv.wv.vocab[word].index)


CPU times: user 1.38 s, sys: 63.5 ms, total: 1.45 s
Wall time: 1.54 s


In [20]:
X_test_word_average

array([[ 0.04901608,  0.0035761 ,  0.04076599, ...,  0.02151042,
         0.01582975, -0.07761028],
       [-0.0173192 , -0.02627543,  0.05079072, ...,  0.011369  ,
         0.01060263, -0.02818567],
       [ 0.03008744, -0.00174473, -0.0415839 , ..., -0.0143545 ,
         0.04348959,  0.01208543],
       ...,
       [ 0.08057824,  0.01254539, -0.00408424, ..., -0.04121935,
         0.01107035, -0.11191422],
       [-0.00772392, -0.00792232,  0.05083323, ..., -0.03165524,
         0.04060698,  0.00132584],
       [ 0.06577844, -0.01399096,  0.03097949, ..., -0.0552184 ,
         0.11687041,  0.04108818]], dtype=float32)

## Model Training

## Text Classification Algorithms

1. Naive Bayes (NB)
2. Logistics Regression
3. SVM
4. Random Forest

## Naive Bayes

Well, when assumption of independence holds, a Naive Bayes classifier performs better compare to other models like logistic regression and you need less training data. An advantage of naive Bayes is that it only requires a small number of training data to estimate the parameters necessary for classification. 

Bayes’ Theorem provides a way that we can calculate the probability of a piece of data belonging to a given class, given our prior knowledge. Bayes’ Theorem is stated as:

P(class|data) = (P(data|class) * P(class)) / P(data)

Where P(class|data) is the probability of class given the provided data.

Naive Bayes is a classification algorithm for binary (two-class) and multiclass classification problems. It is called Naive Bayes or idiot Bayes because the calculations of the probabilities for each class are simplified to make their calculations tractable.

Rather than attempting to calculate the probabilities of each attribute value, they are assumed to be conditionally independent given the class value.

This is a very strong assumption that is most unlikely in real data, i.e. that the attributes do not interact. Nevertheless, the approach performs surprisingly well on data where this assumption does not hold.

### Multinomial NB

The multinomial Naive Bayes classifier is suitable for classification with discrete features (e.g., word counts for text classification). The multinomial distribution normally requires integer feature counts. However, in practice, fractional counts such as tf-idf may also work

### Train Models with Different Types of Features

In [42]:
field='statement'
feature_reps=['binary','counts','tfidf']
nb_results=[]
nb_clf = MultinomialNB()
for feature_rep in feature_reps:
        print(f'Model - {feature_rep} features with statement')
        nb_model,transformer,score=train_model(nb_clf,train_val,field=field,feature_rep=feature_rep)
        nb_results.append([field,feature_rep,score])

2020-12-16 22:44:36,661 : INFO : Starting model training...
2020-12-16 22:44:36,672 : INFO : Extracting features and creating vocabulary...


Model - binary features with statement


2020-12-16 22:45:18,075 : INFO : Training a Classification Model...
2020-12-16 22:45:18,086 : INFO : Starting evaluation...
2020-12-16 22:45:18,096 : INFO : Done training and evaluation.
2020-12-16 22:45:18,098 : INFO : Starting model training...
2020-12-16 22:45:18,103 : INFO : Extracting features and creating vocabulary...


              precision    recall  f1-score   support

           0       0.56      0.48      0.52      1248
           1       0.64      0.71      0.68      1632

    accuracy                           0.61      2880
   macro avg       0.60      0.60      0.60      2880
weighted avg       0.61      0.61      0.61      2880

[[ 603  645]
 [ 471 1161]]
Model - counts features with statement


2020-12-16 22:46:00,700 : INFO : Training a Classification Model...
2020-12-16 22:46:00,704 : INFO : Starting evaluation...
2020-12-16 22:46:00,714 : INFO : Done training and evaluation.
2020-12-16 22:46:00,716 : INFO : Starting model training...
2020-12-16 22:46:00,721 : INFO : Extracting features and creating vocabulary...


              precision    recall  f1-score   support

           0       0.56      0.49      0.52      1248
           1       0.64      0.71      0.67      1632

    accuracy                           0.61      2880
   macro avg       0.60      0.60      0.60      2880
weighted avg       0.61      0.61      0.61      2880

[[ 610  638]
 [ 479 1153]]
Model - tfidf features with statement


2020-12-16 22:46:49,286 : INFO : Training a Classification Model...
2020-12-16 22:46:49,296 : INFO : Starting evaluation...
2020-12-16 22:46:49,307 : INFO : Done training and evaluation.


              precision    recall  f1-score   support

           0       0.58      0.32      0.41      1248
           1       0.61      0.83      0.70      1632

    accuracy                           0.61      2880
   macro avg       0.60      0.57      0.56      2880
weighted avg       0.60      0.61      0.58      2880

[[ 398  850]
 [ 284 1348]]


### Naive Bayes Results of Various Models

In [43]:
nb_df_results=pd.DataFrame(nb_results,columns=['text_fields','feature_representation','f1-score'])
nb_df_results.sort_values(by=['f1-score'],ascending=False)

Unnamed: 0,text_fields,feature_representation,f1-score
2,statement,tfidf,0.703916
0,statement,binary,0.675393
1,statement,counts,0.673678


### Naive Bayes with word2vec 

In [44]:
# nb_clf = MultinomialNB()
# nb_clf.fit(X_train_word_average, train['label'])

In [45]:
# y_pred_test_nb=nb_clf.predict(X_test_word_average)
# score = f1_score(test.label,y_pred_test_nb)
# print(classification_report(test.label,y_pred_test_nb))
# print(confusion_matrix(test.label,y_pred_test_nb))

## logistic regression

The underlying algorithm is also fairly easy to understand. More importantly, in the NLP world, it’s generally accepted that Logistic Regression is a great starter algorithm for text related classification (https://web.stanford.edu/~jurafsky/slp3/5.pdf). 

**How hypothesis makes prediction in logistics regression?**

This algorithm uses sigmoid function(g(z)). If we want to predict y=1 or y=0.
If estimated probability of y=1 is h(x)>=0.5 then the ouput is more likely to be "y=1" 
but if  h(x) < 0.5, the output is more likely to be is "y=0".

### Train Models with Different Types of Features¶

In [46]:
field='statement'
feature_reps=['binary','counts','tfidf']
lr_results=[]
LogR_clf = LogisticRegression(verbose=1, solver='liblinear',random_state=0, C=0.1, penalty='l2',max_iter=500)

for feature_rep in feature_reps:
        print(f'Model - {feature_rep} features with statement')
        lr_model,transformer,score=train_model(LogR_clf,train_val,field=field,feature_rep=feature_rep)
        lr_results.append([field,feature_rep,score])

2020-12-16 22:46:49,356 : INFO : Starting model training...
2020-12-16 22:46:49,362 : INFO : Extracting features and creating vocabulary...


Model - binary features with statement


2020-12-16 22:47:36,424 : INFO : Training a Classification Model...
2020-12-16 22:47:36,454 : INFO : Starting evaluation...
2020-12-16 22:47:36,475 : INFO : Done training and evaluation.
2020-12-16 22:47:36,487 : INFO : Starting model training...
2020-12-16 22:47:36,495 : INFO : Extracting features and creating vocabulary...


[LibLinear]              precision    recall  f1-score   support

           0       0.56      0.47      0.51      1248
           1       0.64      0.72      0.68      1632

    accuracy                           0.61      2880
   macro avg       0.60      0.60      0.59      2880
weighted avg       0.61      0.61      0.61      2880

[[ 585  663]
 [ 454 1178]]
Model - counts features with statement


2020-12-16 22:48:16,134 : INFO : Training a Classification Model...
2020-12-16 22:48:16,167 : INFO : Starting evaluation...
2020-12-16 22:48:16,190 : INFO : Done training and evaluation.
2020-12-16 22:48:16,198 : INFO : Starting model training...
2020-12-16 22:48:16,208 : INFO : Extracting features and creating vocabulary...


[LibLinear]              precision    recall  f1-score   support

           0       0.56      0.47      0.51      1248
           1       0.64      0.72      0.68      1632

    accuracy                           0.61      2880
   macro avg       0.60      0.59      0.59      2880
weighted avg       0.61      0.61      0.61      2880

[[ 585  663]
 [ 456 1176]]
Model - tfidf features with statement


2020-12-16 22:48:59,544 : INFO : Training a Classification Model...
2020-12-16 22:48:59,561 : INFO : Starting evaluation...
2020-12-16 22:48:59,574 : INFO : Done training and evaluation.


[LibLinear]              precision    recall  f1-score   support

           0       0.61      0.20      0.30      1248
           1       0.60      0.90      0.72      1632

    accuracy                           0.60      2880
   macro avg       0.60      0.55      0.51      2880
weighted avg       0.60      0.60      0.54      2880

[[ 248 1000]
 [ 157 1475]]


### Logistics Regression Results of Various Models

In [47]:
lr_df_results=pd.DataFrame(lr_results,columns=['text_fields','feature_representation','f1-score'])
lr_df_results.sort_values(by=['f1-score'],ascending=False)

Unnamed: 0,text_fields,feature_representation,f1-score
2,statement,tfidf,0.718286
0,statement,binary,0.678376
1,statement,counts,0.677615


Here you see how the performance of logistics model is improved using tfidf over counts and binary weightning.

### Logistics Regression with word2vec

In [48]:
LogR_clf = LogisticRegression(verbose=1, solver='liblinear',random_state=0, C=0.1, penalty='l2',max_iter=500)
LogR_clf.fit(X_train_word_average, train['label']) 

[LibLinear]

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='liblinear', tol=0.0001, verbose=1,
                   warm_start=False)

In [49]:
y_pred_test_log=LogR_clf.predict(X_test_word_average)
# y_pred_train_log=LogR_clf.predict(X_train_word_average)

In [50]:
score = f1_score(test.label,y_pred_test_log)
print(classification_report(test.label,y_pred_test_log))
print(confusion_matrix(test.label,y_pred_test_log))

              precision    recall  f1-score   support

           0       0.58      0.42      0.49      1036
           1       0.62      0.75      0.68      1268

    accuracy                           0.61      2304
   macro avg       0.60      0.59      0.58      2304
weighted avg       0.60      0.61      0.59      2304

[[437 599]
 [311 957]]


In [51]:
# score = f1_score(test.label,y_pred_test_log)
# print(classification_report(test.label,y_pred_test_log))
# print(confusion_matrix(test.label,y_pred_test_log))

## SVM

Support vector machines is an algorithm that determines the best decision boundary between vectors that belong to a given group (or category) and vectors that do not belong to it. That’s it. It can be applied to any kind of vectors which encode any kind of data. This means that in order to leverage the power of svm text classification, texts have to be transformed into vectors.

So, when SVM determines the decision boundary we mentioned above, SVM decides where to draw the best “line” (or the best hyperplane) that divides the space into two subspaces: one for the vectors which belong to the given category and one for the vectors which do not belong to it.

### Train Models with Different Types of Features¶

In [52]:
field='statement'
feature_reps=['binary','counts','tfidf']
svm_results=[]
svm_clf = svm.LinearSVC()

for feature_rep in feature_reps:
        print(f'SVM Model - {feature_rep} features with statement')
        svm_model,transformer,score=train_model(svm_clf,train_val,field=field,feature_rep=feature_rep)
        svm_results.append([field,feature_rep,score])

2020-12-16 22:48:59,876 : INFO : Starting model training...
2020-12-16 22:48:59,881 : INFO : Extracting features and creating vocabulary...


SVM Model - binary features with statement


2020-12-16 22:49:45,573 : INFO : Training a Classification Model...
2020-12-16 22:49:45,758 : INFO : Starting evaluation...
2020-12-16 22:49:45,775 : INFO : Done training and evaluation.
2020-12-16 22:49:45,777 : INFO : Starting model training...
2020-12-16 22:49:45,782 : INFO : Extracting features and creating vocabulary...


              precision    recall  f1-score   support

           0       0.50      0.51      0.50      1248
           1       0.62      0.62      0.62      1632

    accuracy                           0.57      2880
   macro avg       0.56      0.56      0.56      2880
weighted avg       0.57      0.57      0.57      2880

[[ 633  615]
 [ 626 1006]]
SVM Model - counts features with statement


2020-12-16 22:50:28,633 : INFO : Training a Classification Model...
2020-12-16 22:50:28,823 : INFO : Starting evaluation...
2020-12-16 22:50:28,833 : INFO : Done training and evaluation.
2020-12-16 22:50:28,835 : INFO : Starting model training...
2020-12-16 22:50:28,840 : INFO : Extracting features and creating vocabulary...


              precision    recall  f1-score   support

           0       0.50      0.51      0.50      1248
           1       0.62      0.61      0.61      1632

    accuracy                           0.56      2880
   macro avg       0.56      0.56      0.56      2880
weighted avg       0.57      0.56      0.56      2880

[[631 617]
 [637 995]]
SVM Model - tfidf features with statement


2020-12-16 22:51:11,670 : INFO : Training a Classification Model...
2020-12-16 22:51:11,703 : INFO : Starting evaluation...
2020-12-16 22:51:11,713 : INFO : Done training and evaluation.


              precision    recall  f1-score   support

           0       0.52      0.50      0.51      1248
           1       0.63      0.65      0.64      1632

    accuracy                           0.59      2880
   macro avg       0.58      0.58      0.58      2880
weighted avg       0.58      0.59      0.59      2880

[[ 627  621]
 [ 570 1062]]


### SVM Results of Various Models

In [53]:
svm_df_results=pd.DataFrame(svm_results,columns=['text_fields','feature_representation','f1-score'])
svm_df_results.sort_values(by=['f1-score'],ascending=False)

Unnamed: 0,text_fields,feature_representation,f1-score
2,statement,tfidf,0.640724
0,statement,binary,0.618506
1,statement,counts,0.61344


### SVM with word2vec

In [54]:
svm_clf = svm.LinearSVC()
svm_clf.fit(X_train_word_average, train['label'])

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [55]:
y_pred_test_svm=svm_clf.predict(X_test_word_average)

In [56]:
score = f1_score(test.label,y_pred_test_svm)
print(classification_report(test.label,y_pred_test_svm))
print(confusion_matrix(test.label,y_pred_test_svm))

              precision    recall  f1-score   support

           0       0.57      0.46      0.51      1036
           1       0.62      0.72      0.67      1268

    accuracy                           0.60      2304
   macro avg       0.60      0.59      0.59      2304
weighted avg       0.60      0.60      0.60      2304

[[479 557]
 [356 912]]


## Random Forest

Given the nature of random forests (a bagging decision tree), it is true that you may come up with a rather weak classifier, especially if only a couple of features are truly significant to determine the outcome.

However, keep in mind that in the case of text classification, a preprocessing phase is required to get either your TF or TF-IDF matrix, through which you have already made a selection of pertinent features. Potentially, all features are relevant in this matrix, so the random forest may be performant when you predict your outcome. (source: https://stats.stackexchange.com/questions/343954/random-forest-short-text-classification)

### Train Models with Different Types of Features¶

In [57]:
field='statement'
feature_reps=['binary','counts','tfidf']
rf_results=[]
rf_clf = RandomForestClassifier(n_estimators=500)

for feature_rep in feature_reps:
        rf_model,transformer,score=train_model(rf_clf,train_val,field=field,feature_rep=feature_rep)
        rf_results.append([field,feature_rep,score])

2020-12-16 22:51:12,202 : INFO : Starting model training...
2020-12-16 22:51:12,208 : INFO : Extracting features and creating vocabulary...
2020-12-16 22:51:54,457 : INFO : Training a Classification Model...
2020-12-16 22:52:31,833 : INFO : Starting evaluation...
2020-12-16 22:52:31,847 : INFO : Done training and evaluation.
2020-12-16 22:52:31,849 : INFO : Starting model training...
2020-12-16 22:52:31,854 : INFO : Extracting features and creating vocabulary...


              precision    recall  f1-score   support

           0       0.55      0.47      0.51      1248
           1       0.64      0.71      0.67      1632

    accuracy                           0.61      2880
   macro avg       0.60      0.59      0.59      2880
weighted avg       0.60      0.61      0.60      2880

[[ 591  657]
 [ 476 1156]]


2020-12-16 22:53:13,223 : INFO : Training a Classification Model...
2020-12-16 22:53:50,287 : INFO : Starting evaluation...
2020-12-16 22:53:50,297 : INFO : Done training and evaluation.
2020-12-16 22:53:50,299 : INFO : Starting model training...
2020-12-16 22:53:50,303 : INFO : Extracting features and creating vocabulary...


              precision    recall  f1-score   support

           0       0.55      0.48      0.51      1248
           1       0.64      0.70      0.67      1632

    accuracy                           0.60      2880
   macro avg       0.59      0.59      0.59      2880
weighted avg       0.60      0.60      0.60      2880

[[ 602  646]
 [ 497 1135]]


2020-12-16 22:54:35,259 : INFO : Training a Classification Model...
2020-12-16 22:55:08,902 : INFO : Starting evaluation...
2020-12-16 22:55:08,919 : INFO : Done training and evaluation.


              precision    recall  f1-score   support

           0       0.56      0.49      0.52      1248
           1       0.64      0.71      0.67      1632

    accuracy                           0.61      2880
   macro avg       0.60      0.60      0.60      2880
weighted avg       0.61      0.61      0.61      2880

[[ 606  642]
 [ 481 1151]]


### RF Results of Various Models¶

In [58]:
rf_df_results=pd.DataFrame(rf_results,columns=['text_fields','feature_representation','f1-score'])
rf_df_results.sort_values(by=['f1-score'],ascending=False)

Unnamed: 0,text_fields,feature_representation,f1-score
2,statement,tfidf,0.672117
0,statement,binary,0.671118
1,statement,counts,0.665104


### RF with word2vec

In [59]:
rf_clf = RandomForestClassifier(n_estimators=700)
rf_clf.fit(X_train_word_average, train['label'])

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=700,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [60]:
y_pred_test_rf=rf_clf.predict(X_test_word_average)

In [61]:
score = f1_score(test.label,y_pred_test_rf)
print(classification_report(test.label,y_pred_test_rf))
print(confusion_matrix(test.label,y_pred_test_rf))

              precision    recall  f1-score   support

           0       0.60      0.40      0.48      1036
           1       0.62      0.79      0.69      1268

    accuracy                           0.61      2304
   macro avg       0.61      0.59      0.59      2304
weighted avg       0.61      0.61      0.60      2304

[[416 620]
 [272 996]]


## K-fold cross validation

With K-fold cross validation, you are testing how well your model is able to get trained by some data and then predict data it hasn't seen. We use cross validation for this because if you train using all the data you have, you have none left for testing. You could do this once, say by using 80% of the data to train and 20% to test, but what if the 20% you happened to pick to test happens to contain a bunch of points that are particularly easy (or particularly hard) to predict? We will not have come up with the best estimate possible of the models ability to learn and predict.

In [30]:
#User defined functon for K-Fold cross validatoin
def apply_kfold(classifier,train_val,field,feature_rep):
    """
    K-fold cross validation on the the data
    """
    k_fold = KFold(n_splits=5, shuffle=True)
    scores = []
    confusion = np.array([[0,0],[0,0]])

    for fold_n, (train_index, valid_index) in enumerate(k_fold.split(train_val['statement'], train_val['label'])):
        print(fold_n, len(train_index), len(valid_index))
        train_x = train_val['statement'].iloc[train_index]
        train_y = train_val['label'].iloc[train_index]
    
        valid_x = train_val['statement'].iloc[valid_index]
        valid_y = train_val['label'].iloc[valid_index]
        
        # GET FEATURES
        train_features,val_features,feature_transformer=extract_features(field,train_x,valid_x,type=feature_rep)
        
        # INIT CLASSIFIER
        logging.info("Training a Classification Model...")
        classifier.fit(train_features, train_y)
        predictions = classifier.predict(val_features)
        
        confusion += confusion_matrix(valid_y,predictions)
        score = f1_score(valid_y,predictions)
        scores.append(score)
        
    return (print('Total statements classified:', len(train_val['statement'])),
    print('Score:', sum(scores)/len(scores)),
    print('score length', len(scores)),
    print('Confusion matrix:'),
    print(confusion))

## Grid Search Hyperparameters with K-Fold Cross Validation

## Logistics Regression

In [14]:
grid={"C":np.logspace(-3,3,7), "penalty":["l2"], "max_iter":[500, 1000]}# l1 lasso l2 ridge
logreg=LogisticRegression()
logreg_cv=GridSearchCV(logreg,grid,cv=5)

# GET FEATURES
feature_reps=['binary','counts','tfidf']
train_features,feature_transformer=extract_final_features(field,train_val['statement'],type='binary')
logreg_cv.fit(train_features,train_val['label'])

2020-12-04 14:27:12,473 : INFO : Extracting features and creating vocabulary...
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/prep

GridSearchCV(cv=5, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
                         'max_iter': [500, 1000], 'penalty': ['l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [15]:
print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)
print("accuracy :",logreg_cv.best_score_)

tuned hpyerparameters :(best parameters)  {'C': 0.1, 'max_iter': 500, 'penalty': 'l2'}
accuracy : 0.6151041666666667


## SVC

In [16]:
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from skopt import BayesSearchCV

field='statement'
feature_reps=['binary','counts','tfidf']
# GET FEATURES
train_features,feature_transformer=extract_final_features('statement',train_val['statement'],type='binary')
    
# define search space
params = dict()
params['C'] = (1e-6, 100.0, 'log-uniform')
params['gamma'] = (1e-6, 100.0, 'log-uniform')
params['degree'] = (1,5)
params['kernel'] = ['linear', 'poly', 'rbf', 'sigmoid']
# define evaluation
cv = StratifiedKFold(n_splits=5, random_state=1)
# define the search
search = BayesSearchCV(estimator=SVC(), search_spaces=params, n_jobs=-1, cv=cv)
# perform the search
search.fit(train_features, train_val['label'])
# report the best result
print(search.best_score_)
print(search.best_params_)

2020-12-04 14:38:28,037 : INFO : Extracting features and creating vocabulary...


0.6172743055555555
OrderedDict([('C', 0.00020805609531827864), ('degree', 1), ('gamma', 100.0), ('kernel', 'poly')])


## Random Forest

In [20]:
rfc = RandomForestClassifier(n_jobs=-1,oob_score = True) 

param_grid = { 
    'n_estimators': [200, 700],
    'max_features': ['sqrt', 'log2']
}

CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
# get features
train_features,feature_transformer=extract_final_features('statement',train_val['statement'],type='binary')
CV_rfc.fit(train_features, train_val['label'])
print(CV_rfc.best_params_)

2020-12-04 15:44:02,202 : INFO : Extracting features and creating vocabulary...


{'max_features': 'log2', 'n_estimators': 700}


In [25]:
print(CV_rfc.best_score_)

0.6276041666666666


## Naive Bayes with K-fold cross validation

In [32]:
field='statement'
feature_reps=['binary','counts','tfidf']
nb_results=[]
nb_clf = MultinomialNB()
for feature_rep in feature_reps:
        print(f'Model - {feature_rep} features with statement')
        apply_kfold(nb_clf,train_val,field=field,feature_rep=feature_rep)

2020-12-03 20:54:40,953 : INFO : Extracting features and creating vocabulary...


Model - binary features with statement
0 9216 2304


2020-12-03 20:55:23,804 : INFO : Training a Classification Model...
2020-12-03 20:55:23,814 : INFO : Extracting features and creating vocabulary...


1 9216 2304


2020-12-03 20:56:03,322 : INFO : Training a Classification Model...
2020-12-03 20:56:03,331 : INFO : Extracting features and creating vocabulary...


2 9216 2304


2020-12-03 20:56:42,538 : INFO : Training a Classification Model...
2020-12-03 20:56:42,547 : INFO : Extracting features and creating vocabulary...


3 9216 2304


2020-12-03 20:57:21,634 : INFO : Training a Classification Model...
2020-12-03 20:57:21,642 : INFO : Extracting features and creating vocabulary...


4 9216 2304


2020-12-03 20:58:00,847 : INFO : Training a Classification Model...
2020-12-03 20:58:00,859 : INFO : Extracting features and creating vocabulary...


Total statements classified: 11520
Score: 0.6719650990437549
score length 5
Confusion matrix:
[[2330 2772]
 [1767 4651]]
Model - counts features with statement
0 9216 2304


2020-12-03 20:58:39,800 : INFO : Training a Classification Model...
2020-12-03 20:58:39,809 : INFO : Extracting features and creating vocabulary...


1 9216 2304


2020-12-03 20:59:18,933 : INFO : Training a Classification Model...
2020-12-03 20:59:18,941 : INFO : Extracting features and creating vocabulary...


2 9216 2304


2020-12-03 20:59:58,121 : INFO : Training a Classification Model...
2020-12-03 20:59:58,131 : INFO : Extracting features and creating vocabulary...


3 9216 2304


2020-12-03 21:00:37,065 : INFO : Training a Classification Model...
2020-12-03 21:00:37,074 : INFO : Extracting features and creating vocabulary...


4 9216 2304


2020-12-03 21:01:16,297 : INFO : Training a Classification Model...
2020-12-03 21:01:16,310 : INFO : Extracting features and creating vocabulary...


Total statements classified: 11520
Score: 0.6706466952721573
score length 5
Confusion matrix:
[[2337 2765]
 [1785 4633]]
Model - tfidf features with statement
0 9216 2304


2020-12-03 21:01:55,724 : INFO : Training a Classification Model...
2020-12-03 21:01:55,733 : INFO : Extracting features and creating vocabulary...


1 9216 2304


2020-12-03 21:02:35,105 : INFO : Training a Classification Model...
2020-12-03 21:02:35,114 : INFO : Extracting features and creating vocabulary...


2 9216 2304


2020-12-03 21:03:14,334 : INFO : Training a Classification Model...
2020-12-03 21:03:14,343 : INFO : Extracting features and creating vocabulary...


3 9216 2304


2020-12-03 21:03:53,362 : INFO : Training a Classification Model...
2020-12-03 21:03:53,370 : INFO : Extracting features and creating vocabulary...


4 9216 2304


2020-12-03 21:04:32,660 : INFO : Training a Classification Model...


Total statements classified: 11520
Score: 0.7032826381843794
score length 5
Confusion matrix:
[[1477 3625]
 [ 971 5447]]


## Logistics Regression with K-fold cross Validation

In [17]:
# field='statement'
# feature_reps=['binary','counts','tfidf']
# LogR_clf = LogisticRegression(verbose=1, solver='liblinear',random_state=0, C=5, penalty='l2',max_iter=1000)

# for feature_rep in feature_reps:
#         print(f'Model - {feature_rep} features with statement')
#         apply_kfold(LogR_clf,train_val,field=field,feature_rep=feature_rep)

## SVM with K-fold cross Validation

In [18]:
# field='statement'
# feature_reps=['binary','counts','tfidf']
# svm_clf = svm.LinearSVC()

# for feature_rep in feature_reps:
#         print(f'Model - {feature_rep} features with statement')
#         apply_kfold(svm_clf,train_val,field=field,feature_rep=feature_rep)

## RF with K-fold cross Validation

In [19]:
# field='statement'
# feature_reps=['binary','counts','tfidf']
# rf_clf = RandomForestClassifier(n_estimators=1000)

# for feature_rep in feature_reps:
#         print(f'Model - {feature_rep} features with statement')
#         apply_kfold(rf_clf,train_val,field=field,feature_rep=feature_rep)

## Best Model Selection

"""
Out of all the models fitted, we would take 2 best performing model. we would call them candidate models
from the confusion matrix, we can see that logistic regression and SVM (with either binary or tfidf features) are better performing 
in terms of precision and recall (take a look into false positive and true negative counts which appeares
to be low compared to rest of the models).

Using k-fold cross validation, we see the performance of the models on the entire dataset. And, the model's aren't performing well. We can apply other features to improve the performance, and grid-search can also help us to find best parameters to improve the perfromance.
"""

In [23]:
field='statement'
LogR_clf_final = LogisticRegression(verbose=1, solver='liblinear',random_state=0, C=5, penalty='l2',max_iter=1000)
lr_final_model,transformer=train_final_model(LogR_clf_final,train_val,field=field,feature_rep='counts')

2020-12-07 11:52:19,852 : INFO : Starting model training...
2020-12-07 11:52:19,854 : INFO : Extracting features and creating vocabulary...
2020-12-07 11:53:03,648 : INFO : Training a Final Model...
2020-12-07 11:53:03,775 : INFO : Done training.


[LibLinear]

In [47]:
# train_val['statement'].values.values

## Saving Model

In [10]:
model_path="../models/lr_final_model.pkl"
transformer_path="../models/transformer.pkl"

In [43]:
# we need to save both the transformer -> to encode a document and the model itself to make predictions based on the weight vectors 
pickle.dump(lr_final_model,open(model_path, 'wb'))
pickle.dump(transformer,open(transformer_path,'wb'))

## Loading model 

In [11]:
# load the model and feature transformer with pickle
loaded_model = pickle.load(open(model_path, 'rb'))
loaded_transformer = pickle.load(open(transformer_path, 'rb'))

In [24]:
type(LogR_clf_final)

sklearn.linear_model._logistic.LogisticRegression

## Check predictions on unseen data

In [17]:
input_text = process_text("When asked by a reporter whether hes at the center of a criminal scheme to violate campaign laws, Gov. Scott Walker nodded yes.")
test_features = loaded_transformer.transform([" ".join(input_text)])
output = get_predictions(loaded_model,test_features)
output[0] # correctly predicted

0

In [18]:
input_text = process_text("Says John McCain has done nothing to help the vets.")
test_features=loaded_transformer.transform([" ".join(input_text)])
ouput = get_predictions(loaded_model,test_features)
ouput[0] # false news predicted as false. #correct prediction

0

In [19]:
input_text = process_text("Says that Tennessee law requires that schools receive half of proceeds -- $31 million per year -- from a half-cent increase in the Shelby County sales tax.")
test_features=loaded_transformer.transform([" ".join(input_text)])
ouput = get_predictions(loaded_model,test_features)
ouput[0] # True news predicted as True. #correct prediction

0