# Import necessary dependencies and settings

In [12]:
import pandas as pd
import numpy as np
import string
import logging
import re

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import  LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.model_selection import KFold
from sklearn.metrics import precision_recall_fscore_support

from skopt import BayesSearchCV
from skopt.callbacks import DeadlineStopper, VerboseCallback, DeltaXStopper

import nltk
import nltk.corpus 
from nltk.corpus import stopwords

# Loading the Data

In [3]:
train_news = pd.read_csv('../data/processed/train.csv')
val_news = pd.read_csv('../data/processed/val.csv')
test_news = pd.read_csv('../data/processed/test.csv')

In [4]:
train_news.head(15)

Unnamed: 0,label,statement,len
0,False,Says the Annies List political group supports ...,82
1,True,When did the decline of coal start? It started...,141
2,True,"Hillary Clinton agrees with John McCain ""by vo...",105
3,False,Health care reform legislation is likely to ma...,78
4,True,The economic turnaround started at the end of ...,54
5,True,The Chicago Bears have had more starting quart...,155
6,False,Jim Dunnam has not lived in the district he re...,69
7,True,I'm the only person on this stage who has work...,159
8,True,"However, it took $19.5 million in Oregon Lotte...",144
9,True,Says GOP primary opponents Glenn Grothman and ...,132


In [3]:
display(train_news), display(test_news), display(val_news)

Unnamed: 0,label,statement
0,False,Says the Annies List political group supports ...
1,True,When did the decline of coal start? It started...
2,True,"Hillary Clinton agrees with John McCain ""by vo..."
3,False,Health care reform legislation is likely to ma...
4,True,The economic turnaround started at the end of ...
...,...,...
10235,True,There are a larger number of shark attacks in ...
10236,True,Democrats have now become the party of the [At...
10237,True,Says an alternative to Social Security that op...
10238,False,On lifting the U.S. Cuban embargo and allowing...


Unnamed: 0,label,statement
0,True,Building a wall on the U.S.-Mexico border will...
1,False,Wisconsin is on pace to double the number of l...
2,False,Says John McCain has done nothing to help the ...
3,True,Suzanne Bonamici supports a plan that will cut...
4,False,When asked by a reporter whether hes at the ce...
...,...,...
1262,True,Says his budget provides the highest state fun...
1263,False,Ive been here almost every day.
1264,False,"In the early 1980s, Sen. Edward Kennedy secret..."
1265,False,Says an EPA permit languished under Strickland...


Unnamed: 0,label,statement
0,False,We have less Americans working now than in the...
1,False,"When Obama was sworn into office, he DID NOT u..."
2,False,Says Having organizations parading as being so...
3,True,Says nearly half of Oregons children are poor.
4,True,On attacks by Republicans that various program...
...,...,...
1279,True,"For the first time in more than a decade, impo..."
1280,True,Says Donald Trump has bankrupted his companies...
1281,True,"John McCain and George Bush have ""absolutely n..."
1282,False,A new poll shows 62 percent support the presid...


(None, None, None)

## Merging train & val data for K-Fold

In [5]:
"""
Merging the training and validation data together, so that I can peroform k-fold cross validation 
and shuffle the data to reduce the bias.
"""
labelEncoder = LabelEncoder()
frames = [train_news, val_news]
train_val = pd.concat(frames)
train_val['label'].value_counts()
train_val['label'] = labelEncoder.fit_transform(train_val['label'])

In [6]:
train_val

Unnamed: 0,label,statement,len
0,0,Says the Annies List political group supports ...,82.0
1,1,When did the decline of coal start? It started...,141.0
2,1,"Hillary Clinton agrees with John McCain ""by vo...",105.0
3,0,Health care reform legislation is likely to ma...,78.0
4,1,The economic turnaround started at the end of ...,54.0
...,...,...,...
1279,1,"For the first time in more than a decade, impo...",
1280,1,Says Donald Trump has bankrupted his companies...,
1281,1,"John McCain and George Bush have ""absolutely n...",
1282,0,A new poll shows 62 percent support the presid...,


In [8]:
def process_text(text):
    '''
    What will be covered:
    1. Lower case and remove special characters\whitespaces
    1. Remove punctuation
    2. Remove stopwords
    3. Return list of clean text words
    '''
    #1  
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
    
    #2
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    
    #3
    clean_words = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    
    #4
    return clean_words

## Feature Weighting

Not all words are equally important to a particular document / category. For example, while words like ‘murder’, ‘knife’ and ‘abduction’ are important to a crime related document, words like ‘news’ and ‘reporter’ may not be quite as important. 

### Binary Weighting
The most basic form of feature weighting, is binary weighting. Where if a word is present in a document, the weight is ‘1’ and if the word is absent the weight is ‘0’. 

### CountVectorizer

It Convert a collection of text documents to a matrix of token counts.


### Tfidf Weighting 

TF-IDF weighting where words that are unique to a particular document would have higher weights compared to words that are used commonly across documents. 

1. TF (Term Frequency): The number of times a word appears in a document divded by the total number of words in the document. Every document has its own term frequency.

2. IDF (Inverse Data Frequency): The log of the number of documents divided by the number of documents that contain the word w. Inverse data frequency determines the weight of rare words across all documents in the corpus.

3. Lastly, the TF-IDF is simply the TF multiplied by IDF.

In [9]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

def extract_features(field,training_data,testing_data,type):
    """Extract features using different methods"""
    
    logging.info("Extracting features and creating vocabulary...")
    
    if "binary" in type:
        
        # BINARY FEATURE REPRESENTATION
        cv= CountVectorizer(binary=True, max_df=0.95, analyzer=process_text)
        cv.fit_transform(training_data.values)
        
        train_feature_set=cv.transform(training_data.values)
        test_feature_set=cv.transform(testing_data.values)
        
        return train_feature_set,test_feature_set,cv
  
    elif "counts" in type:
        
        # COUNT BASED FEATURE REPRESENTATION
        cv= CountVectorizer(binary=False, max_df=0.95, analyzer=process_text)
        cv.fit_transform(training_data.values)
        
        train_feature_set=cv.transform(training_data.values)
        test_feature_set=cv.transform(testing_data.values)
        
        return train_feature_set,test_feature_set,cv
    
    else:    
        
        # TF-IDF BASED FEATURE REPRESENTATION
        tfidf_vectorizer=TfidfVectorizer(use_idf=True, max_df=0.95, analyzer=process_text)
        tfidf_vectorizer.fit_transform(training_data.values)
        
        train_feature_set=tfidf_vectorizer.transform(training_data.values)
        test_feature_set=tfidf_vectorizer.transform(testing_data.values)
        
        return train_feature_set,test_feature_set,tfidf_vectorizer

In [10]:
def train_model(classifier, train_val, field="statement",feature_rep="binary"):
    """
    Training the classifier for the provided features.
    """
    
    logging.info("Starting model training...")
    
    scores = []
    confusion = np.array([[0,0],[0,0]])
    
    # GET A TRAIN TEST SPLIT (set seed for consistent results)
    training_data, testing_data = train_test_split(train_val,random_state = 2000,)

    # features
    X_train=training_data['statement']
    X_test=testing_data['statement']
    
    # GET LABELS
    Y_train=training_data['label'].values
    Y_test=testing_data['label'].values
     
    # GET FEATURES
    train_features,test_features,feature_transformer=extract_features(field,X_train,X_test,type=feature_rep)

    # INIT LOGISTIC REGRESSION CLASSIFIER
    logging.info("Training a Classification Model...")
#     scikit_log_reg = LogisticRegression(verbose=1, solver='liblinear',random_state=0, C=5, penalty='l2',max_iter=1000)
    model=classifier.fit(train_features,Y_train)

    # GET PREDICTIONS
    predictions = model.predict(test_features)
    
    # GET EVALUATION NUMBERS ON TEST SET -- HOW DID WE DO?
    logging.info("Starting evaluation...")
    score = f1_score(Y_test,predictions)
    print(classification_report(Y_test,predictions))
    print(confusion_matrix(Y_test,predictions))
    logging.info("Done training and evaluation.")
    
    return model,feature_transformer,score


## Metric

I need to minimize false positives (number of fake news predicted as real news) as it can be very misleadling . For class 0 i.e. 'fake', recall should be high as well as precision. Because we want our model to perform well on both classes (real & fake). In short, we need to maximize f1-score.

### Cases I considered to choose the right metric

**1. Maximizing recall of class 0 (fake) or minimizing false positives(FP)?**
Well, in extreme case, what if all the news predicted by model are labelled as 'fake'. Recall will still be 1, but overall model is really bad i.e. not able to predict class 1 ('real'). 

Ex=> TN = 553, FP = 0, TP = 0, FN = 714

Class0-Recall = TN / (TN + FP) = 1
Class0-Precision = TN / (TN + FN) = 0.43

F1-Score = 2 * Class0-Recall * Class0-Precision/(Class0-Recall + Class0-Precision) = 0.60

Recall, Precision and F1-score for class 1 will be 0.

**2. Considering an extreme case, if all the news classified as True (Even, fake news are predicted as True).**

Ex=>  TN = 0, FP = 553, TP = 714, FN =0
In that case, TN will be 0, which led to Precision 0, Recall 0 and F1 = 0 for class 0 ('fake').

For class 1, Class1-Recall = TP / (TP + FN) = 1
Class1-Precision = TP / (TP + FP) = 0.56

## Model Training

## Text Classification Algorithms

1. Naive Bayes (NB)
2. Logistics Regression
3. SVM
4. Random Forest

## Naive Bayes

Well, when assumption of independence holds, a Naive Bayes classifier performs better compare to other models like logistic regression and you need less training data. An advantage of naive Bayes is that it only requires a small number of training data to estimate the parameters necessary for classification. 

Bayes’ Theorem provides a way that we can calculate the probability of a piece of data belonging to a given class, given our prior knowledge. Bayes’ Theorem is stated as:

P(class|data) = (P(data|class) * P(class)) / P(data)

Where P(class|data) is the probability of class given the provided data.

Naive Bayes is a classification algorithm for binary (two-class) and multiclass classification problems. It is called Naive Bayes or idiot Bayes because the calculations of the probabilities for each class are simplified to make their calculations tractable.

Rather than attempting to calculate the probabilities of each attribute value, they are assumed to be conditionally independent given the class value.

This is a very strong assumption that is most unlikely in real data, i.e. that the attributes do not interact. Nevertheless, the approach performs surprisingly well on data where this assumption does not hold.

### Multinomial NB

The multinomial Naive Bayes classifier is suitable for classification with discrete features (e.g., word counts for text classification). The multinomial distribution normally requires integer feature counts. However, in practice, fractional counts such as tf-idf may also work

### Train Models with Different Types of Features

In [13]:
# model,transformer,score,confusion,report=train_model(nb_clf, train_val,field=field,feature_rep=feature_rep)
# print("\nF1-score={0}; confusion={1}; classification_report={2}".format(score,confusion,report))
field='statement'
feature_reps=['binary','counts','tfidf']
nb_results=[]
nb_clf = MultinomialNB()
for feature_rep in feature_reps:
        print(f'Model - {feature_rep} features with statement')
        nb_model,transformer,score=train_model(nb_clf,train_val,field=field,feature_rep=feature_rep)
        nb_results.append([field,feature_rep,score])

2020-11-29 14:35:17,885 : INFO : Starting model training...
2020-11-29 14:35:17,891 : INFO : Extracting features and creating vocabulary...


Model - binary features with statement


2020-11-29 14:35:56,161 : INFO : Training a Classification Model...
2020-11-29 14:35:56,171 : INFO : Starting evaluation...
2020-11-29 14:35:56,182 : INFO : Done training and evaluation.
2020-11-29 14:35:56,183 : INFO : Starting model training...
2020-11-29 14:35:56,188 : INFO : Extracting features and creating vocabulary...


              precision    recall  f1-score   support

           0       0.55      0.48      0.51      1248
           1       0.64      0.70      0.67      1632

    accuracy                           0.60      2880
   macro avg       0.59      0.59      0.59      2880
weighted avg       0.60      0.60      0.60      2880

[[ 596  652]
 [ 488 1144]]
Model - counts features with statement


2020-11-29 14:36:33,983 : INFO : Training a Classification Model...
2020-11-29 14:36:33,987 : INFO : Starting evaluation...
2020-11-29 14:36:33,997 : INFO : Done training and evaluation.
2020-11-29 14:36:33,999 : INFO : Starting model training...
2020-11-29 14:36:34,003 : INFO : Extracting features and creating vocabulary...


              precision    recall  f1-score   support

           0       0.55      0.48      0.52      1248
           1       0.64      0.70      0.67      1632

    accuracy                           0.61      2880
   macro avg       0.60      0.59      0.59      2880
weighted avg       0.60      0.61      0.60      2880

[[ 602  646]
 [ 485 1147]]
Model - tfidf features with statement


2020-11-29 14:37:13,014 : INFO : Training a Classification Model...
2020-11-29 14:37:13,018 : INFO : Starting evaluation...
2020-11-29 14:37:13,030 : INFO : Done training and evaluation.


              precision    recall  f1-score   support

           0       0.58      0.31      0.41      1248
           1       0.61      0.83      0.70      1632

    accuracy                           0.60      2880
   macro avg       0.60      0.57      0.56      2880
weighted avg       0.60      0.60      0.58      2880

[[ 391  857]
 [ 281 1351]]


### Naive Bayes Results of Various Models

In [14]:
nb_df_results=pd.DataFrame(nb_results,columns=['text_fields','feature_representation','f1-score'])
nb_df_results.sort_values(by=['f1-score'],ascending=False)

Unnamed: 0,text_fields,feature_representation,f1-score
2,statement,tfidf,0.703646
1,statement,counts,0.669781
0,statement,binary,0.667445


In [41]:
# nb_clf_pipeline = Pipeline([('vect', count_vect),
#                       ('tfidf', tfidf_transformer),
#                       ('nb_clf', MultinomialNB()),
#  ])
# nb_clf_pipeline.fit(train_news['statement'], train_label)
# predicted = nb_clf_pipeline.predict(test_news['statement'])
# print(np.mean(predicted == test_label))
# print(classification_report(test_label,predicted))
# print(confusion_matrix(test_label,predicted))

## logistic regression

The underlying algorithm is also fairly easy to understand. More importantly, in the NLP world, it’s generally accepted that Logistic Regression is a great starter algorithm for text related classification (https://web.stanford.edu/~jurafsky/slp3/5.pdf). 

**How hypothesis makes prediction in logistics regression?**

This algorithm uses sigmoid function(g(z)). If we want to predict y=1 or y=0.
If estimated probability of y=1 is h(x)>=0.5 then the ouput is more likely to be "y=1" 
but if  h(x) < 0.5, the output is more likely to be is "y=0".

### Train Models with Different Types of Features¶

In [15]:
field='statement'
feature_reps=['binary','counts','tfidf']
lr_results=[]
LogR_clf = LogisticRegression(verbose=1, solver='liblinear',random_state=0, C=5, penalty='l2',max_iter=1000)

for feature_rep in feature_reps:
        print(f'Model - {feature_rep} features with statement')
        lr_model,transformer,score=train_model(LogR_clf,train_val,field=field,feature_rep=feature_rep)
        lr_results.append([field,feature_rep,score])

2020-11-29 14:38:35,963 : INFO : Starting model training...
2020-11-29 14:38:35,969 : INFO : Extracting features and creating vocabulary...


Model - binary features with statement


2020-11-29 14:39:14,635 : INFO : Training a Classification Model...
2020-11-29 14:39:14,727 : INFO : Starting evaluation...
2020-11-29 14:39:14,739 : INFO : Done training and evaluation.
2020-11-29 14:39:14,742 : INFO : Starting model training...
2020-11-29 14:39:14,746 : INFO : Extracting features and creating vocabulary...


[LibLinear]              precision    recall  f1-score   support

           0       0.51      0.51      0.51      1248
           1       0.62      0.62      0.62      1632

    accuracy                           0.57      2880
   macro avg       0.57      0.57      0.57      2880
weighted avg       0.57      0.57      0.57      2880

[[ 636  612]
 [ 617 1015]]
Model - counts features with statement


2020-11-29 14:39:52,703 : INFO : Training a Classification Model...
2020-11-29 14:39:52,799 : INFO : Starting evaluation...
2020-11-29 14:39:52,811 : INFO : Done training and evaluation.
2020-11-29 14:39:52,814 : INFO : Starting model training...
2020-11-29 14:39:52,818 : INFO : Extracting features and creating vocabulary...


[LibLinear]              precision    recall  f1-score   support

           0       0.50      0.50      0.50      1248
           1       0.62      0.62      0.62      1632

    accuracy                           0.57      2880
   macro avg       0.56      0.56      0.56      2880
weighted avg       0.57      0.57      0.57      2880

[[ 620  628]
 [ 612 1020]]
Model - tfidf features with statement


2020-11-29 14:40:30,692 : INFO : Training a Classification Model...
2020-11-29 14:40:30,730 : INFO : Starting evaluation...
2020-11-29 14:40:30,748 : INFO : Done training and evaluation.


[LibLinear]              precision    recall  f1-score   support

           0       0.53      0.49      0.51      1248
           1       0.63      0.66      0.65      1632

    accuracy                           0.59      2880
   macro avg       0.58      0.58      0.58      2880
weighted avg       0.59      0.59      0.59      2880

[[ 615  633]
 [ 552 1080]]


### Logistics Regression Results of Various Models

In [16]:
lr_df_results=pd.DataFrame(lr_results,columns=['text_fields','feature_representation','f1-score'])
lr_df_results.sort_values(by=['f1-score'],ascending=False)

Unnamed: 0,text_fields,feature_representation,f1-score
2,statement,tfidf,0.64574
0,statement,binary,0.62289
1,statement,counts,0.621951


Here you see how the performance of logistics model is improved using tfidf over counts and binary weightning.

## SVM

Support vector machines is an algorithm that determines the best decision boundary between vectors that belong to a given group (or category) and vectors that do not belong to it. That’s it. It can be applied to any kind of vectors which encode any kind of data. This means that in order to leverage the power of svm text classification, texts have to be transformed into vectors.

So, when SVM determines the decision boundary we mentioned above, SVM decides where to draw the best “line” (or the best hyperplane) that divides the space into two subspaces: one for the vectors which belong to the given category and one for the vectors which do not belong to it.

### Train Models with Different Types of Features¶

In [17]:
field='statement'
feature_reps=['binary','counts','tfidf']
svm_results=[]
svm_clf = svm.LinearSVC()

for feature_rep in feature_reps:
        print(f'SVM Model - {feature_rep} features with statement')
        svm_model,transformer,score=train_model(svm_clf,train_val,field=field,feature_rep=feature_rep)
        svm_results.append([field,feature_rep,score])

2020-11-29 14:45:56,188 : INFO : Starting model training...
2020-11-29 14:45:56,194 : INFO : Extracting features and creating vocabulary...


SVM Model - binary features with statement


2020-11-29 14:46:37,039 : INFO : Training a Classification Model...
2020-11-29 14:46:37,188 : INFO : Starting evaluation...
2020-11-29 14:46:37,198 : INFO : Done training and evaluation.
2020-11-29 14:46:37,200 : INFO : Starting model training...
2020-11-29 14:46:37,204 : INFO : Extracting features and creating vocabulary...


              precision    recall  f1-score   support

           0       0.50      0.50      0.50      1248
           1       0.61      0.61      0.61      1632

    accuracy                           0.56      2880
   macro avg       0.55      0.55      0.55      2880
weighted avg       0.56      0.56      0.56      2880

[[622 626]
 [634 998]]
SVM Model - counts features with statement


2020-11-29 14:47:15,152 : INFO : Training a Classification Model...
2020-11-29 14:47:15,394 : INFO : Starting evaluation...
2020-11-29 14:47:15,404 : INFO : Done training and evaluation.
2020-11-29 14:47:15,407 : INFO : Starting model training...
2020-11-29 14:47:15,411 : INFO : Extracting features and creating vocabulary...


              precision    recall  f1-score   support

           0       0.49      0.50      0.50      1248
           1       0.61      0.61      0.61      1632

    accuracy                           0.56      2880
   macro avg       0.55      0.55      0.55      2880
weighted avg       0.56      0.56      0.56      2880

[[622 626]
 [635 997]]
SVM Model - tfidf features with statement


2020-11-29 14:47:53,168 : INFO : Training a Classification Model...
2020-11-29 14:47:53,212 : INFO : Starting evaluation...
2020-11-29 14:47:53,224 : INFO : Done training and evaluation.


              precision    recall  f1-score   support

           0       0.52      0.50      0.51      1248
           1       0.63      0.65      0.64      1632

    accuracy                           0.58      2880
   macro avg       0.57      0.57      0.57      2880
weighted avg       0.58      0.58      0.58      2880

[[ 622  626]
 [ 572 1060]]


### SVM Results of Various Models

In [18]:
svm_df_results=pd.DataFrame(svm_results,columns=['text_fields','feature_representation','f1-score'])
svm_df_results.sort_values(by=['f1-score'],ascending=False)

Unnamed: 0,text_fields,feature_representation,f1-score
2,statement,tfidf,0.638939
0,statement,binary,0.613022
1,statement,counts,0.612596


## Random Forest

Given the nature of random forests (a bagging decision tree), it is true that you may come up with a rather weak classifier, especially if only a couple of features are truly significant to determine the outcome.

However, keep in mind that in the case of text classification, a preprocessing phase is required to get either your TF or TF-IDF matrix, through which you have already made a selection of pertinent features. Potentially, all features are relevant in this matrix, so the random forest may be performant when you predict your outcome. (source: https://stats.stackexchange.com/questions/343954/random-forest-short-text-classification)

### Train Models with Different Types of Features¶

In [19]:
field='statement'
feature_reps=['binary','counts','tfidf']
rf_results=[]
rf_clf = RandomForestClassifier(n_estimators=1000)

for feature_rep in feature_reps:
        rf_model,transformer,score=train_model(rf_clf,train_val,field=field,feature_rep=feature_rep)
        rf_results.append([field,feature_rep,score])

2020-11-29 14:47:53,261 : INFO : Starting model training...
2020-11-29 14:47:53,270 : INFO : Extracting features and creating vocabulary...
2020-11-29 14:48:30,987 : INFO : Training a Classification Model...
2020-11-29 14:49:44,378 : INFO : Starting evaluation...
2020-11-29 14:49:44,388 : INFO : Done training and evaluation.
2020-11-29 14:49:44,390 : INFO : Starting model training...
2020-11-29 14:49:44,394 : INFO : Extracting features and creating vocabulary...


              precision    recall  f1-score   support

           0       0.57      0.45      0.50      1248
           1       0.64      0.74      0.68      1632

    accuracy                           0.61      2880
   macro avg       0.60      0.59      0.59      2880
weighted avg       0.61      0.61      0.60      2880

[[ 561  687]
 [ 428 1204]]


2020-11-29 14:50:22,211 : INFO : Training a Classification Model...
2020-11-29 14:51:35,059 : INFO : Starting evaluation...
2020-11-29 14:51:35,070 : INFO : Done training and evaluation.
2020-11-29 14:51:35,072 : INFO : Starting model training...
2020-11-29 14:51:35,076 : INFO : Extracting features and creating vocabulary...


              precision    recall  f1-score   support

           0       0.55      0.44      0.49      1248
           1       0.63      0.73      0.68      1632

    accuracy                           0.60      2880
   macro avg       0.59      0.59      0.58      2880
weighted avg       0.60      0.60      0.60      2880

[[ 554  694]
 [ 446 1186]]


2020-11-29 14:52:13,093 : INFO : Training a Classification Model...
2020-11-29 14:53:19,751 : INFO : Starting evaluation...
2020-11-29 14:53:19,762 : INFO : Done training and evaluation.


              precision    recall  f1-score   support

           0       0.56      0.44      0.49      1248
           1       0.63      0.74      0.68      1632

    accuracy                           0.61      2880
   macro avg       0.59      0.59      0.58      2880
weighted avg       0.60      0.61      0.60      2880

[[ 546  702]
 [ 432 1200]]


### RF Results of Various Models¶

In [20]:
rf_df_results=pd.DataFrame(rf_results,columns=['text_fields','feature_representation','f1-score'])
rf_df_results.sort_values(by=['f1-score'],ascending=False)

Unnamed: 0,text_fields,feature_representation,f1-score
0,statement,binary,0.683508
2,statement,tfidf,0.679117
1,statement,counts,0.675399


## K-fold cross validation

With K-fold cross validation, you are testing how well your model is able to get trained by some data and then predict data it hasn't seen. We use cross validation for this because if you train using all the data you have, you have none left for testing. You could do this once, say by using 80% of the data to train and 20% to test, but what if the 20% you happened to pick to test happens to contain a bunch of points that are particularly easy (or particularly hard) to predict? We will not have come up with the best estimate possible of the models ability to learn and predict.

In [22]:
#User defined functon for K-Fold cross validatoin
def apply_kfold(classifier,train_val,field,feature_rep):
    """
    K-fold cross validation on the the data
    """
    k_fold = KFold(n_splits=5, shuffle=True)
    scores = []
    confusion = np.array([[0,0],[0,0]])

    for fold_n, (train_index, valid_index) in enumerate(k_fold.split(train_val['statement'], train_val['label'])):
        print(fold_n, len(train_index), len(valid_index))
        train_x = train_val['statement'].iloc[train_index]
        train_y = train_val['label'].iloc[train_index]
    
        valid_x = train_val['statement'].iloc[valid_index]
        valid_y = train_val['label'].iloc[valid_index]
        
        # GET FEATURES
        train_features,val_features,feature_transformer=extract_features(field,train_x,valid_x,type=feature_rep)
        
        # INIT CLASSIFIER
        logging.info("Training a Classification Model...")
        classifier.fit(train_features, train_y)
        predictions = classifier.predict(val_features)
        
        confusion += confusion_matrix(valid_y,predictions)
        score = f1_score(valid_y,predictions)
        scores.append(score)
        
    return (print('Total statements classified:', len(train_val['statement'])),
    print('Score:', sum(scores)/len(scores)),
    print('score length', len(scores)),
    print('Confusion matrix:'),
    print(confusion))

## Grid Search Hyperparameters

In [21]:
# from sklearn.svm import SVC
# from sklearn.model_selection import StratifiedKFold
# from skopt import BayesSearchCV

# field='statement'
# feature_reps=['binary','counts','tfidf']
# # GET FEATURES
# train_features,feature_transformer=extract_final_features('statement',train_val['statement'],type='binary')
    
# # define search space
# params = dict()
# params['C'] = (1e-6, 100.0, 'log-uniform')
# params['gamma'] = (1e-6, 100.0, 'log-uniform')
# params['degree'] = (1,5)
# params['kernel'] = ['linear', 'poly', 'rbf', 'sigmoid']
# # define evaluation
# cv = StratifiedKFold(n_splits=5, random_state=1)
# # define the search
# search = BayesSearchCV(estimator=SVC(), search_spaces=params, n_jobs=-1, cv=cv)
# # perform the search
# search.fit(train_features, train_val['label'])
# # report the best result
# print(search.best_score_)
# print(search.best_params_)

## Naive Bayes with K-fold cross validation

In [23]:
field='statement'
feature_reps=['binary','counts','tfidf']
nb_results=[]
nb_clf = MultinomialNB()
for feature_rep in feature_reps:
        print(f'Model - {feature_rep} features with statement')
        apply_kfold(nb_clf,train_val,field=field,feature_rep=feature_rep)

2020-11-29 15:08:18,004 : INFO : Extracting features and creating vocabulary...


Model - binary features with statement
0 9216 2304


2020-11-29 15:09:00,091 : INFO : Training a Classification Model...
2020-11-29 15:09:00,100 : INFO : Extracting features and creating vocabulary...


1 9216 2304


2020-11-29 15:09:39,146 : INFO : Training a Classification Model...
2020-11-29 15:09:39,155 : INFO : Extracting features and creating vocabulary...


2 9216 2304


2020-11-29 15:10:17,909 : INFO : Training a Classification Model...
2020-11-29 15:10:17,918 : INFO : Extracting features and creating vocabulary...


3 9216 2304


2020-11-29 15:10:56,728 : INFO : Training a Classification Model...
2020-11-29 15:10:56,737 : INFO : Extracting features and creating vocabulary...


4 9216 2304


2020-11-29 15:11:35,651 : INFO : Training a Classification Model...
2020-11-29 15:11:35,663 : INFO : Extracting features and creating vocabulary...


Total statements classified: 11520
Score: 0.6696528205765316
score length 5
Confusion matrix:
[[2319 2783]
 [1785 4633]]
Model - counts features with statement
0 9216 2304


2020-11-29 15:12:14,459 : INFO : Training a Classification Model...
2020-11-29 15:12:14,468 : INFO : Extracting features and creating vocabulary...


1 9216 2304


2020-11-29 15:12:53,117 : INFO : Training a Classification Model...
2020-11-29 15:12:53,126 : INFO : Extracting features and creating vocabulary...


2 9216 2304


2020-11-29 15:13:31,972 : INFO : Training a Classification Model...
2020-11-29 15:13:31,980 : INFO : Extracting features and creating vocabulary...


3 9216 2304


2020-11-29 15:14:10,714 : INFO : Training a Classification Model...
2020-11-29 15:14:10,723 : INFO : Extracting features and creating vocabulary...


4 9216 2304


2020-11-29 15:14:49,468 : INFO : Training a Classification Model...
2020-11-29 15:14:49,480 : INFO : Extracting features and creating vocabulary...


Total statements classified: 11520
Score: 0.6708390874339865
score length 5
Confusion matrix:
[[2357 2745]
 [1793 4625]]
Model - tfidf features with statement
0 9216 2304


2020-11-29 15:15:28,212 : INFO : Training a Classification Model...
2020-11-29 15:15:28,221 : INFO : Extracting features and creating vocabulary...


1 9216 2304


2020-11-29 15:16:06,870 : INFO : Training a Classification Model...
2020-11-29 15:16:06,878 : INFO : Extracting features and creating vocabulary...


2 9216 2304


2020-11-29 15:16:45,479 : INFO : Training a Classification Model...
2020-11-29 15:16:45,489 : INFO : Extracting features and creating vocabulary...


3 9216 2304


2020-11-29 15:17:24,137 : INFO : Training a Classification Model...
2020-11-29 15:17:24,146 : INFO : Extracting features and creating vocabulary...


4 9216 2304


2020-11-29 15:18:02,991 : INFO : Training a Classification Model...


Total statements classified: 11520
Score: 0.6991211110570669
score length 5
Confusion matrix:
[[1443 3659]
 [1002 5416]]


## Logistics Regression with K-fold cross Validation

In [24]:
field='statement'
feature_reps=['binary','counts','tfidf']
LogR_clf = LogisticRegression(verbose=1, solver='liblinear',random_state=0, C=5, penalty='l2',max_iter=1000)

for feature_rep in feature_reps:
        print(f'Model - {feature_rep} features with statement')
        apply_kfold(LogR_clf,train_val,field=field,feature_rep=feature_rep)

2020-11-29 15:18:03,011 : INFO : Extracting features and creating vocabulary...


Model - binary features with statement
0 9216 2304


2020-11-29 15:18:41,941 : INFO : Training a Classification Model...
2020-11-29 15:18:42,040 : INFO : Extracting features and creating vocabulary...


[LibLinear]1 9216 2304


2020-11-29 15:19:20,926 : INFO : Training a Classification Model...
2020-11-29 15:19:21,024 : INFO : Extracting features and creating vocabulary...


[LibLinear]2 9216 2304


2020-11-29 15:35:33,689 : INFO : Training a Classification Model...
2020-11-29 15:35:33,870 : INFO : Extracting features and creating vocabulary...


[LibLinear]3 9216 2304


2020-11-29 15:36:17,741 : INFO : Training a Classification Model...
2020-11-29 15:36:17,826 : INFO : Extracting features and creating vocabulary...


[LibLinear]4 9216 2304


2020-11-29 15:36:59,725 : INFO : Training a Classification Model...
2020-11-29 15:36:59,810 : INFO : Extracting features and creating vocabulary...


[LibLinear]Total statements classified: 11520
Score: 0.6318176784203094
score length 5
Confusion matrix:
[[2532 2570]
 [2267 4151]]
Model - counts features with statement
0 9216 2304


2020-11-29 15:37:42,385 : INFO : Training a Classification Model...
2020-11-29 15:37:42,466 : INFO : Extracting features and creating vocabulary...


[LibLinear]1 9216 2304


2020-11-29 15:38:23,893 : INFO : Training a Classification Model...
2020-11-29 15:38:24,056 : INFO : Extracting features and creating vocabulary...


[LibLinear]2 9216 2304


2020-11-29 15:39:04,150 : INFO : Training a Classification Model...
2020-11-29 15:39:04,250 : INFO : Extracting features and creating vocabulary...


[LibLinear]3 9216 2304


2020-11-29 15:39:49,488 : INFO : Training a Classification Model...
2020-11-29 15:39:49,577 : INFO : Extracting features and creating vocabulary...


[LibLinear]4 9216 2304


2020-11-29 15:40:36,070 : INFO : Training a Classification Model...
2020-11-29 15:40:36,152 : INFO : Extracting features and creating vocabulary...


[LibLinear]Total statements classified: 11520
Score: 0.6258242722163977
score length 5
Confusion matrix:
[[2524 2578]
 [2321 4097]]
Model - tfidf features with statement
0 9216 2304


2020-11-29 15:41:19,921 : INFO : Training a Classification Model...
2020-11-29 15:41:19,966 : INFO : Extracting features and creating vocabulary...


[LibLinear]1 9216 2304


2020-11-29 15:42:09,994 : INFO : Training a Classification Model...
2020-11-29 15:42:10,030 : INFO : Extracting features and creating vocabulary...


[LibLinear]2 9216 2304


2020-11-29 15:42:57,193 : INFO : Training a Classification Model...
2020-11-29 15:42:57,226 : INFO : Extracting features and creating vocabulary...


[LibLinear]3 9216 2304


2020-11-29 15:43:37,580 : INFO : Training a Classification Model...
2020-11-29 15:43:37,616 : INFO : Extracting features and creating vocabulary...


[LibLinear]4 9216 2304


2020-11-29 15:44:17,537 : INFO : Training a Classification Model...


[LibLinear]Total statements classified: 11520
Score: 0.6552547115771776
score length 5
Confusion matrix:
[[2470 2632]
 [2008 4410]]


## SVM with K-fold cross Validation

In [25]:
field='statement'
feature_reps=['binary','counts','tfidf']
svm_clf = svm.LinearSVC()

for feature_rep in feature_reps:
        print(f'Model - {feature_rep} features with statement')
        apply_kfold(svm_clf,train_val,field=field,feature_rep=feature_rep)

2020-11-29 15:44:17,579 : INFO : Extracting features and creating vocabulary...


Model - binary features with statement
0 9216 2304


2020-11-29 15:44:57,514 : INFO : Training a Classification Model...
2020-11-29 15:44:57,657 : INFO : Extracting features and creating vocabulary...


1 9216 2304


2020-11-29 15:45:37,463 : INFO : Training a Classification Model...
2020-11-29 15:45:37,601 : INFO : Extracting features and creating vocabulary...


2 9216 2304


2020-11-29 15:46:17,435 : INFO : Training a Classification Model...
2020-11-29 15:46:17,587 : INFO : Extracting features and creating vocabulary...


3 9216 2304


2020-11-29 15:46:57,610 : INFO : Training a Classification Model...
2020-11-29 15:46:57,753 : INFO : Extracting features and creating vocabulary...


4 9216 2304


2020-11-29 15:47:37,807 : INFO : Training a Classification Model...
2020-11-29 15:47:37,951 : INFO : Extracting features and creating vocabulary...


Total statements classified: 11520
Score: 0.616949159904504
score length 5
Confusion matrix:
[[2574 2528]
 [2427 3991]]
Model - counts features with statement
0 9216 2304


2020-11-29 15:48:17,714 : INFO : Training a Classification Model...
2020-11-29 15:48:17,965 : INFO : Extracting features and creating vocabulary...


1 9216 2304


2020-11-29 15:48:57,695 : INFO : Training a Classification Model...
2020-11-29 15:48:57,948 : INFO : Extracting features and creating vocabulary...


2 9216 2304


2020-11-29 15:49:37,824 : INFO : Training a Classification Model...
2020-11-29 15:49:38,081 : INFO : Extracting features and creating vocabulary...


3 9216 2304


2020-11-29 15:50:17,988 : INFO : Training a Classification Model...
2020-11-29 15:50:18,177 : INFO : Extracting features and creating vocabulary...


4 9216 2304


2020-11-29 15:50:59,512 : INFO : Training a Classification Model...
2020-11-29 15:50:59,779 : INFO : Extracting features and creating vocabulary...


Total statements classified: 11520
Score: 0.615255046480374
score length 5
Confusion matrix:
[[2579 2523]
 [2445 3973]]
Model - tfidf features with statement
0 9216 2304


2020-11-29 15:51:39,692 : INFO : Training a Classification Model...
2020-11-29 15:51:39,731 : INFO : Extracting features and creating vocabulary...


1 9216 2304


2020-11-29 15:52:19,542 : INFO : Training a Classification Model...
2020-11-29 15:52:19,581 : INFO : Extracting features and creating vocabulary...


2 9216 2304


2020-11-29 15:52:59,442 : INFO : Training a Classification Model...
2020-11-29 15:52:59,482 : INFO : Extracting features and creating vocabulary...


3 9216 2304


2020-11-29 16:01:01,463 : INFO : Training a Classification Model...
2020-11-29 16:01:01,502 : INFO : Extracting features and creating vocabulary...


4 9216 2304


2020-11-29 16:01:42,182 : INFO : Training a Classification Model...


Total statements classified: 11520
Score: 0.6480762074477264
score length 5
Confusion matrix:
[[2541 2561]
 [2113 4305]]


## RF with K-fold cross Validation

In [None]:
field='statement'
feature_reps=['binary','counts','tfidf']
rf_clf = RandomForestClassifier(n_estimators=1000)

for feature_rep in feature_reps:
        print(f'Model - {feature_rep} features with statement')
        apply_kfold(rf_clf,train_val,field=field,feature_rep=feature_rep)

2020-11-29 16:01:42,245 : INFO : Extracting features and creating vocabulary...


Model - binary features with statement
0 9216 2304


2020-11-29 16:02:23,331 : INFO : Training a Classification Model...
2020-11-29 16:03:47,938 : INFO : Extracting features and creating vocabulary...


1 9216 2304


2020-11-29 16:04:40,416 : INFO : Training a Classification Model...
2020-11-29 16:06:24,841 : INFO : Extracting features and creating vocabulary...


2 9216 2304


2020-11-29 16:07:24,645 : INFO : Training a Classification Model...
2020-11-29 16:08:53,518 : INFO : Extracting features and creating vocabulary...


3 9216 2304


2020-11-29 16:09:39,748 : INFO : Training a Classification Model...
2020-11-29 16:11:18,948 : INFO : Extracting features and creating vocabulary...


4 9216 2304


2020-11-29 16:12:01,949 : INFO : Training a Classification Model...
2020-11-29 16:13:36,395 : INFO : Extracting features and creating vocabulary...


Total statements classified: 11520
Score: 0.6816749066807989
score length 5
Confusion matrix:
[[2155 2947]
 [1575 4843]]
Model - counts features with statement
0 9216 2304


2020-11-29 16:14:25,004 : INFO : Training a Classification Model...
2020-11-29 16:15:58,377 : INFO : Extracting features and creating vocabulary...


1 9216 2304


2020-11-29 16:16:45,730 : INFO : Training a Classification Model...
2020-11-29 16:18:13,986 : INFO : Extracting features and creating vocabulary...


2 9216 2304


2020-11-29 16:19:03,080 : INFO : Training a Classification Model...
2020-11-29 16:20:34,457 : INFO : Extracting features and creating vocabulary...


3 9216 2304


2020-11-29 16:21:18,368 : INFO : Training a Classification Model...


## Best Model Selection

"""
Out of all the models fitted, we would take 2 best performing model. we would call them candidate models
from the confusion matrix, we can see that logistic regression and SVM (with either binary or tfidf features) are better performing 
in terms of precision and recall (take a look into false positive and true negative counts which appeares
to be low compared to rest of the models).

Using k-fold cross validation, we see the performance of the models on the entire dataset. And, the model's aren't performing well. We can apply other features to improve the performance, and grid-search can also help us to find best parameters to improve the perfromance.
"""

## Train the best Model on entire dataset

In [None]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

def extract_final_features(field,training_data,type):
    """Extract features using different methods"""
    
    logging.info("Extracting features and creating vocabulary...")
    
    if "binary" in type:
        
        # BINARY FEATURE REPRESENTATION
        cv= CountVectorizer(binary=True, max_df=0.95)
        cv.fit_transform(training_data.values)
        
        train_feature_set=cv.transform(training_data.values)
        
        return train_feature_set,cv
  
    elif "counts" in type:
        
        # COUNT BASED FEATURE REPRESENTATION
        cv= CountVectorizer(binary=False, max_df=0.95)
        cv.fit_transform(training_data.values)
        
        train_feature_set=cv.transform(training_data.values)
        
        return train_feature_set,cv
    
    else:    
        
        # TF-IDF BASED FEATURE REPRESENTATION
        tfidf_vectorizer=TfidfVectorizer(use_idf=True, max_df=0.95)
        tfidf_vectorizer.fit_transform(training_data.values)
        
        train_feature_set=tfidf_vectorizer.transform(training_data.values)
        
        return train_feature_set,tfidf_vectorizer

In [None]:
def train_final_model(classifier, train_val, field="statement",feature_rep="binary"):
    """
    Training the best classifier on entire dataset for the provided features.
    """
    
    logging.info("Starting model training...")    

    # features
    train_x=train_val['statement']
    
    # GET LABELS
    target=train_val['label'].values
     
    # GET FEATURES
    features,feature_transformer=extract_final_features(field,train_x,type=feature_rep)

    # INIT LOGISTIC REGRESSION CLASSIFIER
    logging.info("Training a Final Model...")
#     scikit_log_reg = LogisticRegression(verbose=1, solver='liblinear',random_state=0, C=5, penalty='l2',max_iter=1000)
    model=classifier.fit(features,target)

    logging.info("Done training.")
    
    return model,feature_transformer

In [None]:
def get_predictions(model,X_test):
    
    # get predicted labels
    pred = model.predict(X_test)
    
    return pred

In [None]:
field='statement'
LogR_clf_final = LogisticRegression(verbose=1, solver='liblinear',random_state=0, C=5, penalty='l2',max_iter=1000)
lr_final_model,transformer=train_final_model(LogR_clf_final,train_val,field=field,feature_rep='counts')

## Check predictions on unseen data

In [None]:
# https://www.snopes.com/fact-check/alaska-town-60-days-without-sun/
test_features=transformer.transform(["Building a wall on the U.S.-Mexico border will take literally years."])
ouput = get_predictions(lr_final_model,test_features)

In [None]:
ouput[0]

In [None]:
# https://www.politifact.com/factchecks/2020/nov/20/viral-image/no-passage-about-defeat-isnt-donald-trumps-art-dea/
test_features=transformer.transform(["Wisconsin is on pace to double the number of layoffs this year."])
ouput = get_predictions(lr_final_model,test_features)

In [None]:
ouput[0] # this information is predicted as true, however, it should be false

## Save Model for Future Use

In [70]:
import pickle

model_path="../models/lr_final_model.pkl"
transformer_path="../models/transformer.pkl"

# we need to save both the transformer -> to encode a document and the model itself to make predictions based on the weight vectors 
pickle.dump(lr_final_model,open(model_path, 'wb'))
pickle.dump(transformer,open(transformer_path,'wb'))