In [12]:
import pandas as pd
import numpy as np
import string
import logging

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,TfidfTransformer

from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import  LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.model_selection import KFold
from sklearn.metrics import precision_recall_fscore_support

import nltk
import nltk.corpus 
from nltk.corpus import stopwords

## Read Data

In [13]:
train_news = pd.read_csv('../data/processed/train.csv')
val_news = pd.read_csv('../data/processed/val.csv')
test_news = pd.read_csv('../data/processed/test.csv')

In [14]:
display(train_news), display(test_news), display(val_news)

Unnamed: 0,label,statement
0,False,Says the Annies List political group supports ...
1,True,When did the decline of coal start? It started...
2,True,"Hillary Clinton agrees with John McCain ""by vo..."
3,False,Health care reform legislation is likely to ma...
4,True,The economic turnaround started at the end of ...
...,...,...
10235,True,There are a larger number of shark attacks in ...
10236,True,Democrats have now become the party of the [At...
10237,True,Says an alternative to Social Security that op...
10238,False,On lifting the U.S. Cuban embargo and allowing...


Unnamed: 0,label,statement
0,True,Building a wall on the U.S.-Mexico border will...
1,False,Wisconsin is on pace to double the number of l...
2,False,Says John McCain has done nothing to help the ...
3,True,Suzanne Bonamici supports a plan that will cut...
4,False,When asked by a reporter whether hes at the ce...
...,...,...
1262,True,Says his budget provides the highest state fun...
1263,False,Ive been here almost every day.
1264,False,"In the early 1980s, Sen. Edward Kennedy secret..."
1265,False,Says an EPA permit languished under Strickland...


Unnamed: 0,label,statement
0,False,We have less Americans working now than in the...
1,False,"When Obama was sworn into office, he DID NOT u..."
2,False,Says Having organizations parading as being so...
3,True,Says nearly half of Oregons children are poor.
4,True,On attacks by Republicans that various program...
...,...,...
1279,True,"For the first time in more than a decade, impo..."
1280,True,Says Donald Trump has bankrupted his companies...
1281,True,"John McCain and George Bush have ""absolutely n..."
1282,False,A new poll shows 62 percent support the presid...


(None, None, None)

## Merging train & val data for K-Fold

In [15]:
# Merging the training and validation data together, so that I can peroform k-fold cross validation 
#and shuffle the data to reduce the bias
labelEncoder = LabelEncoder()
frames = [train_news, val_news]
train_val = pd.concat(frames)
train_val['label'].value_counts()
train_val['label'] = labelEncoder.fit_transform(train_val['label'])

In [16]:
train_val

Unnamed: 0,label,statement
0,0,Says the Annies List political group supports ...
1,1,When did the decline of coal start? It started...
2,1,"Hillary Clinton agrees with John McCain ""by vo..."
3,0,Health care reform legislation is likely to ma...
4,1,The economic turnaround started at the end of ...
...,...,...
1279,1,"For the first time in more than a decade, impo..."
1280,1,Says Donald Trump has bankrupted his companies...
1281,1,"John McCain and George Bush have ""absolutely n..."
1282,0,A new poll shows 62 percent support the presid...


In [17]:
def process_text(text):
    '''
    What will be covered:
    1. Remove punctuation
    2. Remove stopwords
    3. Return list of clean text words
    '''
    
    #1
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    
    #2
    clean_words = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    
    #3
    return clean_words

In [18]:
# count_vect = CountVectorizer(analyzer=process_text)
# tfidf_transformer = TfidfTransformer()

## Feature Weighting

Not all words are equally important to a particular document / category. For example, while words like ‘murder’, ‘knife’ and ‘abduction’ are important to a crime related document, words like ‘news’ and ‘reporter’ may not be quite as important. 

### Binary Weighting
The most basic form of feature weighting, is binary weighting. Where if a word is present in a document, the weight is ‘1’ and if the word is absent the weight is ‘0’. 

### CountVectorizer

It Convert a collection of text documents to a matrix of token counts.


### Tfidf Weighting 

TF-IDF weighting where words that are unique to a particular document would have higher weights compared to words that are used commonly across documents. 

1. TF (Term Frequency): The number of times a word appears in a document divded by the total number of words in the document. Every document has its own term frequency.

2. IDF (Inverse Data Frequency): The log of the number of documents divided by the number of documents that contain the word w. Inverse data frequency determines the weight of rare words across all documents in the corpus.

3. Lastly, the TF-IDF is simply the TF multiplied by IDF.

In [50]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

def extract_features(field,training_data,testing_data,type):
    """Extract features using different methods"""
    
    logging.info("Extracting features and creating vocabulary...")
    
    if "binary" in type:
        
        # BINARY FEATURE REPRESENTATION
        cv= CountVectorizer(binary=True, max_df=0.95)
        cv.fit_transform(training_data.values)
        
        train_feature_set=cv.transform(training_data.values)
        test_feature_set=cv.transform(testing_data.values)
        
        return train_feature_set,test_feature_set,cv
  
    elif "counts" in type:
        
        # COUNT BASED FEATURE REPRESENTATION
        cv= CountVectorizer(binary=False, max_df=0.95)
        cv.fit_transform(training_data.values)
        
        train_feature_set=cv.transform(training_data.values)
        test_feature_set=cv.transform(testing_data.values)
        
        return train_feature_set,test_feature_set,cv
    
    else:    
        
        # TF-IDF BASED FEATURE REPRESENTATION
        tfidf_vectorizer=TfidfVectorizer(use_idf=True, max_df=0.95)
        tfidf_vectorizer.fit_transform(training_data.values)
        
        train_feature_set=tfidf_vectorizer.transform(training_data.values)
        test_feature_set=tfidf_vectorizer.transform(testing_data.values)
        
        return train_feature_set,test_feature_set,tfidf_vectorizer

In [51]:
def train_model(classifier, train_val, field="statement",feature_rep="binary",top_k=3):
    """
    Training the classifier for the provided features.
    """
    
    logging.info("Starting model training...")
    
    scores = []
    confusion = np.array([[0,0],[0,0]])
    
    # GET A TRAIN TEST SPLIT (set seed for consistent results)
    training_data, testing_data = train_test_split(train_val,random_state = 2000,)

    # features
    X_train=training_data['statement']
    X_test=testing_data['statement']
    
    # GET LABELS
    Y_train=training_data['label'].values
    Y_test=testing_data['label'].values
     
    # GET FEATURES
    train_features,test_features,feature_transformer=extract_features(field,X_train,X_test,type=feature_rep)

    # INIT LOGISTIC REGRESSION CLASSIFIER
    logging.info("Training a Classification Model...")
#     scikit_log_reg = LogisticRegression(verbose=1, solver='liblinear',random_state=0, C=5, penalty='l2',max_iter=1000)
    model=classifier.fit(train_features,Y_train)

    # GET PREDICTIONS
    predictions = model.predict(test_features)
    
    # GET EVALUATION NUMBERS ON TEST SET -- HOW DID WE DO?
    logging.info("Starting evaluation...")
    score = f1_score(Y_test,predictions)
    print(classification_report(Y_test,predictions))
    print(confusion_matrix(Y_test,predictions))
    logging.info("Done training and evaluation.")
    
    return model,feature_transformer,score

## Metric

I need to minimize false positives (number of fake news predicted as real news) as it can be very misleadling . For class 0 i.e. 'fake', recall should be high as well as precision. Because we want our model to perform well on both classes (real & fake). In short, we need to maximize f1-score.

### Cases I considered to choose the right metric

**1. Maximizing recall of class 0 (fake) or minimizing false positives(FP)?**
Well, in extreme case, what if all the news predicted by model are labelled as 'fake'. Recall will still be 1, but overall model is really bad i.e. not able to predict class 1 ('real'). 

Ex=> TN = 553, FP = 0, TP = 0, FN = 714

Class0-Recall = TN / (TN + FP) = 1
Class0-Precision = TN / (TN + FN) = 0.43

F1-Score = 2 * Class0-Recall * Class0-Precision/(Class0-Recall + Class0-Precision) = 0.60

Recall, Precision and F1-score for class 1 will be 0.

**2. Considering an extreme case, if all the news classified as True (Even, fake news are predicted as True).**

Ex=>  TN = 0, FP = 553, TP = 714, FN =0
In that case, TN will be 0, which led to Precision 0, Recall 0 and F1 = 0 for class 0 ('fake').

For class 1, Class1-Recall = TP / (TP + FN) = 1
Class1-Precision = TP / (TP + FP) = 0.56

## Model Training

## Text Classification Algorithms

1. Naive Bayes (NB)
2. Logistics Regression
3. SVM
4. Random Forest

## Naive Bayes

Well, when assumption of independence holds, a Naive Bayes classifier performs better compare to other models like logistic regression and you need less training data. An advantage of naive Bayes is that it only requires a small number of training data to estimate the parameters necessary for classification. 

Bayes’ Theorem provides a way that we can calculate the probability of a piece of data belonging to a given class, given our prior knowledge. Bayes’ Theorem is stated as:

P(class|data) = (P(data|class) * P(class)) / P(data)

Where P(class|data) is the probability of class given the provided data.

Naive Bayes is a classification algorithm for binary (two-class) and multiclass classification problems. It is called Naive Bayes or idiot Bayes because the calculations of the probabilities for each class are simplified to make their calculations tractable.

Rather than attempting to calculate the probabilities of each attribute value, they are assumed to be conditionally independent given the class value.

This is a very strong assumption that is most unlikely in real data, i.e. that the attributes do not interact. Nevertheless, the approach performs surprisingly well on data where this assumption does not hold.

### Multinomial NB

The multinomial Naive Bayes classifier is suitable for classification with discrete features (e.g., word counts for text classification). The multinomial distribution normally requires integer feature counts. However, in practice, fractional counts such as tf-idf may also work

### Train Models with Different Types of Features

In [40]:
# model,transformer,score,confusion,report=train_model(nb_clf, train_val,field=field,feature_rep=feature_rep)
# print("\nF1-score={0}; confusion={1}; classification_report={2}".format(score,confusion,report))
field='statement'
feature_reps=['binary','counts','tfidf']
nb_results=[]
nb_clf = MultinomialNB()
for feature_rep in feature_reps:
        print(f'Model - {feature_rep} features with statement')
        model,transformer,score=train_model(nb_clf,train_val,field=field,feature_rep=feature_rep)
        nb_results.append([field,feature_rep,score])

2020-11-19 15:09:18,373 : INFO : Starting model training...
2020-11-19 15:09:18,379 : INFO : Extracting features and creating vocabulary...


Model - binary features with statement


2020-11-19 15:09:18,750 : INFO : Training a Classification Model...
2020-11-19 15:09:18,755 : INFO : Starting evaluation...
2020-11-19 15:09:18,766 : INFO : Done training and evaluation.
2020-11-19 15:09:18,771 : INFO : Starting model training...
2020-11-19 15:09:18,778 : INFO : Extracting features and creating vocabulary...


              precision    recall  f1-score   support

           0       0.58      0.48      0.52      1273
           1       0.64      0.72      0.68      1608

    accuracy                           0.61      2881
   macro avg       0.61      0.60      0.60      2881
weighted avg       0.61      0.61      0.61      2881

[[ 611  662]
 [ 449 1159]]
Model - counts features with statement


2020-11-19 15:09:19,107 : INFO : Training a Classification Model...
2020-11-19 15:09:19,111 : INFO : Starting evaluation...
2020-11-19 15:09:19,122 : INFO : Done training and evaluation.
2020-11-19 15:09:19,125 : INFO : Starting model training...
2020-11-19 15:09:19,131 : INFO : Extracting features and creating vocabulary...


              precision    recall  f1-score   support

           0       0.58      0.49      0.53      1273
           1       0.64      0.72      0.68      1608

    accuracy                           0.62      2881
   macro avg       0.61      0.61      0.61      2881
weighted avg       0.61      0.62      0.61      2881

[[ 630  643]
 [ 455 1153]]
Model - tfidf features with statement


2020-11-19 15:09:19,512 : INFO : Training a Classification Model...
2020-11-19 15:09:19,517 : INFO : Starting evaluation...
2020-11-19 15:09:19,541 : INFO : Done training and evaluation.


              precision    recall  f1-score   support

           0       0.64      0.27      0.38      1273
           1       0.60      0.88      0.71      1608

    accuracy                           0.61      2881
   macro avg       0.62      0.57      0.55      2881
weighted avg       0.62      0.61      0.57      2881

[[ 345  928]
 [ 198 1410]]


### Naive Bayes Results of Various Models

In [41]:
nb_df_results=pd.DataFrame(nb_results,columns=['text_fields','feature_representation','f1-score'])
nb_df_results.sort_values(by=['f1-score'],ascending=False)

Unnamed: 0,text_fields,feature_representation,f1-score
2,statement,tfidf,0.714648
1,statement,counts,0.677438
0,statement,binary,0.675999


In [36]:
# nb_clf_pipeline = Pipeline([('vect', count_vect),
#                       ('tfidf', tfidf_transformer),
#                       ('nb_clf', MultinomialNB()),
#  ])
# nb_clf_pipeline.fit(train_news['statement'], train_label)
# predicted = nb_clf_pipeline.predict(test_news['statement'])
# print(np.mean(predicted == test_label))
# print(classification_report(test_label,predicted))
# print(confusion_matrix(test_label,predicted))

## logistic regression

The underlying algorithm is also fairly easy to understand. More importantly, in the NLP world, it’s generally accepted that Logistic Regression is a great starter algorithm for text related classification (https://web.stanford.edu/~jurafsky/slp3/5.pdf). 

**How hypothesis makes prediction in logistics regression?**

This algorithm uses sigmoid function(g(z)). If we want to predict y=1 or y=0.
If estimated probability of y=1 is h(x)>=0.5 then the ouput is more likely to be "y=1" 
but if  h(x) < 0.5, the output is more likely to be is "y=0".

### Train Models with Different Types of Features¶

In [43]:
field='statement'
feature_reps=['binary','counts','tfidf']
lr_results=[]
LogR_clf = LogisticRegression(verbose=1, solver='liblinear',random_state=0, C=5, penalty='l2',max_iter=1000)

for feature_rep in feature_reps:
        print(f'Model - {feature_rep} features with statement')
        model,transformer,score=train_model(LogR_clf,train_val,field=field,feature_rep=feature_rep)
        lr_results.append([field,feature_rep,score])

2020-11-19 15:09:55,972 : INFO : Starting model training...
2020-11-19 15:09:55,980 : INFO : Extracting features and creating vocabulary...


Model - binary features with statement


2020-11-19 15:09:56,323 : INFO : Training a Classification Model...
2020-11-19 15:09:56,447 : INFO : Starting evaluation...
2020-11-19 15:09:56,461 : INFO : Done training and evaluation.
2020-11-19 15:09:56,465 : INFO : Starting model training...
2020-11-19 15:09:56,472 : INFO : Extracting features and creating vocabulary...


[LibLinear]              precision    recall  f1-score   support

           0       0.53      0.50      0.52      1273
           1       0.62      0.65      0.64      1608

    accuracy                           0.59      2881
   macro avg       0.58      0.58      0.58      2881
weighted avg       0.58      0.59      0.59      2881

[[ 641  632]
 [ 559 1049]]
Model - counts features with statement


2020-11-19 15:09:56,824 : INFO : Training a Classification Model...


[LibLinear]

2020-11-19 15:09:57,034 : INFO : Starting evaluation...
2020-11-19 15:09:57,061 : INFO : Done training and evaluation.
2020-11-19 15:09:57,064 : INFO : Starting model training...
2020-11-19 15:09:57,078 : INFO : Extracting features and creating vocabulary...


              precision    recall  f1-score   support

           0       0.54      0.51      0.52      1273
           1       0.63      0.66      0.64      1608

    accuracy                           0.59      2881
   macro avg       0.58      0.58      0.58      2881
weighted avg       0.59      0.59      0.59      2881

[[ 643  630]
 [ 549 1059]]
Model - tfidf features with statement


2020-11-19 15:09:57,426 : INFO : Training a Classification Model...
2020-11-19 15:09:57,478 : INFO : Starting evaluation...
2020-11-19 15:09:57,493 : INFO : Done training and evaluation.


[LibLinear]              precision    recall  f1-score   support

           0       0.55      0.48      0.51      1273
           1       0.63      0.69      0.66      1608

    accuracy                           0.60      2881
   macro avg       0.59      0.59      0.59      2881
weighted avg       0.60      0.60      0.60      2881

[[ 611  662]
 [ 491 1117]]


### Logistics Regression Results of Various Models

In [44]:
lr_df_results=pd.DataFrame(lr_results,columns=['text_fields','feature_representation','f1-score'])
lr_df_results.sort_values(by=['f1-score'],ascending=False)

Unnamed: 0,text_fields,feature_representation,f1-score
2,statement,tfidf,0.659581
1,statement,counts,0.642402
0,statement,binary,0.637884


Here you see how the performance of logistics model is improved using tfidf over counts and binary weightning.

## SVM

Support vector machines is an algorithm that determines the best decision boundary between vectors that belong to a given group (or category) and vectors that do not belong to it. That’s it. It can be applied to any kind of vectors which encode any kind of data. This means that in order to leverage the power of svm text classification, texts have to be transformed into vectors.

So, when SVM determines the decision boundary we mentioned above, SVM decides where to draw the best “line” (or the best hyperplane) that divides the space into two subspaces: one for the vectors which belong to the given category and one for the vectors which do not belong to it.

### Train Models with Different Types of Features¶

In [45]:
field='statement'
feature_reps=['binary','counts','tfidf']
svm_results=[]
svm_clf = svm.LinearSVC()

for feature_rep in feature_reps:
        print(f'SVM Model - {feature_rep} features with statement')
        model,transformer,score=train_model(svm_clf,train_val,field=field,feature_rep=feature_rep)
        svm_results.append([field,feature_rep,score])

2020-11-19 15:10:07,456 : INFO : Starting model training...
2020-11-19 15:10:07,461 : INFO : Extracting features and creating vocabulary...


SVM Model - binary features with statement


2020-11-19 15:10:07,814 : INFO : Training a Classification Model...
2020-11-19 15:10:08,086 : INFO : Starting evaluation...
2020-11-19 15:10:08,097 : INFO : Done training and evaluation.
2020-11-19 15:10:08,099 : INFO : Starting model training...
2020-11-19 15:10:08,111 : INFO : Extracting features and creating vocabulary...


              precision    recall  f1-score   support

           0       0.52      0.50      0.51      1273
           1       0.61      0.63      0.62      1608

    accuracy                           0.57      2881
   macro avg       0.57      0.57      0.57      2881
weighted avg       0.57      0.57      0.57      2881

[[ 639  634]
 [ 598 1010]]
SVM Model - counts features with statement


2020-11-19 15:10:08,565 : INFO : Training a Classification Model...
2020-11-19 15:10:09,260 : INFO : Starting evaluation...
2020-11-19 15:10:09,271 : INFO : Done training and evaluation.
2020-11-19 15:10:09,273 : INFO : Starting model training...
2020-11-19 15:10:09,278 : INFO : Extracting features and creating vocabulary...


              precision    recall  f1-score   support

           0       0.52      0.50      0.51      1273
           1       0.61      0.63      0.62      1608

    accuracy                           0.57      2881
   macro avg       0.57      0.57      0.57      2881
weighted avg       0.57      0.57      0.57      2881

[[ 640  633]
 [ 599 1009]]
SVM Model - tfidf features with statement


2020-11-19 15:10:09,648 : INFO : Training a Classification Model...
2020-11-19 15:10:09,703 : INFO : Starting evaluation...
2020-11-19 15:10:09,722 : INFO : Done training and evaluation.


              precision    recall  f1-score   support

           0       0.55      0.49      0.52      1273
           1       0.63      0.68      0.65      1608

    accuracy                           0.60      2881
   macro avg       0.59      0.59      0.59      2881
weighted avg       0.59      0.60      0.59      2881

[[ 623  650]
 [ 511 1097]]


### SVM Results of Various Models

In [46]:
svm_df_results=pd.DataFrame(svm_results,columns=['text_fields','feature_representation','f1-score'])
svm_df_results.sort_values(by=['f1-score'],ascending=False)

Unnamed: 0,text_fields,feature_representation,f1-score
2,statement,tfidf,0.653949
0,statement,binary,0.621156
1,statement,counts,0.620923


## Random Forest

Given the nature of random forests (a bagging decision tree), it is true that you may come up with a rather weak classifier, especially if only a couple of features are truly significant to determine the outcome.

However, keep in mind that in the case of text classification, a preprocessing phase is required to get either your TF or TF-IDF matrix, through which you have already made a selection of pertinent features. Potentially, all features are relevant in this matrix, so the random forest may be performant when you predict your outcome. (source: https://stats.stackexchange.com/questions/343954/random-forest-short-text-classification)

### Train Models with Different Types of Features¶

In [47]:
field='statement'
feature_reps=['binary','counts','tfidf']
rf_results=[]
rf_clf = RandomForestClassifier(n_estimators=1000)

for feature_rep in feature_reps:
        model,transformer,score=train_model(rf_clf,train_val,field=field,feature_rep=feature_rep)
        rf_results.append([field,feature_rep,score])

2020-11-19 15:10:28,877 : INFO : Starting model training...
2020-11-19 15:10:28,883 : INFO : Extracting features and creating vocabulary...
2020-11-19 15:10:29,215 : INFO : Training a Classification Model...
2020-11-19 15:11:35,436 : INFO : Starting evaluation...
2020-11-19 15:11:35,447 : INFO : Done training and evaluation.
2020-11-19 15:11:35,449 : INFO : Starting model training...
2020-11-19 15:11:35,454 : INFO : Extracting features and creating vocabulary...


              precision    recall  f1-score   support

           0       0.61      0.40      0.49      1273
           1       0.63      0.79      0.70      1608

    accuracy                           0.62      2881
   macro avg       0.62      0.60      0.59      2881
weighted avg       0.62      0.62      0.61      2881

[[ 515  758]
 [ 333 1275]]


2020-11-19 15:11:35,793 : INFO : Training a Classification Model...
2020-11-19 15:12:42,217 : INFO : Starting evaluation...
2020-11-19 15:12:42,228 : INFO : Done training and evaluation.
2020-11-19 15:12:42,230 : INFO : Starting model training...
2020-11-19 15:12:42,235 : INFO : Extracting features and creating vocabulary...


              precision    recall  f1-score   support

           0       0.61      0.39      0.48      1273
           1       0.63      0.80      0.70      1608

    accuracy                           0.62      2881
   macro avg       0.62      0.60      0.59      2881
weighted avg       0.62      0.62      0.60      2881

[[ 502  771]
 [ 317 1291]]


2020-11-19 15:12:42,626 : INFO : Training a Classification Model...
2020-11-19 15:13:45,806 : INFO : Starting evaluation...
2020-11-19 15:13:45,818 : INFO : Done training and evaluation.


              precision    recall  f1-score   support

           0       0.61      0.40      0.48      1273
           1       0.63      0.80      0.70      1608

    accuracy                           0.62      2881
   macro avg       0.62      0.60      0.59      2881
weighted avg       0.62      0.62      0.61      2881

[[ 511  762]
 [ 326 1282]]


### RF Results of Various Models¶

In [49]:
rf_df_results=pd.DataFrame(rf_results,columns=['text_fields','feature_representation','f1-score'])
rf_df_results.sort_values(by=['f1-score'],ascending=False)

Unnamed: 0,text_fields,feature_representation,f1-score
1,statement,counts,0.703542
2,statement,tfidf,0.702081
0,statement,binary,0.700357


## K-fold cross validation

Cross-validation is a resampling procedure used to evaluate machine learning models on a limited data sample.
Cross-validation is primarily used in applied machine learning to estimate the skill of a machine learning model on unseen data. That is, to use a limited sample in order to estimate how the model is expected to perform in general when used to make predictions on data not used during the training of the model.
[credit: https://machinelearningmastery.com/k-fold-cross-validation/#:~:text=Cross%2Dvalidation%20is%20a%20resampling,on%20a%20limited%20data%20sample.&text=That%20is%2C%20to%20use%20a,the%20training%20of%20the%20model.]

In [52]:
# cross validation with text classification
def apply_kfold(classifier,train_val,field,feature_rep):
    """
    K-fold cross validation on the the data
    """
    k_fold = KFold(n_splits=5, shuffle=True)
    scores = []
    confusion = np.array([[0,0],[0,0]])

    for fold_n, (train_index, valid_index) in enumerate(k_fold.split(train_val['statement'], train_val['label'])):
        print(fold_n, len(train_index), len(valid_index))
        train_x = train_val['statement'].iloc[train_index]
        train_y = train_val['label'].iloc[train_index]
    
        valid_x = train_val['statement'].iloc[valid_index]
        valid_y = train_val['label'].iloc[valid_index]
        
        # GET FEATURES
        train_features,val_features,feature_transformer=extract_features(field,train_x,valid_x,type=feature_rep)
        
        # INIT CLASSIFIER
        logging.info("Training a Classification Model...")
        classifier.fit(train_features, train_y)
        predictions = classifier.predict(val_features)
        
        confusion += confusion_matrix(valid_y,predictions)
        score = f1_score(valid_y,predictions)
        scores.append(score)
        
    return (print('Total statements classified:', len(train_val['statement'])),
    print('Score:', sum(scores)/len(scores)),
    print('score length', len(scores)),
    print('Confusion matrix:'),
    print(confusion))

In [54]:
field='statement'
feature_reps=['binary','counts','tfidf']
nb_results=[]
nb_clf = MultinomialNB()
for feature_rep in feature_reps:
        print(f'Model - {feature_rep} features with statement')
        apply_kfold(nb_clf,train_val,field=field,feature_rep=feature_rep)

2020-11-19 15:23:23,187 : INFO : Training model using k-fold cross validation...
2020-11-19 15:23:23,191 : INFO : Extracting features and creating vocabulary...


Model - binary features with statement
0 9219 2305


2020-11-19 15:23:23,558 : INFO : Training a Classification Model...
2020-11-19 15:23:23,566 : INFO : Training model using k-fold cross validation...
2020-11-19 15:23:23,569 : INFO : Extracting features and creating vocabulary...


1 9219 2305


2020-11-19 15:23:23,927 : INFO : Training a Classification Model...
2020-11-19 15:23:23,935 : INFO : Training model using k-fold cross validation...
2020-11-19 15:23:23,941 : INFO : Extracting features and creating vocabulary...


2 9219 2305


2020-11-19 15:23:24,315 : INFO : Training a Classification Model...
2020-11-19 15:23:24,326 : INFO : Training model using k-fold cross validation...
2020-11-19 15:23:24,329 : INFO : Extracting features and creating vocabulary...


3 9219 2305


2020-11-19 15:23:24,694 : INFO : Training a Classification Model...
2020-11-19 15:23:24,701 : INFO : Training model using k-fold cross validation...
2020-11-19 15:23:24,703 : INFO : Extracting features and creating vocabulary...


4 9220 2304


2020-11-19 15:23:25,054 : INFO : Training a Classification Model...
2020-11-19 15:23:25,069 : INFO : Training model using k-fold cross validation...
2020-11-19 15:23:25,072 : INFO : Extracting features and creating vocabulary...


Total statements classified: 11524
Score: 0.6724790489821746
score length 5
Confusion matrix:
[[2406 2698]
 [1801 4619]]
Model - counts features with statement
0 9219 2305


2020-11-19 15:23:25,461 : INFO : Training a Classification Model...
2020-11-19 15:23:25,468 : INFO : Training model using k-fold cross validation...
2020-11-19 15:23:25,470 : INFO : Extracting features and creating vocabulary...


1 9219 2305


2020-11-19 15:23:25,807 : INFO : Training a Classification Model...
2020-11-19 15:23:25,815 : INFO : Training model using k-fold cross validation...
2020-11-19 15:23:25,818 : INFO : Extracting features and creating vocabulary...


2 9219 2305


2020-11-19 15:23:26,172 : INFO : Training a Classification Model...
2020-11-19 15:23:26,181 : INFO : Training model using k-fold cross validation...
2020-11-19 15:23:26,184 : INFO : Extracting features and creating vocabulary...


3 9219 2305


2020-11-19 15:23:26,522 : INFO : Training a Classification Model...
2020-11-19 15:23:26,530 : INFO : Training model using k-fold cross validation...
2020-11-19 15:23:26,532 : INFO : Extracting features and creating vocabulary...


4 9220 2304


2020-11-19 15:23:26,863 : INFO : Training a Classification Model...
2020-11-19 15:23:26,875 : INFO : Training model using k-fold cross validation...
2020-11-19 15:23:26,877 : INFO : Extracting features and creating vocabulary...


Total statements classified: 11524
Score: 0.6720010736588893
score length 5
Confusion matrix:
[[2515 2589]
 [1861 4559]]
Model - tfidf features with statement
0 9219 2305


2020-11-19 15:23:27,218 : INFO : Training a Classification Model...
2020-11-19 15:23:27,226 : INFO : Training model using k-fold cross validation...
2020-11-19 15:23:27,229 : INFO : Extracting features and creating vocabulary...


1 9219 2305


2020-11-19 15:23:27,576 : INFO : Training a Classification Model...
2020-11-19 15:23:27,583 : INFO : Training model using k-fold cross validation...
2020-11-19 15:23:27,585 : INFO : Extracting features and creating vocabulary...


2 9219 2305


2020-11-19 15:23:27,920 : INFO : Training a Classification Model...
2020-11-19 15:23:27,928 : INFO : Training model using k-fold cross validation...
2020-11-19 15:23:27,931 : INFO : Extracting features and creating vocabulary...


3 9219 2305


2020-11-19 15:23:28,323 : INFO : Training a Classification Model...
2020-11-19 15:23:28,331 : INFO : Training model using k-fold cross validation...
2020-11-19 15:23:28,334 : INFO : Extracting features and creating vocabulary...


4 9220 2304


2020-11-19 15:23:28,686 : INFO : Training a Classification Model...


Total statements classified: 11524
Score: 0.7120634153065452
score length 5
Confusion matrix:
[[1386 3718]
 [ 815 5605]]


## Grid-Search to select best hyperparameters

Grid search is designed with the notion that the loss function is affected by multiple hyper-parameter choices, hence we need to iterate through all the hyper parameter at some fix interval to assess all hyperparameters.

NB doesn't have any hyperparameters to tune.