In [1]:
import pandas as pd
import numpy as np
import string
import logging

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,TfidfTransformer

from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import  LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.model_selection import KFold
from sklearn.metrics import precision_recall_fscore_support

import nltk
import nltk.corpus 
from nltk.corpus import stopwords

## Read Data

In [23]:
train_news = pd.read_csv('../data/processed/train.csv')
val_news = pd.read_csv('../data/processed/val.csv')
test_news = pd.read_csv('../data/processed/test.csv')

In [24]:
display(train_news), display(test_news), display(val_news)

Unnamed: 0,label,statement
0,False,Says the Annies List political group supports ...
1,True,When did the decline of coal start? It started...
2,True,"Hillary Clinton agrees with John McCain ""by vo..."
3,False,Health care reform legislation is likely to ma...
4,True,The economic turnaround started at the end of ...
...,...,...
10235,True,There are a larger number of shark attacks in ...
10236,True,Democrats have now become the party of the [At...
10237,True,Says an alternative to Social Security that op...
10238,False,On lifting the U.S. Cuban embargo and allowing...


Unnamed: 0,label,statement
0,True,Building a wall on the U.S.-Mexico border will...
1,False,Wisconsin is on pace to double the number of l...
2,False,Says John McCain has done nothing to help the ...
3,True,Suzanne Bonamici supports a plan that will cut...
4,False,When asked by a reporter whether hes at the ce...
...,...,...
1262,True,Says his budget provides the highest state fun...
1263,False,Ive been here almost every day.
1264,False,"In the early 1980s, Sen. Edward Kennedy secret..."
1265,False,Says an EPA permit languished under Strickland...


Unnamed: 0,label,statement
0,False,We have less Americans working now than in the...
1,False,"When Obama was sworn into office, he DID NOT u..."
2,False,Says Having organizations parading as being so...
3,True,Says nearly half of Oregons children are poor.
4,True,On attacks by Republicans that various program...
...,...,...
1279,True,"For the first time in more than a decade, impo..."
1280,True,Says Donald Trump has bankrupted his companies...
1281,True,"John McCain and George Bush have ""absolutely n..."
1282,False,A new poll shows 62 percent support the presid...


(None, None, None)

## Merging train & val data for K-Fold

In [25]:
# Merging the training and validation data together, so that I can peroform k-fold cross validation 
#and shuffle the data to reduce the bias
labelEncoder = LabelEncoder()
frames = [train_news, val_news, test_news]
train_val = pd.concat(frames)
train_val['label'].value_counts()
train_val['label'] = labelEncoder.fit_transform(train_val['label'])

In [26]:
train_val

Unnamed: 0,label,statement
0,0,Says the Annies List political group supports ...
1,1,When did the decline of coal start? It started...
2,1,"Hillary Clinton agrees with John McCain ""by vo..."
3,0,Health care reform legislation is likely to ma...
4,1,The economic turnaround started at the end of ...
...,...,...
1262,1,Says his budget provides the highest state fun...
1263,0,Ive been here almost every day.
1264,0,"In the early 1980s, Sen. Edward Kennedy secret..."
1265,0,Says an EPA permit languished under Strickland...


In [27]:
def process_text(text):
    '''
    What will be covered:
    1. Remove punctuation
    2. Remove stopwords
    3. Return list of clean text words
    '''
    
    #1
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    
    #2
    clean_words = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    
    #3
    return clean_words

In [28]:
# count_vect = CountVectorizer(analyzer=process_text)
# tfidf_transformer = TfidfTransformer()

## Feature Weighting

Not all words are equally important to a particular document / category. For example, while words like ‘murder’, ‘knife’ and ‘abduction’ are important to a crime related document, words like ‘news’ and ‘reporter’ may not be quite as important. 

### Binary Weighting
The most basic form of feature weighting, is binary weighting. Where if a word is present in a document, the weight is ‘1’ and if the word is absent the weight is ‘0’. 

### CountVectorizer

It Convert a collection of text documents to a matrix of token counts.


### Tfidf Weighting 

TF-IDF weighting where words that are unique to a particular document would have higher weights compared to words that are used commonly across documents. 

1. TF (Term Frequency): The number of times a word appears in a document divded by the total number of words in the document. Every document has its own term frequency.

2. IDF (Inverse Data Frequency): The log of the number of documents divided by the number of documents that contain the word w. Inverse data frequency determines the weight of rare words across all documents in the corpus.

3. Lastly, the TF-IDF is simply the TF multiplied by IDF.

In [29]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

def extract_features(field,training_data,testing_data,type):
    """Extract features using different methods"""
    
    logging.info("Extracting features and creating vocabulary...")
    
    if "binary" in type:
        
        # BINARY FEATURE REPRESENTATION
        cv= CountVectorizer(binary=True, max_df=0.95)
        cv.fit_transform(training_data.values)
        
        train_feature_set=cv.transform(training_data.values)
        test_feature_set=cv.transform(testing_data.values)
        
        return train_feature_set,test_feature_set,cv
  
    elif "counts" in type:
        
        # COUNT BASED FEATURE REPRESENTATION
        cv= CountVectorizer(binary=False, max_df=0.95)
        cv.fit_transform(training_data.values)
        
        train_feature_set=cv.transform(training_data.values)
        test_feature_set=cv.transform(testing_data.values)
        
        return train_feature_set,test_feature_set,cv
    
    else:    
        
        # TF-IDF BASED FEATURE REPRESENTATION
        tfidf_vectorizer=TfidfVectorizer(use_idf=True, max_df=0.95)
        tfidf_vectorizer.fit_transform(training_data.values)
        
        train_feature_set=tfidf_vectorizer.transform(training_data.values)
        test_feature_set=tfidf_vectorizer.transform(testing_data.values)
        
        return train_feature_set,test_feature_set,tfidf_vectorizer

In [31]:
def train_model(classifier, train_val, field="statement",feature_rep="binary"):
    """
    Training the classifier for the provided features.
    """
    
    logging.info("Starting model training...")
    
    scores = []
    confusion = np.array([[0,0],[0,0]])
    
    # GET A TRAIN TEST SPLIT (set seed for consistent results)
    training_data, testing_data = train_test_split(train_val,random_state = 2000,)

    # features
    X_train=training_data['statement']
    X_test=testing_data['statement']
    
    # GET LABELS
    Y_train=training_data['label'].values
    Y_test=testing_data['label'].values
     
    # GET FEATURES
    train_features,test_features,feature_transformer=extract_features(field,X_train,X_test,type=feature_rep)

    # INIT LOGISTIC REGRESSION CLASSIFIER
    logging.info("Training a Classification Model...")
#     scikit_log_reg = LogisticRegression(verbose=1, solver='liblinear',random_state=0, C=5, penalty='l2',max_iter=1000)
    model=classifier.fit(train_features,Y_train)

    # GET PREDICTIONS
    predictions = model.predict(test_features)
    
    # GET EVALUATION NUMBERS ON TEST SET -- HOW DID WE DO?
    logging.info("Starting evaluation...")
    score = f1_score(Y_test,predictions)
    print(classification_report(Y_test,predictions))
    print(confusion_matrix(Y_test,predictions))
    logging.info("Done training and evaluation.")
    
    return model,feature_transformer,score

## Metric

I need to minimize false positives (number of fake news predicted as real news) as it can be very misleadling . For class 0 i.e. 'fake', recall should be high as well as precision. Because we want our model to perform well on both classes (real & fake). In short, we need to maximize f1-score.

### Cases I considered to choose the right metric

**1. Maximizing recall of class 0 (fake) or minimizing false positives(FP)?**
Well, in extreme case, what if all the news predicted by model are labelled as 'fake'. Recall will still be 1, but overall model is really bad i.e. not able to predict class 1 ('real'). 

Ex=> TN = 553, FP = 0, TP = 0, FN = 714

Class0-Recall = TN / (TN + FP) = 1
Class0-Precision = TN / (TN + FN) = 0.43

F1-Score = 2 * Class0-Recall * Class0-Precision/(Class0-Recall + Class0-Precision) = 0.60

Recall, Precision and F1-score for class 1 will be 0.

**2. Considering an extreme case, if all the news classified as True (Even, fake news are predicted as True).**

Ex=>  TN = 0, FP = 553, TP = 714, FN =0
In that case, TN will be 0, which led to Precision 0, Recall 0 and F1 = 0 for class 0 ('fake').

For class 1, Class1-Recall = TP / (TP + FN) = 1
Class1-Precision = TP / (TP + FP) = 0.56

## Model Training

## Text Classification Algorithms

1. Naive Bayes (NB)
2. Logistics Regression
3. SVM
4. Random Forest

## Naive Bayes

Well, when assumption of independence holds, a Naive Bayes classifier performs better compare to other models like logistic regression and you need less training data. An advantage of naive Bayes is that it only requires a small number of training data to estimate the parameters necessary for classification. 

Bayes’ Theorem provides a way that we can calculate the probability of a piece of data belonging to a given class, given our prior knowledge. Bayes’ Theorem is stated as:

P(class|data) = (P(data|class) * P(class)) / P(data)

Where P(class|data) is the probability of class given the provided data.

Naive Bayes is a classification algorithm for binary (two-class) and multiclass classification problems. It is called Naive Bayes or idiot Bayes because the calculations of the probabilities for each class are simplified to make their calculations tractable.

Rather than attempting to calculate the probabilities of each attribute value, they are assumed to be conditionally independent given the class value.

This is a very strong assumption that is most unlikely in real data, i.e. that the attributes do not interact. Nevertheless, the approach performs surprisingly well on data where this assumption does not hold.

### Multinomial NB

The multinomial Naive Bayes classifier is suitable for classification with discrete features (e.g., word counts for text classification). The multinomial distribution normally requires integer feature counts. However, in practice, fractional counts such as tf-idf may also work

### Train Models with Different Types of Features

In [32]:
# model,transformer,score,confusion,report=train_model(nb_clf, train_val,field=field,feature_rep=feature_rep)
# print("\nF1-score={0}; confusion={1}; classification_report={2}".format(score,confusion,report))
field='statement'
feature_reps=['binary','counts','tfidf']
nb_results=[]
nb_clf = MultinomialNB()
for feature_rep in feature_reps:
        print(f'Model - {feature_rep} features with statement')
        nb_model,transformer,score=train_model(nb_clf,train_val,field=field,feature_rep=feature_rep)
        nb_results.append([field,feature_rep,score])

2020-11-20 15:40:58,826 : INFO : Starting model training...
2020-11-20 15:40:58,831 : INFO : Extracting features and creating vocabulary...


Model - binary features with statement


2020-11-20 15:40:59,233 : INFO : Training a Classification Model...
2020-11-20 15:40:59,237 : INFO : Starting evaluation...
2020-11-20 15:40:59,250 : INFO : Done training and evaluation.
2020-11-20 15:40:59,257 : INFO : Starting model training...
2020-11-20 15:40:59,266 : INFO : Extracting features and creating vocabulary...


              precision    recall  f1-score   support

           0       0.58      0.52      0.55      1391
           1       0.66      0.71      0.68      1807

    accuracy                           0.63      3198
   macro avg       0.62      0.61      0.61      3198
weighted avg       0.62      0.63      0.62      3198

[[ 718  673]
 [ 523 1284]]
Model - counts features with statement


2020-11-20 15:40:59,630 : INFO : Training a Classification Model...
2020-11-20 15:40:59,635 : INFO : Starting evaluation...
2020-11-20 15:40:59,647 : INFO : Done training and evaluation.
2020-11-20 15:40:59,649 : INFO : Starting model training...
2020-11-20 15:40:59,655 : INFO : Extracting features and creating vocabulary...


              precision    recall  f1-score   support

           0       0.58      0.51      0.54      1391
           1       0.66      0.71      0.68      1807

    accuracy                           0.63      3198
   macro avg       0.62      0.61      0.61      3198
weighted avg       0.62      0.63      0.62      3198

[[ 716  675]
 [ 522 1285]]
Model - tfidf features with statement


2020-11-20 15:41:00,040 : INFO : Training a Classification Model...
2020-11-20 15:41:00,044 : INFO : Starting evaluation...
2020-11-20 15:41:00,056 : INFO : Done training and evaluation.


              precision    recall  f1-score   support

           0       0.61      0.30      0.40      1391
           1       0.61      0.86      0.71      1807

    accuracy                           0.61      3198
   macro avg       0.61      0.58      0.56      3198
weighted avg       0.61      0.61      0.58      3198

[[ 411  980]
 [ 260 1547]]


### Naive Bayes Results of Various Models

In [34]:
nb_df_results=pd.DataFrame(nb_results,columns=['text_fields','feature_representation','f1-score'])
nb_df_results.sort_values(by=['f1-score'],ascending=False)

Unnamed: 0,text_fields,feature_representation,f1-score
2,statement,tfidf,0.71389
0,statement,binary,0.682253
1,statement,counts,0.682241


In [36]:
# nb_clf_pipeline = Pipeline([('vect', count_vect),
#                       ('tfidf', tfidf_transformer),
#                       ('nb_clf', MultinomialNB()),
#  ])
# nb_clf_pipeline.fit(train_news['statement'], train_label)
# predicted = nb_clf_pipeline.predict(test_news['statement'])
# print(np.mean(predicted == test_label))
# print(classification_report(test_label,predicted))
# print(confusion_matrix(test_label,predicted))

## logistic regression

The underlying algorithm is also fairly easy to understand. More importantly, in the NLP world, it’s generally accepted that Logistic Regression is a great starter algorithm for text related classification (https://web.stanford.edu/~jurafsky/slp3/5.pdf). 

**How hypothesis makes prediction in logistics regression?**

This algorithm uses sigmoid function(g(z)). If we want to predict y=1 or y=0.
If estimated probability of y=1 is h(x)>=0.5 then the ouput is more likely to be "y=1" 
but if  h(x) < 0.5, the output is more likely to be is "y=0".

### Train Models with Different Types of Features¶

In [35]:
field='statement'
feature_reps=['binary','counts','tfidf']
lr_results=[]
LogR_clf = LogisticRegression(verbose=1, solver='liblinear',random_state=0, C=5, penalty='l2',max_iter=1000)

for feature_rep in feature_reps:
        print(f'Model - {feature_rep} features with statement')
        lr_model,transformer,score=train_model(LogR_clf,train_val,field=field,feature_rep=feature_rep)
        lr_results.append([field,feature_rep,score])

2020-11-20 15:41:24,090 : INFO : Starting model training...
2020-11-20 15:41:24,097 : INFO : Extracting features and creating vocabulary...


Model - binary features with statement


2020-11-20 15:41:24,477 : INFO : Training a Classification Model...
2020-11-20 15:41:24,622 : INFO : Starting evaluation...
2020-11-20 15:41:24,636 : INFO : Done training and evaluation.
2020-11-20 15:41:24,639 : INFO : Starting model training...
2020-11-20 15:41:24,647 : INFO : Extracting features and creating vocabulary...


[LibLinear]              precision    recall  f1-score   support

           0       0.52      0.52      0.52      1391
           1       0.63      0.63      0.63      1807

    accuracy                           0.58      3198
   macro avg       0.58      0.58      0.58      3198
weighted avg       0.58      0.58      0.58      3198

[[ 727  664]
 [ 671 1136]]
Model - counts features with statement


2020-11-20 15:41:25,017 : INFO : Training a Classification Model...


[LibLinear]

2020-11-20 15:41:25,248 : INFO : Starting evaluation...
2020-11-20 15:41:25,306 : INFO : Done training and evaluation.
2020-11-20 15:41:25,313 : INFO : Starting model training...
2020-11-20 15:41:25,329 : INFO : Extracting features and creating vocabulary...


              precision    recall  f1-score   support

           0       0.53      0.53      0.53      1391
           1       0.64      0.63      0.63      1807

    accuracy                           0.59      3198
   macro avg       0.58      0.58      0.58      3198
weighted avg       0.59      0.59      0.59      3198

[[ 740  651]
 [ 668 1139]]
Model - tfidf features with statement


2020-11-20 15:41:25,752 : INFO : Training a Classification Model...
2020-11-20 15:41:25,814 : INFO : Starting evaluation...
2020-11-20 15:41:25,827 : INFO : Done training and evaluation.


[LibLinear]              precision    recall  f1-score   support

           0       0.56      0.52      0.54      1391
           1       0.65      0.68      0.66      1807

    accuracy                           0.61      3198
   macro avg       0.60      0.60      0.60      3198
weighted avg       0.61      0.61      0.61      3198

[[ 728  663]
 [ 577 1230]]


### Logistics Regression Results of Various Models

In [36]:
lr_df_results=pd.DataFrame(lr_results,columns=['text_fields','feature_representation','f1-score'])
lr_df_results.sort_values(by=['f1-score'],ascending=False)

Unnamed: 0,text_fields,feature_representation,f1-score
2,statement,tfidf,0.664865
1,statement,counts,0.633306
0,statement,binary,0.629886


Here you see how the performance of logistics model is improved using tfidf over counts and binary weightning.

## SVM

Support vector machines is an algorithm that determines the best decision boundary between vectors that belong to a given group (or category) and vectors that do not belong to it. That’s it. It can be applied to any kind of vectors which encode any kind of data. This means that in order to leverage the power of svm text classification, texts have to be transformed into vectors.

So, when SVM determines the decision boundary we mentioned above, SVM decides where to draw the best “line” (or the best hyperplane) that divides the space into two subspaces: one for the vectors which belong to the given category and one for the vectors which do not belong to it.

### Train Models with Different Types of Features¶

In [37]:
field='statement'
feature_reps=['binary','counts','tfidf']
svm_results=[]
svm_clf = svm.LinearSVC()

for feature_rep in feature_reps:
        print(f'SVM Model - {feature_rep} features with statement')
        svm_model,transformer,score=train_model(svm_clf,train_val,field=field,feature_rep=feature_rep)
        svm_results.append([field,feature_rep,score])

2020-11-20 15:41:33,995 : INFO : Starting model training...
2020-11-20 15:41:34,000 : INFO : Extracting features and creating vocabulary...


SVM Model - binary features with statement


2020-11-20 15:41:34,386 : INFO : Training a Classification Model...
2020-11-20 15:41:34,927 : INFO : Starting evaluation...
2020-11-20 15:41:34,941 : INFO : Done training and evaluation.
2020-11-20 15:41:34,946 : INFO : Starting model training...
2020-11-20 15:41:34,950 : INFO : Extracting features and creating vocabulary...


              precision    recall  f1-score   support

           0       0.51      0.51      0.51      1391
           1       0.62      0.62      0.62      1807

    accuracy                           0.57      3198
   macro avg       0.57      0.57      0.57      3198
weighted avg       0.57      0.57      0.57      3198

[[ 716  675]
 [ 690 1117]]
SVM Model - counts features with statement


2020-11-20 15:41:35,433 : INFO : Training a Classification Model...
2020-11-20 15:41:36,410 : INFO : Starting evaluation...
2020-11-20 15:41:36,423 : INFO : Done training and evaluation.
2020-11-20 15:41:36,428 : INFO : Starting model training...
2020-11-20 15:41:36,431 : INFO : Extracting features and creating vocabulary...


              precision    recall  f1-score   support

           0       0.51      0.52      0.52      1391
           1       0.63      0.62      0.63      1807

    accuracy                           0.58      3198
   macro avg       0.57      0.57      0.57      3198
weighted avg       0.58      0.58      0.58      3198

[[ 721  670]
 [ 680 1127]]
SVM Model - tfidf features with statement


2020-11-20 15:41:36,887 : INFO : Training a Classification Model...
2020-11-20 15:41:36,954 : INFO : Starting evaluation...
2020-11-20 15:41:36,977 : INFO : Done training and evaluation.


              precision    recall  f1-score   support

           0       0.55      0.53      0.54      1391
           1       0.65      0.66      0.66      1807

    accuracy                           0.61      3198
   macro avg       0.60      0.60      0.60      3198
weighted avg       0.61      0.61      0.61      3198

[[ 740  651]
 [ 606 1201]]


### SVM Results of Various Models

In [38]:
svm_df_results=pd.DataFrame(svm_results,columns=['text_fields','feature_representation','f1-score'])
svm_df_results.sort_values(by=['f1-score'],ascending=False)

Unnamed: 0,text_fields,feature_representation,f1-score
2,statement,tfidf,0.656464
1,statement,counts,0.625416
0,statement,binary,0.620728


## Random Forest

Given the nature of random forests (a bagging decision tree), it is true that you may come up with a rather weak classifier, especially if only a couple of features are truly significant to determine the outcome.

However, keep in mind that in the case of text classification, a preprocessing phase is required to get either your TF or TF-IDF matrix, through which you have already made a selection of pertinent features. Potentially, all features are relevant in this matrix, so the random forest may be performant when you predict your outcome. (source: https://stats.stackexchange.com/questions/343954/random-forest-short-text-classification)

### Train Models with Different Types of Features¶

In [39]:
field='statement'
feature_reps=['binary','counts','tfidf']
rf_results=[]
rf_clf = RandomForestClassifier(n_estimators=1000)

for feature_rep in feature_reps:
        rf_model,transformer,score=train_model(rf_clf,train_val,field=field,feature_rep=feature_rep)
        rf_results.append([field,feature_rep,score])

2020-11-20 15:41:43,043 : INFO : Starting model training...
2020-11-20 15:41:43,047 : INFO : Extracting features and creating vocabulary...
2020-11-20 15:41:43,405 : INFO : Training a Classification Model...
2020-11-20 15:43:07,701 : INFO : Starting evaluation...
2020-11-20 15:43:07,724 : INFO : Done training and evaluation.
2020-11-20 15:43:07,795 : INFO : Starting model training...
2020-11-20 15:43:07,801 : INFO : Extracting features and creating vocabulary...


              precision    recall  f1-score   support

           0       0.61      0.43      0.50      1391
           1       0.64      0.79      0.71      1807

    accuracy                           0.63      3198
   macro avg       0.63      0.61      0.61      3198
weighted avg       0.63      0.63      0.62      3198

[[ 597  794]
 [ 381 1426]]


2020-11-20 15:43:08,281 : INFO : Training a Classification Model...
2020-11-20 15:44:29,706 : INFO : Starting evaluation...
2020-11-20 15:44:29,718 : INFO : Done training and evaluation.
2020-11-20 15:44:29,720 : INFO : Starting model training...
2020-11-20 15:44:29,724 : INFO : Extracting features and creating vocabulary...


              precision    recall  f1-score   support

           0       0.60      0.42      0.50      1391
           1       0.64      0.79      0.71      1807

    accuracy                           0.63      3198
   macro avg       0.62      0.60      0.60      3198
weighted avg       0.62      0.63      0.61      3198

[[ 585  806]
 [ 384 1423]]


2020-11-20 15:44:30,134 : INFO : Training a Classification Model...
2020-11-20 15:45:48,937 : INFO : Starting evaluation...
2020-11-20 15:45:48,954 : INFO : Done training and evaluation.


              precision    recall  f1-score   support

           0       0.60      0.43      0.50      1391
           1       0.64      0.78      0.70      1807

    accuracy                           0.63      3198
   macro avg       0.62      0.60      0.60      3198
weighted avg       0.62      0.63      0.61      3198

[[ 592  799]
 [ 400 1407]]


### RF Results of Various Models¶

In [40]:
rf_df_results=pd.DataFrame(rf_results,columns=['text_fields','feature_representation','f1-score'])
rf_df_results.sort_values(by=['f1-score'],ascending=False)

Unnamed: 0,text_fields,feature_representation,f1-score
0,statement,binary,0.70822
1,statement,counts,0.705154
2,statement,tfidf,0.701221


## K-fold cross validation

With K-fold cross validation, you are testing how well your model is able to get trained by some data and then predict data it hasn't seen. We use cross validation for this because if you train using all the data you have, you have none left for testing. You could do this once, say by using 80% of the data to train and 20% to test, but what if the 20% you happened to pick to test happens to contain a bunch of points that are particularly easy (or particularly hard) to predict? We will not have come up with the best estimate possible of the models ability to learn and predict.

In [41]:
#User defined functon for K-Fold cross validatoin
def apply_kfold(classifier,train_val,field,feature_rep):
    """
    K-fold cross validation on the the data
    """
    k_fold = KFold(n_splits=5, shuffle=True)
    scores = []
    confusion = np.array([[0,0],[0,0]])

    for fold_n, (train_index, valid_index) in enumerate(k_fold.split(train_val['statement'], train_val['label'])):
        print(fold_n, len(train_index), len(valid_index))
        train_x = train_val['statement'].iloc[train_index]
        train_y = train_val['label'].iloc[train_index]
    
        valid_x = train_val['statement'].iloc[valid_index]
        valid_y = train_val['label'].iloc[valid_index]
        
        # GET FEATURES
        train_features,val_features,feature_transformer=extract_features(field,train_x,valid_x,type=feature_rep)
        
        # INIT CLASSIFIER
        logging.info("Training a Classification Model...")
        classifier.fit(train_features, train_y)
        predictions = classifier.predict(val_features)
        
        confusion += confusion_matrix(valid_y,predictions)
        score = f1_score(valid_y,predictions)
        scores.append(score)
        
    return (print('Total statements classified:', len(train_val['statement'])),
    print('Score:', sum(scores)/len(scores)),
    print('score length', len(scores)),
    print('Confusion matrix:'),
    print(confusion))

## Naive Bayes with K-fold cross validation

In [42]:
field='statement'
feature_reps=['binary','counts','tfidf']
nb_results=[]
nb_clf = MultinomialNB()
for feature_rep in feature_reps:
        print(f'Model - {feature_rep} features with statement')
        apply_kfold(nb_clf,train_val,field=field,feature_rep=feature_rep)

2020-11-20 15:45:49,036 : INFO : Extracting features and creating vocabulary...


Model - binary features with statement
0 10232 2559


2020-11-20 15:45:49,838 : INFO : Training a Classification Model...
2020-11-20 15:45:49,858 : INFO : Extracting features and creating vocabulary...


1 10233 2558


2020-11-20 15:45:50,866 : INFO : Training a Classification Model...
2020-11-20 15:45:50,886 : INFO : Extracting features and creating vocabulary...


2 10233 2558


2020-11-20 15:45:51,376 : INFO : Training a Classification Model...
2020-11-20 15:45:51,386 : INFO : Extracting features and creating vocabulary...


3 10233 2558


2020-11-20 15:45:51,782 : INFO : Training a Classification Model...
2020-11-20 15:45:51,791 : INFO : Extracting features and creating vocabulary...


4 10233 2558


2020-11-20 15:45:52,145 : INFO : Training a Classification Model...
2020-11-20 15:45:52,158 : INFO : Extracting features and creating vocabulary...


Total statements classified: 12791
Score: 0.6743321808352938
score length 5
Confusion matrix:
[[2731 2926]
 [2016 5118]]
Model - counts features with statement
0 10232 2559


2020-11-20 15:45:52,506 : INFO : Training a Classification Model...
2020-11-20 15:45:52,517 : INFO : Extracting features and creating vocabulary...


1 10233 2558


2020-11-20 15:45:52,872 : INFO : Training a Classification Model...
2020-11-20 15:45:52,882 : INFO : Extracting features and creating vocabulary...


2 10233 2558


2020-11-20 15:45:53,232 : INFO : Training a Classification Model...
2020-11-20 15:45:53,242 : INFO : Extracting features and creating vocabulary...


3 10233 2558


2020-11-20 15:45:53,787 : INFO : Training a Classification Model...
2020-11-20 15:45:53,797 : INFO : Extracting features and creating vocabulary...


4 10233 2558


2020-11-20 15:45:54,139 : INFO : Training a Classification Model...
2020-11-20 15:45:54,153 : INFO : Extracting features and creating vocabulary...


Total statements classified: 12791
Score: 0.6666163925388202
score length 5
Confusion matrix:
[[2766 2891]
 [2122 5012]]
Model - tfidf features with statement
0 10232 2559


2020-11-20 15:45:54,507 : INFO : Training a Classification Model...
2020-11-20 15:45:54,516 : INFO : Extracting features and creating vocabulary...


1 10233 2558


2020-11-20 15:45:54,867 : INFO : Training a Classification Model...
2020-11-20 15:45:54,878 : INFO : Extracting features and creating vocabulary...


2 10233 2558


2020-11-20 15:45:55,233 : INFO : Training a Classification Model...
2020-11-20 15:45:55,242 : INFO : Extracting features and creating vocabulary...


3 10233 2558


2020-11-20 15:45:55,591 : INFO : Training a Classification Model...
2020-11-20 15:45:55,599 : INFO : Extracting features and creating vocabulary...


4 10233 2558


2020-11-20 15:45:55,944 : INFO : Training a Classification Model...


Total statements classified: 12791
Score: 0.7094457956840234
score length 5
Confusion matrix:
[[1537 4120]
 [ 947 6187]]


## Logistics Regression with K-fold cross Validation

In [43]:
field='statement'
feature_reps=['binary','counts','tfidf']
LogR_clf = LogisticRegression(verbose=1, solver='liblinear',random_state=0, C=5, penalty='l2',max_iter=1000)

for feature_rep in feature_reps:
        print(f'Model - {feature_rep} features with statement')
        apply_kfold(LogR_clf,train_val,field=field,feature_rep=feature_rep)

2020-11-20 15:47:07,313 : INFO : Extracting features and creating vocabulary...


Model - binary features with statement
0 10232 2559


2020-11-20 15:47:07,699 : INFO : Training a Classification Model...
2020-11-20 15:47:07,869 : INFO : Extracting features and creating vocabulary...


[LibLinear]1 10233 2558


2020-11-20 15:47:08,382 : INFO : Training a Classification Model...
2020-11-20 15:47:08,539 : INFO : Extracting features and creating vocabulary...


[LibLinear]2 10233 2558


2020-11-20 15:47:08,938 : INFO : Training a Classification Model...


[LibLinear]

2020-11-20 15:47:09,169 : INFO : Extracting features and creating vocabulary...


3 10233 2558


2020-11-20 15:47:09,569 : INFO : Training a Classification Model...
2020-11-20 15:47:09,735 : INFO : Extracting features and creating vocabulary...


[LibLinear]4 10233 2558


2020-11-20 15:47:10,103 : INFO : Training a Classification Model...
2020-11-20 15:47:10,278 : INFO : Extracting features and creating vocabulary...


[LibLinear]Total statements classified: 12791
Score: 0.6300608298258705
score length 5
Confusion matrix:
[[2927 2730]
 [2595 4539]]
Model - counts features with statement
0 10232 2559


2020-11-20 15:47:10,657 : INFO : Training a Classification Model...
2020-11-20 15:47:10,824 : INFO : Extracting features and creating vocabulary...


[LibLinear]1 10233 2558


2020-11-20 15:47:11,212 : INFO : Training a Classification Model...


[LibLinear]

2020-11-20 15:47:11,499 : INFO : Extracting features and creating vocabulary...


2 10233 2558


2020-11-20 15:47:12,023 : INFO : Training a Classification Model...


[LibLinear]

2020-11-20 15:47:12,241 : INFO : Extracting features and creating vocabulary...


3 10233 2558


2020-11-20 15:47:12,707 : INFO : Training a Classification Model...


[LibLinear]

2020-11-20 15:47:13,062 : INFO : Extracting features and creating vocabulary...


4 10233 2558


2020-11-20 15:47:13,467 : INFO : Training a Classification Model...


[LibLinear]

2020-11-20 15:47:13,682 : INFO : Extracting features and creating vocabulary...


Total statements classified: 12791
Score: 0.6337859530999068
score length 5
Confusion matrix:
[[2861 2796]
 [2526 4608]]
Model - tfidf features with statement
0 10232 2559


2020-11-20 15:47:14,133 : INFO : Training a Classification Model...
2020-11-20 15:47:14,207 : INFO : Extracting features and creating vocabulary...


[LibLinear]1 10233 2558


2020-11-20 15:47:14,587 : INFO : Training a Classification Model...
2020-11-20 15:47:14,658 : INFO : Extracting features and creating vocabulary...


[LibLinear]2 10233 2558


2020-11-20 15:47:15,092 : INFO : Training a Classification Model...
2020-11-20 15:47:15,158 : INFO : Extracting features and creating vocabulary...


[LibLinear]3 10233 2558


2020-11-20 15:47:15,582 : INFO : Training a Classification Model...
2020-11-20 15:47:15,650 : INFO : Extracting features and creating vocabulary...


[LibLinear]4 10233 2558


2020-11-20 15:47:16,063 : INFO : Training a Classification Model...


[LibLinear]Total statements classified: 12791
Score: 0.6639971147599301
score length 5
Confusion matrix:
[[2799 2858]
 [2168 4966]]


## SVM with K-fold cross Validation

In [46]:
field='statement'
feature_reps=['binary','counts','tfidf']
svm_clf = svm.LinearSVC()

for feature_rep in feature_reps:
        print(f'Model - {feature_rep} features with statement')
        apply_kfold(svm_clf,train_val,field=field,feature_rep=feature_rep)

2020-11-20 16:09:23,264 : INFO : Extracting features and creating vocabulary...


Model - binary features with statement
0 10232 2559


2020-11-20 16:09:23,683 : INFO : Training a Classification Model...
2020-11-20 16:09:24,174 : INFO : Extracting features and creating vocabulary...


1 10233 2558


2020-11-20 16:09:24,585 : INFO : Training a Classification Model...
2020-11-20 16:09:24,933 : INFO : Extracting features and creating vocabulary...


2 10233 2558


2020-11-20 16:09:25,309 : INFO : Training a Classification Model...
2020-11-20 16:09:25,714 : INFO : Extracting features and creating vocabulary...


3 10233 2558


2020-11-20 16:09:26,107 : INFO : Training a Classification Model...
2020-11-20 16:09:26,617 : INFO : Extracting features and creating vocabulary...


4 10233 2558


2020-11-20 16:09:26,998 : INFO : Training a Classification Model...
2020-11-20 16:09:27,543 : INFO : Extracting features and creating vocabulary...


Total statements classified: 12791
Score: 0.6227153624086335
score length 5
Confusion matrix:
[[2920 2737]
 [2669 4465]]
Model - counts features with statement
0 10232 2559


2020-11-20 16:09:27,939 : INFO : Training a Classification Model...
2020-11-20 16:09:28,851 : INFO : Extracting features and creating vocabulary...


1 10233 2558


2020-11-20 16:09:29,226 : INFO : Training a Classification Model...
2020-11-20 16:09:30,151 : INFO : Extracting features and creating vocabulary...


2 10233 2558


2020-11-20 16:09:30,566 : INFO : Training a Classification Model...
2020-11-20 16:09:31,317 : INFO : Extracting features and creating vocabulary...


3 10233 2558


2020-11-20 16:09:31,693 : INFO : Training a Classification Model...
2020-11-20 16:09:32,605 : INFO : Extracting features and creating vocabulary...


4 10233 2558


2020-11-20 16:09:32,994 : INFO : Training a Classification Model...
2020-11-20 16:09:33,858 : INFO : Extracting features and creating vocabulary...


Total statements classified: 12791
Score: 0.6225832289724649
score length 5
Confusion matrix:
[[2913 2744]
 [2669 4465]]
Model - tfidf features with statement
0 10232 2559


2020-11-20 16:09:34,238 : INFO : Training a Classification Model...
2020-11-20 16:09:34,333 : INFO : Extracting features and creating vocabulary...


1 10233 2558


2020-11-20 16:09:34,842 : INFO : Training a Classification Model...
2020-11-20 16:09:34,922 : INFO : Extracting features and creating vocabulary...


2 10233 2558


2020-11-20 16:09:35,324 : INFO : Training a Classification Model...
2020-11-20 16:09:35,399 : INFO : Extracting features and creating vocabulary...


3 10233 2558


2020-11-20 16:09:35,785 : INFO : Training a Classification Model...
2020-11-20 16:09:35,856 : INFO : Extracting features and creating vocabulary...


4 10233 2558


2020-11-20 16:09:36,262 : INFO : Training a Classification Model...


Total statements classified: 12791
Score: 0.6525636545634905
score length 5
Confusion matrix:
[[2841 2816]
 [2315 4819]]


## RF with K-fold cross Validation

In [45]:
field='statement'
feature_reps=['binary','counts','tfidf']
rf_clf = RandomForestClassifier(n_estimators=1000)

for feature_rep in feature_reps:
        print(f'Model - {feature_rep} features with statement')
        apply_kfold(rf_clf,train_val,field=field,feature_rep=feature_rep)

2020-11-20 15:47:29,331 : INFO : Extracting features and creating vocabulary...


Model - binary features with statement
0 10232 2559


2020-11-20 15:47:29,724 : INFO : Training a Classification Model...
2020-11-20 15:48:59,125 : INFO : Extracting features and creating vocabulary...


1 10233 2558


2020-11-20 15:48:59,563 : INFO : Training a Classification Model...
2020-11-20 15:50:30,532 : INFO : Extracting features and creating vocabulary...


2 10233 2558


2020-11-20 15:50:30,997 : INFO : Training a Classification Model...
2020-11-20 15:52:02,014 : INFO : Extracting features and creating vocabulary...


3 10233 2558


2020-11-20 15:52:02,461 : INFO : Training a Classification Model...
2020-11-20 15:53:35,295 : INFO : Extracting features and creating vocabulary...


4 10233 2558


2020-11-20 15:53:35,669 : INFO : Training a Classification Model...
2020-11-20 15:55:06,945 : INFO : Extracting features and creating vocabulary...


Total statements classified: 12791
Score: 0.7090001590443251
score length 5
Confusion matrix:
[[2311 3346]
 [1378 5756]]
Model - counts features with statement
0 10232 2559


2020-11-20 15:55:07,403 : INFO : Training a Classification Model...
2020-11-20 15:56:37,501 : INFO : Extracting features and creating vocabulary...


1 10233 2558


2020-11-20 15:56:37,899 : INFO : Training a Classification Model...
2020-11-20 15:58:06,296 : INFO : Extracting features and creating vocabulary...


2 10233 2558


2020-11-20 15:58:06,663 : INFO : Training a Classification Model...
2020-11-20 15:59:36,646 : INFO : Extracting features and creating vocabulary...


3 10233 2558


2020-11-20 15:59:37,061 : INFO : Training a Classification Model...
2020-11-20 16:01:06,434 : INFO : Extracting features and creating vocabulary...


4 10233 2558


2020-11-20 16:01:06,872 : INFO : Training a Classification Model...
2020-11-20 16:02:34,596 : INFO : Extracting features and creating vocabulary...


Total statements classified: 12791
Score: 0.7031539332144441
score length 5
Confusion matrix:
[[2281 3376]
 [1435 5699]]
Model - tfidf features with statement
0 10232 2559


2020-11-20 16:02:34,959 : INFO : Training a Classification Model...
2020-11-20 16:03:55,720 : INFO : Extracting features and creating vocabulary...


1 10233 2558


2020-11-20 16:03:56,082 : INFO : Training a Classification Model...
2020-11-20 16:05:17,504 : INFO : Extracting features and creating vocabulary...


2 10233 2558


2020-11-20 16:05:17,895 : INFO : Training a Classification Model...
2020-11-20 16:06:38,275 : INFO : Extracting features and creating vocabulary...


3 10233 2558


2020-11-20 16:06:38,671 : INFO : Training a Classification Model...
2020-11-20 16:08:02,022 : INFO : Extracting features and creating vocabulary...


4 10233 2558


2020-11-20 16:08:02,418 : INFO : Training a Classification Model...


Total statements classified: 12791
Score: 0.6990642799191381
score length 5
Confusion matrix:
[[2331 3326]
 [1513 5621]]


## Best Model Selection

"""
Out of all the models fitted, we would take 2 best performing model. we would call them candidate models
from the confusion matrix, we can see that logistic regression and SVM (with either binary or tfidf features) are better performing 
in terms of precision and recall (take a look into false positive and true negative counts which appeares
to be low compared to rest of the models).

Using k-fold cross validation, we see the performance of the models on the entire dataset. And, the model's aren't performing well. We can apply other features to improve the performance, and grid-search can also help us to find best parameters to improve the perfromance.
"""

## Train the best Model on entire dataset

In [47]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

def extract_final_features(field,training_data,type):
    """Extract features using different methods"""
    
    logging.info("Extracting features and creating vocabulary...")
    
    if "binary" in type:
        
        # BINARY FEATURE REPRESENTATION
        cv= CountVectorizer(binary=True, max_df=0.95)
        cv.fit_transform(training_data.values)
        
        train_feature_set=cv.transform(training_data.values)
        
        return train_feature_set,cv
  
    elif "counts" in type:
        
        # COUNT BASED FEATURE REPRESENTATION
        cv= CountVectorizer(binary=False, max_df=0.95)
        cv.fit_transform(training_data.values)
        
        train_feature_set=cv.transform(training_data.values)
        
        return train_feature_set,cv
    
    else:    
        
        # TF-IDF BASED FEATURE REPRESENTATION
        tfidf_vectorizer=TfidfVectorizer(use_idf=True, max_df=0.95)
        tfidf_vectorizer.fit_transform(training_data.values)
        
        train_feature_set=tfidf_vectorizer.transform(training_data.values)
        
        return train_feature_set,tfidf_vectorizer

In [50]:
def train_final_model(classifier, train_val, field="statement",feature_rep="binary"):
    """
    Training the best classifier on entire dataset for the provided features.
    """
    
    logging.info("Starting model training...")    

    # features
    train_x=train_val['statement']
    
    # GET LABELS
    target=train_val['label'].values
     
    # GET FEATURES
    features,feature_transformer=extract_final_features(field,train_x,type=feature_rep)

    # INIT LOGISTIC REGRESSION CLASSIFIER
    logging.info("Training a Final Model...")
#     scikit_log_reg = LogisticRegression(verbose=1, solver='liblinear',random_state=0, C=5, penalty='l2',max_iter=1000)
    model=classifier.fit(features,target)

    logging.info("Done training.")
    
    return model,feature_transformer

In [57]:
def get_predictions(model,X_test):
    
    # get predicted labels
    pred = model.predict(X_test)
    
    return pred

In [51]:
field='statement'
LogR_clf_final = LogisticRegression(verbose=1, solver='liblinear',random_state=0, C=5, penalty='l2',max_iter=1000)
lr_final_model,transformer=train_final_model(LogR_clf_final,train_val,field=field,feature_rep='counts')

2020-11-20 16:26:16,433 : INFO : Starting model training...
2020-11-20 16:26:16,435 : INFO : Extracting features and creating vocabulary...
2020-11-20 16:26:16,859 : INFO : Training a Final Model...


[LibLinear]

2020-11-20 16:26:17,140 : INFO : Done training.


## Check predictions on unseen data

In [64]:
# https://www.snopes.com/fact-check/alaska-town-60-days-without-sun/
test_features=transformer.transform(["The sun does not rise in Utqiagvik, Alaska, for more than 60 days during the winter."])
ouput = get_predictions(lr_final_model,test_features)


In [65]:
ouput[0]

1

In [66]:
# https://www.politifact.com/factchecks/2020/nov/20/viral-image/no-passage-about-defeat-isnt-donald-trumps-art-dea/
test_features=transformer.transform(["Says Donald Trump’s book “The Art of the Deal” advises: “Never admit defeat. You win. If you don’t win, claim they cheated.”"])
ouput = get_predictions(lr_final_model,test_features)

In [74]:
ouput[0] # this information is predicted as true, however, it should be false

1

## Save Model for Future Use

In [70]:
import pickle

model_path="../models/lr_final_model.pkl"
transformer_path="../models/transformer.pkl"

# we need to save both the transformer -> to encode a document and the model itself to make predictions based on the weight vectors 
pickle.dump(lr_final_model,open(model_path, 'wb'))
pickle.dump(transformer,open(transformer_path,'wb'))