In [37]:
import pandas as pd
import numpy as np
import string
import logging

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,TfidfTransformer

from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import  LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.model_selection import KFold
from sklearn.metrics import precision_recall_fscore_support

import nltk
import nltk.corpus 
from nltk.corpus import stopwords

## Read Data

In [38]:
train_news = pd.read_csv('../data/processed/train.csv')
val_news = pd.read_csv('../data/processed/val.csv')
test_news = pd.read_csv('../data/processed/test.csv')

## Merging train & val data for K-Fold

In [40]:
frames = [train_news, val_news]
train_val = pd.concat(frames)
train_val['label'].value_counts()
train_val['label'] = Encoder.fit_transform(train_val['label'])

In [41]:
display(train_news), display(test_news), display(val_news)

Unnamed: 0,label,statement
0,False,Says the Annies List political group supports ...
1,True,When did the decline of coal start? It started...
2,True,"Hillary Clinton agrees with John McCain ""by vo..."
3,False,Health care reform legislation is likely to ma...
4,True,The economic turnaround started at the end of ...
...,...,...
10235,True,There are a larger number of shark attacks in ...
10236,True,Democrats have now become the party of the [At...
10237,True,Says an alternative to Social Security that op...
10238,False,On lifting the U.S. Cuban embargo and allowing...


Unnamed: 0,label,statement
0,True,Building a wall on the U.S.-Mexico border will...
1,False,Wisconsin is on pace to double the number of l...
2,False,Says John McCain has done nothing to help the ...
3,True,Suzanne Bonamici supports a plan that will cut...
4,False,When asked by a reporter whether hes at the ce...
...,...,...
1262,True,Says his budget provides the highest state fun...
1263,False,Ive been here almost every day.
1264,False,"In the early 1980s, Sen. Edward Kennedy secret..."
1265,False,Says an EPA permit languished under Strickland...


Unnamed: 0,label,statement
0,False,We have less Americans working now than in the...
1,False,"When Obama was sworn into office, he DID NOT u..."
2,False,Says Having organizations parading as being so...
3,True,Says nearly half of Oregons children are poor.
4,True,On attacks by Republicans that various program...
...,...,...
1279,True,"For the first time in more than a decade, impo..."
1280,True,Says Donald Trump has bankrupted his companies...
1281,True,"John McCain and George Bush have ""absolutely n..."
1282,False,A new poll shows 62 percent support the presid...


(None, None, None)

In [43]:
def process_text(text):
    '''
    What will be covered:
    1. Remove punctuation
    2. Remove stopwords
    3. Return list of clean text words
    '''
    
    #1
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    
    #2
    clean_words = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    
    #3
    return clean_words

In [7]:
# count_vect = CountVectorizer(analyzer=process_text)
# tfidf_transformer = TfidfTransformer()

## Feature Weighting

Not all words are equally important to a particular document / category. For example, while words like ‘murder’, ‘knife’ and ‘abduction’ are important to a crime related document, words like ‘news’ and ‘reporter’ may not be quite as important. 

### Binary Weighting
The most basic form of feature weighting, is binary weighting. Where if a word is present in a document, the weight is ‘1’ and if the word is absent the weight is ‘0’. 

### CountVectorizer

It Convert a collection of text documents to a matrix of token counts.


### Tfidf Weighting 

TF-IDF weighting where words that are unique to a particular document would have higher weights compared to words that are used commonly across documents. 

1. TF (Term Frequency): The number of times a word appears in a document divded by the total number of words in the document. Every document has its own term frequency.

2. IDF (Inverse Data Frequency): The log of the number of documents divided by the number of documents that contain the word w. Inverse data frequency determines the weight of rare words across all documents in the corpus.

3. Lastly, the TF-IDF is simply the TF multiplied by IDF.

In [44]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

def extract_features(field,training_data,testing_data,type):
    """Extract features using different methods"""
    
    logging.info("Extracting features and creating vocabulary...")
    
    if "binary" in type:
        
        # BINARY FEATURE REPRESENTATION
        cv= CountVectorizer(binary=True, max_df=0.95)
        cv.fit_transform(training_data[field].values)
        
        train_feature_set=cv.transform(training_data[field].values)
        test_feature_set=cv.transform(testing_data[field].values)
        
        return train_feature_set,test_feature_set,cv
  
    elif "counts" in type:
        
        # COUNT BASED FEATURE REPRESENTATION
        cv= CountVectorizer(binary=False, max_df=0.95)
        cv.fit_transform(training_data[field].values)
        
        train_feature_set=cv.transform(training_data[field].values)
        test_feature_set=cv.transform(testing_data[field].values)
        
        return train_feature_set,test_feature_set,cv
    
    else:    
        
        # TF-IDF BASED FEATURE REPRESENTATION
        tfidf_vectorizer=TfidfVectorizer(use_idf=True, max_df=0.95)
        tfidf_vectorizer.fit_transform(training_data[field].values)
        
        train_feature_set=tfidf_vectorizer.transform(training_data[field].values)
        test_feature_set=tfidf_vectorizer.transform(testing_data[field].values)
        
        return train_feature_set,test_feature_set,tfidf_vectorizer

In [45]:
def train_model(classifier, train_val, field="statement",feature_rep="binary",top_k=3):
    
    logging.info("Starting model training...")
    
    scores = []
    confusion = np.array([[0,0],[0,0]])
    
    # GET A TRAIN TEST SPLIT (set seed for consistent results)
    training_data, testing_data = train_test_split(train_val,random_state = 2000,)

    # GET LABELS
    Y_train=training_data['label'].values
    Y_test=testing_data['label'].values
     
    # GET FEATURES
    X_train,X_test,feature_transformer=extract_features(field,training_data,testing_data,type=feature_rep)

    # INIT LOGISTIC REGRESSION CLASSIFIER
    logging.info("Training a Classification Model...")
#     scikit_log_reg = LogisticRegression(verbose=1, solver='liblinear',random_state=0, C=5, penalty='l2',max_iter=1000)
    model=classifier.fit(X_train,Y_train)

    # GET PREDICTIONS
    predictions = model.predict(X_test)
    
    # GET EVALUATION NUMBERS ON TEST SET -- HOW DID WE DO?
    logging.info("Starting evaluation...")
    confusion = confusion_matrix(Y_test,predictions)
    score = f1_score(Y_test,predictions)
    report = classification_report(test_label,predicted)
    
    logging.info("Done training and evaluation.")
    
    return model,feature_transformer,score,confusion,report

## Metric

I need to minimize false positives (number of fake news predicted as real) as it can -vely impact people by misleadling them. For class 0 i.e. 'fake', recall should be high as well as precision. Because we want our model to perform well on both classes (real & fake). In short, we need to maximize f1-score.

### Cases to be considered to choose the right metric

**1. Maximizing recall of class 0 (fake) or minimizing false positives(FP)?**
Well, in extreme case, what if all the news predicted by model are labelled as 'fake'. Recall will still be 1, but overall model is really bad i.e. not able to predict class 1 ('real'). 

Ex=> TN = 553, FP = 0, TP = 0, FN = 714

Class0-Recall = TN / (TN + FP) = 1
Class0-Precision = TN / (TN + FN) = 0.43

F1-Score = 2 * Class0-Recall * Class0-Precision/(Class0-Recall + Class0-Precision) = 0.60

Recall, Precision and F1-score for class 1 will be 0.

**2. Considering an extreme case, if all the news classified as True (Even, fake news are predicted as True).**

Ex=>  TN = 0, FP = 553, TP = 714, FN =0
In that case, TN will be 0, which led to Precision 0, Recall 0 and F1 = 0 for class 0 ('fake').

For class 1, Class1-Recall = TP / (TP + FN) = 1
Class1-Precision = TP / (TP + FP) = 0.56

## Model Training

## Text Classification Algorithms

## Naive Bayes Algorithm

Well, when assumption of independence holds, a Naive Bayes classifier performs better compare to other models like logistic regression and you need less training data. An advantage of naive Bayes is that it only requires a small number of training data to estimate the parameters necessary for classification. 

Bayes’ Theorem provides a way that we can calculate the probability of a piece of data belonging to a given class, given our prior knowledge. Bayes’ Theorem is stated as:

P(class|data) = (P(data|class) * P(class)) / P(data)

Where P(class|data) is the probability of class given the provided data.

Naive Bayes is a classification algorithm for binary (two-class) and multiclass classification problems. It is called Naive Bayes or idiot Bayes because the calculations of the probabilities for each class are simplified to make their calculations tractable.

Rather than attempting to calculate the probabilities of each attribute value, they are assumed to be conditionally independent given the class value.

This is a very strong assumption that is most unlikely in real data, i.e. that the attributes do not interact. Nevertheless, the approach performs surprisingly well on data where this assumption does not hold.

### Train Different Types of Models

In [58]:
# model,transformer,score,confusion,report=train_model(nb_clf, train_val,field=field,feature_rep=feature_rep)
# print("\nF1-score={0}; confusion={1}; classification_report={2}".format(score,confusion,report))
field='statement'
feature_reps=['binary','counts','tfidf']
nb_results=[]
nb_clf = MultinomialNB()
for feature_rep in feature_reps:
        print(f'Model - {feature_rep} features with statement')
        model,transformer,score,confusion,report=train_model(nb_clf,train_val,field=field,feature_rep=feature_rep)
        nb_results.append([field,feature_rep,score])

2020-11-18 23:58:08,481 : INFO : Starting model training...
2020-11-18 23:58:08,487 : INFO : Extracting features and creating vocabulary...


Model - binary features with statement


2020-11-18 23:58:08,838 : INFO : Training a Classification Model...
2020-11-18 23:58:08,842 : INFO : Starting evaluation...
2020-11-18 23:58:08,851 : INFO : Done training and evaluation.
2020-11-18 23:58:08,853 : INFO : Starting model training...
2020-11-18 23:58:08,857 : INFO : Extracting features and creating vocabulary...


Model - counts features with statement


2020-11-18 23:58:09,243 : INFO : Training a Classification Model...
2020-11-18 23:58:09,249 : INFO : Starting evaluation...
2020-11-18 23:58:09,258 : INFO : Done training and evaluation.
2020-11-18 23:58:09,260 : INFO : Starting model training...
2020-11-18 23:58:09,269 : INFO : Extracting features and creating vocabulary...


Model - tfidf features with statement


2020-11-18 23:58:09,760 : INFO : Training a Classification Model...
2020-11-18 23:58:09,764 : INFO : Starting evaluation...
2020-11-18 23:58:09,772 : INFO : Done training and evaluation.


### Naive Bayes Results of Various Models

In [59]:
nb_df_results=pd.DataFrame(nb_results,columns=['text_fields','feature_representation','f1-score'])
nb_df_results.sort_values(by=['f1-score'],ascending=False)

Unnamed: 0,text_fields,feature_representation,f1-score
2,statement,tfidf,0.714648
1,statement,counts,0.677438
0,statement,binary,0.675999


In [36]:
# nb_clf_pipeline = Pipeline([('vect', count_vect),
#                       ('tfidf', tfidf_transformer),
#                       ('nb_clf', MultinomialNB()),
#  ])
# nb_clf_pipeline.fit(train_news['statement'], train_label)
# predicted = nb_clf_pipeline.predict(test_news['statement'])
# print(np.mean(predicted == test_label))
# print(classification_report(test_label,predicted))
# print(confusion_matrix(test_label,predicted))

## logistic regression

How hypothesis makes prediction in logistics regression?

This algorithm uses sigmoid function(g(z)). If we want to predict y=1 or y=0.
If estimated probability of y=1 is h(x)>=0.5 then the ouput is more likely to be "y=1" 
but if  h(x) < 0.5, the output is more likely to be is "y=0".

The underlying algorithm is also fairly easy to understand. More importantly, in the NLP world, it’s generally accepted that Logistic Regression is a great starter algorithm for text related classification (https://web.stanford.edu/~jurafsky/slp3/5.pdf). 

### Train a Single Model

### Model - 1 (binary features with statement)

In [27]:
field='statement'
feature_rep='binary'
LogR_clf = LogisticRegression(verbose=1, solver='liblinear',random_state=0, C=5, penalty='l2',max_iter=1000)

model,transformer,score,confusion,report=train_model(LogR_clf, train_val,field=field,feature_rep=feature_rep)
print("\nF1-score={0}; confusion={1}; classification_report={2}".format(score,confusion,report))

2020-11-18 22:49:33,166 : INFO : Starting model training...
2020-11-18 22:49:33,171 : INFO : Extracting features and creating vocabulary...
2020-11-18 22:49:33,524 : INFO : Training a Logistic Regression Model...
2020-11-18 22:49:33,649 : INFO : Starting evaluation...
2020-11-18 22:49:33,654 : INFO : Done training and evaluation.


[LibLinear]
F1-score=0.6378838552751597; confusion=[[ 641  632]
 [ 559 1049]]


### Model - 2 (counts features with statement)¶

In [31]:
field='statement'
feature_rep='counts'

model,transformer,score,confusion=train_model(train_val,field=field,feature_rep=feature_rep)
print("\nF1-score={0}; confusion={1}".format(score,confusion))

2020-11-18 22:53:42,497 : INFO : Starting model training...
2020-11-18 22:53:42,503 : INFO : Extracting features and creating vocabulary...
2020-11-18 22:53:42,840 : INFO : Training a Logistic Regression Model...
2020-11-18 22:53:42,997 : INFO : Starting evaluation...
2020-11-18 22:53:43,004 : INFO : Done training and evaluation.


[LibLinear]
F1-score=0.6424021838034577; confusion=[[ 643  630]
 [ 549 1059]]


### Model - 3 (tfidf features with statement)¶

In [30]:
field='statement'
feature_rep='tfidf'

model,transformer,score,confusion=train_model(train_val,field=field,feature_rep=feature_rep)
print("\nF1-score={0}; confusion={1}".format(score,confusion))

2020-11-18 22:53:34,271 : INFO : Starting model training...
2020-11-18 22:53:34,275 : INFO : Extracting features and creating vocabulary...
2020-11-18 22:53:34,644 : INFO : Training a Logistic Regression Model...
2020-11-18 22:53:34,692 : INFO : Starting evaluation...
2020-11-18 22:53:34,698 : INFO : Done training and evaluation.


[LibLinear]
F1-score=0.6595807499261884; confusion=[[ 611  662]
 [ 491 1117]]


### Train Different Types of Models

In [60]:
field='statement'
feature_reps=['binary','counts','tfidf']
lr_results=[]
LogR_clf = LogisticRegression(verbose=1, solver='liblinear',random_state=0, C=5, penalty='l2',max_iter=1000)

for feature_rep in feature_reps:
        print(f'Model - {feature_rep} features with statement')
        model,transformer,score,confusion,report=train_model(LogR_clf,train_val,field=field,feature_rep=feature_rep)
        lr_results.append([field,feature_rep,score])

2020-11-18 23:59:00,334 : INFO : Starting model training...
2020-11-18 23:59:00,340 : INFO : Extracting features and creating vocabulary...


Model - binary features with statement


2020-11-18 23:59:00,683 : INFO : Training a Classification Model...
2020-11-18 23:59:00,808 : INFO : Starting evaluation...
2020-11-18 23:59:00,817 : INFO : Done training and evaluation.
2020-11-18 23:59:00,820 : INFO : Starting model training...
2020-11-18 23:59:00,824 : INFO : Extracting features and creating vocabulary...


[LibLinear]Model - counts features with statement


2020-11-18 23:59:01,247 : INFO : Training a Classification Model...
2020-11-18 23:59:01,433 : INFO : Starting evaluation...
2020-11-18 23:59:01,443 : INFO : Done training and evaluation.
2020-11-18 23:59:01,446 : INFO : Starting model training...
2020-11-18 23:59:01,450 : INFO : Extracting features and creating vocabulary...


[LibLinear]Model - tfidf features with statement


2020-11-18 23:59:01,841 : INFO : Training a Classification Model...
2020-11-18 23:59:01,887 : INFO : Starting evaluation...
2020-11-18 23:59:01,900 : INFO : Done training and evaluation.


[LibLinear]

### Logistics Regression Results of Various Models

In [61]:
lr_df_results=pd.DataFrame(lr_results,columns=['text_fields','feature_representation','f1-score'])
lr_df_results.sort_values(by=['f1-score'],ascending=False)

Unnamed: 0,text_fields,feature_representation,f1-score
2,statement,tfidf,0.659581
1,statement,counts,0.642402
0,statement,binary,0.637884


Here you see how the performance of logistics model is improved using tfidf over counts and binary weightning.

## SVM

Support vector machines is an algorithm that determines the best decision boundary between vectors that belong to a given group (or category) and vectors that do not belong to it. That’s it. It can be applied to any kind of vectors which encode any kind of data. This means that in order to leverage the power of svm text classification, texts have to be transformed into vectors.

So, when SVM determines the decision boundary we mentioned above, SVM decides where to draw the best “line” (or the best hyperplane) that divides the space into two subspaces: one for the vectors which belong to the given category and one for the vectors which do not belong to it.

### Train Different Types of Models

In [62]:
field='statement'
feature_reps=['binary','counts','tfidf']
svm_results=[]
svm_clf = svm.LinearSVC()

for feature_rep in feature_reps:
        print(f'SVM Model - {feature_rep} features with statement')
        model,transformer,score,confusion,report=train_model(svm_clf,train_val,field=field,feature_rep=feature_rep)
        svm_results.append([field,feature_rep,score])

2020-11-18 23:59:18,218 : INFO : Starting model training...
2020-11-18 23:59:18,224 : INFO : Extracting features and creating vocabulary...


SVM Model - binary features with statement


2020-11-18 23:59:18,571 : INFO : Training a Classification Model...
2020-11-18 23:59:19,157 : INFO : Starting evaluation...
2020-11-18 23:59:19,166 : INFO : Done training and evaluation.
2020-11-18 23:59:19,169 : INFO : Starting model training...
2020-11-18 23:59:19,174 : INFO : Extracting features and creating vocabulary...


SVM Model - counts features with statement


2020-11-18 23:59:19,604 : INFO : Training a Classification Model...
2020-11-18 23:59:20,295 : INFO : Starting evaluation...
2020-11-18 23:59:20,305 : INFO : Done training and evaluation.
2020-11-18 23:59:20,308 : INFO : Starting model training...
2020-11-18 23:59:20,313 : INFO : Extracting features and creating vocabulary...


SVM Model - tfidf features with statement


2020-11-18 23:59:20,725 : INFO : Training a Classification Model...
2020-11-18 23:59:20,795 : INFO : Starting evaluation...
2020-11-18 23:59:20,808 : INFO : Done training and evaluation.


### SVM Results of Various Models

In [63]:
svm_df_results=pd.DataFrame(svm_results,columns=['text_fields','feature_representation','f1-score'])
svm_df_results.sort_values(by=['f1-score'],ascending=False)

Unnamed: 0,text_fields,feature_representation,f1-score
2,statement,tfidf,0.653949
0,statement,binary,0.621156
1,statement,counts,0.620923


## Random Forest

Given the nature of random forests (a bagging decision tree), it is true that you may come up with a rather weak classifier, especially if only a couple of features are truly significant to determine the outcome.

However, keep in mind that in the case of text classification, a preprocessing phase is required to get either your TF or TF-IDF matrix, through which you have already made a selection of pertinent features. Potentially, all features are relevant in this matrix, so the random forest may be performant when you predict your outcome. (source: https://stats.stackexchange.com/questions/343954/random-forest-short-text-classification)

### Train Different Types of Models

In [65]:
field='statement'
feature_reps=['binary','counts','tfidf']
rf_results=[]
rf_clf = RandomForestClassifier(n_estimators=1000)

for feature_rep in feature_reps:
        model,transformer,score,confusion,report=train_model(rf_clf,train_val,field=field,feature_rep=feature_rep)
        rf_results.append([field,feature_rep,score])

2020-11-19 00:01:57,192 : INFO : Starting model training...
2020-11-19 00:01:57,198 : INFO : Extracting features and creating vocabulary...
2020-11-19 00:01:57,519 : INFO : Training a Classification Model...
2020-11-19 00:03:04,644 : INFO : Starting evaluation...
2020-11-19 00:03:04,656 : INFO : Done training and evaluation.
2020-11-19 00:03:04,658 : INFO : Starting model training...
2020-11-19 00:03:04,661 : INFO : Extracting features and creating vocabulary...
2020-11-19 00:03:04,999 : INFO : Training a Classification Model...
2020-11-19 00:04:11,020 : INFO : Starting evaluation...
2020-11-19 00:04:11,028 : INFO : Done training and evaluation.
2020-11-19 00:04:11,030 : INFO : Starting model training...
2020-11-19 00:04:11,034 : INFO : Extracting features and creating vocabulary...
2020-11-19 00:04:11,402 : INFO : Training a Classification Model...
2020-11-19 00:05:12,190 : INFO : Starting evaluation...
2020-11-19 00:05:12,204 : INFO : Done training and evaluation.


## RF Results of Various Models¶

In [66]:
rf_df_results=pd.DataFrame(rf_results,columns=['text_fields','feature_representation','f1-score'])
rf_df_results.sort_values(by=['f1-score'],ascending=False)

Unnamed: 0,text_fields,feature_representation,f1-score
0,statement,binary,0.706107
2,statement,tfidf,0.704757
1,statement,counts,0.702436


## K-fold cross validation

In [19]:
# cross validation with cat boost classification
def apply_crossvalidation(classifier):

    k_fold = KFold(n_splits=5, shuffle=True)
    scores = []
    confusion = np.array([[0,0],[0,0]])

    for fold_n, (train_index, valid_index) in enumerate(k_fold.split(train_val['statement'], train_val['label'])):
        print(fold_n, len(train_index), len(valid_index))
        train_x = train_val['statement'].iloc[train_index]
        train_y = train_val['label'].iloc[train_index]
    
        valid_x = train_val['statement'].iloc[valid_index]
        valid_y = train_val['label'].iloc[valid_index]
    
        classifier.fit(train_x, train_y)
        predictions = classifier.predict(valid_x)
        
        confusion += confusion_matrix(valid_y,predictions)
        score = f1_score(valid_y,predictions)
        scores.append(score)
        
    return (print('Total statements classified:', len(train_val['statement'])),
    print('Score:', sum(scores)/len(scores)),
    print('score length', len(scores)),
    print('Confusion matrix:'),
    print(confusion))

In [20]:
apply_crossvalidation(nb_clf_pipeline)

0 9219 2305
1 9219 2305
2 9219 2305
3 9219 2305
4 9220 2304
Total statements classified: 11524
Score: 0.7043800897131771
score length 5
Confusion matrix:
[[1444 3660]
 [ 940 5480]]


(None, None, None, None, None)

In [21]:
apply_crossvalidation(logR_pipeline)

0 9219 2305
1 9219 2305
2 9219 2305
3 9219 2305


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


4 9220 2304
Total statements classified: 11524
Score: 0.6415364639602688
score length 5
Confusion matrix:
[[2542 2562]
 [2178 4242]]


(None, None, None, None, None)

In [22]:
apply_crossvalidation(svm_pipeline)

0 9219 2305
1 9219 2305
2 9219 2305
3 9219 2305
4 9220 2304
Total statements classified: 11524
Score: 0.6062161766400181
score length 5
Confusion matrix:
[[2528 2576]
 [2507 3913]]


(None, None, None, None, None)

In [23]:
apply_crossvalidation(random_forest)

0 9219 2305
1 9219 2305
2 9219 2305
3 9219 2305
4 9220 2304
Total statements classified: 11524
Score: 0.6814466924391752
score length 5
Confusion matrix:
[[2299 2805]
 [1652 4768]]


(None, None, None, None, None)