In [12]:
import pandas as pd
import numpy as np
import string
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import  LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.model_selection import KFold
import nltk
import nltk.corpus 
from nltk.corpus import stopwords

In [3]:
train_news = pd.read_csv('../data/processed/train.csv')
val_news = pd.read_csv('../data/processed/val.csv')
test_news = pd.read_csv('../data/processed/test.csv')

## LabelEncoding the target

In [4]:
Encoder = LabelEncoder()
train_label = Encoder.fit_transform(train_news.label)
val_label = Encoder.fit_transform(val_news.label)
test_label = Encoder.fit_transform(test_news.label)

In [5]:
display(train_news), display(test_news), display(val_news)

Unnamed: 0,label,statement
0,False,Says the Annies List political group supports ...
1,True,When did the decline of coal start? It started...
2,True,"Hillary Clinton agrees with John McCain ""by vo..."
3,False,Health care reform legislation is likely to ma...
4,True,The economic turnaround started at the end of ...
...,...,...
10235,True,There are a larger number of shark attacks in ...
10236,True,Democrats have now become the party of the [At...
10237,True,Says an alternative to Social Security that op...
10238,False,On lifting the U.S. Cuban embargo and allowing...


Unnamed: 0,label,statement
0,True,Building a wall on the U.S.-Mexico border will...
1,False,Wisconsin is on pace to double the number of l...
2,False,Says John McCain has done nothing to help the ...
3,True,Suzanne Bonamici supports a plan that will cut...
4,False,When asked by a reporter whether hes at the ce...
...,...,...
1262,True,Says his budget provides the highest state fun...
1263,False,Ive been here almost every day.
1264,False,"In the early 1980s, Sen. Edward Kennedy secret..."
1265,False,Says an EPA permit languished under Strickland...


Unnamed: 0,label,statement
0,False,We have less Americans working now than in the...
1,False,"When Obama was sworn into office, he DID NOT u..."
2,False,Says Having organizations parading as being so...
3,True,Says nearly half of Oregons children are poor.
4,True,On attacks by Republicans that various program...
...,...,...
1279,True,"For the first time in more than a decade, impo..."
1280,True,Says Donald Trump has bankrupted his companies...
1281,True,"John McCain and George Bush have ""absolutely n..."
1282,False,A new poll shows 62 percent support the presid...


(None, None, None)

In [6]:
train_label

array([0, 1, 1, ..., 1, 0, 0])

In [7]:
def process_text(text):
    '''
    What will be covered:
    1. Remove punctuation
    2. Remove stopwords
    3. Return list of clean text words
    '''
    
    #1
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    
    #2
    clean_words = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    
    #3
    return clean_words

In [8]:
count_vect = CountVectorizer(analyzer=process_text)
tfidf_transformer = TfidfTransformer()

## Naive Bayes Algorithm

Well, when assumption of independence holds, a Naive Bayes classifier performs better compare to other models like logistic regression and you need less training data. An advantage of naive Bayes is that it only requires a small number of training data to estimate the parameters necessary for classification. 

Bayes’ Theorem provides a way that we can calculate the probability of a piece of data belonging to a given class, given our prior knowledge. Bayes’ Theorem is stated as:

P(class|data) = (P(data|class) * P(class)) / P(data)

Where P(class|data) is the probability of class given the provided data.

Naive Bayes is a classification algorithm for binary (two-class) and multiclass classification problems. It is called Naive Bayes or idiot Bayes because the calculations of the probabilities for each class are simplified to make their calculations tractable.

Rather than attempting to calculate the probabilities of each attribute value, they are assumed to be conditionally independent given the class value.

This is a very strong assumption that is most unlikely in real data, i.e. that the attributes do not interact. Nevertheless, the approach performs surprisingly well on data where this assumption does not hold.



## Metric to use? 

I need to minimize false positives (number of fake news predicted as real) as it can -vely impact people by misleadling them. For class 0 i.e. 'fake', recall should be high as well as precision. Because we want our model to perform well on both classes (real & fake). In short, we need to maximize f1-score.

### Cases I considered to choose metric?

**1. If I just care to about maximizing recall of class 0 (fake) or minimizing false positives (FP)?**
Well, in extreme case, what if all the news predicted by model are labelled as 'fake'. Recall will still be 1, but overall model is really bad i.e. not able to predict class 1 ('real'). 

Ex=> TN = 553, FP = 0, TP = 0, FN = 714

Class0-Recall = TN / (TN + FP) = 1
Class0-Precision = TN / (TN + FN) = 0.43

F1-Score = 2 * Class0-Recall * Class0-Precision/(Class0-Recall + Class0-Precision) = 0.60

Recall, Precision and F1-score for class 1 will be 0.

**2. Considering an extreme case, if all the news predicted are labelled as True. Even, fake news are predicted as True.**

Ex=>  TN = 0, FP = 553, TP = 714, FN =0
In that case, TN will be 0, which led to Precision 0, Recall 0 and F1 = 0 for class 0 ('fake').

For class 1, Class1-Recall = TP / (TP + FN) = 1
Class1-Precision = TP / (TP + FP) = 0.56

Hence, we care about model's performance in both classes i.e. precision and recall for both class 0 and class 1. 

In [13]:
nb_clf_pipeline = Pipeline([('vect', count_vect),
                      ('tfidf', tfidf_transformer),
                      ('nb_clf', MultinomialNB()),
 ])
nb_clf_pipeline.fit(train_news['statement'], train_label)
predicted = nb_clf_pipeline.predict(test_news['statement'])
print(np.mean(predicted == test_label))
print(classification_report(test_label,predicted))
print(confusion_matrix(test_label,predicted))

0.6006314127861089
              precision    recall  f1-score   support

           0       0.60      0.25      0.36       553
           1       0.60      0.87      0.71       714

    accuracy                           0.60      1267
   macro avg       0.60      0.56      0.53      1267
weighted avg       0.60      0.60      0.56      1267

[[140 413]
 [ 93 621]]


## logistic regression

How hypothesis makes prediction in logistics regression?

This algorithm uses sigmoid function(g(z)). If we want to predict y=1 or y=0.
If estimated probability of y=1 is h(x)>=0.5 then the ouput is more likely to be "y=1" 
but if  h(x) < 0.5, the output is more likely to be is "y=0".

In [14]:
logR_pipeline = Pipeline([
        ('LogRCV',count_vect),
        ('LogR_clf',LogisticRegression())
        ])

logR_pipeline.fit(train_news['statement'],train_label)
predicted_LogR = logR_pipeline.predict(test_news['statement'])
print(np.mean(predicted_LogR == test_label))
print(classification_report(test_label,predicted_LogR))
print(confusion_matrix(test_label,predicted_LogR))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.6077348066298343
              precision    recall  f1-score   support

           0       0.56      0.50      0.53       553
           1       0.64      0.69      0.66       714

    accuracy                           0.61      1267
   macro avg       0.60      0.60      0.60      1267
weighted avg       0.60      0.61      0.60      1267

[[278 275]
 [222 492]]


## SVM

In [15]:
svm_pipeline = Pipeline([
        ('svmCV',count_vect),
        ('svm_clf',svm.LinearSVC())
        ])

svm_pipeline.fit(train_news['statement'],train_label)
predicted_svm = svm_pipeline.predict(test_news['statement'])
print(np.mean(predicted_svm == test_label))
print(classification_report(test_label,predicted_svm))
print(confusion_matrix(test_label,predicted_svm))

0.5730071033938438
              precision    recall  f1-score   support

           0       0.51      0.50      0.51       553
           1       0.62      0.63      0.62       714

    accuracy                           0.57      1267
   macro avg       0.57      0.56      0.56      1267
weighted avg       0.57      0.57      0.57      1267

[[277 276]
 [265 449]]


## Random Forest

In [16]:
random_forest = Pipeline([
        ('rfCV', count_vect),
        ('rf_clf',RandomForestClassifier(n_estimators=200))
        ])
    
random_forest.fit(train_news['statement'],train_label)
predicted_rf = random_forest.predict(test_news['statement'])
print(np.mean(predicted_rf == test_label))
print(classification_report(test_label,predicted_rf))
print(confusion_matrix(test_label,predicted_rf))

0.6132596685082873
              precision    recall  f1-score   support

           0       0.58      0.41      0.48       553
           1       0.63      0.77      0.69       714

    accuracy                           0.61      1267
   macro avg       0.60      0.59      0.59      1267
weighted avg       0.61      0.61      0.60      1267

[[226 327]
 [163 551]]


## Merging train, val and test data for K-Fold

In [17]:
frames = [train_news.drop('length', axis=1), val_news]
train_val = pd.concat(frames)
train_val

KeyError: "['length'] not found in axis"

In [None]:
train_val['label'].value_counts()

In [None]:
train_val['label'] = Encoder.fit_transform(train_val['label'])

In [None]:
train_val['label']

So, we have merged all three datasets (train, test & val) together, so that we can run Naive Bayes with k-fold cross validation.

## K-fold cross validation

In [None]:
# cross validation with cat boost classification
def apply_crossvalidation(classifier):

    k_fold = KFold(n_splits=5, shuffle=True)
    scores = []
    confusion = np.array([[0,0],[0,0]])

    for fold_n, (train_index, valid_index) in enumerate(k_fold.split(train_val['statement'], train_val['label'])):
        print(fold_n, len(train_index), len(valid_index))
        train_x = train_val['statement'].iloc[train_index]
        train_y = train_val['label'].iloc[train_index]
    
        valid_x = train_val['statement'].iloc[valid_index]
        valid_y = train_val['label'].iloc[valid_index]
    
        classifier.fit(train_x, train_y)
        predictions = classifier.predict(valid_x)
        
        confusion += confusion_matrix(valid_y,predictions)
        score = f1_score(valid_y,predictions)
        scores.append(score)
        
    return (print('Total statements classified:', len(train_val['statement'])),
    print('Score:', sum(scores)/len(scores)),
    print('score length', len(scores)),
    print('Confusion matrix:'),
    print(confusion))

In [None]:
apply_crossvalidation(nb_clf_pipeline)

In [None]:
apply_crossvalidation(logR_pipeline)

In [None]:
apply_crossvalidation(svm_pipeline)

In [None]:
apply_crossvalidation(random_forest)