In [1]:
import sys

In [2]:
!{sys.executable} -m pip install numpy
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install matplotlib
!{sys.executable} -m pip install scikit-learn
!{sys.executable} -m pip install nltk



In [3]:
import ast
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, plot_confusion_matrix
from sklearn.metrics import precision_recall_curve, auc
from sklearn.metrics import accuracy_score, f1_score, recall_score
from sklearn.preprocessing import LabelBinarizer

In [5]:
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

<h3>Importing the dataset</h3>

In [6]:
cwd = os.getcwd()
dataset_dir = os.path.join(cwd,'Dataset')

In [7]:
df = pd.read_csv(os.path.join(dataset_dir,'train_news_preprocessed.csv'), low_memory=False, 
                 usecols = ['label','clean_news_tokens','clean_headline_tokens'])
# 'headline','news','headline_len','news_len','caps_in_headline','caps_in_news',

<h3>Setting environment variables</h3>

In [8]:
RANDOM_STATE = 1973
pd.options.display.max_seq_items = 20
pd.options.display.max_rows = 50

In [9]:
print("Dataset shape:", df.shape)

Dataset shape: (19865, 3)


In [10]:
df.head()

Unnamed: 0,label,clean_news_tokens,clean_headline_tokens
0,0,"['WASHINGTON', 'in', 'sonny', 'perdue', 'telli...","['ethics', 'questions', 'dogged', 'agriculture..."
1,0,"['HOUSTON', 'venezuela', 'had', 'plan', 'it', ...","['U.S.', 'must', 'dig', 'deep', 'to', 'stop', ..."
2,0,"['on', 'abc', 'this', 'week', 'while', 'discus...","['cotton', 'to', 'house', 'do', 'not', 'walk',..."
3,0,"['AUGUSTA', 'me', 'the', 'beleaguered', 'repub...","['paul', 'lepage', 'besieged', 'maine', 'gover..."
4,1,"['finian', 'cunningham', 'has', 'written', 'ex...","['digital', 'nine-eleven', 'if', 'trump', 'wins']"


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19865 entries, 0 to 19864
Data columns (total 3 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   label                  19865 non-null  int64 
 1   clean_news_tokens      19865 non-null  object
 2   clean_headline_tokens  19865 non-null  object
dtypes: int64(1), object(2)
memory usage: 465.7+ KB


In [12]:
df.label.value_counts()

0    10387
1     9478
Name: label, dtype: int64

In [13]:
df.clean_news_tokens = df.clean_news_tokens.map(ast.literal_eval)
df.clean_headline_tokens = df.clean_headline_tokens.map(ast.literal_eval)

In [14]:
y = df.label
X = df.drop('label', axis=1)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                   test_size = 0.2,
                                                   random_state = RANDOM_STATE,
                                                   stratify = y)

In [16]:
X_train.shape

(15892, 2)

In [17]:
X_test.shape

(3973, 2)

In [18]:
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

<h2>Train Model</h2>

In [19]:
def passthrough(doc):
    """passthrough function for use in the pipeline because the text is already tokenized"""
    return doc

In [20]:
graph_dir = os.path.join(cwd,'Graphs')

In [21]:
def confustion_matrix_and_classification_report(estimator, X, y, labels, set_name):
    """
    Display a Classfication Report and Confusion Matrix for the given data.
    """

    predictions = estimator.predict(X)
    labels = ['true','false']
    print(f'Classification Report for {set_name} Set')
    print(classification_report(y, predictions, target_names=labels, digits=4))
    
    """
    matrix = plot_confusion_matrix(estimator,
                                   X,
                                   y,
                                   display_labels = labels,
                                   cmap = plt.cm.Blues,
                                   xticks_rotation = 70,
                                   values_format = 'd')
    title=f'{set_name} Set Confustion Matrix, without Normalization'
    matrix.ax_.set_title(title)
    plt.show()
    plt.close()

    matrix = plot_confusion_matrix(estimator,
                                   X,
                                   y,
                                   display_labels = labels,
                                   cmap = plt.cm.Blues,
                                   xticks_rotation = 70,
                                   normalize = 'true')
    titlen=f'{set_name} Set Confustion Matrix, with Normalization'
    matrix.ax_.set_title(titlen)
    plt.show()
    plt.close()
    """

In [22]:
class LemmaTokenizer:
    def __init__(self):
         self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in doc]

In [23]:
def remove_stopwords(doc):
    """Remove the stopwords from the input document"""
    stop_words = stopwords.words('english')
    return [token for token in doc if ((token not in stop_words) and (token.lower() not in stop_words))]

In [24]:
def lowercase_tokens(doc):
    """lowercase all letters in doc"""
    return [token.lower() for token in doc]

In [25]:
def lowercase_and_remove_stopwords(doc):
    """Remove stopwords and lowercase tokens"""
    stop_words = stopwords.words('english')
    return [token.lower() for token in doc if token.lower() not in stop_words]

In [26]:
def train_and_eval_model(X_train, X_test, y_train_enc, y_test_enc, classes_,
                        preprocessor, tokenizer, stopwords, max_df=1.0, ngram_range = (1, 1)):
    """
    Train and Evaluate and Bag of Words Representation with a Naive Bayes
    classifier.
    """
    
    pipeline = Pipeline([
    ('bow',CountVectorizer(min_df = 5, 
                           stop_words = stopwords,
                           preprocessor = preprocessor, 
                           tokenizer = tokenizer, 
                           max_df = max_df, 
                           ngram_range = ngram_range)),  
    ('classifier', MultinomialNB()),
    ])
    
    pipeline.fit(X_train,y_train_enc)
    
    confustion_matrix_and_classification_report(pipeline, 
                                                X_train, 
                                                y_train_enc, 
                                                classes_, 
                                               'Training')

    confustion_matrix_and_classification_report(pipeline, 
                                                X_test, 
                                                y_test_enc, 
                                                classes_,
                                                'Test')
    """
    fake_class_prob_sorted = pipeline['classifier'].feature_log_prob_[0, :].argsort()
    true_class_prob_sorted = pipeline['classifier'].feature_log_prob_[1, :].argsort()
    
    print('fake ', np.take(pipeline['bow'].get_feature_names(), fake_class_prob_sorted[::-1][:25]))
    print('')
    print('true ', np.take(pipeline['bow'].get_feature_names(), true_class_prob_sorted[::-1][:25]))
    """

<h3>Headline only test</h3>

In [27]:
train_and_eval_model(X_train['clean_headline_tokens'], X_test['clean_headline_tokens'], 
                     y_train_enc, y_test_enc, le.classes_, passthrough, passthrough, None)

Classification Report for Training Set
              precision    recall  f1-score   support

        true     0.9185    0.9468    0.9324      8310
       false     0.9397    0.9079    0.9235      7582

    accuracy                         0.9283     15892
   macro avg     0.9291    0.9274    0.9280     15892
weighted avg     0.9286    0.9283    0.9282     15892

Classification Report for Test Set
              precision    recall  f1-score   support

        true     0.8894    0.9336    0.9110      2077
       false     0.9230    0.8729    0.8973      1896

    accuracy                         0.9046      3973
   macro avg     0.9062    0.9032    0.9041      3973
weighted avg     0.9055    0.9046    0.9044      3973



<h3>News only test</h3>

In [28]:
train_and_eval_model(X_train['clean_news_tokens'], X_test['clean_news_tokens'], 
                     y_train_enc, y_test_enc, le.classes_, passthrough, passthrough, None)

Classification Report for Training Set
              precision    recall  f1-score   support

        true     0.9352    0.9619    0.9483      8310
       false     0.9568    0.9269    0.9416      7582

    accuracy                         0.9452     15892
   macro avg     0.9460    0.9444    0.9450     15892
weighted avg     0.9455    0.9452    0.9451     15892

Classification Report for Test Set
              precision    recall  f1-score   support

        true     0.9093    0.9562    0.9322      2077
       false     0.9491    0.8956    0.9216      1896

    accuracy                         0.9273      3973
   macro avg     0.9292    0.9259    0.9269      3973
weighted avg     0.9283    0.9273    0.9271      3973



<h3>Headline and news test</h3>

In [29]:
X_train['headline_and_news'] = X_train['clean_headline_tokens'] + X_train['clean_news_tokens']
X_test['headline_and_news'] = X_test['clean_headline_tokens'] + X_test['clean_news_tokens']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['headline_and_news'] = X_train['clean_headline_tokens'] + X_train['clean_news_tokens']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['headline_and_news'] = X_test['clean_headline_tokens'] + X_test['clean_news_tokens']


In [30]:
train_and_eval_model(X_train['headline_and_news'], X_test['headline_and_news'], 
                     y_train_enc, y_test_enc, le.classes_, passthrough, passthrough, None)

Classification Report for Training Set
              precision    recall  f1-score   support

        true     0.9334    0.9659    0.9494      8310
       false     0.9612    0.9244    0.9424      7582

    accuracy                         0.9461     15892
   macro avg     0.9473    0.9452    0.9459     15892
weighted avg     0.9466    0.9461    0.9461     15892

Classification Report for Test Set
              precision    recall  f1-score   support

        true     0.9095    0.9586    0.9334      2077
       false     0.9518    0.8956    0.9228      1896

    accuracy                         0.9285      3973
   macro avg     0.9307    0.9271    0.9281      3973
weighted avg     0.9297    0.9285    0.9284      3973



<h3>Headline and news article passed through LemmaTokenizer</h3>

In [31]:
train_and_eval_model(X_train['headline_and_news'], X_test['headline_and_news'], 
                     y_train_enc, y_test_enc, le.classes_, passthrough, LemmaTokenizer(), None)

Classification Report for Training Set
              precision    recall  f1-score   support

        true     0.9293    0.9649    0.9467      8310
       false     0.9598    0.9195    0.9392      7582

    accuracy                         0.9432     15892
   macro avg     0.9446    0.9422    0.9430     15892
weighted avg     0.9439    0.9432    0.9432     15892

Classification Report for Test Set
              precision    recall  f1-score   support

        true     0.9076    0.9552    0.9308      2077
       false     0.9480    0.8935    0.9199      1896

    accuracy                         0.9257      3973
   macro avg     0.9278    0.9243    0.9254      3973
weighted avg     0.9269    0.9257    0.9256      3973



<h3>Headline and news article with stopwords removed</h3>

In [32]:
train_and_eval_model(X_train['headline_and_news'], X_test['headline_and_news'], 
                     y_train_enc, y_test_enc, le.classes_, passthrough, passthrough, stopwords.words('english'))



Classification Report for Training Set
              precision    recall  f1-score   support

        true     0.9374    0.9734    0.9551      8310
       false     0.9696    0.9288    0.9487      7582

    accuracy                         0.9521     15892
   macro avg     0.9535    0.9511    0.9519     15892
weighted avg     0.9528    0.9521    0.9521     15892

Classification Report for Test Set
              precision    recall  f1-score   support

        true     0.9115    0.9624    0.9363      2077
       false     0.9562    0.8977    0.9260      1896

    accuracy                         0.9315      3973
   macro avg     0.9339    0.9301    0.9312      3973
weighted avg     0.9328    0.9315    0.9314      3973



<h3>Headline and news article with stopwords removed and all lowercase</h3>

In [33]:
train_and_eval_model(X_train['headline_and_news'], X_test['headline_and_news'], 
                     y_train_enc, y_test_enc, le.classes_, lowercase_tokens, remove_stopwords, None)

Classification Report for Training Set
              precision    recall  f1-score   support

        true     0.9343    0.9710    0.9523      8310
       false     0.9668    0.9252    0.9455      7582

    accuracy                         0.9492     15892
   macro avg     0.9506    0.9481    0.9489     15892
weighted avg     0.9498    0.9492    0.9491     15892

Classification Report for Test Set
              precision    recall  f1-score   support

        true     0.9089    0.9610    0.9342      2077
       false     0.9544    0.8945    0.9235      1896

    accuracy                         0.9293      3973
   macro avg     0.9317    0.9278    0.9289      3973
weighted avg     0.9306    0.9293    0.9291      3973



<h3>Headline and news article with stopwords removed, all lowercase and LemmaTokenizer</h3>

In [34]:
train_and_eval_model(X_train['headline_and_news'], X_test['headline_and_news'], 
                     y_train_enc, y_test_enc, le.classes_, lowercase_and_remove_stopwords, LemmaTokenizer(), None)

Classification Report for Training Set
              precision    recall  f1-score   support

        true     0.9315    0.9699    0.9503      8310
       false     0.9655    0.9218    0.9431      7582

    accuracy                         0.9470     15892
   macro avg     0.9485    0.9459    0.9467     15892
weighted avg     0.9477    0.9470    0.9469     15892

Classification Report for Test Set
              precision    recall  f1-score   support

        true     0.9059    0.9591    0.9317      2077
       false     0.9521    0.8908    0.9204      1896

    accuracy                         0.9265      3973
   macro avg     0.9290    0.9249    0.9261      3973
weighted avg     0.9279    0.9265    0.9263      3973



<h3>Headline and news article with stopwords removed, all lowercase and LemmaTokenizer and ngram range (1,2)</h3>

In [35]:
train_and_eval_model(X_train['headline_and_news'], X_test['headline_and_news'], 
                     y_train_enc, y_test_enc, le.classes_, lowercase_and_remove_stopwords, LemmaTokenizer(), None, ngram_range = (1,2))

Classification Report for Training Set
              precision    recall  f1-score   support

        true     0.9566    0.9901    0.9731      8310
       false     0.9888    0.9508    0.9694      7582

    accuracy                         0.9714     15892
   macro avg     0.9727    0.9705    0.9713     15892
weighted avg     0.9720    0.9714    0.9713     15892

Classification Report for Test Set
              precision    recall  f1-score   support

        true     0.9137    0.9793    0.9454      2077
       false     0.9754    0.8987    0.9355      1896

    accuracy                         0.9409      3973
   macro avg     0.9446    0.9390    0.9404      3973
weighted avg     0.9432    0.9409    0.9407      3973

