# Data import & exploration 

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Path of data
downloaded from: https://www.kaggle.com/c/fake-news/data?select=test.csv
```sh
├── raw_data
│   └── fake-news
│       ├── submit.csv
│       ├── test.csv
│       └── train.csv
```

In [6]:
!ls ../raw_data/fake-news/

submit.csv test.csv   train.csv


## load data & basic exploration

In [7]:
df = pd.read_csv('../raw_data/fake-news/train.csv')

In [8]:
df.shape

(20800, 5)

In [9]:
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [10]:
df['label'].value_counts()

1    10413
0    10387
Name: label, dtype: int64

>**The data is balanced, between fake and real** 👍

In [11]:
df.dtypes

id         int64
title     object
author    object
text      object
label      int64
dtype: object

In [12]:
df['text'].isna().sum()

39

>**data contains `NaN` - will remove** and at the same time drop `title` and `author`

In [13]:
df = df[['text', 'label']].dropna(axis=0)
df.head()

Unnamed: 0,text,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,Ever get the feeling your life circles the rou...,0
2,"Why the Truth Might Get You Fired October 29, ...",1
3,Videos 15 Civilians Killed In Single US Airstr...,1
4,Print \nAn Iranian woman has been sentenced to...,1


**Let's look at a full text...** (below)

Things to fix,a apart from the usual = remove new-line `\n`

In [14]:
df.loc[0,'text']

'House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It By Darrell Lucus on October 30, 2016 Subscribe Jason Chaffetz on the stump in American Fork, Utah ( image courtesy Michael Jolley, available under a Creative Commons-BY license) \nWith apologies to Keith Olbermann, there is no doubt who the Worst Person in The World is this week–FBI Director James Comey. But according to a House Democratic aide, it looks like we also know who the second-worst person is as well. It turns out that when Comey sent his now-infamous letter announcing that the FBI was looking into emails that may be related to Hillary Clinton’s email server, the ranking Democrats on the relevant committees didn’t hear about it from Comey. They found out via a tweet from one of the Republican committee chairmen. \nAs we now know, Comey notified the Republican chairmen and Democratic ranking members of the House Intelligence, Judiciary, and Oversight committees that his agency was reviewing emai

# Text Preprocessing (as a function)

## Data load function

In [15]:
def load_train_data():
    df = pd.read_csv('../raw_data/fake-news/train.csv')
    df = df[['text', 'label']].dropna(axis=0)
    return df.copy()

## Clean Text Function

In [16]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
import string
import unidecode
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer


def clean_text(text):
    # remove new-line: /n
    text = text.replace('\n', ' ')
    
    # Remove Punctuation
    for punctuation in string.punctuation:
        text = text.replace(punctuation, ' ')
    
    # Lower Case
    text = text.lower() 
    
    # Remove Accents
    text = unidecode.unidecode(text) 
    
    # Tokenize --> (make word list) -> useful for following operation
    token_text = word_tokenize(text) 
    
    # Remove numbers
    token_text = [word for word in token_text if word.isalpha()] 
    
    # Stemming 👉 cut to the common root (sometimes no at real word)
    #stemmer = PorterStemmer()
    #token_text = [stemmer.stem(word) for word in token_text]
    
    # Lemming 👉 base word by meaning (laanguage correct)
    lemmatizer = WordNetLemmatizer()
    token_text = [lemmatizer.lemmatize(word) for word in token_text]
    
    # Remove Stop Words
    # 👍 useful for topic modelling, sentiment analysis
    # 👎 Useless for authorship attribution 🍎
    stop_words = set(stopwords.words('english'))     
    token_text = [word for word in token_text if not word in stop_words] 
    
    return " ".join(token_text)


# Load & Clean

In [17]:
# Load data
df = load_train_data()

In [18]:
df.shape

(20761, 2)

**Split data into Train & Test**

In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df[['text']], df['label'], test_size=0.3)

In [20]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((14532, 1), (6229, 1), (14532,), (6229,))

**Run clening function on `X_train` and `X_test`**

🚨 this will take som time... ⏱ 2.5 min on my machine, which is fairly fast (I hope..)

In [17]:
%%time
X_train_clean = X_train['text'].apply(clean_text)
X_test_clean = X_test['text'].apply(clean_text)

CPU times: user 2min 17s, sys: 2.22 s, total: 2min 19s
Wall time: 2min 21s


# Vectorizing X - with `TfidfVectorizer()`

**Running with default**, but following params could be tweeked:
* ngram_range=(1, 1),
* max_df=1.0,
* min_df=1,
* max_features=None,

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

X_train_vector = vectorizer.fit_transform(X_train_clean)
X_test_vector = vectorizer.transform(X_test_clean)

**result shapes** 🥺

In [19]:
X_train_vector.shape, X_test_vector.shape

((14532, 129320), (6229, 129320))

# Naive Bayes Algorithm - `MultinomialNB`

In [20]:
from sklearn.naive_bayes import MultinomialNB

In [21]:
%%time
# Modle
nb_model = MultinomialNB()

# Model fit
nb_model.fit(X_train_vector, y_train)

CPU times: user 33.9 ms, sys: 5.65 ms, total: 39.6 ms
Wall time: 37.1 ms


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [22]:
# model score on TRAIN-data
nb_model.score(X_train_vector, y_train)

0.9089595375722543

In [23]:
# model score on TEST-data
nb_model.score(X_test_vector, y_test)

0.8734949430085086

# ⚠️ Below code is for reading - is not functional ⚠️

## Encoder function for `Pipelines` - 🧰 work in progress

In [108]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [107]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin


class TextPreprocessor(BaseEstimator, TransformerMixin):
    """
This function clean column(s) of text, row by row, according to
initiated parameters.

Clean is initiated by TextPreprocessor().transform(X)

Input (X) should be either:
- pandas.Series
- pandas.DataFrame

Cleaning process can handle handle both single and multiple column 
DataFrames. Return format depends on input, se 'Returns below'

Parameters
----------
new_line : bool, default True
    Remove all new-line characters in the text (\\n)
    
punct : bool, default True
    Remove all punctuation characters from text
    !"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~
    
lower : bool, default True
    Change all characters to lower case
    
accent : bool, default True
    Replace all accent charcters with standard (non accented) characters
    
numbers : bool, default True
    Remove all numbers
    
stemm : bool, default False
    
lemm : bool, default True
    
stop_words : bool, default True


Returns
-------
Return depends on input
    - if pd.Series or pd.DataFrame with 1 column -> return pd.Series
    - if pd.DataFrame with 2 or more columns -> return pd.DataFrame 
      (same size as input)
"""

    def __init__(self, 
                 new_line=True, punct=True, lower=True, 
                 accent=True, numbers=True, stemm=False, 
                 lemm=True, stop_words=True):
        
        self.new_line = new_line
        self.punct = punct
        self.lower = lower
        self.accent = accent
        self.numbers = numbers
        self.stemm = stemm
        self.lemm = lemm
        self.stop_words = stop_words

    
    def clean_text(self, text):
        if self.new_line: # remove new-line: /n
            text = text.replace('\n', ' ')
        
        if self.punct: # Remove Punctuation
            for punctuation in string.punctuation:
                text = text.replace(punctuation, ' ')

        if self.lower: # Lower Case
            text = text.lower() 

        if self.accent: # Remove Accents
            text = unidecode.unidecode(text) 

        # Tokenize --> (make word list) -> useful for following operation
        token_text = word_tokenize(text) 

        if self.numbers: # Remove numbers
            token_text = [word for word in token_text if word.isalpha()] 

        if self.stemm: # Stemming 👉 cut to the common root (sometimes no at real word)
            stemmer = PorterStemmer()
            token_text = [stemmer.stem(word) for word in token_text]

        if self.lemm: # Lemming 👉 base word by meaning (laanguage correct)
            lemmatizer = WordNetLemmatizer()
            token_text = [lemmatizer.lemmatize(word) for word in token_text]

        if self.stop_words: # Remove Stop Words
            # 👍 useful for topic modelling, sentiment analysis
            # 👎 Useless for authorship attribution 🍎
            stop_words = set(stopwords.words('english'))     
            token_text = [word for word in token_text if not word in stop_words] 

        return " ".join(token_text)
        
        
    def fit(self, X, y=None):
        return self      
        

    def transform(self, X, y=None):
        # Check if input is DataFrame or Series
        if isinstance(X, pd.DataFrame):
            
            if X.shape[-1] > 1: # DataFrame with 2 or more columns
                for col in X.columns:
                    X[col] = X[col].apply(self.clean_text)
                # will return a DataFrame
                return X
            
            else: # DataFrame with 1 column
                X = X.iloc[:,0]

        return X.apply(self.clean_text)
    

In [100]:
df_test = df.head().copy()
df_test

Unnamed: 0,text,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,Ever get the feeling your life circles the rou...,0
2,"Why the Truth Might Get You Fired October 29, ...",1
3,Videos 15 Civilians Killed In Single US Airstr...,1
4,Print \nAn Iranian woman has been sentenced to...,1


In [102]:
XX_test = df_test[['text']]
XX_test

Unnamed: 0,text
0,House Dem Aide: We Didn’t Even See Comey’s Let...
1,Ever get the feeling your life circles the rou...
2,"Why the Truth Might Get You Fired October 29, ..."
3,Videos 15 Civilians Killed In Single US Airstr...
4,Print \nAn Iranian woman has been sentenced to...


In [103]:
X2_test = df_test[['text']]
X2_test['text2'] = df_test['text']
X2_test

Unnamed: 0,text,text2
0,House Dem Aide: We Didn’t Even See Comey’s Let...,House Dem Aide: We Didn’t Even See Comey’s Let...
1,Ever get the feeling your life circles the rou...,Ever get the feeling your life circles the rou...
2,"Why the Truth Might Get You Fired October 29, ...","Why the Truth Might Get You Fired October 29, ..."
3,Videos 15 Civilians Killed In Single US Airstr...,Videos 15 Civilians Killed In Single US Airstr...
4,Print \nAn Iranian woman has been sentenced to...,Print \nAn Iranian woman has been sentenced to...


In [104]:
processor = TextPreprocessor()
processor.fit(XX_test)
processor.transform(XX_test)

0    house dem aide even see comey letter jason cha...
1    ever get feeling life circle roundabout rather...
2    truth might get fired october tension intellig...
3    video civilian killed single u airstrike ident...
4    print iranian woman ha sentenced six year pris...
Name: text, dtype: object

In [105]:
processor = TextPreprocessor()
processor.fit(X2_test)
processor.transform(X2_test)

Unnamed: 0,text,text2
0,house dem aide even see comey letter jason cha...,house dem aide even see comey letter jason cha...
1,ever get feeling life circle roundabout rather...,ever get feeling life circle roundabout rather...
2,truth might get fired october tension intellig...,truth might get fired october tension intellig...
3,video civilian killed single u airstrike ident...,video civilian killed single u airstrike ident...
4,print iranian woman ha sentenced six year pris...,print iranian woman ha sentenced six year pris...


## Example of - Feature engineering
Sometimes, you may want to extract your own features from the texts. Some common features are:

* Vocabulary Richness
* Average word per line
* Digit/Character ratio
* Anything you can think of that relates to the task!

In [16]:
data = pd.DataFrame([
    'i do not love football', 
    'i love football not basketball',
    'football football football'
], columns=['text'])
data

Unnamed: 0,text
0,i do not love football
1,i love football not basketball
2,football football football


In [18]:
from nltk.tokenize import word_tokenize

def vocab_richness(text):
    tokens = word_tokenize(text)
    total_length = len(tokens)
    unique_words = set(tokens)
    unique_word_length = len(unique_words)
    return unique_word_length/total_length

data['vocab richness'] = data.text.apply(vocab_richness)

data

Unnamed: 0,text,vocab richness
0,i do not love football,1.0
1,i love football not basketball,1.0
2,football football football,0.333333


## Tuning vectorizer and model simultanously
Different vectorizing hyperparameters will affect model performance. As such, it is important to tune the hyperparameters of both the vectorizer and the model simultaneously. This can be done by using a Pipeline.

In [88]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# Create Pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB()),
])

# Set parameters to search
parameters = {
    'tfidf__ngram_range': ((1,1), (2,2)),
    'nb__alpha': (0.1,1)}

# Perform grid search
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, 
                           verbose=1, scoring = "accuracy", 
                           refit=True, cv=5)

test = grid_search.fit(data.text,y)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    3.3s finished


In [89]:
grid_search.best_params_, grid_search.best_score_

({'nb__alpha': 0.1, 'tfidf__ngram_range': (1, 1)}, 0.9789872008903728)

## Combining vectorizer output and engineered features

In [102]:
from nltk.tokenize import word_tokenize

def vocab_richness(text):
    tokens = word_tokenize(text)
    total_length = len(tokens)
    unique_words = set(tokens)
    unique_word_length = len(unique_words)
    return unique_word_length/total_length

In [103]:
data['vocab_richness'] = data.text.apply(vocab_richness)

In [104]:
data.head()

Unnamed: 0,text,label,vocab_richness
3320,Subject: tenaska iv receivables\r\ndaren :\r\n...,0,0.433206
2064,Subject: re : noms / actual flow for 03 / 15\r...,0,0.558559
1615,Subject: re : license\r\nlet me know if you ev...,0,0.460526
962,"Subject: enron / hpl actuals for sept . 13 , 2...",0,0.842105
2460,Subject: cornhusker contact information\r\nlon...,0,0.581731


In [111]:
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_validate


# Data
X = data[['text', 'vocab_richness']]
y = data.label

# selective column transformer for the pipeline
column_trans = ColumnTransformer([('vec', CountVectorizer(), 'text')]
                                 , remainder='passthrough')

# Assemble Pipeline
pipeline = Pipeline([
    ('transformer', column_trans),
    ('nb', MultinomialNB()),])

# 5-Fold Cross validate model
cv_results = cross_validate(pipeline, X, y, 
                            cv=5, 
                            scoring=['accuracy',
                                     'precision',
                                     'recall'],
                            n_jobs=-1
                           )
pd.DataFrame(cv_results) # Cross validation output

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall
0,0.332056,0.073501,0.975,0.989691,0.96
1,0.32766,0.087,0.97,0.976351,0.963333
2,0.398551,0.074748,0.971667,0.982935,0.96
3,0.396014,0.091583,0.976628,0.979798,0.973244
4,0.339298,0.073544,0.981636,0.983278,0.98


In [110]:
cv_results['test_accuracy'].mean(), cv_results['test_precision'].mean()

(0.9749860879243183, 0.9824105596711356)

In [125]:
final_model = pipeline.fit(X, y)