# Data import & exploration 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Path of data
downloaded from: https://www.kaggle.com/c/fake-news/data?select=test.csv
```sh
├── raw_data
│   └── fake-news
│       ├── submit.csv
│       ├── test.csv
│       └── train.csv
```

In [2]:
!ls ../raw_data/fake-news/

ls: ../raw_data/fake-news/: No such file or directory


## load data & basic exploration

In [3]:
df = pd.read_csv('../raw_data/train.csv')

In [4]:
df.shape

(20800, 5)

In [5]:
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [6]:
df['label'].value_counts()

1    10413
0    10387
Name: label, dtype: int64

>**The data is balanced, between fake and real** 👍

In [7]:
df.dtypes

id         int64
title     object
author    object
text      object
label      int64
dtype: object

In [8]:
df['text'].isna().sum()

39

>**data contains `NaN` - will remove** and at the same time drop `title` and `author`

In [9]:
df = df[['text', 'label']].dropna(axis=0)
df.head()

Unnamed: 0,text,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,Ever get the feeling your life circles the rou...,0
2,"Why the Truth Might Get You Fired October 29, ...",1
3,Videos 15 Civilians Killed In Single US Airstr...,1
4,Print \nAn Iranian woman has been sentenced to...,1


**Let's look at a full text...** (below)

Things to fix,a apart from the usual = remove new-line `\n`

In [10]:
df.loc[0,'text']

'House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It By Darrell Lucus on October 30, 2016 Subscribe Jason Chaffetz on the stump in American Fork, Utah ( image courtesy Michael Jolley, available under a Creative Commons-BY license) \nWith apologies to Keith Olbermann, there is no doubt who the Worst Person in The World is this week–FBI Director James Comey. But according to a House Democratic aide, it looks like we also know who the second-worst person is as well. It turns out that when Comey sent his now-infamous letter announcing that the FBI was looking into emails that may be related to Hillary Clinton’s email server, the ranking Democrats on the relevant committees didn’t hear about it from Comey. They found out via a tweet from one of the Republican committee chairmen. \nAs we now know, Comey notified the Republican chairmen and Democratic ranking members of the House Intelligence, Judiciary, and Oversight committees that his agency was reviewing emai

# Text Preprocessing (as a function)

## Data load function

In [11]:
def load_train_data():
    df = pd.read_csv('../raw_data/train.csv')
    df = df[['text', 'label']].dropna(axis=0)
    return df.copy()

## Clean Text Function

In [12]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
import string
import unidecode
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer


def clean_text(text):
    # remove new-line: /n
    text = text.replace('\n', ' ')
    
    # Remove Punctuation
    for punctuation in string.punctuation:
        text = text.replace(punctuation, ' ')
    
    # Lower Case
    text = text.lower() 
    
    # Remove Accents
    text = unidecode.unidecode(text) 
    
    # Tokenize --> (make word list) -> useful for following operation
    token_text = word_tokenize(text) 
    
    # Remove numbers
    token_text = [word for word in token_text if word.isalpha()] 
    
    # Stemming 👉 cut to the common root (sometimes no at real word)
    #stemmer = PorterStemmer()
    #token_text = [stemmer.stem(word) for word in token_text]
    
    # Lemming 👉 base word by meaning (laanguage correct)
    lemmatizer = WordNetLemmatizer()
    token_text = [lemmatizer.lemmatize(word) for word in token_text]
    
    # Remove Stop Words
    # 👍 useful for topic modelling, sentiment analysis
    # 👎 Useless for authorship attribution 🍎
    stop_words = set(stopwords.words('english'))     
    token_text = [word for word in token_text if not word in stop_words] 
    
    return " ".join(token_text)


# Load & Clean

In [13]:
# Load data
df = load_train_data()

In [14]:
df.shape

(20761, 2)

**Split data into Train & Test**

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df[['text']], df['label'], test_size=0.3)

In [16]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((14532, 1), (6229, 1), (14532,), (6229,))

**Run clening function on `X_train` and `X_test`**

🚨 this will take som time... ⏱ 2.5 min on my machine, which is fairly fast (I hope..)

In [17]:
%%time
X_train_clean = X_train['text'].apply(clean_text)
X_test_clean = X_test['text'].apply(clean_text)

CPU times: user 2min 7s, sys: 583 ms, total: 2min 8s
Wall time: 2min 8s


# Vectorizing X - with `TfidfVectorizer()`

**Running with default**, but following params could be tweeked:
* ngram_range=(1, 1),
* max_df=1.0,
* min_df=1,
* max_features=None,

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(2, 2))

X_train_vector = vectorizer.fit_transform(X_train_clean)
X_test_vector = vectorizer.transform(X_test_clean)

**result shapes** 🥺

In [19]:
X_train_vector.shape, X_test_vector.shape

((14532, 3121799), (6229, 3121799))

# Naive Bayes Algorithm - `MultinomialNB`

In [20]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import CategoricalNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import ComplementNB
from sklearn.naive_bayes import BernoulliNB


In [28]:
%%time
# Modle
nb_model = MultinomialNB(alpha=2)

# Model fit
nb_model.fit(X_train_vector, y_train)

CPU times: user 143 ms, sys: 23.7 ms, total: 167 ms
Wall time: 169 ms


MultinomialNB(alpha=2, class_prior=None, fit_prior=True)

In [29]:
# model score on TRAIN-data
nb_model.score(X_train_vector, y_train)

0.9945637214423342

In [30]:
# model score on TEST-data
nb_model.score(X_test_vector, y_test)

0.9017498795954407

# ⚠️ Below code is for reading - is not functional ⚠️

## Encoder function for `Pipelines` - 🧰 work in progress

In [24]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin


class TimeFeaturesEncoder(BaseEstimator, TransformerMixin):
    """
        kjsnbjkfbeafv
    """

    def __init__(self, time_column, time_zone_name='America/New_York'):
        self.time_column = time_column
        self.time_zone_name = time_zone_name

    def fit(self, X, y=None):
        return self
    
    def clean_text(self, text):
        
        return text

    def transform(self, X, y=None):
        assert isinstance(X, pd.DataFrame)
        X_ = X.copy()
        
        # X_ = X.apply(clean_text)
        
        X_.index = pd.to_datetime(X[self.time_column])
        X_.index = X_.index.tz_convert(self.time_zone_name)
        X_["dow"] = X_.index.weekday
        X_["hour"] = X_.index.hour
        X_["month"] = X_.index.month
        X_["year"] = X_.index.year
        return X_[['dow', 'hour', 'month', 'year']]
    

## Example of - Feature engineering
Sometimes, you may want to extract your own features from the texts. Some common features are:

* Vocabulary Richness
* Average word per line
* Digit/Character ratio
* Anything you can think of that relates to the task!

In [25]:
data = pd.DataFrame([
    'i do not love football', 
    'i love football not basketball',
    'football football football'
], columns=['text'])
data

Unnamed: 0,text
0,i do not love football
1,i love football not basketball
2,football football football


In [26]:
from nltk.tokenize import word_tokenize

def vocab_richness(text):
    tokens = word_tokenize(text)
    total_length = len(tokens)
    unique_words = set(tokens)
    unique_word_length = len(unique_words)
    return unique_word_length/total_length

data['vocab richness'] = data.text.apply(vocab_richness)

data

Unnamed: 0,text,vocab richness
0,i do not love football,1.0
1,i love football not basketball,1.0
2,football football football,0.333333


## Tuning vectorizer and model simultanously
Different vectorizing hyperparameters will affect model performance. As such, it is important to tune the hyperparameters of both the vectorizer and the model simultaneously. This can be done by using a Pipeline.

In [27]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# Create Pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB()),
])

# Set parameters to search
parameters = {
    'tfidf__ngram_range': ((1,1), (2,2)),
    'nb__alpha': (0.1,1)}

# Perform grid search
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, 
                           verbose=1, scoring = "accuracy", 
                           refit=True, cv=5)

test = grid_search.fit(data.text,y)

NameError: name 'y' is not defined

In [None]:
grid_search.best_params_, grid_search.best_score_

## Combining vectorizer output and engineered features

In [None]:
from nltk.tokenize import word_tokenize

def vocab_richness(text):
    tokens = word_tokenize(text)
    total_length = len(tokens)
    unique_words = set(tokens)
    unique_word_length = len(unique_words)
    return unique_word_length/total_length

In [None]:
data['vocab_richness'] = data.text.apply(vocab_richness)

In [None]:
data.head()

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_validate


# Data
X = data[['text', 'vocab_richness']]
y = data.label

# selective column transformer for the pipeline
column_trans = ColumnTransformer([('vec', CountVectorizer(), 'text')]
                                 , remainder='passthrough')

# Assemble Pipeline
pipeline = Pipeline([
    ('transformer', column_trans),
    ('nb', MultinomialNB()),])

# 5-Fold Cross validate model
cv_results = cross_validate(pipeline, X, y, 
                            cv=5, 
                            scoring=['accuracy',
                                     'precision',
                                     'recall'],
                            n_jobs=-1
                           )
pd.DataFrame(cv_results) # Cross validation output

In [None]:
cv_results['test_accuracy'].mean(), cv_results['test_precision'].mean()

In [None]:
final_model = pipeline.fit(X, y)