In [83]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import cross_validate, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [47]:
data = pd.read_csv('../raw_data/train.csv')
data.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [48]:
data.shape

(20800, 5)

In [57]:
data.dtypes

id                   int64
title               object
author              object
text                object
label                int64
sans_punctuation    object
dtype: object

## clean dataset

In [50]:
data.duplicated().sum()

0

In [51]:
data.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [52]:
data = data.dropna(subset=['text'])

In [53]:
data.isnull().sum()

id           0
title      558
author    1918
text         0
label        0
dtype: int64

## preprocessing

In [55]:
def remove_punctuation(document):
    for punct in string.punctuation:
        document = document.replace(punct, '')
    return document

In [56]:
data['sans_punctuation'] = data.text.apply(remove_punctuation)
data.head()

Unnamed: 0,id,title,author,text,label,sans_punctuation
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,House Dem Aide We Didn’t Even See Comey’s Lett...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,Ever get the feeling your life circles the rou...
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,Why the Truth Might Get You Fired October 29 2...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,Videos 15 Civilians Killed In Single US Airstr...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,Print \nAn Iranian woman has been sentenced to...


In [58]:
data['lowercase'] = data['sans_punctuation'].str.lower()
data.head()

Unnamed: 0,id,title,author,text,label,sans_punctuation,lowercase
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,House Dem Aide We Didn’t Even See Comey’s Lett...,house dem aide we didn’t even see comey’s lett...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,Ever get the feeling your life circles the rou...,ever get the feeling your life circles the rou...
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,Why the Truth Might Get You Fired October 29 2...,why the truth might get you fired october 29 2...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,Videos 15 Civilians Killed In Single US Airstr...,videos 15 civilians killed in single us airstr...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,Print \nAn Iranian woman has been sentenced to...,print \nan iranian woman has been sentenced to...


In [62]:
data['tokenized'] = data['lowercase'].apply(word_tokenize)

In [63]:
data.head()

Unnamed: 0,id,title,author,text,label,sans_punctuation,lowercase,tokenized
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,House Dem Aide We Didn’t Even See Comey’s Lett...,house dem aide we didn’t even see comey’s lett...,"[house, dem, aide, we, didn, ’, t, even, see, ..."
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,Ever get the feeling your life circles the rou...,ever get the feeling your life circles the rou...,"[ever, get, the, feeling, your, life, circles,..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,Why the Truth Might Get You Fired October 29 2...,why the truth might get you fired october 29 2...,"[why, the, truth, might, get, you, fired, octo..."
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,Videos 15 Civilians Killed In Single US Airstr...,videos 15 civilians killed in single us airstr...,"[videos, 15, civilians, killed, in, single, us..."
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,Print \nAn Iranian woman has been sentenced to...,print \nan iranian woman has been sentenced to...,"[print, an, iranian, woman, has, been, sentenc..."


In [67]:
def remove_stopwords(tokens):
    sans_stopwords = [word for word in tokens if word not in stop_words]
    return sans_stopwords

In [68]:
stop_words = set(stopwords.words('english'))

In [69]:
data['sw_removed'] = data['tokenized'].apply(remove_stopwords)
data.head()

Unnamed: 0,id,title,author,text,label,sans_punctuation,lowercase,tokenized,sw_removed
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,House Dem Aide We Didn’t Even See Comey’s Lett...,house dem aide we didn’t even see comey’s lett...,"[house, dem, aide, we, didn, ’, t, even, see, ...","[house, dem, aide, ’, even, see, comey, ’, let..."
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,Ever get the feeling your life circles the rou...,ever get the feeling your life circles the rou...,"[ever, get, the, feeling, your, life, circles,...","[ever, get, feeling, life, circles, roundabout..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,Why the Truth Might Get You Fired October 29 2...,why the truth might get you fired october 29 2...,"[why, the, truth, might, get, you, fired, octo...","[truth, might, get, fired, october, 29, 2016, ..."
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,Videos 15 Civilians Killed In Single US Airstr...,videos 15 civilians killed in single us airstr...,"[videos, 15, civilians, killed, in, single, us...","[videos, 15, civilians, killed, single, us, ai..."
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,Print \nAn Iranian woman has been sentenced to...,print \nan iranian woman has been sentenced to...,"[print, an, iranian, woman, has, been, sentenc...","[print, iranian, woman, sentenced, six, years,..."


In [71]:
def token_to_lemma_string(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(lemmatized)

In [72]:
data['lemmatized'] = data['sw_removed'].apply(token_to_lemma_string)
data.head()

Unnamed: 0,id,title,author,text,label,sans_punctuation,lowercase,tokenized,sw_removed,lemmatized
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,House Dem Aide We Didn’t Even See Comey’s Lett...,house dem aide we didn’t even see comey’s lett...,"[house, dem, aide, we, didn, ’, t, even, see, ...","[house, dem, aide, ’, even, see, comey, ’, let...",house dem aide ’ even see comey ’ letter jason...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,Ever get the feeling your life circles the rou...,ever get the feeling your life circles the rou...,"[ever, get, the, feeling, your, life, circles,...","[ever, get, feeling, life, circles, roundabout...",ever get feeling life circle roundabout rather...
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,Why the Truth Might Get You Fired October 29 2...,why the truth might get you fired october 29 2...,"[why, the, truth, might, get, you, fired, octo...","[truth, might, get, fired, october, 29, 2016, ...",truth might get fired october 29 2016 tension ...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,Videos 15 Civilians Killed In Single US Airstr...,videos 15 civilians killed in single us airstr...,"[videos, 15, civilians, killed, in, single, us...","[videos, 15, civilians, killed, single, us, ai...",video 15 civilian killed single u airstrike id...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,Print \nAn Iranian woman has been sentenced to...,print \nan iranian woman has been sentenced to...,"[print, an, iranian, woman, has, been, sentenc...","[print, iranian, woman, sentenced, six, years,...",print iranian woman sentenced six year prison ...


In [73]:
data['clean_text'] = data['lemmatized']
data.head()

Unnamed: 0,id,title,author,text,label,sans_punctuation,lowercase,tokenized,sw_removed,lemmatized,clean_text
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,House Dem Aide We Didn’t Even See Comey’s Lett...,house dem aide we didn’t even see comey’s lett...,"[house, dem, aide, we, didn, ’, t, even, see, ...","[house, dem, aide, ’, even, see, comey, ’, let...",house dem aide ’ even see comey ’ letter jason...,house dem aide ’ even see comey ’ letter jason...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,Ever get the feeling your life circles the rou...,ever get the feeling your life circles the rou...,"[ever, get, the, feeling, your, life, circles,...","[ever, get, feeling, life, circles, roundabout...",ever get feeling life circle roundabout rather...,ever get feeling life circle roundabout rather...
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,Why the Truth Might Get You Fired October 29 2...,why the truth might get you fired october 29 2...,"[why, the, truth, might, get, you, fired, octo...","[truth, might, get, fired, october, 29, 2016, ...",truth might get fired october 29 2016 tension ...,truth might get fired october 29 2016 tension ...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,Videos 15 Civilians Killed In Single US Airstr...,videos 15 civilians killed in single us airstr...,"[videos, 15, civilians, killed, in, single, us...","[videos, 15, civilians, killed, single, us, ai...",video 15 civilian killed single u airstrike id...,video 15 civilian killed single u airstrike id...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,Print \nAn Iranian woman has been sentenced to...,print \nan iranian woman has been sentenced to...,"[print, an, iranian, woman, has, been, sentenc...","[print, iranian, woman, sentenced, six, years,...",print iranian woman sentenced six year prison ...,print iranian woman sentenced six year prison ...


## vectorizing and fitting to model

In [75]:
X_bow = CountVectorizer().fit_transform(data['clean_text'])

In [77]:
%%time
X_bow_scores = cross_val_score(MultinomialNB(), X_bow, data['label'], cv=5, scoring='accuracy')

CPU times: user 5.63 s, sys: 0 ns, total: 5.63 s
Wall time: 10.4 s


In [79]:
X_bow_scores.mean()

0.9085304538771564

In [80]:
X_two_gram = TfidfVectorizer(ngram_range=(2,2)).fit_transform(data['clean_text'])

In [81]:
%%time
X_two_gram_scores = cross_val_score(MultinomialNB(), X_two_gram, data['label'], cv=5, scoring='accuracy')

CPU times: user 3.19 s, sys: 1.27 s, total: 4.46 s
Wall time: 4.46 s


In [82]:
X_two_gram_scores.mean()

0.9146960063690986

In [84]:
tfidf_pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB())
])

In [85]:
param_grid = {
    'tfidf__max_df': [0.8,1],
    'tfidf__min_df': [0,0.2],
    'tfidf__max_features': [10,100,1000],
    'tfidf__ngram_range': [(1,1),(2,2),(3,3)],
    'nb__alpha': [0.1,0.5,1.0]
}

In [86]:
grid_search = GridSearchCV(
    tfidf_pipe,
    param_grid,
    n_jobs=-1,
    cv=5,
    scoring='accuracy'
)

In [None]:
%%time
grid_search.fit(data['clean_text'], data['labels'])