In [25]:
import pandas as pd
import numpy as np 
import re
from collections import Counter

In [3]:
import spacy
nlp = spacy.load('en')

In [4]:
from gensim.models import Phrases
from gensim.models.phrases import Phraser



In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [5]:
# Update puncuation list in spacy
nlp.vocab["$"].is_punct = True
nlp.vocab["|"].is_punct = True
nlp.vocab["+"].is_punct = True
nlp.vocab["<"].is_punct = True
nlp.vocab[">"].is_punct = True
nlp.vocab["="].is_punct = True
nlp.vocab["^"].is_punct = True
nlp.vocab["`"].is_punct = True
nlp.vocab["~"].is_punct = True

In [6]:
clean_final = pd.read_csv("clean_final_news.csv", encoding = "utf8", index_col = 0)

In [7]:
clean_final.head()

Unnamed: 0,author,published,text,domain_rank,site_url,spam_score,title,response,length
0,SAM TANENHAUS,2016-11-20T03:51:00.000+02:00,Privacy Policy Eisenhowers two terms bore this...,98.0,nytimes.com,0.0,Opinion: Donald Trump’s Art of the New Deal?,Not fake,949
1,,2016-11-20T00:52:00.000+02:00,Can Trump Save Their Jobs? by Nelson D. Sc...,98.0,nytimes.com,0.0,Carrier Workers for Trump,Not fake,372
2,Kathleen Elkins,2016-11-20T06:18:00.000+02:00,"Tuesday, 18 Oct 2016 | 10:25 AM ET CNBC.com Ed...",767.0,cnbc.com,0.008,Tennis star Caroline Wozniacki shares the mone...,Not fake,386
3,JEFF SOMMER,2016-11-20T00:04:00.000+02:00,Continue reading the main story Yet it is poss...,98.0,nytimes.com,0.0,Strategies: It’s Not Just the White House. Cha...,Not fake,1005
4,James Rufus Koren,2016-11-20T02:42:00.000+02:00,Wells Fargo hit with new sanctions following f...,609.0,latimes.com,0.264,Wells Fargo hit with new sanctions following f...,Not fake,809


### Preprocessing Functions

In [9]:
# Convert text and response to array 
x_text = clean_final.text.values
y_response = clean_final.response.values

In [10]:
# Remove twitter handles from text 
for idx in range(len(x_text)):
    x_text[idx] = re.sub(r'@([A-Za-z0-9_]+)', "", str(x_text[idx]))

In [11]:
# Remove hyperlinks from text
for idx in range(len(x_text)):
    x_text[idx] = re.sub(r"(https|http)\S+", "", str(x_text[idx]))

In [12]:
# helper function to eliminate tokens that are pure punctuation, whitespace, or stopword
# can be updated based on desired filtering 

def process_txt(token):
    return token.is_punct or token.is_space or token.is_stop or token.like_num

In [13]:
# function to take array of articles and turn them into nested list of tokens

def lemmatize_txt(array):
    lemma = []
    
    for doc in nlp.pipe(array, batch_size=50,
                        n_threads=-1):
        if doc.is_parsed:
            lemma.append([n.lemma_ for n in doc if not process_txt(n)])
        
        else:
            lemma.append(None)
    
    return lemma

In [14]:
# function to recombine nested list of tokens into full articles 

def lemma_combine(lis):
    parsed_articles = []
    
    for i in range(len(lis)):
        concat_art = ' '.join(lis[i])
        parsed_articles.append(concat_art)
    
    return parsed_articles

In [15]:
# function to match "cleaned" text back up with response variable

def zip_response(observations, response):
    response = response.tolist()
    
    return list(zip(observations, response))

### Process Text

In [16]:
# Create nested list of tokens for each article
lem = lemmatize_txt(x_text)

In [17]:
# Create bi-grams for our text 
phrases = Phrases(lem)
bigram = Phraser(phrases)
bigram_lem = list(bigram[lem])

In [18]:
# Create tri-grams for our text
phrases2 = Phrases(bigram_lem)
trigram = Phraser(phrases2)
trigram_lem = list(trigram[bigram_lem])

In [19]:
# Recombine full article text for unigrams, bigrams, and trigrams
uni_lem_comb = lemma_combine(lem)
bi_lem_comb = lemma_combine(bigram_lem)
tri_lem_comb = lemma_combine(trigram_lem)

### Model

In [20]:
# Split test and train data using trigram text
x_train, x_test, y_train, y_test = train_test_split(tri_lem_comb, y_response, test_size = 0.2, stratify = y_response)

In [22]:
# Set up pipeline to run TFIDF and Naive Bayes
text_pipe = Pipeline([('tfidf', TfidfVectorizer()), ('clf', MultinomialNB())])

In [27]:
# Use pipeline to predict on train data and find accuracy
predicted_train = text_pipe.predict(x_train)
np.mean(predicted_train == y_train) 

0.82682028315515743

In [26]:
# Use pipeline to predict on test data and find accuracy
predicted = text_pipe.predict(x_test)
np.mean(predicted == y_test) 

0.80687563195146617

In [28]:
# Get more detailed performance metrics
from sklearn import metrics
print(metrics.classification_report(y_test, predicted, target_names = ["Not fake", "fake"]))

             precision    recall  f1-score   support

   Not fake       0.78      1.00      0.88      4715
       fake       0.99      0.40      0.57      2208

avg / total       0.85      0.81      0.78      6923



In [30]:
parameters = {'tfidf__max_df': (0.75, 0.80, 0.85, 0.90, 0.95, 1.0),\
              'tfidf__min_df': (0.001,0.01,0.1),\
              'clf__alpha': (1, 0.1, 0.01, 0.001, 0.0001, 0.00001)}

In [31]:
gs_clf = GridSearchCV(text_pipe, parameters, n_jobs=-1)

In [None]:
gs_clf = gs_clf.fit(x_train, y_train)

In [None]:
gs_clf.best_score_ 
gs_clf.best_params_