In [86]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, ParameterGrid
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import confusion_matrix
from sklearn.externals import joblib

import matplotlib.pyplot as plt
import numpy as np

# Read data

In [87]:
df = pd.read_csv("../data/fake_or_real_news.csv").dropna()
df.drop('Unnamed: 0', axis = 1, inplace = True)
X = df.drop('label', axis = 1)
y = df['label'].apply(func = lambda x: (1 if x=='FAKE' else 0))
df.head(5)

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [88]:
y.value_counts()

0    3171
1    3164
Name: label, dtype: int64

In [89]:
df.describe()

Unnamed: 0,title,text,label
count,6335,6335,6335
unique,6256,6060,2
top,OnPolitics | 's politics blog,"Killing Obama administration rules, dismantlin...",REAL
freq,5,58,3171


# Train-test-split

In [90]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 104)

# Pipeline setup

In [91]:
class Selector (BaseEstimator, TransformerMixin):
    def __init__(self, cols):
        self.cols = cols
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X):
        return X[self.cols]

In [92]:
pipeline_text = Pipeline(steps=[('select1', Selector('text')),
                               ('tf1', CountVectorizer())])
pipeline_title = Pipeline(steps=[('select2', Selector('title')),
                                ('tf2', CountVectorizer())])

pipeline_text_tfidf = Pipeline(
    steps=[
        ('select1', Selector('text')),
        ('tf1', TfidfVectorizer(stop_words='english', max_features = 10000))
    ])
pipeline_title_tfidf = Pipeline(
    steps=[
        ('select2', Selector('title')),
        ('tf2', TfidfVectorizer(stop_words='english', max_features = 2000))
    ])

In [93]:
fu = FeatureUnion(n_jobs=2,
                  transformer_list=
                  [('p1', pipeline_text_tfidf),
                   ('p2', pipeline_title_tfidf)])

knn = KNeighborsClassifier()
lr = LogisticRegression()
nn = MLPClassifier(hidden_layer_sizes=[5, 10])

pipeline_all = Pipeline(steps=[('etl', fu),
                                ('classifier', VotingClassifier([
                                    ('knn',knn),
                                    ('lr', lr),
                                    ('nn', nn)], 
                                    voting='soft'))])

# Check dimensions

In [94]:
fu.fit_transform(df).shape

(6335, 12000)

# Test pipeline

In [95]:
pipeline_fitted = pipeline_all.fit(X_train, y_train)
y_pred = pipeline_fitted.predict(X_test)

  if diff:


In [96]:
pipeline_fitted.predict_proba(X_test)

array([[0.12, 0.88],
       [0.29, 0.71],
       [0.66, 0.34],
       ...,
       [0.7 , 0.3 ],
       [0.07, 0.93],
       [0.84, 0.16]])

In [97]:
pd.Series(y_pred).value_counts()

1    335
0    299
dtype: int64

In [98]:
confusion_matrix(y_test, y_pred)

array([[277,  22],
       [ 22, 313]])

# GridSearch setup

In [99]:
param_grid = dict(classifier__knn__n_neighbors=[5,6],
                 classifier__nn__hidden_layer_sizes=[[5,10],[6,8]],
                 classifier__lr__C=[1, 10])
pipeline_gs = GridSearchCV(pipeline_all, param_grid=param_grid, n_jobs=1, cv=2, scoring='recall')

# Fit GridSearch

In [100]:
pipeline_fitted_gs = pipeline_gs.fit(X_train, y_train)
y_pred = pipeline_fitted_gs.predict(X_test)
confusion_matrix(y_test, y_pred)

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


array([[278,  21],
       [ 22, 313]])

In [104]:
joblib.dump(pipeline_fitted_gs, '../Keramik/nn.pkl')

['../Keramik/nn.pkl']

# Test deserialization

In [105]:
confusion_matrix(y_test, joblib.load('../Keramik/nn.pkl').predict(X_test))

  if diff:


array([[278,  21],
       [ 22, 313]])