In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import confusion_matrix
from sklearn.externals import joblib

In [None]:
df = pd.read_csv("../data/fake_or_real_news.csv").dropna()
df.drop('Unnamed: 0', axis = 1, inplace = True)
X = df.drop('label', axis = 1)
y = df['label']
df.head(5)

In [None]:
df["label"].value_counts()

In [None]:
df.describe()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 105)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
class Selector (BaseEstimator, TransformerMixin):
    def __init__(self, cols):
        self.cols = cols
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X):
        return X[self.cols]

In [None]:

pipeline_text = Pipeline(steps=[('select1', Selector('text')),
                               ('tf1', CountVectorizer())])
pipeline_title = Pipeline(steps=[('select2', Selector('title')),
                                ('tf2', CountVectorizer())])

pipeline_text_tfidf = Pipeline(steps=[('select1', Selector('text')),
                               ('tf1', TfidfVectorizer())])
pipeline_title_tfidf = Pipeline(steps=[('select2', Selector('title')),
                                ('tf2', TfidfVectorizer())])

In [None]:
fu = FeatureUnion(n_jobs=2,
                  transformer_list=
                  [('p1', pipeline_text_tfidf),
                   ('p2', pipeline_title_tfidf)])

knn = KNeighborsClassifier()
lr = LogisticRegression()
nn = MLPClassifier(hidden_layer_sizes=[5, 10])

pipeline_all = Pipeline(steps=[('etl', fu),
                                ('classifier', VotingClassifier([
                                    ('knn',knn),
                                    ('lr', lr),
                                    ('nn', nn)], 
                                    voting='hard'))])

In [None]:
pipeline_fitted = pipeline_all.fit(X_train, y_train)
y_pred = pipeline_fitted.predict(X_test)

In [None]:
pd.Series(y_pred).value_counts()

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
joblib.dump(pipeline_fitted, 'ens.pkl')
confusion_matrix(y_test, joblib.load('ens.pkl').predict(X_test))