## Ensemble Code

## Importing

In [None]:
import csv
import datetime as dt
import gc  # garbage collector for gpu memory
import json

import numpy as np
import pandas as pd
from sklearn import linear_model, metrics, model_selection, svm
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, f1_score, plot_roc_curve,
                             roc_auc_score)
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from tqdm import tqdm

%matplotlib inline
import pickle

import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder

In [None]:
from google.colab import drive
from google.colab import files
drive.mount('/content/gdrive')

Mounted at /content/gdrive


## Loading Data

In [None]:
politifact_data = json.load(open("gdrive/MyDrive/BT4222/Data/politifact_clean.json", "r"))
gossipcop_data = json.load(open("gdrive/MyDrive/BT4222/Data/gossipcop_clean.json", "r"))

In [None]:
politifact_df = pd.DataFrame(politifact_data)
gossipcop_df = pd.DataFrame(gossipcop_data)

In [None]:
politifact_df['target'] = politifact_df['label'].apply(lambda x: 1 if x=='real' else 0)
gossipcop_df['target'] = gossipcop_df['label'].apply(lambda x: 1 if x=='real' else 0)

In [None]:
politifact_df['parsed_month'] = politifact_df['publish_date'].apply(lambda x: dt.datetime.fromtimestamp(x).strftime("%m") if not pd.isna(x) else '0')
gossipcop_df['parsed_month'] = gossipcop_df['publish_date'].apply(lambda x: dt.datetime.fromtimestamp(x).strftime("%m") if not pd.isna(x) else '0')

In [None]:
politifact_df['parsed_hour'] = politifact_df['publish_date'].apply(lambda x: dt.datetime.fromtimestamp(x).strftime("%H") if not pd.isna(x) else '0')
gossipcop_df['parsed_hour'] = gossipcop_df['publish_date'].apply(lambda x: dt.datetime.fromtimestamp(x).strftime("%H") if not pd.isna(x) else '0')

In [None]:
politifact_df['publisher'] = politifact_df['publisher'].fillna('None')
gossipcop_df['publisher'] = gossipcop_df['publisher'].fillna('None')

In [None]:
#description + headline as another column
politifact_df['text_and_title'] = politifact_df['text_clean'] + ' '+ politifact_df['title_clean']
gossipcop_df['text_and_title'] = gossipcop_df['text_clean'] + ' '+ gossipcop_df['title_clean']

## Gossipcop

We obtain the results from the best performning models and load them in. 

We then train an aggregator model, logistic regression, to accept the predictions of other models as input features.

We then use this newly created logistic regression model for prediction

In [None]:
with open('gdrive/MyDrive/BT4222/Code/machine_learning/neil/gossipcop/svm.pkl', 'rb') as f:
    svm = pickle.load(f)

with open('gdrive/MyDrive/BT4222/Code/machine_learning/neil/gossipcop/logistic.pkl', 'rb') as f:
    logistic = pickle.load(f)

with open('gdrive/MyDrive/BT4222/Code/machine_learning/neil/gossipcop/deep_learning.pkl', 'rb') as f:
    deep_learning = pickle.load(f)

with open('gdrive/MyDrive/BT4222/Code/machine_learning/neil/gossipcop/naive_bayes', 'rb') as f:
    naive_bayes = pickle.load(f)

In [None]:
X = gossipcop_df.text_clean
y = gossipcop_df.target

In [None]:
_, X_rest, y_train, y_rest = train_test_split(X, y, test_size=0.3, random_state=42)
_, _, y_val, y_test = train_test_split(X_rest, y_rest, test_size=0.5, random_state=42)

In [None]:
gossipcop = {}
gossipcop['train_1'] = svm['svm_train']
gossipcop['train_2'] = logistic['lr_train']
gossipcop['train_3'] = deep_learning['deep_train']
gossipcop['train_4'] = naive_bayes['nb_train']

gossipcop_test = {}
gossipcop_test['train_1'] = svm['svm_test']
gossipcop_test['train_2'] = logistic['lr_test']
gossipcop_test['train_3'] = deep_learning['deep_test']
gossipcop_test['train_4'] = naive_bayes['nb_test']

In [None]:
LogR = LogisticRegression()
LogR.fit(pd.DataFrame(gossipcop), y_train)
y_pred_class = LogR.predict(pd.DataFrame(gossipcop_test))

In [None]:
# confusion matrix
matrix = confusion_matrix(y_test, y_pred_class)
matrix = classification_report(y_test, y_pred_class)

global accuracy, fscore
accuracy = metrics.accuracy_score(y_test, y_pred_class)
fscore = metrics.f1_score(y_test, y_pred_class)
roc = metrics.roc_auc_score(y_test, y_pred_class, multi_class='ovo')
# print the accuracy of its predictions
print('Accuracy: ', accuracy)
print('F1 score: ', fscore)
print('ROC', roc)
print(matrix)

Accuracy:  0.8912898936170213
F1 score:  0.9299935773924214
ROC 0.8304403359041617
              precision    recall  f1-score   support

           0       0.80      0.71      0.76       712
           1       0.91      0.95      0.93      2296

    accuracy                           0.89      3008
   macro avg       0.86      0.83      0.84      3008
weighted avg       0.89      0.89      0.89      3008



## Politifact

We obtain the results from the best performning models and load them in. 

We then train an aggregator model, logistic regression, to accept the predictions of other models as input features.

We then use this newly created logistic regression model for prediction

In [None]:
with open('gdrive/MyDrive/BT4222/Code/machine_learning/neil/politifact/svm.pkl', 'rb') as f:
    svm = pickle.load(f)

with open('gdrive/MyDrive/BT4222/Code/machine_learning/neil/politifact/logistic.pkl', 'rb') as f:
    logistic = pickle.load(f)

with open('gdrive/MyDrive/BT4222/Code/machine_learning/neil/politifact/deep_learning.pkl', 'rb') as f:
    deep_learning = pickle.load(f)

with open('gdrive/MyDrive/BT4222/Code/machine_learning/neil/politifact/naive_bayes', 'rb') as f:
    naive_bayes = pickle.load(f)

In [None]:
X = politifact_df.text_clean
y = politifact_df.target

In [None]:
_, X_rest, y_train, y_rest = train_test_split(X, y, test_size=0.3, random_state=42)
_, _, y_val, y_test = train_test_split(X_rest, y_rest, test_size=0.5, random_state=42)

In [None]:
politifact = {}
politifact['train_1'] = svm['svm_train']
politifact['train_2'] = logistic['nb_train']
politifact['train_3'] = deep_learning['deep_train']
politifact['train_4'] = naive_bayes['nb_train']

politifact_test = {}
politifact_test['train_1'] = svm['svm_test']
politifact_test['train_2'] = logistic['nb_test']
politifact_test['train_3'] = deep_learning['deep_test']
politifact_test['train_4'] = naive_bayes['nb_test']

In [None]:
LogR = LogisticRegression()
LogR.fit(pd.DataFrame(politifact), y_train)
y_pred_class = LogR.predict(pd.DataFrame(politifact_test))

In [None]:
# confusion matrix
matrix = confusion_matrix(y_test, y_pred_class)
matrix = classification_report(y_test, y_pred_class)

global accuracy, fscore
accuracy = metrics.accuracy_score(y_test, y_pred_class)
fscore = metrics.f1_score(y_test, y_pred_class)
roc = metrics.roc_auc_score(y_test, y_pred_class, multi_class='ovo')
# print the accuracy of its predictions
print('Accuracy: ', accuracy)
print('F1 score: ', fscore)
print('ROC', roc)
print(matrix)

Accuracy:  0.9027777777777778
F1 score:  0.9156626506024096
ROC 0.9046859421734795
              precision    recall  f1-score   support

           0       0.86      0.92      0.89        59
           1       0.94      0.89      0.92        85

    accuracy                           0.90       144
   macro avg       0.90      0.90      0.90       144
weighted avg       0.91      0.90      0.90       144

