# Packages

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from ast import literal_eval
import numpy as np
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.preprocessing import normalize
from scipy.sparse import csr_matrix, vstack, load_npz, save_npz
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
import torch

import pipeline as pp
import model_tests as mt

import importlib
import math
import pickle
import seaborn as sns
import matplotlib.pyplot as plt

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In this notebook you should be very careful with choosing the correct file.

# Preproccessing

Covert types to binary labels in LIAR - either True (reliable) or False (fake news)

In [None]:
importlib.reload(pp)

pp.apply_pipeline(
    "../datasets/large/cleaned_file.csv", 
    [(pp.Binary_labels_LIAR(), 'type', 'type_binary')], 
    new_file="../datasets/large/cleaned_file_bin.csv", 
    progress_bar=True
)

Create the follwoing input files:
* All are unbalanced
* The test and validation set are balanced according to the types (e.g. satire, reliable...), and the test set is unbalanced
* The test and validation set are balanced according to the binary classes, and the test set is unbalanced

In [None]:
# The number of rows to train the model
BATCH_SIZE = 1000000

In [None]:
importlib.reload(pp)
from_file = "../datasets/large/cleaned_file.csv"

pp.get_dataframe_with_distribution(from_file, BATCH_SIZE, [0.1,0.1,0.1], [False, False, False], 
                                    out_file="../datasets/large/dataset_unbalanced_100k.csv", get_frame=False)
pp.get_dataframe_with_distribution(from_file, BATCH_SIZE, [0.5 ,0.1,0.1], [True, False, False], 
                                   out_file="../datasets/sample/dataset_balanced_types.csv", get_frame=False)
pp.get_dataframe_with_distribution(from_file, BATCH_SIZE, [0.8,0.1,0.1], [True, False, False],
                                    out_file="../datasets/sample/dataset_balanced_bin.csv", get_frame=False, classes=[True,False], type_col="type_binary")
pp.get_dataframe_with_distribution(from_file, BATCH_SIZE, [0.8,0.1,0.1], [True, False, False], 
                                    out_file="../datasets/sample/dataset_balanced_reliable_fake.csv", get_frame=False, classes=["reliable", "fake"])

Cleaning the files. This may have already been done by running some of the other notebooks

In [None]:
importlib.reload(pp)

def Clean_data(file, new_file):
    stopwords_lst = stopwords.words('english')
    pp.apply_pipeline(file, [
            # binary labels
            (pp.Binary_labels(), 'type', 'type_binary'),
            # Clean content
            (pp.Clean_data(), 'content'),
            (pp.Tokenizer(), "content"),
            (pp.Remove_stopwords(stopwords_lst), "content"),
            (pp.Stem(), "content"),
            (pp.Combine_Content(), "content", "content_combined"),
            # Clean authors
            (pp.Clean_author(), "authors"),
            # Clean title
            (pp.Clean_data(), 'title'),
            (pp.Tokenizer(), "title"),
            (pp.Remove_stopwords(stopwords_lst), "title"),
            (pp.Stem(), "title"),
            (pp.Combine_Content(), "title"),
            # Clean domain
            (pp.Clean_domain(), 'domain'),
            # Combine columns (used as features)
            (pp.Join_str_columns(["content_combined", "authors"]), None, "content_authors"),
            (pp.Join_str_columns(["content_combined", "title"]), None, "content_title"),
            (pp.Join_str_columns(["content_combined", "domain"]), None, "content_domain"),
            (pp.Join_str_columns(["content_combined", "domain", "authors", "title"]), None, "content_domain_authors_title")
        ],
        new_file=new_file,
        progress_bar=True,
    )

#Clean_data("../datasets/sample/dataset_unbalanced.csv", "../datasets/sample/dataset_unbalanced_cleaned.csv")
#Clean_data("../datasets/sample/dataset_balanced_types.csv", "../datasets/sample/dataset_balanced_types_cleaned.csv")
#Clean_data("../datasets/sample/dataset_balanced_bin.csv", "../datasets/sample/dataset_balanced_bin_cleaned.csv")
Clean_data("../datasets/sample/dataset_reliable_fake.csv", "../datasets/sample/dataset_reliable_fake_cleaned.csv")

Delete nans for the combined columns. This is nescesarry for some models.

In [None]:
# delete nans
pp.apply_pipeline(
    "../datasets/sample/dataset_unbalanced_1M.csv",
    [(pp.Delete_nan(), 'content_title'),
     (pp.Delete_nan(), 'content_domain'),
     (pp.Delete_nan(), 'content_authors'),
     (pp.Delete_nan(), 'content_domain_authors_title')],
     new_file="../datasets/sample/dataset_unbalanced_1M_.csv",
     progress_bar=True
)

# Training the logistic model

Extracting liar data

In [None]:
liar_data = pd.read_csv("../datasets/liar_dataset/cleaned/combined_cleaned.csv")
X_liar =  liar_data["statement_combined"].values
y_liar = liar_data["label_binary"].astype(int)

Testing models (other than logistic) - may take a long time.

In [None]:

info_list = [(
    "../datasets/large/dataset_unbalanced_1M.csv", "content_no_swords_combined", mt.create_count_vector, [
        (MultinomialNB(), "naive_bayes"),
        (RandomForestClassifier(max_depth=5), "random_forest"), #25
        (DecisionTreeClassifier(max_depth=2), "decision_tree"),
        (AdaBoostClassifier(n_estimators=2), "ada_boost"), #2
        (SVC(kernel='linear', max_iter=10), "svm"),
        (KNeighborsClassifier(n_neighbors=2, algorithm='kd_tree'), "knn"), #15
        (PassiveAggressiveClassifier(), "passive_aggressive")
        ])
]

test_stats_base = mt.Test_statistic()

mt.create_vectors_from_infolist("../datasets/large/dataset_count_vectors.pickle", info_list, X_liar, y_liar)
mt.test_vectors_from_infolist("../datasets/large/dataset_count_vectors.pickle", info_list, tests=test_stats_base)
test_stats_base.metrics.sort_values(by=["split","f1"], ascending=False)


Testing logistic model on different label distributions

In [None]:
importlib.reload(mt)

unbalanced = "../datasets/large/dataset_unbalanced_1M.csv"
balanced_types = "../datasets/sample/dataset_balanced_types_1M.csv"
balanced_bin = "../datasets/sample/dataset_balanced_bin_1M.csv"
balanced_reliable_fake = "../datasets/sample/dataset_balanced_reliable_fake_1M.csv"

info_list = [
    (unbalanced, "content_combined", mt.create_count_vector, [(LogisticRegression(max_iter=300), "content_count"), 
                                                              (LogisticRegression(max_iter=300, C=0.1), "content_count_hyper"),
                                                              (LogisticRegression(max_iter=300, C=250), "content_count_hyper")]),
    (balanced_types, "content_combined", mt.create_count_vector, [(LogisticRegression(max_iter=300), "content_count_balanced_types")]),
    (balanced_bin, "content_combined", mt.create_count_vector, [(LogisticRegression(max_iter=300), "content_count_balanced_bin")]),
    (balanced_reliable_fake, "content_combined", mt.create_count_vector, [(LogisticRegression(max_iter=300), "content_count_reliable_fake")]),
]

test_stats_simple = mt.Test_statistic()

mt.create_vectors_from_infolist("../datasets/sample/dataset_count_vectors.pickle", info_list, X_liar, y_liar)
mt.test_vectors_from_infolist("../datasets/sample/dataset_count_vectors.pickle", info_list, tests=test_stats_simple)
test_stats_simple.metrics.sort_values(by=["split","f1"], ascending=False)

Testing logistic models with tf-idf.

In [None]:
importlib.reload(mt)

info_list = [
      ("../datasets/large/dataset_unbalanced_1M.csv", "content_combined", mt.create_tdfidf_vector_unigram, [
        (LogisticRegression(max_iter=300), "content_tfidf_uni"),
        (LogisticRegression(max_iter=300, C=250), "content_tfidf_uni_hyper_1"),
        (LogisticRegression(max_iter=300, C=0.1), "content_tfidf_uni_hyper_2")]),
     ("../datasets/large/dataset_unbalanced_1M.csv", "content_combined", mt.create_tdfidf_vector_bigram, [
        (LogisticRegression(max_iter=300), "content_tfidf_bi")]),
     ("../datasets/sample/dataset_unbalanced_10K.csv", "content_combined", mt.create_tdfidf_vector_trigram, [
        (LogisticRegression(max_iter=300), "content_tfidf_tri"),
        (LogisticRegression(max_iter=300, C=250), "content_tfidf_tri_hyper_1"),
        (LogisticRegression(max_iter=300, C=0.1), "content_tfidf_hyper_2")]),
]

test_stats_tdidf = mt.Test_statistic()

mt.create_vectors_from_infolist("../datasets/sample/dataset_tdidf_vectors.pickle", info_list, X_liar, y_liar)
mt.test_vectors_from_infolist("../datasets/sample/dataset_tdidf_vectors.pickle", info_list, tests=test_stats_tdidf)
test_stats_tdidf.metrics.sort_values(by=["split","f1"], ascending=False)

Testing baseline model with meta fields

In [None]:
importlib.reload(mt)

unbalanced = "../datasets/sample/dataset_unbalanced_1M.csv"

info_list = [
    (unbalanced, "content_title", mt.create_count_vector, [(LogisticRegression(max_iter=300), "content_title_count")]),
    (unbalanced, "content_domain", mt.create_count_vector, [(LogisticRegression(max_iter=300), "content_domain_count")]),
    (unbalanced, "content_authors", mt.create_count_vector, [(LogisticRegression(max_iter=300), "content_authors_count")]),
    (unbalanced, "content_domain_authors_title", mt.create_count_vector, [(LogisticRegression(max_iter=300), "all_count"), 
                                                                          (LogisticRegression(max_iter=300, C=250), "all_count_hyper_1"), 
                                                                          (LogisticRegression(max_iter=300, C=0.1), "all_count_hyper_2")]),
]

test_stats_meta = mt.Test_statistic()

mt.create_vectors_from_infolist("../datasets/sample/dataset_count_vectors_meta.pickle", info_list, X_liar, y_liar)
mt.test_vectors_from_infolist("../datasets/sample/dataset_count_vectors_meta.pickle", info_list, tests=test_stats_meta)
test_stats_meta.metrics.sort_values(by=["split","f1"], ascending=False)

Printing confusion matrix

In [None]:
model_name = "content_count_hyper"

metrics = test_stats_simple.metrics
metrics_test = metrics[metrics["split"] == "test"]
cm = metrics_test[metrics_test["name"] == model_name]["confusion_matrix"].values[0]
tn, fp, fn, tp = cm.ravel()
sns.heatmap([[tp, fn],[fp, tn]], annot=True, cmap="Blues", xticklabels=["Fake","True"], yticklabels=["Fake","True"], fmt="d", annot_kws={"size": 12})
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

Hyperparameter tuning - the best found was C=300 and max_iter=700. The code down below takes around 5 hours to run for 1M entries.

In [None]:
importlib.reload(mt)

grid = GridSearchCV(
    estimator  = LogisticRegression(),
    param_grid = {"C": [200, 250, 300, 350], "max_iter": [500]},#[500, 600, 700, 800]},
    cv         = 3,
    scoring    = ['f1'],
    refit      = 'f1',
    verbose    = 2
)

unbalanced = "../datasets/sample/dataset_unbalanced.csv"

info_list = [
    (unbalanced, "content_combined", mt.create_count_vector, [(grid, "content_count")]),
]

test_stats_hyper_opt = mt.Test_statistic()

mt.create_vectors_from_infolist("../datasets/sample/hyper_opt.pickle", info_list, X_liar, y_liar) 
mt.test_vectors_from_infolist("../datasets/sample/hyper_opt.pickle", info_list, tests=test_stats_hyper_opt)
test_stats_hyper_opt.metrics.sort_values(by="f1", ascending=False)
# best params
print(grid.best_params_)