# Packages

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from ast import literal_eval
import numpy as np
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.preprocessing import normalize
from scipy.sparse import csr_matrix, vstack
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier

import transformers as ppb # pytorch-pretrained-bert
import torch

import pipeline as pp
import models as ml

import importlib
import math

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\madsv\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Preproccessing

Covert types to binary labels - either True (reliable) or False (fake news)

In [None]:
pp.apply_pipeline(
    "../datasets/big/dataset.csv", 
    [(pp.Binary_labels(), 'type', 'type_binary')], 
    new_file="../datasets/big/dataset_bin.csv", 
    progress_bar=True
)

Create the follwoing input files:
* All are unbalanced
* The test and validation set are balanced according to the types (e.g. satire, reliable...), and the test set is unbalanced
* The test and validation set are balanced according to the binary classes, and the test set is unbalanced

In [276]:
# The number of rows to train the model
BATCH_SIZE = 1000000

In [287]:
importlib.reload(pp)
from_file = "../datasets/big/dataset_bin.csv"

pp.get_dataframe_with_distribution(from_file, BATCH_SIZE, [0.8,0.1,0.1], [False, False, False], 
                                   out_file="../datasets/big/dataset_unbalanced.csv", get_frame=False)
pp.get_dataframe_with_distribution(from_file, BATCH_SIZE, [0.6,0.1,0.1], [True, True, False], 
                                   out_file="../datasets/big/dataset_balanced_types.csv", get_frame=False)
pp.get_dataframe_with_distribution(from_file, BATCH_SIZE, [0.8,0.1,0.1], [True, True, False],
                                   out_file="../datasets/big/dataset_balanced_bin.csv", get_frame=False, classes=[True,False], type_col="type_binary")
pp.get_dataframe_with_distribution(from_file, BATCH_SIZE, [0.8,0.1,0.1], [True, True, False], 
                                   out_file="../datasets/big/dataset_reliable_fake.csv", get_frame=False, classes=["reliable", "fake"])

100%|██████████| 200000/200000 [00:01<00:00, 152296.33it/s]
100%|██████████| 200000/200000 [00:01<00:00, 176616.40it/s]
100%|██████████| 200000/200000 [00:01<00:00, 164883.53it/s]
100%|██████████| 200000/200000 [00:01<00:00, 185039.16it/s]
100%|██████████| 200000/200000 [00:01<00:00, 186903.95it/s]
100%|██████████| 200000/200000 [00:01<00:00, 187540.29it/s]
100%|██████████| 200000/200000 [00:01<00:00, 190906.82it/s]
100%|██████████| 200000/200000 [00:01<00:00, 191636.46it/s]
100%|██████████| 200000/200000 [00:01<00:00, 193830.14it/s]
100%|██████████| 200000/200000 [00:01<00:00, 193353.57it/s]
100%|██████████| 200000/200000 [00:00<00:00, 566782.14it/s]
100%|██████████| 200000/200000 [00:00<00:00, 558838.17it/s]
100%|██████████| 200000/200000 [00:00<00:00, 581432.65it/s]
100%|██████████| 200000/200000 [00:00<00:00, 555758.89it/s]
100%|██████████| 200000/200000 [00:00<00:00, 577280.55it/s]
100%|██████████| 200000/200000 [00:00<00:00, 498551.82it/s]
100%|██████████| 200000/200000 [00:00<00

entries read: 6800000


100%|██████████| 200000/200000 [00:00<00:00, 498589.16it/s]
100%|██████████| 200000/200000 [00:00<00:00, 488578.70it/s]
100%|██████████| 200000/200000 [00:00<00:00, 516741.00it/s]
100%|██████████| 200000/200000 [00:00<00:00, 483314.14it/s]
100%|██████████| 200000/200000 [00:00<00:00, 587605.42it/s]
100%|██████████| 200000/200000 [00:00<00:00, 756896.48it/s]


entries read: 1200000


100%|██████████| 200000/200000 [00:00<00:00, 668309.54it/s]
100%|██████████| 200000/200000 [00:00<00:00, 622432.36it/s]
100%|██████████| 200000/200000 [00:00<00:00, 670391.97it/s]
100%|██████████| 200000/200000 [00:00<00:00, 642936.75it/s]
100%|██████████| 200000/200000 [00:00<00:00, 674643.20it/s]
100%|██████████| 200000/200000 [00:00<00:00, 659918.07it/s]
100%|██████████| 200000/200000 [00:00<00:00, 553486.85it/s]
100%|██████████| 200000/200000 [00:00<00:00, 637649.29it/s]
100%|██████████| 200000/200000 [00:00<00:00, 691680.24it/s]
100%|██████████| 200000/200000 [00:00<00:00, 705313.26it/s]
100%|██████████| 200000/200000 [00:00<00:00, 700668.04it/s]
100%|██████████| 200000/200000 [00:00<00:00, 631871.65it/s]
100%|██████████| 200000/200000 [00:00<00:00, 674796.24it/s]
100%|██████████| 200000/200000 [00:00<00:00, 680684.11it/s]
100%|██████████| 200000/200000 [00:00<00:00, 709069.22it/s]
100%|██████████| 200000/200000 [00:00<00:00, 680064.95it/s]
100%|██████████| 200000/200000 [00:00<00

entries read: 4000000


Check distribution of labels (just to show that everything works)

In [135]:
def get_distribution(data, is_percentage=True, col = "type"):
    for i, label in enumerate(pp.labels):
        if is_percentage:
            percent = len(data[data[col] == label]) / (data.shape[0])
        else:
            percent = len(data[data[col] == label])
        print(f"{label}: {percent}", end="")
        print(", ", end="") if i != len(pp.labels) - 1 else _

for file in ["../datasets/big/dataset_unbalanced.csv", "../datasets/big/dataset_balanced_types.csv", "../datasets/big/dataset_balanced_bin.csv"]:
    data = pd.read_csv(file)
    print(f"File: {file} ----------------------------------")
    # find distribution of labels
    for i, set_name in enumerate(["train", "val", "test"]):
        set = data[data["set"] == i]
        print(f"Distribution of {set_name} with size {set.shape[0]}:")
        get_distribution(set)
        print(f"\nTrue: {len(set[set['type_binary'] == True])}, Fake: {len(set[set['type_binary'] == False])}")

File: ../datasets/big/dataset_unbalanced.csv ----------------------------------
Distribution of train with size 8000:
fake: 0.121625, conspiracy: 0.123125, junksci: 0.0155, hate: 0.01, unreliable: 0.04575, bias: 0.15475, satire: 0.016125, reliable: 0.259125, clickbait: 0.032, political: 0.222
True: 4105, Fake: 3895
Distribution of val with size 1000:
fake: 0.129, conspiracy: 0.124, junksci: 0.012, hate: 0.014, unreliable: 0.045, bias: 0.147, satire: 0.022, reliable: 0.262, clickbait: 0.021, political: 0.224
True: 507, Fake: 493
Distribution of test with size 1000:
fake: 0.106, conspiracy: 0.125, junksci: 0.017, hate: 0.008, unreliable: 0.05, bias: 0.172, satire: 0.014, reliable: 0.255, clickbait: 0.032, political: 0.221
True: 508, Fake: 492
File: ../datasets/big/dataset_balanced_types.csv ----------------------------------
Distribution of train with size 8000:
fake: 0.1, conspiracy: 0.1, junksci: 0.1, hate: 0.1, unreliable: 0.1, bias: 0.1, satire: 0.1, reliable: 0.1, clickbait: 0.1, po

Cleaning the files

In [4]:
importlib.reload(pp)

def Clean_data(file, new_file):
    stopwords_lst = stopwords.words('english')
    pp.apply_pipeline(file, [
            # Clean content
            (pp.Clean_data(), 'content'),
            (pp.Tokenizer(), "content"),
            (pp.Remove_stopwords(stopwords_lst), "content"),
            (pp.Stem(), "content"),
            (pp.Combine_Content(), "content", "content_combined"),
            # Clean authors
            (pp.Clean_author(), "authors"),
            # Clean title
            (pp.Clean_data(), 'title'),
            (pp.Tokenizer(), "title"),
            (pp.Remove_stopwords(stopwords_lst), "title"),
            (pp.Stem(), "title"),
            (pp.Combine_Content(), "title"),
            # Clean domain
            (pp.Clean_domain(), 'domain'),
            # Combine columns (used as features)
            (pp.Join_str_columns(["content_combined", "authors"]), None, "content_authors"),
            (pp.Join_str_columns(["content_combined", "title"]), None, "content_title"),
            (pp.Join_str_columns(["content_combined", "domain"]), None, "content_domain"),
            (pp.Join_str_columns(["content_combined", "domain", "authors", "title"]), None, "content_domain_authors_title")
        ],
        new_file=new_file,
        progress_bar=True,
    )

#Clean_data("../datasets/big/dataset_unbalanced.csv", "../datasets/big/dataset_unbalanced_cleaned.csv")
#Clean_data("../datasets/big/dataset_balanced_types.csv", "../datasets/big/dataset_balanced_types_cleaned.csv")
Clean_data("../datasets/big/dataset_balanced_bin.csv", "../datasets/big/dataset_balanced_bin_cleaned.csv")
Clean_data("../datasets/big/dataset_reliable_fake.csv", "../datasets/big/dataset_reliable_fake_cleaned.csv")

100%|██████████| 200000/200000 [06:07<00:00, 543.87it/s]
100%|██████████| 200000/200000 [00:08<00:00, 22498.70it/s]
100%|██████████| 200000/200000 [02:44<00:00, 1218.72it/s]
100%|██████████| 200000/200000 [2:37:40<00:00, 21.14it/s]     
100%|██████████| 200000/200000 [00:05<00:00, 37714.99it/s]
100%|██████████| 200000/200000 [00:01<00:00, 174595.96it/s]
100%|██████████| 200000/200000 [00:10<00:00, 18988.77it/s]
100%|██████████| 200000/200000 [00:04<00:00, 40743.86it/s] 
100%|██████████| 200000/200000 [00:03<00:00, 60886.85it/s]
100%|██████████| 200000/200000 [00:29<00:00, 6803.84it/s]
100%|██████████| 200000/200000 [00:00<00:00, 673690.16it/s]
100%|██████████| 200000/200000 [00:00<00:00, 331466.09it/s]
100%|██████████| 200000/200000 [00:04<00:00, 44450.21it/s]
100%|██████████| 200000/200000 [00:04<00:00, 43561.39it/s]
100%|██████████| 200000/200000 [00:05<00:00, 39662.88it/s]
100%|██████████| 200000/200000 [00:07<00:00, 25032.76it/s]
 65%|██████▍   | 129952/200000 [04:10<02:09, 539.74i

# Training the logistic model

In [201]:
def vectorize_content(data, col="content", new_col="count_vectorized"):
    # Prepare the tf-idf (term frequency-inverse document frequency) TODO: read up on this for report
    start_time = time() 
    count_vectorizer = CountVectorizer(ngram_range=(1, 2))
    tf_idf_transformer = TfidfTransformer(smooth_idf=False)

    # fit and transform train data to count vectorizer
    count_vectorizer.fit(data[col].values)
    count_vect_train = count_vectorizer.transform(data[col].values)
    # fit the counts vector to tfidf transformer
    tf_idf_transformer.fit(count_vect_train)
    count_vect_train = tf_idf_transformer.transform(count_vect_train)
    data[new_col] = [x for x in count_vect_train]
    #cleaned_data_combined['count_vectorized'] = cleaned_data_combined['count_vectorized'].apply(lambda x: tf_idf_transformer.transform([x]))

    end_time = time()
    print(f"Time elapsed of TF IDF transform for {col}:", end_time - start_time)

In [164]:
def split_csr_data(data, features="content", y="type", set="set", get_val=True):
    train = data[data[set] == 0]
    val = data[data[set] == 1]
    test = data[data[set] == 2]
    X_train, y_train = vstack(train[features]), train[y].astype(int)
    X_val, y_val = vstack(val[features]), val[y].astype(int)
    X_test, y_test = vstack(test[features]), test[y].astype(int)
    if not get_val:
        return X_train, X_test, y_train, y_test
    return X_train, X_val, X_test, y_train, y_val, y_test

In [169]:
def try_models(models, X_train, X_test, y_train, y_test, name=None):
    metrics = []
    for model in models:
        start_time = time() 
        model.fit(X_train, y_train)
        train_time = time() - start_time
        y_train_pred = model.predict(X_train)
        y_pred = model.predict(X_test)
        
        if name == None:
            name = type(model).__name__
        metrics.append({
            "name": name,
            "train_acc": accuracy_score(y_train, y_train_pred),
            "test_acc": accuracy_score(y_test, y_pred),
            "precision": precision_score(y_test, y_pred),
            "recall": recall_score(y_test, y_pred),
            "f1": f1_score(y_test, y_pred), 
            "time": "{:.2f}".format(train_time)
        })
        print(f"{name} finished in {(time() - start_time):.2f} seconds")
    return pd.DataFrame(metrics)

In [229]:
class Test_baseline():
    def __init__(self):
        self.metrics = pd.DataFrame()

    def test_baseline(self, X_train, X_test, y_train, y_test, name=None, model=None):
        if model == None:
            model = LogisticRegression()
        metric = try_models([model], X_train, X_test, y_train, y_test, name=name)
        self.metrics = pd.concat([self.metrics, metric])

    def test_col(self, data, col, name, model=None):
        self.test_baseline(*split_csr_data(data, features=col, y="type_binary", get_val=False), name=name, model=model)

    def test_cols(self, data, cols_to_test, model=None):
        for col, name in cols_to_test:
            self.test_baseline(*split_csr_data(data, features=col, y="type_binary", get_val=False), name=name, model=model)

In [None]:
tests = Test_baseline()
# contains the files to test and the name of the test-group (for the dataframe)
files = [
    ("../datasets/big/dataset_unbalanced_cleaned.csv", "unbalanced"), 
    ("../datasets/big/dataset_balanced_types_cleaned.csv", "balanced_types"), 
    ("../datasets/big/dataset_balanced_bin_cleaned.csv", "balanced_bin"),
    ("../datasets/big/dataset_reliable_fake_cleaned.csv", "reliable_fake")
]
# contains the columns to test and the name of the specific test (for the dataframe)
cols_to_test = [
    ("content_combined", "content"),
    ("content_authors", "content_authors"), 
    ("content_title", "content_title"),
    ("content_domain", "content_domain"),
    ("content_domain_authors_title", "content_domain_authors_title")
]

In [None]:
for file, name in files:
    print(f"Proccessing: {name}")
    cols_to_read = list(list(zip(*cols_to_test))[0]) + ["type_binary", "set"]
    vectorized_data = pd.read_csv(file, usecols=cols_to_read)
    print("Read data into dataframe")
    for col, entry_name in cols_to_test:
        #vectorize_content(vectorized_data, col=col, new_col=f"{col}_vectorized")
        #tests.test_col(vectorized_data, f"{col}_vectorized", f"{entry_name}_{name}")
        vectorize_content(vectorized_data, col=col, new_col=col)
        tests.test_col(vectorized_data, col, f"{entry_name}_{name}")
        del vectorized_data[col] # free up memory

Proccessing: unbalanced...
Time elapsed of TF IDF transform for content_combined: 9.632106304168701
content_unbalanced finished in 5.60 seconds
Time elapsed of TF IDF transform for content_authors: 7.3649742603302
content_authors_unbalanced finished in 4.12 seconds
Time elapsed of TF IDF transform for content_title: 7.952449083328247
content_title_unbalanced finished in 5.75 seconds
Time elapsed of TF IDF transform for content_domain: 8.076876640319824
content_domain_unbalanced finished in 4.44 seconds
Time elapsed of TF IDF transform for content_domain_authors_title: 7.574965000152588
content_domain_authors_title_unbalanced finished in 5.15 seconds
Proccessing: balanced_types...
Time elapsed of TF IDF transform for content_combined: 7.003679037094116
content_balanced_types finished in 3.78 seconds
Time elapsed of TF IDF transform for content_authors: 6.9939961433410645
content_authors_balanced_types finished in 3.42 seconds
Time elapsed of TF IDF transform for content_title: 7.6031830

In [None]:
tests.metrics.sort_values(by="f1", ascending=False)

Unnamed: 0,name,train_acc,test_acc,precision,recall,f1,time
0,content_domain_authors_title_balanced_bin,0.996,0.841,0.854127,0.842803,0.848427,4.71
0,content_domain_authors_title_unbalanced,0.996,0.838,0.83829,0.857414,0.847744,4.64
0,content_domain_balanced_bin,0.995,0.83,0.836466,0.842803,0.839623,4.4
0,content_domain_unbalanced,0.995,0.825,0.812834,0.86692,0.839006,4.57
0,content_authors_balanced_bin,0.9955,0.818,0.839216,0.810606,0.824663,4.15
0,content_authors_unbalanced,0.9945,0.813,0.820416,0.825095,0.822749,4.03
0,content_title_balanced_bin,0.994,0.803,0.820116,0.80303,0.811483,3.96
0,content_title_unbalanced,0.994,0.793,0.79159,0.823194,0.807083,5.17
0,content_balanced_bin,0.9945,0.794,0.808429,0.799242,0.80381,4.01
0,content_unbalanced,0.9945,0.785,0.78018,0.823194,0.80111,5.35


Best file and features

In [220]:
best_file = "../datasets/big/dataset_unbalanced_cleaned.csv"
best_col = "content_domain_authors_title"

In [271]:
data = pd.read_csv(best_file)
vectorize_content(data, col=best_col, new_col=best_col)

Time elapsed of TF IDF transform for content_domain_authors_title: 9.814547061920166


Hyperparameter tuning - the best found was C=300 and max_iter=200

In [None]:
model = LogisticRegression()
param_grid = {"C": [250, 300, 350], "max_iter": [150, 200, 250]} #200 won - det samme
#param_grid = {'penalty': ['l1', 'l2'],'C': [350], "maxiter": [200], 'solver': ['liblinear', 'saga']}

grid = GridSearchCV(estimator=model,
                    param_grid=param_grid,
                    cv=3,
                    scoring=['f1'],
                    refit='f1',
                    verbose=2) #'accuracy'


tests.test_col(data, best_col, "hyper_1", model=grid)
print(grid.best_estimator_)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] END ................................C=300, max_iter=100; total time=  10.0s
[CV] END ................................C=300, max_iter=100; total time=  10.8s
[CV] END ................................C=300, max_iter=100; total time=   7.1s
[CV] END ................................C=300, max_iter=150; total time=   8.8s
[CV] END ................................C=300, max_iter=150; total time=  11.2s
[CV] END ................................C=300, max_iter=150; total time=   6.7s
[CV] END ................................C=300, max_iter=200; total time=   8.6s
[CV] END ................................C=300, max_iter=200; total time=  10.6s
[CV] END ................................C=300, max_iter=200; total time=   7.2s
hyper_1 finished in 88.89 seconds
LogisticRegression(C=300)


In [None]:
data = pd.read_csv(best_file)
vectorize_content(data, col=best_col, new_col=best_col)
tests.test_col(data, best_col, "hyper_all", model=LogisticRegression(C=300, max_iter=200))
# test the best parameters on the other featues
vectorize_content(data, col="content_combined", new_col="content_combined")
tests.test_col(data, "content_combined", "hyper_content", model=LogisticRegression(C=300, max_iter=200))

hyper_all finished in 8.66 seconds
Time elapsed of TF IDF transform for content_combined: 7.822778701782227
hyper_content finished in 9.59 seconds


In [None]:
tests.metrics.sort_values(by="f1", ascending=False)

Unnamed: 0,name,train_acc,test_acc,precision,recall,f1,time
0,content_domain_reliable_fake,0.9965,0.934,0.979332,0.920777,0.949153,4.29
0,content_domain_authors_title_reliable_fake,0.997,0.929,0.98539,0.907324,0.944747,4.51
0,content_authors_reliable_fake,0.9955,0.903,0.976667,0.875934,0.923562,3.97
0,content_reliable_fake,0.9955,0.898,0.970149,0.874439,0.919811,3.21
0,content_title_reliable_fake,0.9955,0.896,0.976391,0.865471,0.917591,4.1
0,hyper_all,1.0,0.867,0.881553,0.863118,0.872238,8.65
0,hyper_all,1.0,0.867,0.881553,0.863118,0.872238,8.21
0,content_domain_authors_title_balanced_bin,0.996,0.841,0.854127,0.842803,0.848427,4.34
0,content_domain_authors_title_unbalanced,0.996,0.838,0.83829,0.857414,0.847744,5.14
0,content_domain_balanced_bin,0.995,0.83,0.836466,0.842803,0.839623,4.29
