# Packages

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from ast import literal_eval
import numpy as np
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.preprocessing import normalize
from scipy.sparse import csr_matrix, vstack, load_npz, save_npz
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
import torch

import pipeline as pp
import model_tests as mt

import importlib
import math
import pickle
import seaborn as sns
import matplotlib.pyplot as plt

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

# Preproccessing

Covert types to binary labels - either True (reliable) or False (fake news)

In [None]:
importlib.reload(pp)

pp.apply_pipeline(
    "../datasets/large/cleaned_file.csv", 
    [(pp.Binary_labels_LIAR(), 'type', 'type_binary')], 
    new_file="../datasets/large/cleaned_file_bin.csv", 
    progress_bar=True
)

In [None]:
# delete nans
pp.apply_pipeline(
    "../datasets/sample/dataset_unbalanced_1M.csv",
    [(pp.Delete_nan(), 'content_title'),
     (pp.Delete_nan(), 'content_domain'),
     (pp.Delete_nan(), 'content_authors'),
     (pp.Delete_nan(), 'content_domain_authors_title')],
     new_file="../datasets/sample/dataset_unbalanced_1M_.csv",
     progress_bar=True
)

Create the follwoing input files:
* All are unbalanced
* The test and validation set are balanced according to the types (e.g. satire, reliable...), and the test set is unbalanced
* The test and validation set are balanced according to the binary classes, and the test set is unbalanced

In [32]:
# The number of rows to train the model
BATCH_SIZE = 1000000

In [33]:
importlib.reload(pp)
from_file = "../datasets/large/cleaned_file.csv"

pp.get_dataframe_with_distribution(from_file, BATCH_SIZE, [0.1,0.1,0.1], [False, False, False], 
                                    out_file="../datasets/large/dataset_unbalanced_100k.csv", get_frame=False)
#pp.get_dataframe_with_distribution(from_file, BATCH_SIZE, [0.5 ,0.1,0.1], [True, False, False], 
 #                                   out_file="../datasets/sample/dataset_balanced_types.csv", get_frame=False)
#pp.get_dataframe_with_distribution(from_file, BATCH_SIZE, [0.8,0.1,0.1], [True, False, False],
#                                    out_file="../datasets/sample/dataset_balanced_bin.csv", get_frame=False, classes=[True,False], type_col="type_binary")
#pp.get_dataframe_with_distribution(from_file, BATCH_SIZE, [0.8,0.1,0.1], [True, False, False], 
#                                    out_file="../datasets/sample/dataset_balanced_reliable_fake.csv", get_frame=False, classes=["reliable", "fake"])

100%|██████████| 500000/500000 [00:00<00:00, 763187.13it/s]


entries read: 500000


Check distribution of labels (just to show that everything works)

Cleaning the files

In [None]:
importlib.reload(pp)

def Clean_data(file, new_file):
    stopwords_lst = stopwords.words('english')
    pp.apply_pipeline(file, [
            # binary labels
            (pp.Binary_labels(), 'type', 'type_binary'),
            # Clean content
            (pp.Clean_data(), 'content'),
            (pp.Tokenizer(), "content"),
            (pp.Remove_stopwords(stopwords_lst), "content"),
            (pp.Stem(), "content"),
            (pp.Combine_Content(), "content", "content_combined"),
            # Clean authors
            (pp.Clean_author(), "authors"),
            # Clean title
            (pp.Clean_data(), 'title'),
            (pp.Tokenizer(), "title"),
            (pp.Remove_stopwords(stopwords_lst), "title"),
            (pp.Stem(), "title"),
            (pp.Combine_Content(), "title"),
            # Clean domain
            (pp.Clean_domain(), 'domain'),
            # Combine columns (used as features)
            (pp.Join_str_columns(["content_combined", "authors"]), None, "content_authors"),
            (pp.Join_str_columns(["content_combined", "title"]), None, "content_title"),
            (pp.Join_str_columns(["content_combined", "domain"]), None, "content_domain"),
            (pp.Join_str_columns(["content_combined", "domain", "authors", "title"]), None, "content_domain_authors_title")
        ],
        new_file=new_file,
        progress_bar=True,
    )

#Clean_data("../datasets/sample/dataset_unbalanced.csv", "../datasets/sample/dataset_unbalanced_cleaned.csv")
#Clean_data("../datasets/sample/dataset_balanced_types.csv", "../datasets/sample/dataset_balanced_types_cleaned.csv")
#Clean_data("../datasets/sample/dataset_balanced_bin.csv", "../datasets/sample/dataset_balanced_bin_cleaned.csv")
Clean_data("../datasets/sample/dataset_reliable_fake.csv", "../datasets/sample/dataset_reliable_fake_cleaned.csv")

# Training the logistic model

Extracting liar data

In [16]:
liar_data = pd.read_csv("../datasets/liar_dataset/cleaned/combined_cleaned.csv")
X_liar =  liar_data["statement_combined"].values
y_liar = liar_data["label_binary"].astype(int)

Testing models (other than logistic)

In [19]:

info_list = [(
    "../datasets/large/dataset_unbalanced_1M.csv", "content_no_swords_combined", mt.create_count_vector, [
        (MultinomialNB(), "naive_bayes"),
        (RandomForestClassifier(max_depth=5), "random_forest"), #25
        (DecisionTreeClassifier(max_depth=2), "decision_tree"),
        (AdaBoostClassifier(n_estimators=2), "ada_boost"), #2
        #(SVC(kernel='linear', max_iter=10), "svm"),
        #(KNeighborsClassifier(n_neighbors=2, algorithm='kd_tree'), "knn"), #15
        (PassiveAggressiveClassifier(), "passive_aggressive")
        ])
]

test_stats_base = mt.Test_statistic()

mt.create_vectors_from_infolist("../datasets/large/dataset_count_vectors.pickle", info_list, X_liar, y_liar)
mt.test_vectors_from_infolist("../datasets/large/dataset_count_vectors.pickle", info_list, tests=test_stats_base)
test_stats_base.metrics.sort_values(by=["split","f1"], ascending=False)


Creating vector 0 (data read in 249.15490126609802 seconds)
Saved vector 0 in 305.8764753341675 seconds
naive_bayes finished in 2.04 seconds
random_forest finished in 82.84 seconds
decision_tree finished in 24.52 seconds
ada_boost finished in 34.10 seconds
passive_aggressive finished in 50.92 seconds


Unnamed: 0,name,split,train_acc,acc,precision,recall,f1,time,confusion_matrix,model
0,passive_aggressive,val,0.891746,0.84536,0.82978,0.851294,0.8404,49.88,"[[43822, 8352], [7112, 40714]]",PassiveAggressiveClassifier()
0,naive_bayes,val,0.817662,0.8055,0.791838,0.804918,0.798324,0.7,"[[42054, 10120], [9330, 38496]]",MultinomialNB()
0,decision_tree,val,0.65207,0.65332,0.590572,0.89696,0.712213,23.05,"[[22434, 29740], [4928, 42898]]",DecisionTreeClassifier(max_depth=2)
0,ada_boost,val,0.65207,0.65332,0.590572,0.89696,0.712213,31.49,"[[22434, 29740], [4928, 42898]]","(DecisionTreeClassifier(max_depth=1, random_st..."
0,random_forest,val,0.685954,0.68455,0.756936,0.501443,0.603252,20.79,"[[44473, 7701], [23844, 23982]]","(DecisionTreeClassifier(max_depth=5, max_featu..."
1,passive_aggressive,test,0.891746,0.84885,0.832606,0.856451,0.84436,49.88,"[[43885, 8243], [6872, 41000]]",PassiveAggressiveClassifier()
1,naive_bayes,test,0.817662,0.8079,0.793355,0.809596,0.801394,0.7,"[[42033, 10095], [9115, 38757]]",MultinomialNB()
1,decision_tree,test,0.65207,0.65148,0.589364,0.89685,0.711299,23.05,"[[22214, 29914], [4938, 42934]]",DecisionTreeClassifier(max_depth=2)
1,ada_boost,test,0.65207,0.65148,0.589364,0.89685,0.711299,31.49,"[[22214, 29914], [4938, 42934]]","(DecisionTreeClassifier(max_depth=1, random_st..."
1,random_forest,test,0.685954,0.68894,0.763434,0.507478,0.609682,20.79,"[[44600, 7528], [23578, 24294]]","(DecisionTreeClassifier(max_depth=5, max_featu..."


In [21]:

info_list = [(
    "../datasets/large/dataset_unbalanced_1M.csv", "content_no_swords_combined", mt.create_count_vector, [
        #(MultinomialNB(), "naive_bayes"),
        #(RandomForestClassifier(max_depth=5), "random_forest"),  # 25
        #(DecisionTreeClassifier(max_depth=2), "decision_tree"),
        #(AdaBoostClassifier(n_estimators=2), "ada_boost"),  # 2
        (SVC(kernel='linear', max_iter=100), "svm"),
        #(KNeighborsClassifier(n_neighbors=2, algorithm='kd_tree'), "knn"), #15
        #(PassiveAggressiveClassifier(), "passive_aggressive")
    ])
]

test_stats_base_svm = mt.Test_statistic()

#mt.create_vectors_from_infolist("../datasets/large/dataset_count_vectors.pickle", info_list, X_liar, y_liar)
mt.test_vectors_from_infolist("../datasets/large/dataset_count_vectors.pickle", info_list, tests=test_stats_base_svm)
test_stats_base_svm.metrics.sort_values(by=["split", "f1"], ascending=False)


svm finished in 527.08 seconds


Unnamed: 0,name,split,train_acc,acc,precision,recall,f1,time,confusion_matrix,model
0,svm,val,0.477913,0.47881,0.478504,0.99908,0.647089,251.51,"[[99, 52075], [44, 47782]]","SVC(kernel='linear', max_iter=100)"
1,svm,test,0.477913,0.47944,0.479048,0.999164,0.647604,251.51,"[[112, 52016], [40, 47832]]","SVC(kernel='linear', max_iter=100)"
2,svm,liar,0.477913,0.442264,0.442264,1.0,0.613291,251.51,"[[0, 7134], [0, 5657]]","SVC(kernel='linear', max_iter=100)"


In [22]:
# FINISHED
importlib.reload(mt)

unbalanced = "../datasets/large/dataset_unbalanced_1M.csv"
#balanced_types = "../datasets/sample/dataset_balanced_types_1M.csv"
#balanced_bin = "../datasets/sample/dataset_balanced_bin_1M.csv"
#balanced_reliable_fake = "../datasets/sample/dataset_balanced_reliable_fake_1M.csv"

info_list = [
    (unbalanced, "content_combined", mt.create_count_vector, [(LogisticRegression(max_iter=300), "content_count"), 
                                                              (LogisticRegression(max_iter=300, C=0.1), "content_count_hyper"),
                                                              (LogisticRegression(max_iter=300, C=250), "content_count_hyper")]),
    #(balanced_types, "content_combined", mt.create_count_vector, [(LogisticRegression(max_iter=300), "content_count_balanced_types")]),
    #(balanced_bin, "content_combined", mt.create_count_vector, [(LogisticRegression(max_iter=300), "content_count_balanced_bin")]),
    #(balanced_reliable_fake, "content_combined", mt.create_count_vector, [(LogisticRegression(max_iter=300), "content_count_reliable_fake")]),
]

test_stats_simple = mt.Test_statistic()

mt.create_vectors_from_infolist("../datasets/sample/dataset_count_vectors.pickle", info_list, X_liar, y_liar)
mt.test_vectors_from_infolist("../datasets/sample/dataset_count_vectors.pickle", info_list, tests=test_stats_simple)
test_stats_simple.metrics.sort_values(by=["split","f1"], ascending=False)

Creating vector 0 (data read in 157.5944595336914 seconds)
Saved vector 0 in 321.01699686050415 seconds
content_count finished in 198.35 seconds
content_count_hyper finished in 191.77 seconds
content_count_hyper finished in 198.20 seconds


Unnamed: 0,name,split,train_acc,acc,precision,recall,f1,time,confusion_matrix,model
0,content_count_hyper,val,0.897497,0.87317,0.892116,0.860238,0.875887,190.7,"[[42564, 5412], [7271, 44753]]","LogisticRegression(C=0.1, max_iter=300)"
0,content_count,val,0.900826,0.87192,0.890256,0.859795,0.87476,197.26,"[[42462, 5514], [7294, 44730]]",LogisticRegression(max_iter=300)
0,content_count_hyper,val,0.900057,0.87097,0.889256,0.85895,0.87384,196.89,"[[42411, 5565], [7338, 44686]]","LogisticRegression(C=250, max_iter=300)"
1,content_count_hyper,test,0.897497,0.87137,0.890553,0.860517,0.875278,190.7,"[[42002, 5547], [7316, 45135]]","LogisticRegression(C=0.1, max_iter=300)"
1,content_count,test,0.900826,0.8701,0.887915,0.861032,0.874267,197.26,"[[41848, 5701], [7289, 45162]]",LogisticRegression(max_iter=300)
1,content_count_hyper,test,0.900057,0.86952,0.887634,0.860117,0.873659,196.89,"[[41838, 5711], [7337, 45114]]","LogisticRegression(C=250, max_iter=300)"
2,content_count_hyper,liar,0.900057,0.520053,0.413805,0.204525,0.273749,196.89,"[[5495, 1639], [4500, 1157]]","LogisticRegression(C=250, max_iter=300)"
2,content_count,liar,0.900826,0.523024,0.411058,0.181368,0.251686,197.26,"[[5664, 1470], [4631, 1026]]",LogisticRegression(max_iter=300)
2,content_count_hyper,liar,0.897497,0.527168,0.415331,0.169524,0.240773,190.7,"[[5784, 1350], [4698, 959]]","LogisticRegression(C=0.1, max_iter=300)"


In [23]:
importlib.reload(mt)

info_list = [
      ("../datasets/large/dataset_unbalanced_1M.csv", "content_combined", mt.create_tdfidf_vector_unigram, [
        (LogisticRegression(max_iter=300), "content_tfidf_uni"),
        (LogisticRegression(max_iter=300, C=250), "content_tfidf_uni_hyper_1"),
        (LogisticRegression(max_iter=300, C=0.1), "content_tfidf_uni_hyper_2")]),
     ("../datasets/large/dataset_unbalanced_1M.csv", "content_combined", mt.create_tdfidf_vector_bigram, [
        (LogisticRegression(max_iter=300), "content_tfidf_bi")]),
 #    ("../datasets/sample/dataset_unbalanced_10K.csv", "content_combined", mt.create_tdfidf_vector_trigram, [
 #       (LogisticRegression(max_iter=300), "content_tfidf_tri"),
 #       (LogisticRegression(max_iter=300, C=250), "content_tfidf_tri_hyper_1"),
  #      (LogisticRegression(max_iter=300, C=0.1), "content_tfidf_hyper_2")]),
]

test_stats_tdidf = mt.Test_statistic()

mt.create_vectors_from_infolist("../datasets/sample/dataset_tdidf_vectors.pickle", info_list, X_liar, y_liar)
mt.test_vectors_from_infolist("../datasets/sample/dataset_tdidf_vectors.pickle", info_list, tests=test_stats_tdidf)
test_stats_tdidf.metrics.sort_values(by=["split","f1"], ascending=False)

Creating vector 0 (data read in 155.73694348335266 seconds)
Saved vector 0 in 318.0987198352814 seconds
Creating vector 1 (data read in 99.3843047618866 seconds)
Saved vector 1 in 1135.6230821609497 seconds
content_tfidf_bi finished in 198.51 seconds
content_tfidf_bi_hyper_1 finished in 200.57 seconds
content_tfidf_bi_hyper_2 finished in 59.96 seconds
content_tfidf_bi finished in 1939.47 seconds


KeyboardInterrupt: 

In [24]:
test_stats_tdidf_bitri.metrics.sort_values(by=["split","f1"], ascending=False)

Unnamed: 0,name,split,train_acc,acc,precision,recall,f1,time,confusion_matrix,model
0,content_tfidf_bi,val,0.94509,0.90483,0.902536,0.915981,0.909209,1936.2,"[[42830, 5146], [4371, 47653]]",LogisticRegression(max_iter=300)
0,content_tfidf_bi_hyper_1,val,0.932771,0.87995,0.884717,0.884496,0.884606,199.62,"[[41980, 5996], [6009, 46015]]","LogisticRegression(C=250, max_iter=300)"
0,content_tfidf_bi,val,0.894134,0.87886,0.881578,0.886187,0.883877,197.55,"[[41783, 6193], [5921, 46103]]",LogisticRegression(max_iter=300)
0,content_tfidf_bi_hyper_2,val,0.867195,0.86361,0.86744,0.870925,0.869179,59.07,"[[41052, 6924], [6715, 45309]]","LogisticRegression(C=0.1, max_iter=300)"
1,content_tfidf_bi,test,0.94509,0.90378,0.902702,0.915197,0.908907,1936.2,"[[42375, 5174], [4448, 48003]]",LogisticRegression(max_iter=300)
1,content_tfidf_bi,test,0.894134,0.87891,0.882084,0.887819,0.884942,197.55,"[[41324, 6225], [5884, 46567]]",LogisticRegression(max_iter=300)
1,content_tfidf_bi_hyper_1,test,0.932771,0.87918,0.884137,0.885722,0.884929,199.62,"[[41461, 6088], [5994, 46457]]","LogisticRegression(C=250, max_iter=300)"
1,content_tfidf_bi_hyper_2,test,0.867195,0.86189,0.866353,0.871061,0.868701,59.07,"[[40501, 7048], [6763, 45688]]","LogisticRegression(C=0.1, max_iter=300)"
2,content_tfidf_bi,liar,0.94509,0.443124,0.411259,0.600495,0.48818,1936.2,"[[2271, 4863], [2260, 3397]]",LogisticRegression(max_iter=300)
2,content_tfidf_bi_hyper_2,liar,0.867195,0.45962,0.41807,0.566024,0.480925,59.07,"[[2677, 4457], [2455, 3202]]","LogisticRegression(C=0.1, max_iter=300)"


In [None]:
importlib.reload(mt)

info_list = [

    ("../datasets/large/dataset_unbalanced_100k.csv", "content_no_swords_combined", mt.create_tdfidf_vector_trigram, [
       (LogisticRegression(max_iter=300), "content_tfidf_tri"),
       (LogisticRegression(max_iter=300, C=250), "content_tfidf_tri_hyper_1"),
       (LogisticRegression(max_iter=300, C=0.1), "content_tfidf_hyper_2")]),
]

test_stats_tdidf_tri = mt.Test_statistic()

mt.create_vectors_from_infolist("../datasets/sample/dataset_tdidf_vectors_tri.pickle", info_list, X_liar, y_liar)
mt.test_vectors_from_infolist("../datasets/sample/dataset_tdidf_vectors_tri.pickle", info_list, tests=test_stats_tdidf_tri)
test_stats_tdidf_tri.metrics.sort_values(by=["split","f1"], ascending=False)

In [36]:
test_stats_tdidf_tri.metrics.sort_values(by=["split","f1"], ascending=False)

Unnamed: 0,name,split,train_acc,acc,precision,recall,f1,time,confusion_matrix,model
0,content_tfidf_tri,val,0.9526,0.87048,0.872862,0.851522,0.86206,469.5,"[[46576, 5895], [7057, 40472]]",LogisticRegression(max_iter=300)
1,content_tfidf_tri,test,0.9526,0.86836,0.871823,0.849733,0.860636,469.5,"[[46189, 5976], [7188, 40647]]",LogisticRegression(max_iter=300)
2,content_tfidf_tri,liar,0.9526,0.547025,0.483984,0.365918,0.416751,469.5,"[[4927, 2207], [3587, 2070]]",LogisticRegression(max_iter=300)


In [None]:
importlib.reload(mt)

unbalanced = "../datasets/sample/dataset_unbalanced_1M.csv"

info_list = [
    (unbalanced, "content_title", mt.create_count_vector, [(LogisticRegression(max_iter=300), "content_title_count")]),
    (unbalanced, "content_domain", mt.create_count_vector, [(LogisticRegression(max_iter=300), "content_domain_count")]),
    (unbalanced, "content_authors", mt.create_count_vector, [(LogisticRegression(max_iter=300), "content_authors_count")]),
    (unbalanced, "content_domain_authors_title", mt.create_count_vector, [(LogisticRegression(max_iter=300), "all_count"), 
                                                                          (LogisticRegression(max_iter=300, C=250), "all_count_hyper_1"), 
                                                                          (LogisticRegression(max_iter=300, C=0.1), "all_count_hyper_2")]),
]

test_stats_meta = mt.Test_statistic()

mt.create_vectors_from_infolist("../datasets/sample/dataset_count_vectors_meta.pickle", info_list, X_liar, y_liar)
mt.test_vectors_from_infolist("../datasets/sample/dataset_count_vectors_meta.pickle", info_list, tests=test_stats_meta)
test_stats_meta.metrics.sort_values(by=["split","f1"], ascending=False)

In [None]:
model_name = "content_count_hyper"

metrics = test_stats_simple.metrics
metrics_test = metrics[metrics["split"] == "test"]
cm = metrics_test[metrics_test["name"] == model_name]["confusion_matrix"].values[0]
tn, fp, fn, tp = cm.ravel()
sns.heatmap([[tp, fn],[fp, tn]], annot=True, cmap="Blues", xticklabels=["Fake","True"], yticklabels=["Fake","True"], fmt="d", annot_kws={"size": 12})
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
print(test_stats_simple.metrics.sort_values(by=["split","f1"], ascending=False).to_latex(index=False))

Hyperparameter tuning - the best found was C=300 and max_iter=700. The code down below takes around 5 hours to run for 1M entries.

In [None]:
importlib.reload(mt)

grid = GridSearchCV(
    estimator  = LogisticRegression(),
    param_grid = {"C": [200, 250, 300, 350], "max_iter": [500]},#[500, 600, 700, 800]},
    cv         = 3,
    scoring    = ['f1'],
    refit      = 'f1',
    verbose    = 2
)

unbalanced = "../datasets/sample/dataset_unbalanced.csv"

info_list = [
    (unbalanced, "content_combined", mt.create_count_vector, [(grid, "content_count")]),
]

test_stats_hyper_opt = mt.Test_statistic()

mt.create_vectors_from_infolist("../datasets/sample/hyper_opt.pickle", info_list, X_liar, y_liar) 
mt.test_vectors_from_infolist("../datasets/sample/hyper_opt.pickle", info_list, tests=test_stats_hyper_opt)
test_stats_hyper_opt.metrics.sort_values(by="f1", ascending=False)
# best params
print(grid.best_params_)

# ...

In [None]:
def test_files(files, cols_to_test, vec_funcs, tests = None):
    if tests == None:
        tests = Test_statistic()
    for file, name in files:
        print(f"Proccessing: {name}")
        cols_to_read = list(list(zip(*cols_to_test))[0]) + ["type_binary", "set"]
        data = pd.read_csv(file, usecols=cols_to_read)
        print("Read data into dataframe")

        for col, entry_name in cols_to_test:
            for func, model, func_name in vec_funcs:
                X_train, X_val, X_test, y_train, y_val, y_test = split_data(data, col, "type_binary")
                X_train_vec, X_val_vec, X_test_vec = func(X_train, X_val, X_test)
                print(f"Vectorized {entry_name} with {func_name}")
                tests.test_baseline(X_train_vec, X_val_vec, y_train, y_val, name=f"{entry_name}_{name}_{func_name}", model=model)
    return tests

In [None]:
importlib.reload(mt)
importlib.reload(pp)

def test_on_liar(test, file):
    liar_data = pp.apply_pipeline_pd_tqdm(pd.read_csv(file), [(pp.Binary_labels_LIAR(), 'label', 'type_binary')])

    metrics = pd.DataFrame()
    for row in info_list:
        model_name = row[-1]
        model = test.metrics[test.metrics["name"] == model_name]["model"].values[0]
        vectorizer = test.metrics[test.metrics["name"] == model_name]["vectorizer"].values[0]
        X = vectorizer.transform(liar_data["statement_combined"].values)
        #print(liar_data["type_binary"].astype(int).value_counts())
        metrics = pd.concat([mt.get_predict_metrics(model, X, liar_data["type_binary"].astype(int), name=model_name), metrics])

        
    return metrics.sort_values(by="f1", ascending=False)


In [None]:
def get_distribution(data, is_percentage=True, col = "type"):
    for i, label in enumerate(pp.labels):
        if is_percentage:
            percent = len(data[data[col] == label]) / (data.shape[0])
        else:
            percent = len(data[data[col] == label])
        print(f"{label}: {percent}", end="")
        print(", ", end="") if i != len(pp.labels) - 1 else _

unbalanced = "../datasets/sample/dataset_unbalanced_cleaned.csv"
balanced_types = "../datasets/sample/dataset_balanced_types_cleaned.csv"
balanced_bin = "../datasets/sample/dataset_balanced_bin_cleaned.csv"
balanced_reliable_fake = "../datasets/sample/dataset_reliable_fake_cleaned.csv"

for file in [unbalanced, balanced_types, balanced_bin, balanced_reliable_fake]:
    data = pd.read_csv(file)
    print(f"File: {file} ----------------------------------")
    # find distribution of labels
    for i, set_name in enumerate(["train", "val", "test"]):
        set = data[data["set"] == i]
        print(f"Distribution of {set_name} with size {set.shape[0]}:")
        get_distribution(set)
        print(f"\nTrue: {len(set[set['type_binary'] == True])}, Fake: {len(set[set['type_binary'] == False])}")