In [38]:
import pandas as pd
from nltk.corpus import stopwords

import pipeline as pp
import models as ml
from tqdm import tqdm
import importlib
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [44]:
def evaluate_models(models, X, y_true, name=None):
    metrics = []
    for model in models:
        y_pred = model.predict(X)
        
        if name == None:
            name = type(model).__name__
        metrics.append({
            "name": name,
            "accuracy": accuracy_score(y_true, y_pred),
            "precision": precision_score(y_true, y_pred),
            "recall": recall_score(y_true, y_pred),
            "f1": f1_score(y_true, y_pred), 
        })
    return pd.DataFrame(metrics)

# FakeNewsCorpus test data

# LIAR data

In [40]:
#Convert to csv
column_names_list = ['id', 'type', 'content', 'subject', 'speaker', 'speaker job title', 'state info', 'party affiliation', 
            'barely true counts', 'false counts', 'half true counts', 'mostly true counts', 'pants on fire counts', 'context']
df = pd.read_table("../datasets/liar_dataset/train.tsv", header=None, names = column_names_list)
pd.set_option('display.max_colwidth', None)
df.to_csv("../datasets/liar_dataset/train.csv")

Clean data

In [41]:
importlib.reload(pp)

def Clean_data(file, new_file):
    stopwords_lst = stopwords.words('english')
    pp.apply_pipeline(file, [
            # Clean content
            (pp.Clean_data(), 'content'),
            (pp.Tokenizer(), "content"),
            (pp.Remove_stopwords(stopwords_lst), "content"),
            (pp.Stem(), "content"),
            (pp.Combine_Content(), "content", "content_combined")
        ],
        new_file=new_file,
        progress_bar=True,
    )

Clean_data("../datasets/liar_dataset/train.csv", "../datasets/liar_dataset/train_cleaned.csv")

100%|██████████| 10240/10240 [00:01<00:00, 9959.79it/s]
100%|██████████| 10240/10240 [00:00<00:00, 331156.50it/s]
100%|██████████| 10240/10240 [00:00<00:00, 36975.63it/s]
100%|██████████| 10240/10240 [00:01<00:00, 5444.37it/s]
100%|██████████| 10240/10240 [00:00<00:00, 792619.50it/s]


processed 200000 rows
finish time: 3.46828293800354


Convert to binary labels

In [42]:
importlib.reload(pp)

pp.apply_pipeline(
    "../datasets/liar_dataset/train_cleaned.csv", 
    [(pp.Binary_labels_LIAR(), 'type', 'type_binary')], 
    new_file="../datasets/liar_dataset/train_cleaned_bin.csv", 
    progress_bar=True
)

100%|██████████| 10240/10240 [00:00<00:00, 1212411.38it/s]

processed 200000 rows
finish time: 0.19042396545410156





In [43]:
#Vectorisation 

importlib.reload(pp)

def Get_unique_words(file):
    unique_words = pp.Generate_unique_word_list()
    pp.apply_pipeline(file, [(unique_words, None)], progress_bar=True)
    return unique_words

unique_words = Get_unique_words("../datasets/liar_dataset/train_cleaned_bin.csv")

unique_words_list = unique_words.get_unique_words(0,1)

def Vectorize_content(file, new_file, unique_words):
    pp.apply_pipeline(file, [
            (pp.Create_word_vector(unique_words), "content"),
            (pp.Save_numpy_arr(), "content")
        ], 
        new_file=new_file,
        progress_bar=True)

Vectorize_content("../datasets/liar_dataset/train_cleaned_bin.csv", "../datasets/liar_dataset/train_vectorized.csv", unique_words_list)

100%|██████████| 10240/10240 [00:00<00:00, 23302.40it/s]


processed 200000 rows
finish time: 0.4933781623840332


100%|██████████| 10240/10240 [00:16<00:00, 603.36it/s]
  result = libops.scalar_compare(x.ravel(), y, op)
100%|██████████| 10240/10240 [00:07<00:00, 1288.20it/s]


processed 200000 rows
finish time: 29.131235122680664


In [None]:
#Add features 