In [28]:
import pandas as pd
from nltk.corpus import stopwords

import pipeline as pp
import models as ml
from tqdm import tqdm
import importlib
from time import time
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# FakeNewsCorpus test

In [29]:
def evaluate_models(models, X, y_true, name=None):
    metrics = []
    for model in models:
        y_pred = model.predict(X)
        
        if name == None:
            name = type(model).__name__
        metrics.append({
            "name": name,
            "test_acc": accuracy_score(y_true, y_pred),
            "precision": precision_score(y_true, y_pred),
            "recall": recall_score(y_true, y_pred),
            "f1": f1_score(y_true, y_pred), 
        })
    return pd.DataFrame(metrics)

# LIAR data

In [18]:
#Convert to csv
column_names_list = ['id', 'type', 'content', 'subject', 'speaker', 'speaker job title', 'state info', 'party affiliation', 
            'barely true counts', 'false counts', 'half true counts', 'mostly true counts', 'pants on fire counts', 'context']
df = pd.read_table("../datasets/liar_dataset/train.tsv", header=None, names = column_names_list)
pd.set_option('display.max_colwidth', None)
df.to_csv("../datasets/liar_dataset/train.csv")

Clean data

In [20]:
def Clean_data(file, new_file):
    stopwords_lst = stopwords.words('english')
    pp.apply_pipeline(file, [
            # Clean content
            (pp.Clean_data(), 'content'),
            (pp.Tokenizer(), "content"),
            (pp.Remove_stopwords(stopwords_lst), "content"),
            (pp.Stem(), "content"),
            (pp.Combine_Content(), "content", "content_combined")
        ],
        new_file=new_file,
        progress_bar=True,
    )

Clean_data("../datasets/liar_dataset/train.csv", "../datasets/liar_dataset/train_cleaned.csv")

100%|██████████| 10240/10240 [00:01<00:00, 8996.04it/s]
100%|██████████| 10240/10240 [00:00<00:00, 283271.82it/s]
100%|██████████| 10240/10240 [00:00<00:00, 34601.03it/s]
100%|██████████| 10240/10240 [00:01<00:00, 5155.23it/s]
100%|██████████| 10240/10240 [00:00<00:00, 812717.33it/s]


processed 200000 rows
finish time: 3.744737148284912


Convert to binary labels

In [27]:
importlib.reload(pp)

pp.apply_pipeline(
    "../datasets/liar_dataset/train_cleaned.csv", 
    [(pp.Binary_labels_LIAR(), 'type', 'type_binary')], 
    new_file="../datasets/liar_dataset/train_cleaned_bin.csv", 
    progress_bar=True
)

100%|██████████| 10240/10240 [00:00<00:00, 1089236.21it/s]

processed 200000 rows
finish time: 0.2721900939941406





In [31]:
#Vectorisation 

importlib.reload(pp)

def Get_unique_words(file):
    unique_words = pp.Generate_unique_word_list()
    pp.apply_pipeline(file, [(unique_words, None)], progress_bar=True)
    return unique_words

unique_words = Get_unique_words("../datasets/liar_dataset/train_cleaned_bin.csv")

unique_words_list = unique_words.get_unique_words(0,1)

def Vectorize_content(file, new_file, unique_words):
    pp.apply_pipeline(file, [
            (pp.Create_word_vector(unique_words), "content"),
            (pp.Save_numpy_arr(), "content")
        ], 
        new_file=new_file,
        progress_bar=True)

Vectorize_content("../datasets/liar_dataset/train_cleaned_bin.csv", "../datasets/liar_dataset/train_vectorized.csv", unique_words_list)

100%|██████████| 10240/10240 [00:00<00:00, 23857.01it/s]


processed 200000 rows
finish time: 0.5099701881408691


100%|██████████| 10240/10240 [00:14<00:00, 711.57it/s]
  result = libops.scalar_compare(x.ravel(), y, op)
100%|██████████| 10240/10240 [00:09<00:00, 1027.28it/s]


processed 200000 rows
finish time: 29.30839991569519
