## Data loading (LIAR)

In [None]:
from datasets import load_dataset
dataset = load_dataset("liar")

In [None]:
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import pickle

X_train = dataset['train']['statement']
X_val = dataset['validation']['statement']
X_test = dataset['test']['statement']

y_train = dataset['train']['label']
y_val = dataset['validation']['label']
y_test = dataset['test']['label']

## BASELINE 1

In [None]:
total_full_stop_train = []

for article in X_train:
    full_stop_count = 0
    for word in article:
        full_stop_count += word.count('.')
    total_full_stop_train.append(full_stop_count)

In [None]:
dat_size = len(total_full_stop_train)
pos = sum(total_full_stop_train[:int(dat_size/2)]) / (dat_size/2)
neg = sum(total_full_stop_train[int(dat_size/2):]) / (dat_size/2)
print(pos)
print(neg)
boundary = (pos + neg) / 2
print(boundary)

In [None]:
total_full_stop_val = []

for article in X_val:
    full_stop_count = 0
    for word in str(article):
        full_stop_count += word.count('.')
    total_full_stop_val.append(full_stop_count)

In [None]:
from sklearn.metrics import accuracy_score
predictions = [0 if x > boundary else 1 for x in total_full_stop_val]

acc = accuracy_score(y_val, predictions)

print('Accuracy: ', acc)

In [None]:
total_full_stop_test = []

for article in X_test:
    full_stop_count = 0
    for word in str(article):
        full_stop_count += word.count('.')
    total_full_stop_test.append(full_stop_count)

In [None]:
predictions = [0 if x > boundary else 1 for x in total_full_stop_test]

acc = accuracy_score(y_test, predictions)

print('Accuracy: ', acc)

## BASELINE 2

In [None]:
import gensim
import gensim.downloader

word2vec = gensim.downloader.load('word2vec-google-news-300')

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import numpy as np

def score(model, word):
    try:
        return model[word]
    except KeyError:
        # Handle the case when one or both words are not in the vocabulary
        return np.zeros(300)

Train_Data = []

max_words = 10
print("Processing train articles ...")
# Iterate over each article
for article in X_train:
    # Create a matrix for the current article
    article_matrix = [score(word2vec, word) for word in article.split()[:10]]
    # Add the matrix to the list
    padding_size = max_words - len(article_matrix)
    if padding_size > 0:
        article_matrix += [np.zeros(300)] * padding_size
    Train_Data.append(np.asarray(article_matrix).flatten())


Train_Data = np.array(Train_Data)

print("Articles processed!")

model = LogisticRegression(max_iter=1000)

model.fit(Train_Data, y_train)

In [None]:
Val_Data = []
print("Processing val articles ...")
# Iterate over each article
for article in X_val:
    # Create a matrix for the current article
    article_matrix = [score(word2vec, word) for word in article.split()[:10]]
    # Add the matrix to the list

    padding_size = max_words - len(article_matrix)
    if padding_size > 0:
        article_matrix += [np.zeros(300)] * padding_size
    Val_Data.append(np.asarray(article_matrix).flatten())

Val_Data = np.array(Val_Data)

print("Articles processed!")

predictions = model.predict(Val_Data)


acc = accuracy_score(y_val, predictions)


print('Scores for Baseline 2:')
print('Accuracy: ', acc)

In [None]:
Test_Data = []
print("Processing test articles ...")
# Iterate over each article
for article in X_test:
    # Create a matrix for the current article
    article_matrix = [score(word2vec, word) for word in article.split()[:10]]
    # Add the matrix to the list
    padding_size = max_words - len(article_matrix)
    if padding_size > 0:
        article_matrix += [np.zeros(300)] * padding_size

    Test_Data.append(np.asarray(article_matrix).flatten())

Test_Data = np.array(Test_Data)

print("Articles processed!")

predictions = model.predict(Test_Data)


acc = accuracy_score(y_test, predictions)

print('Scores for Baseline 2:')
print('Accuracy: ', acc)