## MLP
Simple Multilayer Perceptron to use to train the model.

In [1]:
import gensim.downloader as api

# get word embeddings
import numpy as np

vectors = api.load('fasttext-wiki-news-subwords-300')
vectors['hello']

array([-3.6086e-02,  5.2419e-02,  7.1797e-02,  5.5754e-02,  4.3034e-02,
       -3.7642e-02,  3.1122e-02, -1.0537e-01, -8.5471e-02, -3.8272e-02,
       -3.6829e-02,  7.1718e-02,  3.2325e-02,  5.0349e-02, -1.4943e-02,
        3.4765e-02,  9.9398e-02,  1.0352e-01,  4.9882e-02, -9.2280e-02,
       -4.8455e-02, -2.2770e-02, -1.1397e-01,  6.6670e-02,  4.6059e-02,
       -4.8253e-03,  3.1790e-02,  8.5202e-02,  7.6544e-02, -5.6798e-02,
        2.2006e-03,  1.0122e-01,  3.3008e-02,  1.6444e-02, -1.2835e-02,
       -6.4925e-02,  7.4660e-02,  5.5163e-02,  2.6431e-04, -3.3244e-02,
       -7.6224e-02, -1.9530e-01, -5.3684e-03, -2.2878e-02,  3.4950e-02,
        5.2780e-02,  1.5911e-02, -8.4575e-02,  5.8473e-03,  1.3065e-01,
        1.8299e-02,  1.6318e-02,  1.1358e-01, -1.1667e-01,  5.6145e-02,
        2.8836e-03, -3.5630e-02,  3.9919e-02, -1.0776e-01, -4.8945e-02,
        9.7135e-03,  1.3764e-02,  2.3084e-02,  8.7231e-03, -4.8894e-02,
        9.1422e-02,  7.2054e-02, -5.0373e-03, -1.6506e-02, -8.73

In [2]:
dir(vectors)
vectors.vector_size

300

In [3]:
# load the training data
import pandas as pd
df = pd.read_csv("../../train.csv")

print(df.columns)
df.head(3)

Index(['Unnamed: 0', 'claim', 'label', 'evidence_title', 'evidence_sentence'], dtype='object')


Unnamed: 0.1,Unnamed: 0,claim,label,evidence_title,evidence_sentence
0,0,Nikolaj Coster-Waldau worked with the Fox Broa...,SUPPORTS,Fox_Broadcasting_Company,The Fox Broadcasting Company -LRB- often short...
1,1,Nikolaj Coster-Waldau worked with the Fox Broa...,SUPPORTS,Nikolaj_Coster-Waldau,He then played Detective John Amsterdam in the...
2,2,Roman Atwood is a content creator.,SUPPORTS,Roman_Atwood,"He is best known for his vlogs , where he post..."


In [4]:
# possible augmentation - coreference resolution, titles -- then add this as another column

In [5]:
# basic cleaning

In [6]:
# write the sentence encoder (tf idf weighted?)
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
matrix = tfidf.fit_transform(df['evidence_sentence'])

print(tfidf.get_feature_names_out()[11244:11254])
matrix.shape

['disenfranchise' 'disenfranchised' 'disengaged' 'disengagement'
 'disenrolled' 'disfigured' 'disgorging' 'disgrace' 'disguise' 'disguises']


(202563, 43113)

In [7]:
tfidf.vocabulary_.get("disengaged")

11246

In [8]:
from typing import List
def tfidf_weights(tokens: List[str]):
    # from feature names out get the index
    scores = np.zeros(len(tokens))
    for i, token in enumerate(tokens):
        scores[i] = tfidf.vocabulary_.get(token.lower())
    scores = np.nan_to_num(scores, copy=False)
    return scores / np.sum(scores)      # normalised

sent = df['evidence_sentence'][0]
print(sent)
tfidf_weights(sent.split())

The Fox Broadcasting Company -LRB- often shortened to Fox and stylized as FOX -RRB- is an American English language commercial broadcast television network that is owned by the Fox Entertainment Group subsidiary of 21st Century Fox .


array([0.05881791, 0.02334309, 0.00999008, 0.01384026, 0.        ,
       0.04236151, 0.05370663, 0.05944368, 0.02334309, 0.00471161,
       0.05670015, 0.00589628, 0.02334309, 0.        , 0.03050207,
       0.00461607, 0.00447754, 0.02039734, 0.03381723, 0.01374472,
       0.0099853 , 0.0584676 , 0.04098258, 0.05880995, 0.03050207,
       0.04332644, 0.01057286, 0.05881791, 0.02334309, 0.0205088 ,
       0.02591466, 0.05682276, 0.04230737, 0.00136938, 0.01187377,
       0.02334309, 0.        ])

In [215]:
from string import punctuation
import re
from nltk.corpus import stopwords
sw = set(stopwords.words('english'))

def encode(sent: str):
    # 1. get tokens
    sent = re.sub(rf"[{punctuation}]", '', sent)
    tokens = sent.split()

    # UPDATE: remove stop words to see if it converges
    tokens = [t for t in tokens if t not in sw]
    # weights = np.ones(len(tokens))
    weights = tfidf_weights(tokens)

    encoded = np.empty((len(tokens), vectors.vector_size))
    for i, t in enumerate(tokens):
        t = t.lower()
        try:
            v = vectors[t]
        except:
            v = np.zeros(vectors.vector_size)
        encoded[i] = np.multiply(v, weights[i])
    return np.sum(encoded, axis=0)

sent = df['evidence_sentence'][1]
print(sent)
print(encode(sent).shape)
from scipy.stats import describe
describe(encode(sent))

He then played Detective John Amsterdam in the short-lived Fox television series New Amsterdam -LRB- 2008 -RRB- , as well as appearing as Frank Pike in the 2009 Fox television film Virtuality , originally intended as a pilot .
(300,)


DescribeResult(nobs=300, minmax=(-0.13246785661613103, 0.11091277527157217), mean=0.0008606826803999231, variance=0.0008328520100926018, skewness=-0.22878857512640563, kurtosis=4.465815401409027)

In [201]:
def batch_encode(sents):
    batch = np.zeros((len(sents), vectors.vector_size))
    for i in range(len(batch)):
        batch[i] = encode(sents.iloc[i])
    return batch

batch = batch_encode(df['claim'][:100])
batch_df = pd.DataFrame(batch)
batch_df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,0.019157,-0.144394,0.015852,0.137294,0.017582,-0.093853,0.045612,-0.461705,0.082373,0.036538,...,-0.014472,0.009861,-0.109073,-0.007156,-0.020911,0.001668,0.054783,0.075496,0.054024,-0.038319
std,0.101482,0.180438,0.123218,0.125897,0.136218,0.113029,0.089369,0.311769,0.197031,0.15642,...,0.099175,0.095277,0.150877,0.123255,0.115458,0.098582,0.112653,0.10801,0.094888,0.095547
min,-0.39905,-0.987619,-0.158387,-0.172244,-0.451728,-0.526309,-0.154055,-1.613457,-0.274779,-0.393937,...,-0.272612,-0.249547,-0.609977,-0.545749,-0.413466,-0.241736,-0.280266,-0.162211,-0.217214,-0.317617
25%,-0.016076,-0.237129,-0.077598,0.065403,-0.05082,-0.132759,-0.001251,-0.631087,-0.034739,-0.035613,...,-0.064821,-0.046887,-0.193663,-0.07008,-0.090097,-0.046632,-0.024143,0.021358,0.007385,-0.105858
50%,0.01923,-0.100224,-0.002687,0.129528,0.000878,-0.052955,0.053616,-0.455672,0.054266,0.013974,...,-0.012535,-0.003117,-0.081208,-0.001896,-0.013274,0.002956,0.075382,0.068907,0.072715,-0.055485
75%,0.074536,-0.041155,0.081869,0.180884,0.095905,-0.03138,0.103908,-0.25624,0.141267,0.117012,...,0.04377,0.052359,-0.002575,0.070761,0.04985,0.039971,0.112181,0.117673,0.108643,0.041211
max,0.374026,0.224877,0.753815,0.567,0.401476,0.117404,0.221061,0.182474,0.918773,0.653752,...,0.268188,0.335998,0.283057,0.294124,0.272128,0.523862,0.488391,0.579751,0.376641,0.260027


In [230]:
# TODO: balance the dataset
print(df['label'].value_counts())

num_refutes = len(df[df['label'].str.contains('REFUTE') == True])
print(num_refutes)

support_indices = df[df['label'].str.contains('SUPPORT') == True].index
len(support_indices)
# randomly draw {num_refutes} amount from this index

SUPPORTS    146033
REFUTES      56523
Name: label, dtype: int64
56523


146033

In [222]:
# build the claim and evidence sets
import torch
# print(len(df))
# df = df.drop(labels=[157030, 157031, 160536], axis=0)
# df = df.drop(labels=[96165, 96166, 188267], axis=0)
df = df.drop(labels=[188264], axis=0)
# print(len(df))

train_claims = torch.tensor(df['claim'].apply(lambda s: encode(s)), dtype=torch.float32)
train_evidence = torch.tensor(df['evidence_sentence'].apply(lambda s: encode(s)), dtype=torch.float32)
from datetime import datetime
print(f"Applied encoding... {datetime.now()}")

train_claims[:3, :10], train_evidence[:3, :10]

Applied encoding... 2022-04-10 20:39:39.609122


(tensor([[ 0.0022, -0.0258,  0.0154,  0.0088,  0.0190, -0.0197, -0.0086, -0.0917,
          -0.0023, -0.0234],
         [ 0.0022, -0.0258,  0.0154,  0.0088,  0.0190, -0.0197, -0.0086, -0.0917,
          -0.0023, -0.0234],
         [-0.0015, -0.0599,  0.0691, -0.0066,  0.0265,  0.0033,  0.0086, -0.1273,
           0.0243,  0.0355]]),
 tensor([[ 0.0056, -0.0247,  0.0173,  0.0148, -0.0021, -0.0274, -0.0056, -0.0850,
           0.0022, -0.0029],
         [-0.0015, -0.0249,  0.0139,  0.0255,  0.0107, -0.0208,  0.0048, -0.0991,
           0.0108,  0.0234],
         [ 0.0024, -0.0179,  0.0074, -0.0208,  0.0222, -0.0266, -0.0075, -0.1099,
           0.0107,  0.0072]]))

In [223]:

nans_in_claims = 0
nans_in_evidence = 0
for i in range(len(train_claims)):
    if torch.isnan(train_claims[i]).any():
        print(f"Index: {i}\tText: {df['claim'].iloc[i]}")
        nans_in_claims+=1
    if torch.isnan(train_evidence[i]).any():
        print(f"Index: {i}\tText: {df['evidence_sentence'].iloc[i]}")
        nans_in_evidence+=1

nans_in_claims, nans_in_evidence

Index: 188263	Text: Spider-Man is relatable.


(1, 0)

# Training

In [207]:
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

class Train(Dataset):
    def __init__(self, claims, evidences, labels):
        self._claims = claims
        self._evidences = evidences
        self._labels = labels

    def __len__(self):
        return len(self._labels)

    def __getitem__(self, idx):
        claim = self._claims[idx]
        evidence = self._evidences[idx]
        label = self._labels[idx]
        return claim, evidence, label


labels = pd.Categorical(df['label'])
print(labels[0], labels.codes[0])
labels = F.one_hot(torch.tensor(labels.codes, dtype=torch.long), num_classes=2).float()
train = Train(claims=train_claims, evidences=train_evidence, labels=labels)

train_dl = DataLoader(train, batch_size=12, shuffle=True)
data = next(iter(train_dl))
data

SUPPORTS 1


[tensor([[-2.3241e-01, -3.1076e-02,  1.3272e-01,  ..., -4.5082e-03,
          -8.5342e-03, -2.4965e-02],
         [-9.7410e-02, -2.5672e-01,  6.5306e-02,  ...,  2.3662e-01,
          -2.7858e-02, -3.4912e-02],
         [-1.6641e-01, -2.5534e-02, -8.4005e-02,  ...,  1.2521e-01,
           8.5710e-02, -6.7112e-02],
         ...,
         [-1.0712e-01, -3.6259e-01, -1.1233e-01,  ..., -3.6882e-02,
           9.3531e-02, -1.8838e-01],
         [-2.1391e-03, -1.4190e-01,  1.9514e-01,  ...,  1.3104e-01,
           7.0668e-02,  1.0340e-01],
         [-1.1295e-01, -1.5569e-01, -3.3710e-02,  ...,  1.1398e-01,
          -2.2742e-01, -1.9100e-04]]),
 tensor([[-0.3277, -0.1098,  0.3066,  ...,  0.1991,  0.2710,  0.1359],
         [ 0.0822, -0.0989,  0.1281,  ...,  0.0185,  0.0387, -0.3607],
         [ 0.0873, -0.2155, -0.1350,  ..., -0.1718,  0.0236,  0.0783],
         ...,
         [-0.2675, -0.8068,  0.2540,  ...,  0.2972,  0.6239,  0.2465],
         [-0.3402, -0.5360,  0.1442,  ...,  0.2735,  0.4

In [125]:
# # debug cell
# import torch.nn as nn
# cri = nn.CrossEntropyLoss()
#
# model = Head(embedding_length=vectors.vector_size, interaction=False)
# labels = data[2]
# preds = model(data[0], data[1])
# print(preds[:3], labels[:3])
#
# loss = cri(preds, labels)
# loss.item()

tensor([[0.5078, 0.4922],
        [0.5058, 0.4942],
        [0.5079, 0.4921]], grad_fn=<SliceBackward0>) tensor([[0., 1.],
        [0., 1.],
        [0., 1.]])


0.6970160603523254

In [232]:
import torch.optim as optim
import torch.nn as nn
from fvs.nli.mlp import Head

from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter(log_dir='../../runs')

model = Head(embedding_length=vectors.vector_size, interaction=True)

criterion = nn.CrossEntropyLoss()
# optimiser = optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
optimiser = optim.Adam(model.parameters(), lr=0.01)


epochs = 10
for epoch in range(epochs):
    running_loss = 0.0
    for i, data in enumerate(train_dl, 0):
        claims, evidence, labels = data
        optimiser.zero_grad()

        outputs = model(claims, evidence)
        loss = criterion(outputs, labels)
        l = loss.item()
        loss.backward()
        optimiser.step()

        if torch.isnan(torch.tensor(l)):
            print("LOSS DTYPE IS NAN")
            print("claim ", claims)
            print("evidence ", evidence)
            print("output ", outputs)
            print("labels ", labels)
            print("loss ", loss)
            print(list(model.parameters()))
            break

        running_loss += l
        if (i + 1) % 1000 == 0:
            print(f"[{epoch+1}, {i+1}] loss: {running_loss/1000}")
            writer.add_scalar("Loss/train", running_loss/1000, (epoch+1)*i)
            params = list(model.parameters())
            writer.add_histogram("Linear_0_grad", params[0].grad.view(-1), global_step=(epoch+1)*i)
            writer.add_scalar("Linear_0_grad_mean", params[0].grad.view(-1).mean(), (epoch+1)*i)
            writer.add_histogram("Linear_0", params[0], global_step=(epoch+1)*i)
            writer.add_histogram("Linear_1_grad", params[2].grad.view(-1), global_step=(epoch+1)*i)
            writer.add_histogram("Linear_1", params[2], global_step=(epoch+1)*i)
            running_loss = 0.0
print("Training complete.")

[1, 1000] loss: 0.5941623095571995
[1, 2000] loss: 0.5923449902236462
[1, 3000] loss: 0.5950953635573387
[1, 4000] loss: 0.5926783253252507
[1, 5000] loss: 0.5909283387362957
[1, 6000] loss: 0.5945949888527393
[1, 7000] loss: 0.599261655330658
[1, 8000] loss: 0.5923449884653091
[1, 9000] loss: 0.5930949890613556
[1, 10000] loss: 0.5950949883759021
[1, 11000] loss: 0.5922616565227509
[1, 12000] loss: 0.5931783213019372
[1, 13000] loss: 0.583178324252367
[1, 14000] loss: 0.5898449904918671
[1, 15000] loss: 0.5907616551220417
[1, 16000] loss: 0.5883450075089931
[2, 1000] loss: 0.589511657267809
[2, 2000] loss: 0.5884283211827278
[2, 3000] loss: 0.5975116538107396
[2, 4000] loss: 0.5970116551816463
[2, 5000] loss: 0.5891783232688904
[2, 6000] loss: 0.5825949899554252
[2, 7000] loss: 0.5884283227026462
[2, 8000] loss: 0.5985949896872044
[2, 9000] loss: 0.5940949901044369
[2, 10000] loss: 0.5905116545557976
[2, 11000] loss: 0.5938449874520302
[2, 12000] loss: 0.5842616584897041
[2, 13000] lo

In [133]:
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter(log_dir='../../runs')
writer.add_scalar("Loss", 1, 1)

In [162]:
%load_ext autoreload
%autoreload 2

from fvs.nli.mlp import Head
head = Head(1, True)
for k, param in head.state_dict().items():
    print(k, param.shape)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
linear_0.weight torch.Size([128, 3])
linear_0.bias torch.Size([128])
linear_1.weight torch.Size([2, 128])
linear_1.bias torch.Size([2])
