# Import and Setup

In [None]:
import os
import sys
import time
import math
import string
import random
import json
import codecs

In [None]:
from importlib import reload
from collections import Counter

In [None]:
import nltk
import numpy as np
# import pandas as pd
from matplotlib import pyplot as plt

In [None]:
import sklearn
from sklearn import metrics

In [None]:
import torch
from torch import nn
from torch import optim
from torch.functional import F
from torch.utils.data import DataLoader

In [None]:
BASE_DIR = os.path.abspath("../")
print(BASE_DIR)

In [None]:
sys.path = [BASE_DIR] + sys.path if BASE_DIR not in sys.path else sys.path

In [None]:
nltk.data.path.append(BASE_DIR + "/data/nltk/")

In [None]:
from utils import preprocess, evaluate, training

## Loading data

In [None]:
data_dict = torch.load(BASE_DIR + "/saves/data/clean_data.pt")

In [None]:
train_data = data_dict["train_dataset"].copy()

# random.seed(32)
train_data.extend(random.sample(data_dict["train_fdataset"].copy(), 3100))

vocab = data_dict["vocab"]

In [None]:
print("Data length:", len(train_data))
print("Vocab size:", len(vocab))

In [None]:
print(*train_data[:3], sep="\n\n")

In [None]:
token_to_ix = {t:i for i,t in enumerate(vocab)}
ix_to_token = {i:t for t,i in token_to_ix.items()}

In [None]:
reload(evaluate)

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, data_as_str, _map, vote_map):
        self.vote_map = vote_map
        
        self.data_as_int = []
        self.max_seqlen = float("-inf")
        self.min_seqlen = float("inf")
        
        # Convert data to integers
        for i, dt in enumerate(data_as_str):
            dt_as_int = dt.copy()
            dt_as_int["sentence"] = evaluate.keys_to_values(dt["sentence"], _map,
                _map["</unk>"])
            
            self.data_as_int.append(dt_as_int)
            self.max_seqlen = max(self.max_seqlen, len(dt_as_int["sentence"]))
            self.min_seqlen = min(self.min_seqlen, len(dt_as_int["sentence"]))

    def __len__(self):
        return len(self.data_as_int)
    
    def get_item_with_id(self, _id):
        for dp in self.data_as_int[abs(_id-5):]+self.data_as_int[:abs(_id+5)]:
            if dp["id"] == _id:
                return dp
        # return should have been None but i'm too lazy
        return random.choice(self.data_as_int)
    
    def __getitem__(self, ix):
        # get data sample at index=ix
        item_1 = self.data_as_int[ix]
        PAD_ix = token_to_ix["</pad>"]
        
        choose_relative = False
        
        # print(item_1["label"], ix, end=" ")
        
        # if sentence is a claim and has relatives
        if item_1["label"] == "CLAIM" and len(item_1["related"]) > 0:
            choose_relative = bool(np.random.choice([0, 1], p=[0.15, 0.85]))
            
            if choose_relative:
                p = [(6*1 if r["label"] == "REFUTES" else (
                        2*4 if r["label"] == "SUPPORTS" else 4)
                     ) for r in item_1["related"]]
                p = [n/sum(p) for n in p]
                relative = np.random.choice(item_1["related"], p=p)
                item_2 = self.get_item_with_id(relative["id"])
                
            else:
                choose_relative = False
                while True:
                    item_2 = random.choice(self.data_as_int)
                    if item_2["id"] not in [item_1["id"]]+[i["id"] for i in item_1["related"]]:
                        break
            
        # if sentence is an evidence
        elif item_1["label"] == "EVIDENCE":
            choose_relative = bool(np.random.choice([0, 1], p=[0.15, 0.85]))
            parent = self.get_item_with_id(item_1["related"][0]["id"])
            
            if choose_relative and len(parent["related"]) > 1:
                while True:
                    current = random.choice(parent["related"]).copy()
                    if current["id"] == item_1["id"]:
                        break
                       
                p = [(0 if r["id"] == item_1["id"] else (
                        6*4 if r["label"] == "SUPPORTS" else (
                            14*1 if r["label"] == "REFUTES" else 2)
                     )) for r in parent["related"]]
                p = [n/sum(p) for n in p]
                relative = np.random.choice(parent["related"], p=p).copy()
                item_2 = self.get_item_with_id(relative["id"])
                        
                if current["label"] == relative["label"] and current["label"] == "NOT_ENOUGH_INFO":
                    relative["label"] = "NOT_ENOUGH_INFO"
                    
                elif current["label"] == relative["label"]:
                    relative["label"] = "SUPPORTS"
                    
                else:
                    relative["label"] = "REFUTES"
                    
            else:
                choose_relative = False
                while True:
                    item_2 = random.choice(self.data_as_int)
                    if item_2["id"] not in [item_1["id"]]+[i["id"] for i in parent["related"]]:
                        break
                     
        # else, for every other scenerio
        else:
            choose_relative = False
            while True:
                item_2 = random.choice(self.data_as_int)
                if item_2["id"] != item_1["id"]:
                    break
                    
        # get sample
        x1_pad = item_1["sentence"]
        x1_len = len(x1_pad)
        
        # get sample
        x2_pad = item_2["sentence"]
        x2_len = len(x2_pad)
        
        if choose_relative:
            y_pad = [1, self.vote_map[relative["label"]]]
            
        else:
            y_pad = [0, 0]
        
        # Pad x to self.max_seqlen
        x1_pad += ([PAD_ix] * (self.max_seqlen+1 - len(x1_pad)))
        x2_pad += ([PAD_ix] * (self.max_seqlen+1 - len(x2_pad)))
        
        return (
            (torch.tensor(x1_pad), torch.tensor(x1_len)),
            (torch.tensor(x2_pad), torch.tensor(x2_len))
        ), torch.tensor(y_pad)

In [None]:
vote_map = {"NOT_ENOUGH_INFO": 0, "SUPPORTS": 1, "REFUTES": 2}
dataset = Dataset(train_data, token_to_ix, vote_map)
dataloader = DataLoader(dataset, 8, True)

In [None]:
print("Dataset size:", len(dataset))
print("Max sequence length:", dataset.max_seqlen)
print("Min sequence length:", dataset.min_seqlen)

In [None]:
var_counter = Counter()

try:
    for i, __d in enumerate(dataloader):
        var_counter.update(["_".join(map(str, l)) for l in __d[1].tolist()])
        continue
except Exception as e:
    print(i)
    raise e
    
print(var_counter.most_common())

In [None]:
print(__d)

## Model

In [None]:
class Model(nn.Module):
    def __init__(self, _map, h_size, out_size, emb_dim=128, n_layers=2, dropout_p=0.2):
        """ """
        super(Model, self).__init__()
        
        self.vocab_size  = len(_map)
        self.hidden_size = h_size
        self.output_size = out_size
        self.emb_dim     = emb_dim
        self.n_layers    = max(n_layers, 2)
        self.dropout_p   = dropout_p
        
        self.embedding = nn.Embedding(
            num_embeddings=self.vocab_size,
            embedding_dim =self.emb_dim,
            padding_idx   =_map["</pad>"])
        
        self.lstm = nn.LSTM(
            input_size =self.emb_dim,
            hidden_size=self.hidden_size,
            num_layers =self.n_layers,
            batch_first=True,
            dropout    =self.dropout_p,
            bidirectional=True)
        
        self.dropout = nn.Dropout(p=self.dropout_p)
        
        self.fc = nn.Linear(
            in_features =self.hidden_size*4,
            out_features=self.hidden_size)
        
        self.classifier = nn.Linear(
            in_features =self.hidden_size,
            out_features=1)
        
        self.multi_classifier = nn.Linear(
            in_features =self.hidden_size,
            out_features=self.output_size)
        
    def forward(self, x, prev_state, *, verbose=False):
        """ """
        if verbose:
            print("*"*10, "INPUT", "*"*10)
            print(x[0][0].shape)
            print(x[0][1].shape)
        n_b, n_s = x[0][0].shape
        
        embed = self.embedding(torch.cat([x[0][0], x[1][0]], dim=0))
        if verbose:
            print("\n")
            print("*"*10, "EMBED", "*"*10)
            print(embed.shape)
        
        x_len = torch.cat([x[0][1], x[1][1]], dim=0)

        embed = nn.utils.rnn.pack_padded_sequence(embed, x_len, True, False)
        yhat, state = self.lstm(embed, None)
        yhat, _ = nn.utils.rnn.pad_packed_sequence(yhat, True, total_length=n_s)
        
        yhat = yhat.view(n_b*2, n_s, 2, self.hidden_size)
        if verbose:
            print("\n")
            print("*"*10, "Sanity check: Last timestep", "*"*10)
            print(yhat.shape)
            # print("should be zero (0):", yhat[range(n_b*2), x_len, :].sum())
            # print(yhat[range(n_b), x[1]-1, :].shape)
            print("should not be zero (0):", yhat[range(n_b*2), x_len-1, :].sum())
            
        yhat = torch.cat([yhat[range(n_b*2), x_len-1, 0, :],
                          yhat[range(n_b*2), x_len*0, 1, :]], dim=-1) / x_len.view(n_b*2, 1)
        yhat = torch.cat([yhat[:n_b], yhat[n_b:]], dim=-1)
        if verbose:
            print("\n")
            print("*"*10, "Y_HAT", "*"*10)
            print(yhat.shape)
            # print(yhat)
        
        yhat = self.dropout(yhat)
        yhat = self.fc(yhat)
        out = self.classifier(yhat)
        out_cls = self.multi_classifier(yhat)
        if verbose:
            print("\n")
            print("*"*10, "OUTPUT", "*"*10)
            print(out.shape)
            print(out_cls.shape)
            print([s.shape for s in state])
        return (out, out_cls), state
    
    def init_state(self, b_size=1):
        return (torch.zeros(self.n_layers, b_size, self.hidden_size),
                torch.zeros(self.n_layers, b_size, self.hidden_size))

In [None]:
model = Model(token_to_ix, 16, 3, emb_dim=32, n_layers=2, dropout_p=0.2)
model

In [None]:
loss_history = []

In [None]:
__out = model(__d[0], None, verbose=True)

## Loading and Saving Model

In [None]:
def load_model(path):
    m_data = torch.load(path)
    
    m = Model(
        _map      =m_data["_map"],
        h_size    =m_data["hidden_size"],
        out_size  =m_data["output_size"],
        emb_dim   =m_data["emb_dim"],
        n_layers  =m_data["n_layers"],
        dropout_p =m_data["dropout_p"])
    
    m.load_state_dict(m_data["state_dict"])
    l_hist = m_data["loss_history"]
    return m, l_hist

**Uncomment cell to load the trained model**

In [None]:
model, loss_history = load_model(BASE_DIR + "/saves/model/r-vs-nonr-classifier.pt")
model

In [None]:
def save_model(m, l_hist, _map, path=None):
    if not path: path = BASE_DIR + "/saves/model/r-vs-nonr-classifier.pt"
        
    m_data = {
        "_map"        : _map,
        "hidden_size" : m.hidden_size,
        "emb_dim"     : m.emb_dim,
        "output_size"  : m.output_size,
        "n_layers"    : m.n_layers,
        "dropout_p"   : m.dropout_p,
        "state_dict"  : m.state_dict(),
        "loss_history": l_hist}
    torch.save(m_data, path)

## Training

In [None]:
bce_loss = nn.BCEWithLogitsLoss()
ce_loss = nn.CrossEntropyLoss(ignore_index=token_to_ix["</pad>"], reduction="none")

def criterion(out, y):
    cls_loss = bce_loss(out[0], y[:, [0]].float())

    cat_loss = ce_loss(out[1], y[:, 1])
    cat_loss = (cat_loss * y[:, [0]]).sum() / y[:, [0]].sum()
    return cls_loss + cat_loss

In [None]:
reload(training)

In [None]:
iteration =200
per_iter = 20
start_t = time.time()

for _ti in range(iteration//per_iter):
    model, costs = training.train(
        model, dataloader, per_iter, criterion, print_every=5,
        sleep=20, sleep_every=5)
    
    loss_history.extend(costs)
    save_model(model, loss_history, token_to_ix)
    time.sleep(5)
    
    print("\n" + "="*50)
    print("Round: {:2} of {:2}, Running Time: {:7.2f} sec".format(
        _ti+1, iteration//per_iter, time.time() - start_t))
    print("="*50 + "\n")

In [None]:
cum = 20
plt.xlabel("Iteration")
plt.ylabel("Cross-Entropy Loss")
plt.plot(
    [sum(loss_history[i:i+cum])/cum for i in range(0, len(loss_history), cum)])

In [None]:
print("Iter: {} | Min: {:.4f} | Max: {:.4f} | Last: {:.4f} | Ave: {:.4f}".format(
    len(loss_history), min(loss_history), max(loss_history), loss_history[-1],
    sum(loss_history)/len(loss_history)))

## Evaluating

In [None]:
var_y_true, var_y_pred = [], []

for i, __dd in enumerate(dataloader):
    var_y_true.extend(
        (__dd[1][:, [0]] * (__dd[1][:, [1]] + 1)).flatten().tolist())
    
    var_out = model(__dd[0], None)[0]
    var_y_pred.extend((
        (torch.sigmoid(var_out[0]) >= 0.5) *\
        (torch.topk(var_out[1], k=1, dim=-1)[1] + 1)
    ).flatten().tolist())
    
    if i == 3:
        break
        
print("y true:", len(var_y_true), "; y pred:", len(var_y_pred))
print("y true class count:", Counter(var_y_true))
print("y pred class count:", Counter(var_y_pred))

In [None]:
print("Accuracy:  ", metrics.accuracy_score(var_y_true, var_y_pred))
print("Precision: ", metrics.precision_score(var_y_true, var_y_pred, average=None))
print("Recall:    ", metrics.recall_score(var_y_true, var_y_pred, average=None))
print("F1 Score:  ", metrics.f1_score(var_y_true, var_y_pred, average=None))

In [None]:
print("Confusion Matrix:\n", metrics.confusion_matrix(
    var_y_true, var_y_pred, normalize="true"))

plt.imshow(metrics.confusion_matrix(
    var_y_true, var_y_pred, normalize="true"), cmap=plt.cm.gray_r)
plt.xlabel("Actual")
plt.ylabel("Prediction")
plt.show()