# Import and Setup

In [None]:
import os
import sys
import time
import math
import string
import random
import json
import codecs

In [None]:
from importlib import reload
from collections import Counter

In [None]:
import nltk
import numpy as np
# import pandas as pd
from matplotlib import pyplot as plt

In [None]:
import torch
from torch import nn
from torch import optim
from torch.functional import F
from torch.utils.data import DataLoader

In [None]:
BASE_DIR = os.path.abspath("../")
print(BASE_DIR)

In [None]:
sys.path = [BASE_DIR] + sys.path if BASE_DIR not in sys.path else sys.path

In [None]:
nltk.data.path.append(BASE_DIR + "/data/nltk/")

In [None]:
from utils import preprocess, evaluate, training

### Preprocess Data

**Climate FEVER**

In [None]:
with codecs.open(BASE_DIR + "/data/climate-fever.json", encoding="utf-8") as fp:
    data = json.loads(fp.read())
    
print(len(data))
data[:1]

In [None]:
var_clabel_count = {} # Counter([d["claim_label"] for d in data])
var_elabel_count = {}
var_evote_count = {}

for var_d in data:
    if not var_d["claim_label"] in var_clabel_count:
        var_clabel_count[var_d["claim_label"]] = 0
    var_clabel_count[var_d["claim_label"]] += 1
    
    for var_evid in var_d["evidences"]:
        if var_evid["evidence_label"] not in var_elabel_count:
            var_elabel_count[var_evid["evidence_label"]] = 0
        var_elabel_count[var_evid["evidence_label"]] += 1
    
        for var_vote in var_evid["votes"]:
            if var_vote not in var_evote_count:
                var_evote_count[var_vote] = 0
            var_evote_count[var_vote] += 1

print("Claim labels:\n{}".format(var_clabel_count))
print("\nEvidence labels:\n{}".format(var_elabel_count))
print("\nEvidence Votes:\n{}".format(var_evote_count))

**First FEVER**

```python
with codecs.open(BASE_DIR + "/data/train.jsonl", encoding="utf-8") as fp:
    fever_data = [json.loads(l) for l in fp.readlines()]
    
print(len(fever_data))
fever_data[:1]
````

```python
with codecs.open(BASE_DIR + "/saves/data/first-fever.json", mode="w", encoding="utf-8") as fp:
    random.seed(32)
    var_ignore = [l for l in fever_data if l["verifiable"] == "NOT VERIFIABLE"]
    random.shuffle(var_ignore)
    var_choosen = var_ignore[:8000]
    
    random.seed(32)
    var_verify = [l for l in fever_data if l["verifiable"] == "VERIFIABLE"]
    random.shuffle(var_verify)
    var_choosen.extend(var_verify[:6500])
    
    random.seed(32)
    random.shuffle(var_choosen)
    json.dump(var_choosen, fp, indent=2)
```

In [None]:
with codecs.open(BASE_DIR + "/saves/data/first-fever.json", encoding="utf-8") as fp:
    fever_data = json.loads(fp.read())
    
print(len(fever_data))
fever_data[:1]

In [None]:
var_fcverifi_count = Counter([d["verifiable"] for d in fever_data])
var_fclabel_count = Counter([d["label"] for d in fever_data])

print("First FEVER Dataset\n===\n")
print("Claim Verifiable:\n{}".format(var_fcverifi_count))
print("\nClaim Label:\n{}".format(var_fclabel_count))

**Build Dataset**

In [None]:
def clean_sent(sent):
    return " ".join(nltk.word_tokenize(sent.lower()))

```python
dataset = []
var_curr_id = 0

for c in data:
    var_related_set = []
    
    # Add claim
    var_curr_id += 1
    var_related_set.append({
        "id": var_curr_id,
        "sentence": clean_sent(c["claim"]),
        "label": "CLAIM",
        "related": []
    })
    
    for e in c["evidences"]:
        # Add evidence
        var_curr_id += 1
        var_related_set.append({
            "id": var_curr_id,
            "sentence": clean_sent(e["evidence"]),
            "label": "EVIDENCE",
            "related": [{
                "id": var_related_set[0]["id"],
                "label": e["evidence_label"]
            }]
        })
        
        # Append evidence to claim's related sentences
        var_related_set[0]["related"].append({
            "id": var_curr_id,
            "label": e["evidence_label"]
        })
        
    # Append current claim data to dataset
    dataset.extend(var_related_set)
    
fdataset = []

for c in fever_data:
    # Add claim
    var_curr_id += 1
    fdataset.append({
        "id": var_curr_id,
        "sentence": clean_sent(c["claim"]),
        "label": "CLAIM" if c["verifiable"] == "VERIFIABLE" else "IGNORE",
        "related": []
    })
    
print("Climate Dataset:", Counter([d["label"] for d in dataset]))
print("FEVER Dataset:", Counter([d["label"] for d in fdataset]))
```

```python
torch.save({
        "climate_fever": dataset,
        "first_fever": fdataset
    }, BASE_DIR + "/saves/data/dataset.pt")
```

In [None]:
dataset = torch.load(BASE_DIR + "/saves/data/dataset.pt")["climate_fever"]
fdataset = torch.load(BASE_DIR + "/saves/data/dataset.pt")["first_fever"]

print(len(dataset))
print(dataset[:1], "\n")
print(len(fdataset))
print(fdataset[:1])

**Split dataset**

In [None]:
var_dataset_len = len(dataset)

train_dataset = dataset[:int(0.75*var_dataset_len)]
val_dataset = dataset[int(0.75*var_dataset_len):int(0.80*var_dataset_len)]
test_dataset = dataset[int(0.80*var_dataset_len):]

assert(len(train_dataset)+len(val_dataset)+len(test_dataset) == len(dataset))

print("Total Data Length:", len(dataset))
print("Training Data Length:", len(train_dataset))
print("Validation Data Length:", len(val_dataset))
print("Testing Data Length:", len(test_dataset))

In [None]:
var_fdataset_len = len(fdataset)

train_fdataset = fdataset[:int(0.75*var_fdataset_len)]
val_fdataset = fdataset[int(0.75*var_fdataset_len):int(0.80*var_fdataset_len)]
test_fdataset = fdataset[int(0.80*var_fdataset_len):]

assert(len(train_fdataset)+len(val_fdataset)+len(test_fdataset) == len(fdataset))

print("Total Data Length:", len(fdataset))
print("Training Data Length:", len(train_fdataset))
print("Validation Data Length:", len(val_fdataset))
print("Testing Data Length:", len(test_fdataset))

**Gather Training claims and evidences to create vocab**

In [None]:
with open(BASE_DIR + "/saves/data/train-sent.txt", mode="w", encoding='utf-8') as fp:
    var_label_count = {}
    var_word_list = []
    
    for s in train_dataset+train_fdataset:
        fp.write("{}\n".format(s["sentence"]))
        
        if s["label"] not in var_label_count:
            var_label_count[s["label"]] = 0
        
        var_label_count[s["label"]] += 1
        var_word_list.extend(s["sentence"].split())

print("Word count:", len(var_word_list))
print("Unique Word count:", len(set(var_word_list)))
print("Label counts:", var_label_count)

**Analyse dataset words and characters**

In [None]:
def str_replace(w, ch_list, rep=""):
    for ch in ch_list:
        w = w.replace(ch, rep)
    return w

In [None]:
var_dash_forms = list(set(['-', '‐', '–', '—', '−', '‑']))
print("No. of dash forms:", len(var_dash_forms))

var_quote_forms = list(set(['\'', '"', '‘', '’', '“', '”', '′', '″',]))
print("No. of quote forms:", len(var_quote_forms))

var_other_forms = ['°', '˚', '…', '€', '£', '±']
print("No. of other forms:", len(var_other_forms))

var_sp_ch = list(var_dash_forms) +\
    list(var_quote_forms) +\
    list(var_other_forms) +\
    list(string.punctuation)

var_non_alpha = [
    w for w in set(var_all_sent.strip().split())\
        if not w.isalpha() and not w.isdigit() and not w.isalnum() and\
        not (len(w)!=1 and str_replace(w, [*var_sp_ch]).isalnum()) and\
        str_replace(w, var_sp_ch)
]

print("\nNo. of non-alphabetic words:", len(var_non_alpha))
print(var_non_alpha)

In [None]:
print("Number of unique characters:", len(set(var_all_sent)))

var_oov = [ch for ch in sorted(set(var_all_sent))\
      if ch not in string.ascii_lowercase+string.digits+string.whitespace+\
          string.punctuation+"".join(var_sp_ch)]

print("\nNo. of OOV characters:", len(var_oov))
print(var_oov)

**Get vocab list, complete it, and save it along with training dataset**

In [None]:
with open(BASE_DIR + "/saves/data/train-vocab.txt", encoding="utf-8") as fp:
    var_vocab_text = fp.read()

In [None]:
var_vocab = [l.replace(" ", "") for l in var_vocab_text.splitlines()[1:] if len(l.split()) == 2]
var_vocab = sorted(var_vocab + list(set("".join([
    "".join(set(s["sentence"])) for s in train_dataset+train_fdataset]))))
var_vocab = ["</eos>"] + var_vocab + ["</unk>", "</pad>"]

print(len(var_vocab))
print(var_vocab[:10], "...", var_vocab[-10:])

In [None]:
with open(BASE_DIR + "/saves/data/train-split.txt", encoding="utf-8") as fp:
    var_train_data = [s.split()+["</eos>"] if s.split()[-1]=="." else s.split()\
        for s in fp.read().splitlines()]

print(len(var_train_data))

In [None]:
print(var_train_data[:3])

In [None]:
for i, var_dt in enumerate(var_train_data):
    try:
        train_dataset[i]["sentence"] = var_dt
    except:
        train_fdataset[i-len(train_dataset)]["sentence"] = var_dt
    
print(train_dataset[0], "\n")
print(train_fdataset[0])

In [None]:
data_dict = {
    "train_dataset": train_dataset,
    "train_fdataset": train_fdataset,
    "vocab": var_vocab}
torch.save(data_dict, BASE_DIR + "/saves/data/clean_data.pt")