In [28]:
from gensim.utils import deaccent
from nltk import pos_tag
from nltk import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer

import re
import time

import pandas as pd
import json

import pickle
import nltk
nltk.download('averaged_perceptron_tagger')

#
LOAD_FILES = True

#d_evidence = pd.read_json("data/evidence.json", typ='series')

lemmatizer = WordNetLemmatizer()

# contraction_dict from WS7
contraction_dict = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have",
                    "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not",
                    "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did",
                    "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have",
                    "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have",
                    "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",
                    "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us",
                    "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have",
                    "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have",
                    "o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not",
                    "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have",
                    "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have",
                    "so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",
                    "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",
                    "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not",
                    "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have",
                    "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have",
                    "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will",
                    "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have",
                    "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have",
                    "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
                    "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have"}


# https://stackoverflow.com/a/46231553
def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None # for easy if-statement 


def sentence_preprocessing(sentence):

    out_list = []
    # Use gensim deaccent to match more characters to [a-z]
    sentence = deaccent(sentence.lower())

    for old, new in contraction_dict.items():
        sentence.replace(old, new)

    tokenized = word_tokenize(sentence)

    # now remove all tokens that don't contain any alphanumeric characters
    # then strip non alphanumeric characters afterwards
    tokenized = [re.sub(r"[^a-z0-9\s]", "", token) for token in tokenized if re.match(r"[a-z0-9\s]", token)]

    # now lemmatize with pos
    tagged = pos_tag(tokenized)
    for token, tag in tagged:
        wntag = get_wordnet_pos(tag)

        if wntag is None: # do not supply tag in case of None
            lemma = lemmatizer.lemmatize(token) 
        else:
            lemma = lemmatizer.lemmatize(token, pos=wntag) 

        out_list.append(lemma)
    
    return out_list


def evidence_preprocessing(evidences):
  t = time.time()
  processed = []
  for index, item in enumerate(evidences.items()):
    id, evidence = item

    row = []
    
    row.append(id)
    row.append(evidence)

    # break the text into sentences before tokenizing by each sentence
    processed_sentences = [sentence_preprocessing(sentence) for sentence in sent_tokenize(evidence)]
    row.append(processed_sentences)


    # Appending an empty list to populate with embeddings later
    row.append([])

    processed.append(row)

    if (index + 1) % 50000 == 0:
        print(f"{time.time() - t:.2f} - {index+1} rows processed")

  return pd.DataFrame(processed, columns = ["id", "raw evidence", "processed evidence", "embeddings"])


# Evidence processing
if not LOAD_FILES:
    evidence = evidence_preprocessing(d_evidence)
    with open("evidence_preprocessed_v3.pkl", "wb") as f:
        pickle.dump(evidence, f)
else:
    with open("evidence_preprocessed_v3.pkl", "rb") as f:
        evidence = pickle.load(f)
    
    evidence.head()









[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\mrpea\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [49]:
from gensim.utils import deaccent
from nltk import pos_tag
from nltk import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer

import re
import time

import pandas as pd
import json

import pickle
import nltk
nltk.download('averaged_perceptron_tagger')

#d_evidence = pd.read_json("data/evidence.json", typ='series')

lemmatizer = WordNetLemmatizer()

# contraction_dict from WS7
contraction_dict = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have",
                    "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not",
                    "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did",
                    "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have",
                    "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have",
                    "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",
                    "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us",
                    "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have",
                    "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have",
                    "o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not",
                    "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have",
                    "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have",
                    "so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",
                    "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",
                    "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not",
                    "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have",
                    "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have",
                    "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will",
                    "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have",
                    "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have",
                    "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
                    "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have"}




# https://stackoverflow.com/a/46231553
def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None # for easy if-statement 
    
def sentence_preprocessing(sentence):

    out_list = []
    # Use gensim deaccent to match more characters to [a-z]
    sentence = deaccent(sentence.lower())

    for old, new in contraction_dict.items():
        sentence.replace(old, new)

    tokenized = word_tokenize(sentence)

    # now remove all tokens that don't contain any alphanumeric characters
    # then strip non alphanumeric characters afterwards
    tokenized = [re.sub(r"[^a-z0-9\s]", "", token) for token in tokenized if re.match(r"[a-z0-9\s]", token)]

    # now lemmatize with pos
    tagged = pos_tag(tokenized)
    for token, tag in tagged:
        wntag = get_wordnet_pos(tag)

        if wntag is None: # do not supply tag in case of None
            lemma = lemmatizer.lemmatize(token) 
        else:
            lemma = lemmatizer.lemmatize(token, pos=wntag) 

        out_list.append(lemma)
    
    return out_list


# https://huggingface.co/learn/nlp-course/chapter6/6
def encode_word(word):
    tokens = []
    while len(word) > 0:
        i = len(word)
        while i > 0 and word[:i] not in vocab:
            i -= 1
        if i == 0:
            return ["[UNK]"]
        tokens.append(word[:i])
        word = word[i:]
        if len(word) > 0:
            word = f"##{word}"
    return tokens

# adapted from https://huggingface.co/learn/nlp-course/chapter6/6
def tokenize(sentence):

    # janky workaround for preprocessed sentences
    if type(sentence) is not list:
        sentence = sentence_preprocessing(sentence)
        
    encoded_words = [encode_word(word) for word in sentence]
    return sum(encoded_words, [])



with open("BPETokenizer_merge_rules_v1.5.pkl", "rb") as f:
    merge_rules = pickle.load(f)
    

# Reconstruct vocab from merge rules due to lack of foresight
# This grabs all vocab of length 2 or above (if contains first letter)
# or 4 or above (##__)
vocab = [v for v in merge_rules.values()]

# So iterate through merge rules again to find starting letters
# and one letter suffixes
for pair, merge in merge_rules.items():
    if len(pair[0]) == 1 and pair[0] not in vocab:
        vocab.append(pair[0])
    if len(pair[1]) == 3 and pair[1] not in vocab:
        vocab.append(pair[1])


def processed_evidence_to_bpe(paragraph):
    # 2d array -> paragraph
    if type(paragraph[0]) is list:
        return [tokenize(sentence) for sentence in paragraph]

    # 1 sentence -> tokenize as is 
    else:
        return tokenize(paragraph)


counter = 0
def processed_evidence_to_bpe(paragraph):
    global counter
    counter += 1
    if counter % 1000 == 0:
        print(f"{counter} rows processed")
    #2d array -> paragraph
    if type(paragraph[0]) is list:
        return [tokenize(sentence) for sentence in paragraph]

    # 1 sentence -> tokenize as is 
    else:
        return tokenize(paragraph)


# Save

"""
e["bpe evidence"] = e["processed evidence"].apply(processed_evidence_to_bpe)
with open("BPETokenized_evidence_v3.pkl", "wb") as f:
    pickle.dump(e, f)
"""

# Load
with open("BPETokenized_evidence_v3.pkl", "rb") as f:
    evidence = pickle.load(f)

"""
sentences = []

for paragraph in evidence["bpe evidence"]:
    if type(paragraph[0]) is list:
        for sentence in paragraph:
            sentences.append(sentence)
    else:
        sentences.append(paragraph)
"""

# Now do word2vec
from gensim.models import Word2Vec


EMBEDDING_DIM = 200
"""
embedding_model = Word2Vec(sentences=sentences,
                           vector_size=EMBEDDING_DIM,
                           window=4,
                           min_count=3,
                           workers=10,
                           negative=5
                           )

version = 3
with open(f"BPE Tokenizer to embedding/embeddings_BPE_v{version}.pkl", "wb") as f:
    pickle.dump(embedding_model, f)
"""

# Load embedding
with open("embeddings_BPE_v3.pkl", "rb") as f:
    embedding_model = pickle.load(f)

import numpy as np
def sentence_embedding(sentence):

  # Failsafe
  if len(sentence) == 0:
    return np.zeros(EMBEDDING_DIM)

  if type(sentence[0]) is not list:
      sentence = tokenize(sentence)


  embedding = np.zeros(EMBEDDING_DIM)
  for word in sentence:
    word_embedding = np.zeros(EMBEDDING_DIM)

    # get word vector for given word
    # if not found, ignore (treat as having the zero vector)
    try:
      word_embedding = embedding_model.wv[str(word)]
    except KeyError:
      pass

    embedding += word_embedding

  return embedding / len(sentence)


def paragraph_embedding(paragraph):
    out = []

    # One sentence
    if type(paragraph[0]) is not list:
        return [sentence_embedding(paragraph)]

    else:
        for sentence in paragraph:
            out.append(sentence_embedding(sentence))
    return out

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\mrpea\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [4]:
evidence.head()

Unnamed: 0,id,raw evidence,processed evidence,embeddings
0,evidence-0,"John Bennet Lawes, English entrepreneur and ag...","[[john, bennet, lawes, english, entrepreneur, ...",[]
1,evidence-1,Lindberg began his professional career at the ...,"[[lindberg, begin, his, professional, career, ...",[]
2,evidence-2,``Boston (Ladies of Cambridge)'' by Vampire We...,"[[boston, lady, of, cambridge, by, vampire, we...",[]
3,evidence-3,"Gerald Francis Goyer (born October 20, 1936) w...","[[gerald, francis, goyer, born, october, 20, 1...",[]
4,evidence-4,He detected abnormalities of oxytocinergic fun...,"[[he, detect, abnormality, of, oxytocinergic, ...",[]


In [11]:
# Baseline retrieval: immediately use the raw embeddings to retrieve closest sentences
# Train a cutoff distance threshold.

from scipy.spatial.distance import cosine

# Similarity based on cosine similarity ([0-1], higher the more similar)
def similarity(text, evidence_ids):

    # Seems stupid and retrieving everything from w2v is probably cleaner
    # TODO: make this better
    evidence_embeddings = [evidence.loc[evidence['id'] == id, 'embeddings'].values[0] for id in evidence_ids]
    key_embedding = sentence_embedding(text)
    
    similarities = []
    for evidence_embedding in evidence_embeddings:
        similarities.append(1-cosine(key_embedding, evidence_embedding))

    return similarities


# Using 1 - fscore as the loss
def retrieval_loss(prediction, target):
    numerator = 0
    denominator = 0
    
    for p in prediction:
        if p in target:
            denominator += 2
            numerator += 2
        else:
            denominator += 1
    
    for t in target:
        if t not in prediction:
            denominator += 1
    
    return 1 - numerator/denominator

In [42]:
import json

with open("dev-claims.json") as f:
    dev = json.load(f)


In [40]:
def claim_preprocessing(claims):
  processed = []
  for id, inner in claims.items():
        
    row = []
    row.append(id)
    row.append(inner.get("claim_text"))
    row.append(sentence_preprocessing(inner.get("claim_text")))

    # No label or evidence for unlabelled set
    row.append(inner.get("claim_label", None))
    row.append(inner.get("evidences", None))

    processed.append(row)

  return pd.DataFrame(processed, columns = ["id", "claim_text", "processed text", "claim_label", "evidences"])

In [44]:
d_dev = claim_preprocessing(dev)
d_dev.head()

d_dev["bpe evidence"] = d_dev["processed text"].apply(processed_evidence_to_bpe)

In [45]:
d_dev.head()

Unnamed: 0,id,claim_text,processed text,claim_label,evidences,bpe evidence
0,claim-752,[South Australia] has the most expensive elect...,"[south, australia, have, the, most, expensive,...",SUPPORTS,"[evidence-67732, evidence-572512]","[south, australia, have, the, most, expensive,..."
1,claim-375,when 3 per cent of total annual global emissio...,"[when, 3, per, cent, of, total, annual, global...",NOT_ENOUGH_INFO,"[evidence-996421, evidence-1080858, evidence-2...","[when, 3, per, cent, of, total, annual, global..."
2,claim-1266,This means that the world is now 1C warmer tha...,"[this, mean, that, the, world, be, now, 1c, wa...",SUPPORTS,"[evidence-889933, evidence-694262]","[this, mean, that, the, world, be, now, 1, ##c..."
3,claim-871,"“As it happens, Zika may also be a good model ...","[a, it, happen, zika, may, also, be, a, good, ...",NOT_ENOUGH_INFO,"[evidence-422399, evidence-702226, evidence-28...","[a, it, happen, z, ##ika, may, also, be, a, go..."
4,claim-2164,Greenland has only lost a tiny fraction of its...,"[greenland, have, only, lose, a, tiny, fractio...",REFUTES,"[evidence-52981, evidence-264761, evidence-947...","[greenland, have, only, lose, a, tiny, fractio..."


In [48]:
with open("BPETokenized_evidence_v3.pkl", "rb") as f:
    d_evidence = pickle.load(f)
d_evidence.head()

Unnamed: 0,id,raw evidence,processed evidence,embeddings,bpe evidence
0,evidence-0,"John Bennet Lawes, English entrepreneur and ag...","[[john, bennet, lawes, english, entrepreneur, ...",[],"[[john, benn, ##et, law, ##es, english, entrep..."
1,evidence-1,Lindberg began his professional career at the ...,"[[lindberg, begin, his, professional, career, ...",[],"[[lind, ##berg, begin, his, professional, care..."
2,evidence-2,``Boston (Ladies of Cambridge)'' by Vampire We...,"[[boston, lady, of, cambridge, by, vampire, we...",[],"[[boston, lady, of, cambridge, by, vampire, we..."
3,evidence-3,"Gerald Francis Goyer (born October 20, 1936) w...","[[gerald, francis, goyer, born, october, 20, 1...",[],"[[gerald, francis, go, ##yer, born, october, 2..."
4,evidence-4,He detected abnormalities of oxytocinergic fun...,"[[he, detect, abnormality, of, oxytocinergic, ...",[],"[[he, detect, ab, ##normal, ##ity, of, oxy, ##..."


In [55]:
d_evidence[embeddings] = d_evidence["bpe evidence"].apply(paragraph_embedding)

KeyboardInterrupt: 

In [54]:
def sentence_embedding(sentence):

  # Failsafe
  if len(sentence) == 0:
    return np.zeros(EMBEDDING_DIM)

  if type(sentence[0]) is not list:
      sentence = tokenize(sentence)


  embedding = np.zeros(EMBEDDING_DIM)
  for word in sentence:
    word_embedding = np.zeros(EMBEDDING_DIM)

    # get word vector for given word
    # if not found, ignore (treat as having the zero vector)
    try:
      word_embedding = embedding_model.wv[str(word)]
    except KeyError:
      pass

    embedding += word_embedding

  return embedding / len(sentence)

In [59]:
print(d_evidence.loc[:3, "bpe evidence"].apply(paragraph_embedding))


0    [[0.012712392210960387, 0.5824892453849315, 0....
1    [[-0.4110906516250811, -0.13801924715210734, 0...
2    [[-0.5756708331006978, 0.24082601070404053, 0....
3    [[-0.23389491179715033, -0.2673430044365966, 0...
Name: bpe evidence, dtype: object
