# 2024 COMP90042 Project
*Make sure you change the file name with your group id.*

# Readme
*If there is something to be noted for the marker, please mention here.*

*If you are planning to implement a program with Object Oriented Programming style, please put those the bottom of this ipynb file*

# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [91]:
#### word embedding pipeline

from gensim.utils import deaccent
from nltk import pos_tag
from nltk import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer

import re
import time

import pandas as pd
import json

import pickle
import nltk
nltk.download('averaged_perceptron_tagger')

#
LOAD_FILES = True

#d_evidence = pd.read_json("data/evidence.json", typ='series')

lemmatizer = WordNetLemmatizer()

# contraction_dict from WS7
contraction_dict = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have",
                    "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not",
                    "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did",
                    "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have",
                    "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have",
                    "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",
                    "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us",
                    "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have",
                    "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have",
                    "o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not",
                    "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have",
                    "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have",
                    "so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",
                    "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",
                    "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not",
                    "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have",
                    "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have",
                    "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will",
                    "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have",
                    "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have",
                    "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
                    "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have"}


# https://stackoverflow.com/a/46231553
def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None # for easy if-statement 


def sentence_preprocessing(sentence):

    out_list = []
    # Use gensim deaccent to match more characters to [a-z]
    sentence = deaccent(sentence.lower())

    for old, new in contraction_dict.items():
        sentence.replace(old, new)

    tokenized = word_tokenize(sentence)

    # now remove all tokens that don't contain any alphanumeric characters
    # then strip non alphanumeric characters afterwards
    tokenized = [re.sub(r"[^a-z0-9\s]", "", token) for token in tokenized if re.match(r"[a-z0-9\s]", token)]

    # now lemmatize with pos
    tagged = pos_tag(tokenized)
    for token, tag in tagged:
        wntag = get_wordnet_pos(tag)

        if wntag is None: # do not supply tag in case of None
            lemma = lemmatizer.lemmatize(token) 
        else:
            lemma = lemmatizer.lemmatize(token, pos=wntag) 

        out_list.append(lemma)
    
    return out_list


def evidence_preprocessing(evidences):
  t = time.time()
  processed = []
  for index, item in enumerate(evidences.items()):
    id, evidence = item

    row = []
    
    row.append(id)
    row.append(evidence)

    # break the text into sentences before tokenizing by each sentence
    processed_sentences = [sentence_preprocessing(sentence) for sentence in sent_tokenize(evidence)]
    row.append(processed_sentences)


    # Appending an empty list to populate with embeddings later
    row.append([])

    processed.append(row)

    if (index + 1) % 50000 == 0:
        print(f"{time.time() - t:.2f} - {index+1} rows processed")

  return pd.DataFrame(processed, columns = ["id", "raw evidence", "processed evidence", "embeddings"])


# Evidence processing
if not LOAD_FILES:
    evidence = evidence_preprocessing(d_evidence)
    with open("../pipeline/evidence_preprocessed_v3.pkl", "wb") as f:
        pickle.dump(evidence, f)
else:
    with open("../pipeline/evidence_preprocessed_v3.pkl", "rb") as f:
        evidence = pickle.load(f)
    
    evidence.head()

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/kaiyuancui/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [93]:
from gensim.utils import deaccent
from nltk import pos_tag
from nltk import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer

import re
import time

import pandas as pd
import json

import pickle
import nltk
nltk.download('averaged_perceptron_tagger')

#d_evidence = pd.read_json("data/evidence.json", typ='series')

lemmatizer = WordNetLemmatizer()

# contraction_dict from WS7
contraction_dict = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have",
                    "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not",
                    "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did",
                    "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have",
                    "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have",
                    "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",
                    "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us",
                    "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have",
                    "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have",
                    "o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not",
                    "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have",
                    "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have",
                    "so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",
                    "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",
                    "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not",
                    "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have",
                    "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have",
                    "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will",
                    "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have",
                    "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have",
                    "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
                    "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have"}




# https://stackoverflow.com/a/46231553
def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None # for easy if-statement 
    
def sentence_preprocessing(sentence):

    out_list = []
    # Use gensim deaccent to match more characters to [a-z]
    sentence = deaccent(sentence.lower())

    for old, new in contraction_dict.items():
        sentence.replace(old, new)

    tokenized = word_tokenize(sentence)

    # now remove all tokens that don't contain any alphanumeric characters
    # then strip non alphanumeric characters afterwards
    tokenized = [re.sub(r"[^a-z0-9\s]", "", token) for token in tokenized if re.match(r"[a-z0-9\s]", token)]

    # now lemmatize with pos
    tagged = pos_tag(tokenized)
    for token, tag in tagged:
        wntag = get_wordnet_pos(tag)

        if wntag is None: # do not supply tag in case of None
            lemma = lemmatizer.lemmatize(token) 
        else:
            lemma = lemmatizer.lemmatize(token, pos=wntag) 

        out_list.append(lemma)
    
    return out_list


# https://huggingface.co/learn/nlp-course/chapter6/6
def encode_word(word):
    tokens = []
    while len(word) > 0:
        i = len(word)
        while i > 0 and word[:i] not in vocab:
            i -= 1
        if i == 0:
            return ["[UNK]"]
        tokens.append(word[:i])
        word = word[i:]
        if len(word) > 0:
            word = f"##{word}"
    return tokens

# adapted from https://huggingface.co/learn/nlp-course/chapter6/6
def tokenize(sentence):

    # janky workaround for preprocessed sentences
    if type(sentence) is not list:
        sentence = sentence_preprocessing(sentence)
        
    encoded_words = [encode_word(word) for word in sentence]
    return sum(encoded_words, [])



with open("../pipeline/BPETokenizer_merge_rules_v1.5.pkl", "rb") as f:
    merge_rules = pickle.load(f)
    

# Reconstruct vocab from merge rules due to lack of foresight
# This grabs all vocab of length 2 or above (if contains first letter)
# or 4 or above (##__)
vocab = [v for v in merge_rules.values()]

# So iterate through merge rules again to find starting letters
# and one letter suffixes
for pair, merge in merge_rules.items():
    if len(pair[0]) == 1 and pair[0] not in vocab:
        vocab.append(pair[0])
    if len(pair[1]) == 3 and pair[1] not in vocab:
        vocab.append(pair[1])


def processed_evidence_to_bpe(paragraph):
    # 2d array -> paragraph
    if type(paragraph[0]) is list:
        return [tokenize(sentence) for sentence in paragraph]

    # 1 sentence -> tokenize as is 
    else:
        return tokenize(paragraph)


counter = 0
def processed_evidence_to_bpe(paragraph):
    global counter
    counter += 1
    if counter % 1000 == 0:
        print(f"{counter} rows processed")
    #2d array -> paragraph
    if type(paragraph[0]) is list:
        return [tokenize(sentence) for sentence in paragraph]

    # 1 sentence -> tokenize as is 
    else:
        return tokenize(paragraph)


# Save

"""
e["bpe evidence"] = e["processed evidence"].apply(processed_evidence_to_bpe)
with open("BPETokenized_evidence_v3.pkl", "wb") as f:
    pickle.dump(e, f)
"""

# Load
with open("../pipeline/BPETokenized_evidence_v3.pkl", "rb") as f:
    evidence = pickle.load(f)

"""
sentences = []

for paragraph in evidence["bpe evidence"]:
    if type(paragraph[0]) is list:
        for sentence in paragraph:
            sentences.append(sentence)
    else:
        sentences.append(paragraph)
"""

# Now do word2vec
from gensim.models import Word2Vec


EMBEDDING_DIM = 200
"""
embedding_model = Word2Vec(sentences=sentences,
                           vector_size=EMBEDDING_DIM,
                           window=4,
                           min_count=3,
                           workers=10,
                           negative=5
                           )

version = 3
with open(f"BPE Tokenizer to embedding/embeddings_BPE_v{version}.pkl", "wb") as f:
    pickle.dump(embedding_model, f)
"""

# Load embedding
with open("../pipeline/embeddings_BPE_v3.pkl", "rb") as f:
    embedding_model = pickle.load(f)

import numpy as np
def sentence_embedding(sentence):

  # Failsafe
  if len(sentence) == 0:
    return np.zeros(EMBEDDING_DIM)

  if type(sentence[0]) is not list:
      sentence = tokenize(sentence)


  embedding = np.zeros(EMBEDDING_DIM)
  for word in sentence:
    word_embedding = np.zeros(EMBEDDING_DIM)

    # get word vector for given word
    # if not found, ignore (treat as having the zero vector)
    try:
      word_embedding = embedding_model.wv[str(word)]
    except KeyError:
      pass

    embedding += word_embedding

  return embedding / len(sentence)


def paragraph_embedding(paragraph):
    out = []

    # One sentence
    if type(paragraph[0]) is not list:
        return [sentence_embedding(paragraph)]

    else:
        for sentence in paragraph:
            out.append(sentence_embedding(sentence))
    return out

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/kaiyuancui/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [94]:
# Baseline retrieval: immediately use the raw embeddings to retrieve closest sentences
# Train a cutoff distance threshold.

from scipy.spatial.distance import cosine

# Similarity based on cosine similarity ([0-1], higher the more similar)
def similarity(text, evidence_ids):

    # Seems stupid and retrieving everything from w2v is probably cleaner
    # TODO: make this better
    evidence_embeddings = [evidence.loc[evidence['id'] == id, 'embeddings'].values[0] for id in evidence_ids]
    key_embedding = sentence_embedding(text)
    
    similarities = []
    for evidence_embedding in evidence_embeddings:
        similarities.append(1-cosine(key_embedding, evidence_embedding))

    return similarities


# Using 1 - fscore as the loss
def retrieval_loss(prediction, target):
    numerator = 0
    denominator = 0
    
    for p in prediction:
        if p in target:
            denominator += 2
            numerator += 2
        else:
            denominator += 1
    
    for t in target:
        if t not in prediction:
            denominator += 1
    
    return 1 - numerator/denominator

In [95]:
def sentence_embedding(sentence):

  # Failsafe
  if len(sentence) == 0:
    return np.zeros(EMBEDDING_DIM)

  if type(sentence[0]) is not list:
      sentence = tokenize(sentence)


  embedding = np.zeros(EMBEDDING_DIM)
  for word in sentence:
    word_embedding = np.zeros(EMBEDDING_DIM)

    # get word vector for given word
    # if not found, ignore (treat as having the zero vector)
    try:
      word_embedding = embedding_model.wv[str(word)]
    except KeyError:
      pass

    embedding += word_embedding

  return embedding / len(sentence)

In [21]:
#TODO: combined preprocessing of embedding part with this:

LOCAL_DEV = True # to switch between developing locally and on colab

if not LOCAL_DEV:
    # TODO: need to upload data files on Google Drive?
    from google.colab import drive
    drive.mount('/content/drive')

In [22]:
#Imports
import numpy as np
import torch
import pandas as pd

In [23]:
#visualising training data
if LOCAL_DEV:
    train = pd.read_json("../data/train-claims.json") # for local dev
    dev = pd.read_json("../data/train-claims.json")
    
else:
    train = pd.read_json("/content/drive/MyDrive/data/train-claims.json") # on colab


train = train.transpose()
train.head()

Unnamed: 0,claim_text,claim_label,evidences
claim-1937,Not only is there no scientific evidence that ...,DISPUTED,"[evidence-442946, evidence-1194317, evidence-1..."
claim-126,El Niño drove record highs in global temperatu...,REFUTES,"[evidence-338219, evidence-1127398]"
claim-2510,"In 1946, PDO switched to a cool phase.",SUPPORTS,"[evidence-530063, evidence-984887]"
claim-2021,Weather Channel co-founder John Coleman provid...,DISPUTED,"[evidence-1177431, evidence-782448, evidence-5..."
claim-2449,"""January 2008 capped a 12 month period of glob...",NOT_ENOUGH_INFO,"[evidence-1010750, evidence-91661, evidence-72..."


In [24]:
#visualising evidence data
#visualising evidence data
if LOCAL_DEV:
    evidence = pd.read_json("../data/evidence.json",typ='series')
else:
    evidence = pd.read_json("/content/drive/MyDrive/data/evidence.json",typ='series')

In [25]:
print(len(evidence))
evidence.head()

1208827


evidence-0    John Bennet Lawes, English entrepreneur and ag...
evidence-1    Lindberg began his professional career at the ...
evidence-2    ``Boston (Ladies of Cambridge)'' by Vampire We...
evidence-3    Gerald Francis Goyer (born October 20, 1936) w...
evidence-4    He detected abnormalities of oxytocinergic fun...
dtype: object

In [26]:
if LOCAL_DEV:
    test = pd.read_json("../data/test-claims-unlabelled.json")
else:
    test = pd.read_json("/content/drive/MyDrive/data/test-claims-unlabelled.json")
test = test.transpose()
test.head()


Unnamed: 0,claim_text
claim-2967,The contribution of waste heat to the global c...
claim-979,“Warm weather worsened the most recent five-ye...
claim-1609,Greenland has only lost a tiny fraction of its...
claim-1020,“The global reef crisis does not necessarily m...
claim-2599,Small amounts of very active substances can ca...


In [27]:
#preprocessing
# punctuations should be removed, common words such as the, is, are, should be removed. all words also should be lemmentised and stemmed.


In [28]:
pip install keras

Note: you may need to restart the kernel to use updated packages.


In [29]:
import string
import contractions
from collections import defaultdict
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kaiyuancui/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/kaiyuancui/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kaiyuancui/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [30]:


def preprocess_data(data: pd.Series, limit=10000) -> pd.Series:
  preprocessed_data = {}
  stop_words = set(stopwords.words('english'))
  stop_words.remove('not')
  count = 0
  for id, text in data.items():
    text = text.lower()
    text = contractions.fix(text)
    tokens = word_tokenize(text)
    wnl = WordNetLemmatizer()
    lemmatized_tokens = [wnl.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words]
    preprocessed_data[id] = " ".join(lemmatized_tokens)
    count += 1
    if count >= limit:
      break

  return pd.Series(preprocessed_data)

processed_evidence = preprocess_data(evidence)

test_claims = test['claim_text']
train_claims = train['claim_text']
processed_test = preprocess_data(test_claims)
processed_test.head()
processed_train = preprocess_data(train_claims)
processed_train.head()


claim-1937    not scientific evidence pollutant higher conce...
claim-126     el niño drove record high global temperature s...
claim-2510                              pdo switched cool phase
claim-2021    weather channel john coleman provided evidence...
claim-2449    january capped month period global temperature...
dtype: object

In [31]:
processed_evidence = processed_evidence[processed_evidence.str.strip().str.len() > 0]
processed_evidence.head()

evidence-0    john bennet lawes english entrepreneur agricul...
evidence-1    lindberg began professional career age eventua...
evidence-2                boston lady cambridge vampire weekend
evidence-3    gerald francis goyer born october professional...
evidence-4    detected abnormality oxytocinergic function sc...
dtype: object

# Two steps for the this task
# first. find all relavent evidence, either use contextual embedding or similarity scoring
# second. classify the evidents into 4 classes.

# 2. Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import numpy as np
from tensorflow import keras
from keras.layers import Dense
from keras.models import Sequential, load_model
from keras.layers import Embedding, Flatten, Dense
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.preprocessing.sequence import pad_sequences
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Embedding, Flatten, Dense

In [97]:
#Vectorizing preprocessed text
vectorizer = TfidfVectorizer()
all_texts = pd.concat([processed_evidence, processed_train])
vectorizer.fit(all_texts)
evidence_tfidf = vectorizer.transform(processed_evidence)
test_tfidf = vectorizer.transform(processed_test)
train_tfidf = vectorizer.transform(processed_train)

print(vectorizer.get_feature_names_out()) # why does this contain non-alphabetic?
print(test_tfidf[0])
#print(test_tfidf.shape)




['aa' 'aaa' 'aabis' ... '楊璟翊' '盧翰' '민주국민당']
  (0, 24182)	0.5894022428982739
  (0, 9912)	0.43153385604356104
  (0, 9096)	0.3217964341887153
  (0, 4988)	0.5097918621830975
  (0, 4330)	0.32084706535977053
evidence-0       john bennet lawes english entrepreneur agricul...
evidence-1       lindberg began professional career age eventua...
evidence-2                   boston lady cambridge vampire weekend
evidence-3       gerald francis goyer born october professional...
evidence-4       detected abnormality oxytocinergic function sc...
                                       ...                        
evidence-9995    rising step load testing rsl testing testing s...
evidence-9996               one single would big league hit season
evidence-9997    stream tributary include panther creek plank b...
evidence-9998    kerry glen simon june september american celeb...
evidence-9999    survivor retreated across river joined spaniar...
Length: 9994, dtype: object


john bennet lawes english entrepreneur agricultural scientist
[ 9.19078158e-02  6.90522912e-01  2.78793582e-01 -1.88396685e-02
 -2.28649519e-01 -4.43825321e-01 -2.57578073e-02 -7.96965988e-02
  1.56434759e-01 -4.25711121e-01 -4.77746917e-01 -5.09624938e-01
 -1.12946350e-01 -5.53536547e-01  4.47556537e-01 -2.55390281e-01
 -8.45387330e-02 -2.80425862e-01  2.29172011e-01 -1.44044688e-01
  1.98405743e-01  2.12396421e-01  3.82673217e-01  2.49713839e-01
 -4.64526597e-01  8.77822631e-01  4.16555944e-01  4.88641779e-01
  7.62103965e-01 -6.33041772e-02  5.51566457e-02 -2.50505329e-01
 -5.33437444e-01 -2.24365794e-01 -6.79122785e-02  3.87498591e-01
  1.91612665e-01 -3.02849983e-01  1.51370910e-01  5.73424382e-01
 -1.48332053e-01 -1.97941481e-01  7.50132249e-02  5.45069735e-01
  3.11545775e-01 -3.15839024e-01 -1.87085857e-01 -8.14146393e-01
  9.50082157e-01 -3.44185258e-01 -1.33396986e-01 -9.43473491e-01
 -6.64342791e-02 -1.11255514e-01  5.68910679e-01  4.18735663e-01
 -4.06766784e-01 -8.01613104

In [34]:
similarity_matrix = cosine_similarity(test_tfidf, evidence_tfidf)

def getTopN(similarity_matrix, test, evidence, n):
  test = test.to_frame(name='claim_text')
  top_indices = np.argsort(-similarity_matrix, axis = 1)[:, :n]
  top_evidence = [[str(evidence.index[i]) for i in row] for row in top_indices]
  test['evidences'] = top_evidence
  return test

test_with_evi = getTopN(similarity_matrix, processed_test, processed_evidence, 5)
test_with_evi.head()


Unnamed: 0,claim_text,evidences
claim-2967,contribution waste heat global climate,"[evidence-8950, evidence-4903, evidence-1294, ..."
claim-979,warm weather worsened recent drought included ...,"[evidence-2760, evidence-8828, evidence-5911, ..."
claim-1609,greenland lost tiny fraction ice mass,"[evidence-5928, evidence-4202, evidence-3680, ..."
claim-1020,global reef crisis not necessarily mean extinc...,"[evidence-3210, evidence-8721, evidence-2739, ..."
claim-2599,small amount active substance cause large effect,"[evidence-8207, evidence-8000, evidence-320, e..."


In [41]:
!pip3 install torchtext==0.4.0

Collecting torchtext==0.4.0
  Downloading torchtext-0.4.0-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torchtext
Successfully installed torchtext-0.4.0


In [83]:

# using a classification model to test relevance of each evidence

# assuming train data text formatted as follows:
# trainX = [claim_text + SEPARATION_TOKEN + evidence_text, .....]
# trainY = [RELEVANT, NOT_RELEVANT, ...... ]

# WORKSHOP 8 ------
import torch
import torchtext
from torchtext.data.utils import get_tokenizer

# Define tokenizer
tokenizer = get_tokenizer("basic_english")

# Define Field for text data
TEXT = torchtext.data.Field(tokenize=tokenizer,
                            init_token='<sos>',
                            eos_token='<eos>',
                            lower=True)

# Define Field for label data
LABEL = torchtext.data.LabelField(dtype=torch.float)

# Load IMDb dataset
train_txt, test_txt = torchtext.datasets.IMDB.splits(TEXT, LABEL)

# Split train_txt into train and validation sets
train_txt, valid_txt = train_txt.split(split_ratio=0.8)

# Build vocabulary
TEXT.build_vocab(train_txt)

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Batchify function
def batchify(data, bsz):
    data = TEXT.numericalize([data.examples[0].text])
    # Divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)

# Batch sizes
batch_size = 20
eval_batch_size = 10

# Process the datasets
train_data = batchify(train_txt, batch_size)
val_data = batchify(valid_txt, eval_batch_size)
test_data = batchify(test_txt, eval_batch_size)






TypeError: only integer scalar arrays can be converted to a scalar index

In [70]:
print(valid_txt[0].text)
print(valid_txt[0].label)

print(valid_txt[1].text)
print(valid_txt[1].label)

print(valid_txt)

# ours can be:
# text =  claim_tokens + [SEP_TOKEN] + evidence_tokens
# label <- one of {'rel', 'irr'} (relevant, irrelevant)

# and use the same batchify method ?

print(val_data)

print(TEXT)

['how', 'can', 'such', 'good', 'actors', 'like', 'jean', 'rochefort', 'and', 'carole', 'bouquet', 'could', 'have', 'been', 'involved', 'in', 'such', 'a', '.', '.', '.', 'a', '.', '.', '.', 'well', ',', 'such', 'a', 'thing', '?', 'i', 'can', "'", 't', 'get', 'it', '.', 'it', 'was', 'awful', ',', 'very', 'baldy', 'played', '(', 'but', 'some', 'of', 'the', 'few', 'leading', 'roles', ')', ',', 'the', 'jokes', 'are', 'dumb', 'and', 'absolutely', 'not', 'funny', '.', '.', '.', 'i', 'won', "'", 't', 'talk', 'more', 'about', 'this', 'movie', ',', 'except', 'for', 'one', 'little', 'piece', 'of', 'advice', 'do', 'not', 'go', 'see', 'it', ',', 'it', 'will', 'be', 'a', 'waste', 'of', 'time', 'and', 'money', '.']
neg
['wow', ',', 'finally', 'jim', 'carrey', 'has', 'returned', 'from', 'the', 'died', '.', 'this', 'movie', 'had', 'me', 'laughing', 'and', 'crying', '.', 'it', 'also', 'sends', 'a', 'message', 'that', 'we', 'should', 'all', 'know', 'and', 'learn', 'from', '.', 'jeniffer', 'aniston', 'was

In [86]:
def reverse_batchify(batched_data):
    # Convert the batched data tensor to a numpy array
    data_array = batched_data.cpu().numpy()
    
    # Convert numerical indices back to tokens using the vocabulary
    original_data = []
    for row in data_array:
        tokens = [TEXT.vocab.itos[idx] for idx in row]
        original_data.append(tokens)
    
    return original_data


reversed_data = reverse_batchify(val_data)
print(reversed_data)

[['how', 'carole', '.', 'such', 'it', '(', ',', '.', 'about', 'of'], ['can', 'bouquet', '.', 'a', '.', 'but', 'the', '.', 'this', 'advice'], ['such', 'could', '.', 'thing', 'it', 'some', 'jokes', '.', 'movie', 'do'], ['good', 'have', 'a', '?', 'was', 'of', 'are', 'i', ',', 'not'], ['actors', 'been', '.', 'i', 'awful', 'the', 'dumb', 'won', 'except', 'go'], ['like', 'involved', '.', 'can', ',', 'few', 'and', "'", 'for', 'see'], ['jean', 'in', '.', "'", 'very', 'leading', 'absolutely', 't', 'one', 'it'], ['rochefort', 'such', 'well', 't', '<unk>', 'roles', 'not', 'talk', 'little', ','], ['and', 'a', ',', 'get', 'played', ')', 'funny', 'more', 'piece', 'it']]


In [48]:
bptt = 35
def get_batch(source, i):
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target


ntokens = len(TEXT.vocab.stoi) # the size of vocabulary
emsize = 200 # embedding dimension
nhid = 200 # the dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 2 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 2 # the number of heads in the multiheadattention models
dropout = 0.2 # the dropout value
model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout).to(device)



In [81]:
criterion = nn.CrossEntropyLoss()
lr = 5.0 # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

import time
def train():

    model.train() # Turn on the train mode
    total_loss = 0.
    start_time = time.time()
    ntokens = len(TEXT.vocab.stoi)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        data, targets = get_batch(train_data, i)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        log_interval = 200
        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | '
                  'lr {:02.2f} | ms/batch {:5.2f} | '
                  'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, batch, len(train_data) // bptt, scheduler.get_lr()[0],
                    elapsed * 1000 / log_interval,
                    cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()


def evaluate(eval_model, data_source):
    eval_model.eval() # Turn on the evaluation mode
    total_loss = 0.
    ntokens = len(TEXT.vocab.stoi)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, bptt):
            data, targets = get_batch(data_source, i)
            print(targets)
            output = eval_model(data)
            
            output_flat = output.view(-1, ntokens)
       
            total_loss += len(data) * criterion(output_flat, targets).item()
    return total_loss / (len(data_source) - 1)

In [72]:
best_val_loss = float("inf")
epochs = 3 # The number of epochs
best_model = None

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train()
    val_loss = evaluate(model, val_data)
    print('-' * 89)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
          'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                     val_loss, math.exp(val_loss)))
    print('-' * 89)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model

    scheduler.step()

tensor([[[ 0.2748, -0.5060,  0.5688,  ...,  1.6586, -1.2559,  0.3453],
         [ 0.0743, -0.2062,  1.8589,  ...,  1.2166, -2.8926, -0.1619],
         [-0.0331,  0.1290,  0.2629,  ...,  0.2275, -1.1322,  0.0752],
         ...,
         [-0.0331,  0.1290,  0.2629,  ...,  0.2275, -1.1322,  0.0752],
         [-0.3395, -0.7340,  1.0964,  ...,  1.1425, -0.3426,  0.4701],
         [ 0.4491, -0.5914,  0.5147,  ...,  0.9385, -1.7443, -0.4042]],

        [[ 0.8524, -0.3086,  0.6977,  ...,  1.1315, -1.4706,  0.2938],
         [-0.4178, -0.8994,  1.3850,  ...,  0.8975, -2.2735,  0.1196],
         [-0.0124,  0.1729,  0.2242,  ...,  0.1382, -1.1179,  0.0198],
         ...,
         [-0.0124,  0.1729,  0.2242,  ...,  0.1382, -1.1179,  0.0198],
         [-0.3050, -0.7221,  1.1860,  ...,  0.8514, -0.3012, -0.4128],
         [ 1.7305, -0.4095,  1.6345,  ...,  0.5868, -1.7220, -1.2164]],

        [[ 0.6288, -1.2293,  0.9387,  ...,  0.9099, -1.1320,  0.2057],
         [ 0.6403, -1.0638,  1.2341,  ...,  0

In [82]:
test_loss = evaluate(best_model, test_data)


print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)


print(best_model(val_data[1]))

tensor([   30,  8357,    62,    63,   557,     5,    63,    10,   134,    50,
           43,    21,    49,   105,     8,    15,    76,  1795,    12,    10,
          777,    49, 15095,    85,  1186,    19,  3775,    45,     8,   176,
           73,  1888,    19,    15,    92,  1560,     7,     4,   249,    40,
            6,     5,  2621,     5,     4,   538,  7731,   745,    44,    63,
          304,     8,    14,   247,  4183,     5, 19371,  1339,  6638,    54,
        11287,  3178,     0,     5,    21,  1064,     8,     7,  8211,     8,
          288,    10,     6,  6577,     8,    39,  1339,   360,     6,   317,
            4, 37860,    63,    45,  3350,   612,     6,    10, 18966,   535,
         1663,    21,   105,     4,  3609,  3191,    63,  2334,  8803,     5,
            9,    49,    31,   325,     5,  1949,   225,    99,     6,    16,
           43,   782,   566, 14833,    15,     7,    49,     6, 18931,    12,
          325,   467,     4,     5,   801,    63,  3350,    21, 

In [36]:
#print(train.head())
#print(test_with_evi.head())
# claim: contribution waste heat global climate
print(evidence['evidence-308923'])
print(evidence['evidence-213569'])

#greenland lost tiny fraction ice mass	
print(evidence['evidence-962481'])
print(evidence['evidence-1200633'])



Global forcing from waste heat was 0.028 W/m2 in 2005.
Thus, the waste heat engine may be one of the least expensive components of a complete waste heat recovery system.
Only a tiny fraction of the original chemical energy is used for work:
Land ice sheets in both Antarctica and Greenland have been losing mass since 2002 and have seen an acceleration of ice mass loss since 2009.


In [37]:
from torch import nn, optim

# define hypermeter
sequence_len = 28
input_len = 28
hidden_size = 128
num_layers = 2
num_classes = 4
num_epchos = 5
learning_rate = 0.01


In [38]:
class LSTM(nn.Module):
  def __init__(self, input_length, hidden_size, num_classes, num_layers):
    super(LSTM, self).__init__()
    self.hidden_size = hidden_size
    self.num_layers = num_layers
    self.lstm = nn.LSTM(input_len, hidden_size, num_layers, num_classes, batch_first=True)
    self.output_layer = nn.Linear(hidden_size, num_classes)

  def forward(self, X):
    hidden_states = torch.zeros(self.num_layers, X.size(0), self.hidden_size)
    cell_states = torch.zeros(self.num_layers, X.size(0), self.hidden_size)
    out, _ = self.lstm(X, (hidden_states, cell_states))
    out = self.output_layer(out[:, -1, :])
    return out


In [39]:
model = LSTM(input_len, hidden_size, num_classes, num_layers)
print(model)

LSTM(
  (lstm): LSTM(28, 128, num_layers=2, bias=4, batch_first=True)
  (output_layer): Linear(in_features=128, out_features=4, bias=True)
)


In [40]:
loss_func = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

# 3.Testing and Evaluation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

## Object Oriented Programming codes here

*You can use multiple code snippets. Just add more if needed*

In [42]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

# model adpated from workshop 8
class TransformerModel(nn.Module):

    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
        from torch.nn import TransformerEncoder, TransformerEncoderLayer
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, ntoken)

        self.init_weights()

    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src):
        if self.src_mask is None or self.src_mask.size(0) != len(src):
            device = src.device
            mask = self._generate_square_subsequent_mask(len(src)).to(device)
            self.src_mask = mask

        src = self.encoder(src) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, self.src_mask)
        output = self.decoder(output)
        return output



class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term) #0::2 means starting with index 0, step = 2
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)