In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import json
import re
import nltk
import numpy as np
import string
from collections import Counter, defaultdict
from math import log
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\LEE JUNYOUNG\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [2]:
with open('preprocess/trec_split.json', 'r') as f:
    data = json.load(f)

train_data = data['train']
test_data = data['test']

In [64]:
train_data['0']

{'text': 'How did serfdom develop in and then leave Russia ?',
 'coarse_label': 2,
 'fine_label': 26}

to form any kinds of graph, you first need to go through the entire corpus and obtain {idx:node}. we first do this for word, then pos tags

In [4]:

# for cleaning text
import string, re

def expand_contractions(text):
    # Define a list of patterns and their replacements
    patterns = [
        (r"n't\b", " not"),
        (r"'re\b", " are"),
        (r"'ll\b", " will"),
        (r"'ve\b", " have"),
        (r"'m\b", " am"),
        (r"'d\b", " would"),
        (r"'s\b", " is"),
        (r"\bcan't\b", "cannot"),
        (r"\bshan't\b", "shall not"),
        (r"\bwon't\b", "will not"),
        (r"\blet's\b", "let us"),
    ]
    
    # Apply each pattern and replacement in turn
    for pattern, replacement in patterns:
        text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
    
    return text

def clean_str(sentence ,use=True):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    if not use: return sentence
    
    sentence = expand_contractions(sentence)

    sentence = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", sentence)
    sentence = re.sub(r"\'s", " \'s", sentence)
    sentence = re.sub(r"\'ve", " \'ve", sentence)
    sentence = re.sub(r"n\'t", " n\'t", sentence)
    sentence = re.sub(r"\'re", " \'re", sentence)
    sentence = re.sub(r"\'d", " \'d", sentence)
    sentence = re.sub(r"\'ll", " \'ll", sentence)
    sentence = re.sub(r",", " , ", sentence)
    sentence = re.sub(r"!", " ! ", sentence)
    sentence = re.sub(r"\(", " \( ", sentence)
    sentence = re.sub(r"\)", " \) ", sentence)
    sentence = re.sub(r"\?", " \? ", sentence)
    sentence = re.sub(r"\s{2,}", " ", sentence)
    sentence = sentence.translate(str.maketrans('', '', string.punctuation))
    return sentence.strip().lower()

In [5]:

from collections import Counter, defaultdict
from math import log
import numpy as np

def process_corpus(corpus):
    unique_words = set()
    word_count = Counter()
    pair_count = defaultdict(int)
    total_words = 0
    
    for line in corpus:
        line = clean_str(line)
        words = line.split()
        total_words += len(words)
        word_count.update(words)
        for i, word in enumerate(words):
            unique_words.add(word)
            for j in range(i + 1, len(words)):
                pair = tuple(sorted([word, words[j]]))
                pair_count[pair] += 1
    
    word_prob = {word: count / total_words for word, count in word_count.items()}
    pair_prob = {pair: count / total_words for pair, count in pair_count.items()}
    
    return word_prob, pair_prob, unique_words

def calculate_pmi(word_prob, pair_prob, word1, word2):
    pair = tuple(sorted([word1, word2]))
    if pair in pair_prob and word1 in word_prob and word2 in word_prob:
        pmi = log(pair_prob[pair] / (word_prob[word1] * word_prob[word2]))
        return pmi
    return 0.0

def create_pmi_matrix(sentence, word_prob, pair_prob, word_index):
    words = clean_str(sentence).split()
    n = len(words)
    pmi_matrix = np.zeros((n, n))
    node_list = []

    for word in words:
        if word in word_index:
            node_list.append(word_index[word])
        else:
            node_list.append(-1)
        
    for i in range(n):
        for j in range(i + 1, n):
            pmi = calculate_pmi(word_prob, pair_prob, words[i], words[j])
            pmi_matrix[i, j] = pmi
            pmi_matrix[j, i] = pmi  # PMI matrix is symmetric
    
    return pmi_matrix, node_list

# Example usage
corpus = [
    "Hello, world! This is a test.",
    "Another line; with more: punctuation.",
    "Is this working? Yes, it is!"
]

word_prob, pair_prob, unique_words = process_corpus(corpus)
word_index = {word: index for index, word in enumerate(sorted(unique_words))}

sentence = "Hello world, this is a test"
pmi_matrix, node_list = create_pmi_matrix(sentence, word_prob, pair_prob, word_index)
print("PMI Adjacency Matrix:")
print(pmi_matrix)
print(node_list)


PMI Adjacency Matrix:
[[0.         2.83321334 2.14006616 1.73460106 2.83321334 2.83321334]
 [2.83321334 0.         2.14006616 1.73460106 2.83321334 2.83321334]
 [2.14006616 2.14006616 0.         2.14006616 2.14006616 2.14006616]
 [1.73460106 1.73460106 2.14006616 0.         1.73460106 1.73460106]
 [2.83321334 2.83321334 2.14006616 1.73460106 0.         2.83321334]
 [2.83321334 2.83321334 2.14006616 1.73460106 2.83321334 0.        ]]
[2, 12, 9, 3, 0, 8]


In [6]:
def process_corpus_tags(corpus: list[list]):
    unique_pos_tags = set()
    pos_tag_count = Counter()
    pos_tag_pair_count = defaultdict(int)
    total_pos_tags = 0
    
    for line in corpus:
        total_pos_tags += len(line)
        pos_tag_count.update(line)
        
        for i, pos_tag in enumerate(line):
            unique_pos_tags.add(pos_tag)
            
            for j in range(len(line)):
                if i == j: continue
                pair = tuple(sorted([pos_tag, line[j]]))
                pos_tag_count[pair] += 1
    
    pos_tag_prob = {pos_tag: count / total_pos_tags for pos_tag, count in pos_tag_count.items()}
    pos_tag_pair_prob = {pos_tag_pair: count / total_pos_tags for pos_tag_pair, count in pos_tag_pair_count.items()}
    
    return pos_tag_prob, pos_tag_pair_prob, unique_pos_tags

### euclidean distance function

In [1]:
import numpy as np

def euclidean_distance(vector1, vector2):
    """
    Calculate the Euclidean distance between two embedding vectors.
    
    Args:
    vector1 (np.array): First embedding vector.
    vector2 (np.array): Second embedding vector.
    
    Returns:
    float: The Euclidean distance between the two vectors.
    """
    return np.linalg.norm(vector1 - vector2)

# Example usage
embedding1 = np.array([1, 2, 3])
embedding2 = np.array([4, 5, 6])

distance = euclidean_distance(embedding1, embedding2)
print(f"The Euclidean distance between the vectors is: {distance}")


The Euclidean distance between the vectors is: 5.196152422706632


# Phrase extraction

In [16]:
import nltk
import re

# Ensure you have the necessary nltk data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')


[nltk_data] Downloading package punkt to /Users/jylee/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jylee/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /Users/jylee/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [21]:

def pos_tag_sentence(sentence):
    words = nltk.word_tokenize(sentence)
    pos_tags = nltk.pos_tag(words, tagset='universal')
    return pos_tags

def identify_phrases(pos_tags, np_pattern, vp_pattern):
    pos_sequence = ' '.join([tag for word, tag in pos_tags])
    
    np_regex = re.compile(np_pattern)
    vp_regex = re.compile(vp_pattern)
    
    np_matches = [(match.start(), match.end()) for match in np_regex.finditer(pos_sequence)]
    vp_matches = [(match.start(), match.end()) for match in vp_regex.finditer(pos_sequence)]
    
    phrases = []
    for start, end in sorted(np_matches + vp_matches):
        phrase = ' '.join([word for word, tag in pos_tags[start:end]])
        phrases.append(phrase)
    
    return phrases

# Define the patterns
np_pattern = r"((DET)?(NUM)*((ADJ)(PUNCT)?(CONJ)?)*(((NOUN)|(PROPN))(PART)?)+)"
vp_pattern = r"((AUX)*(ADV)*(VERB))"

# Example usage
sentence = "The quick brown fox jumps over the lazy dog"
pos_tags = pos_tag_sentence(sentence)
print("POS Tags:", pos_tags)

phrases = identify_phrases(pos_tags, np_pattern, vp_pattern)
print("Phrases:", phrases)


POS Tags: [('The', 'DET'), ('quick', 'ADJ'), ('brown', 'NOUN'), ('fox', 'NOUN'), ('jumps', 'VERB'), ('over', 'ADP'), ('the', 'DET'), ('lazy', 'ADJ'), ('dog', 'NOUN')]
Phrases: ['dog', '', '', '']


## NP with spacy model (using this for now)

In [65]:
import spacy
nlp = spacy.load('en_core_web_lg')
sample_text = 'The quick brown fox jumps over the lazy dog'
sample_doc = nlp(sample_text)
# Extract Noun Phrases
for chunk in sample_doc.noun_chunks:
    print (chunk)

The quick brown fox
the lazy dog


## VP with spacy model (obtained from fyp student) (using this)

In [66]:
import textacy
sample_text = ('The quick brown fox jumps over the lazy dog')
expression = r'(<VERB>?<ADV>*<VERB>+)'
# pattern = [{"TEXT": {"REGEX": '(<VERB>?<ADV>*<VERB>+)'}}]
vp_patterns = [
    [{"POS": "ADV"}, {"POS": "VERB"}],
    [{"POS": "NOUN"}, {"POS": "VERB"}],
    [{"POS": "PRON"}, {"POS": "VERB"}],
    [{"POS": "ADJ"}, {"POS": "VERB"}],
    [{"POS": "VERB"}, {"POS": "PART"}],
    [{"POS": "VERB"}, {"POS": "NOUN"}],
    [{"POS": "VERB"}, {"POS": "NOUN"}, {"POS": "NOUN"}],
    [{"POS": "VERB"}, {"POS": "ADV"}],
    [{"POS": "VERB"}, {"POS": "ADJ"}],
    [{"POS": "VERB"}, {"POS": "PRON"}],
    [{"POS": "VERB"}, {"POS": "ADP"}],
    [{"POS": "VERB"}, {"POS": "ADP"}, {"POS": "ADV"}],
    [{"POS": "VERB"}, {"POS": "CONJ"}, {"POS": "VERB"}],
    [{"POS": "VERB"}, {"POS": "DET"}, {"POS": "NOUN"}],
    [{"POS": "VERB"}, {"POS": "ADP"}, {"POS": "ADJ"}, {"POS": "NOUN"}],
    [{"POS": "VERB"}, {"POS": "ADJ"}, {"POS": "NOUN"}],
    [{"POS": "VERB"}, {"POS": "DET"}, {"POS": "ADJ"}],
    [{"POS": "VERB"}, {"POS": "PART"}, {"POS": "ADP"}]
]


# get_verb_phrases = textacy.extract.token_matches(sample_text, patterns=patterns)
# verb_phrases = []
# for verb_phrase in get_verb_phrases:
#     verb_phrases.append(verb_phrase)
sample_doc = textacy.make_spacy_doc(sample_text,
                                        lang='en_core_web_lg')
verb_phrases = textacy.extract.token_matches(sample_doc, vp_patterns)
# Print all Verb Phrase
for chunk in verb_phrases:
    print(chunk)

fox jumps
jumps over


In [32]:
for match in re.finditer(expression, sample_doc.text):
    start, end = match.span()
    span = sample_doc.char_span(start, end)
    # This is a Span object or None if match doesn't map to valid token sequence
    if span is not None:
        print("Found match:", span.text)

## VP with spacy using grammar tree

In [35]:
import spacy

# Load the spaCy model
nlp = spacy.load("en_core_web_lg")

def extract_verb_phrases(text):
    doc = nlp(text)
    verb_phrases = []
    for token in doc:
        if token.pos_ == "VERB":
            verb_phrase = ' '.join([child.text for child in token.subtree])
            verb_phrases.append(verb_phrase)
    return verb_phrases

sentence = "He is eating an apple while she reads a book."
verb_phrases = extract_verb_phrases(sentence)
print("Verb Phrases:", verb_phrases)

Verb Phrases: ['He is eating an apple while she reads a book .', 'while she reads a book']


## VP using nltk regex

In [36]:
import nltk
from nltk import pos_tag
from nltk.chunk import RegexpParser
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package punkt to /Users/jylee/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jylee/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [1]:

def extract_verb_phrases(text):
    words = word_tokenize(text)
    tagged = pos_tag(words)
    chunk_grammar = "VP: {<VB.*><.*>*}"
    chunk_parser = RegexpParser(chunk_grammar)
    tree = chunk_parser.parse(tagged)

    verb_phrases = []
    for subtree in tree.subtrees():
        if subtree.label() == "VP":
            verb_phrase = ' '.join(word for word, pos in subtree.leaves())
            verb_phrases.append(verb_phrase)
    return verb_phrases

sentence = "He is eating an apple while she reads a book."
verb_phrases = extract_verb_phrases(sentence)
print("Verb Phrases:", verb_phrases)

NameError: name 'word_tokenize' is not defined

## combined phrase (NP and VP)

In [None]:
vp_patterns = [
    [{"POS": "ADV"}, {"POS": "VERB"}],
    [{"POS": "NOUN"}, {"POS": "VERB"}],
    [{"POS": "PRON"}, {"POS": "VERB"}],
    [{"POS": "ADJ"}, {"POS": "VERB"}],
    [{"POS": "VERB"}, {"POS": "PART"}],
    [{"POS": "VERB"}, {"POS": "NOUN"}],
    [{"POS": "VERB"}, {"POS": "NOUN"}, {"POS": "NOUN"}],
    [{"POS": "VERB"}, {"POS": "ADV"}],
    [{"POS": "VERB"}, {"POS": "ADJ"}],
    [{"POS": "VERB"}, {"POS": "PRON"}],
    [{"POS": "VERB"}, {"POS": "ADP"}],
    [{"POS": "VERB"}, {"POS": "ADP"}, {"POS": "ADV"}],
    [{"POS": "VERB"}, {"POS": "CONJ"}, {"POS": "VERB"}],
    [{"POS": "VERB"}, {"POS": "DET"}, {"POS": "NOUN"}],
    [{"POS": "VERB"}, {"POS": "ADP"}, {"POS": "ADJ"}, {"POS": "NOUN"}],
    [{"POS": "VERB"}, {"POS": "ADJ"}, {"POS": "NOUN"}],
    [{"POS": "VERB"}, {"POS": "DET"}, {"POS": "ADJ"}],
    [{"POS": "VERB"}, {"POS": "PART"}, {"POS": "ADP"}]
]

In [15]:
import spacy, textacy
nlp = spacy.load('en_core_web_lg')
sample_text = 'Calculate the Euclidean distance between two embedding vectors.'
sample_doc = nlp(sample_text)
verb_phrases = textacy.extract.token_matches(sample_doc, vp_patterns)

phrase_list = []
tag_list = []

# Extract Noun Phrases and corresponding pos tags
for chunk in sample_doc.noun_chunks:
    phrase_list.append(chunk)
    tag_list.append(' '.join([t.pos_ for t in chunk]))
# Print all Verb Phrase and corresponding pos tags
for chunk in verb_phrases:
    phrase_list.append(chunk)
    tag_list.append(' '.join([t.pos_ for t in chunk]))

print("Phrases: ", phrase_list)
print("POS Tags: ", tag_list)

Phrases:  [the Euclidean distance, two embedding vectors, Calculate the Euclidean, embedding vectors]
POS Tags:  ['DET ADJ NOUN', 'NUM VERB NOUN', 'VERB DET ADJ', 'VERB NOUN']


### one-hot encoding vectors for pos tags

In [2]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder

# Example corpus of sentences transformed into lists of POS tags
corpus = [
    ['VB', 'NN', 'NNP'],
    ['DT', 'NN', 'VBZ', 'VBG'],
    ['NN', 'NN', 'VB'],
]

# Step 1: Collect all unique POS tags to create a vocabulary
all_pos_tags = set(tag for sentence in corpus for tag in sentence)

# Step 2: Create a mapping from each POS tag to a unique index
pos_to_index = {tag: idx for idx, tag in enumerate(all_pos_tags)}

# Step 3: Generate one-hot encoded vectors
def one_hot_encode(pos_list, pos_to_index):
    vector = np.zeros(len(pos_to_index))
    for pos in pos_list:
        if pos in pos_to_index:
            vector[pos_to_index[pos]] = 1
    return vector

# Example usage
encoded_corpus = [one_hot_encode(sentence, pos_to_index) for sentence in corpus]

print("Vocabulary:", pos_to_index)
print("One-hot encoded vectors:")
for vec in encoded_corpus:
    print(vec)


Vocabulary: {'NNP': 0, 'DT': 1, 'VBG': 2, 'NN': 3, 'VB': 4, 'VBZ': 5}
One-hot encoded vectors:
[1. 0. 0. 1. 1. 0.]
[0. 1. 1. 1. 0. 1.]
[0. 0. 0. 1. 1. 0.]


# Phrase level embeddings

In [121]:
from sentence_transformers import SentenceTransformer
phrase_list = ['play an active role', 'participate actively', 'active lifestyle']

model = SentenceTransformer('whaleloops/phrase-bert')
phrase_embs = model.encode(phrase_list)
[p1, p2, p3] = phrase_embs




In [18]:
p1.shape

(768,)

# Dependency parsing

In [15]:
import spacy
import numpy as np

# Load the spacy model
nlp = spacy.load("en_core_web_lg")

def get_dependency_parse(sentence):
    doc = nlp(sentence)
    dependencies = [(token.text, token.head.text, token.dep_) for token in doc]
    return dependencies, [token.text for token in doc]

def create_adjacency_matrix(sentence):
    dependencies, words = get_dependency_parse(sentence)
    word_index = {word: i for i, word in enumerate(words)}
    n = len(words)
    
    adjacency_matrix = np.zeros((n, n), dtype=int)
    
    for word, head, dep in dependencies:
        if word != head:  # Skip self-loops
            adjacency_matrix[word_index[head]][word_index[word]] = 1
    
    return adjacency_matrix, words

# Example usage
sentence = "The quick brown fox jumps over the lazy dog."
adj_matrix, words = create_adjacency_matrix(sentence)

print("Words:", words)
print("Adjacency Matrix:")
print(adj_matrix)


Words: ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
Adjacency Matrix:
[[0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0]
 [1 1 1 0 0 0 0 0 0]
 [0 0 0 1 0 1 0 0 0]
 [0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 1 0]]


# NER

In [68]:
import json
import pickle as pkl
import numpy as np
from scipy.sparse import coo_matrix

nlp = spacy.load('en_core_web_lg')
ent2id_new = json.load(open('./pretrained_emb/NELL_KG/ent2ids_refined', 'r'))        
ent_mapping = {} 
entity_set = set()
adj_ent_index = []

def get_adj_ent_index(query, ent_mapping, ent2id_new):
    # named entity recognition
    np_list = []
    ent_list = []
    index = []
    
    # extract NP first
    doc = nlp(query)
    for chunk in doc.noun_chunks:
        np_list.append(chunk.text)
    
    # for every word in the NER dictionary
    for key in ent2id_new.keys(): 
        # check if the word is in the text
        if key in np_list: 
            # check if word is already in the mapping dict
            if key not in ent_mapping: 
                # add word to ent_list
                ent_list.append(key)
                # add word to mapping dict as word:idx_in_ent_list
                ent_mapping[key] = len(ent_mapping)
                # update the set
                entity_set.update(ent_list)
            if ent_mapping[key] not in index: 
                index.append(ent_mapping[key])
    # entity adjacency (index) matrix: list[list] of entities present in the sentences
    adj_ent_index.append(index)


sample_query = ['the quick brown fox jumps over the lazy dog', 'who is george washington']
for sent in sample_query:
    get_adj_ent_index(sent, ent_mapping, ent2id_new)
# json.dump([adj_ent_index, ent_mapping],
#           open('./{}_data/index_and_mapping.json'.format(dataset_name), 'w'), ensure_ascii=False)
ent_emb = []
TransE_emb_file = np.loadtxt('./pretrained_emb/NELL_KG/entity2vec.TransE')
TransE_emb = []

for i in range(len(TransE_emb_file)):
    TransE_emb.append(list(TransE_emb_file[i, :]))

rows = []
data = []
columns = []

max_num = len(ent_mapping)
# creating a coo format for matrix of adj_ent_index
for sent_i, indices in enumerate(adj_ent_index):
    for index in indices:
        data.append(1)
        rows.append(sent_i)
        columns.append(index)

# create a matrice of ones and zeros
# ones correspond to (sentence_index, entity_index) i.e. which entities are present in the sentence
adj_ent = coo_matrix((data, (rows, columns)), shape=(len(adj_ent_index), max_num))
# for entity in entity mapping
for key in ent_mapping.keys():
    # add embedding to ent_emb
    ent_emb.append(TransE_emb[ent2id_new[key]])

ent_emb = np.array(ent_emb)
print('ent shape', ent_emb.shape)
ent_emb_normed = ent_emb / np.sqrt(np.square(ent_emb).sum(-1, keepdims=True))
adj_emb = np.matmul(ent_emb_normed, ent_emb_normed.transpose())
print('entity_emb_cos', np.mean(np.mean(adj_emb, -1)))
# pkl.dump(np.array(ent_emb), open('./{}_data/entity_emb.pkl'.format(dataset_name), 'wb'))
# pkl.dump(adj_ent, open('./{}_data/adj_query2entity.pkl'.format(dataset_name), 'wb'))

entity_nodes = list(entity_set)

print('ent', len(entity_nodes))
print('entities', entity_nodes)

# 


ent shape (1, 100)
entity_emb_cos 0.9999999999999999
ent 1
entities ['george washington']


In [43]:
sample_query = 'the quick brown fox jumps over the lazy dog'
get_adj_ent_index(sample_query, ent_mapping, ent2id_new)

ent shape (6, 100)
entity_emb_cos 0.18653110780888263
ent 6
entities ['fox', 'jump', 'row', 'quick', 'dog', 'brown']


# Compile data

In [37]:

import json
dataset_name = 'bloom'
with open(f'preprocess/{dataset_name}_split.json', 'r') as f:
    data = json.load(f)

In [38]:
import pandas as pd

train_data = pd.DataFrame(data['train']).transpose().reset_index(drop=True)
test_data = pd.DataFrame(data['test']).transpose().reset_index(drop=True)

In [39]:
train_data

Unnamed: 0,text,label
0,Analyze the product-market options that are av...,0
1,Define attribution.,4
2,Given three possible approaches to implement t...,3
3,Predict what will happen next in.,3
4,Define “ecosystem services” and describe how t...,2
...,...,...
2012,"Compare and contrast preview questions, clarif...",0
2013,Name the amino acids for the following single ...,4
2014,List and explain THREE factors that are affect...,2
2015,"After designing an experiment, examining the r...",3


In [10]:
import spacy
import numpy as np

# Load the spacy model
nlp = spacy.load("en_core_web_lg")

def get_dependency_parse(sentence):
    doc = nlp(sentence)
    dependencies = [(token.text, token.head.text, token.dep_) for token in doc]
    # return dependencies, [token.text for token in doc]
    return dependencies

def create_adjacency_matrix(sentence):
    # dependencies, words = get_dependency_parse(sentence)
    dependencies = get_dependency_parse(sentence)
    word_index = {word: i for i, word in enumerate(words)}
    n = len(words)
    
    adjacency_matrix = np.zeros((n, n), dtype=int)
    
    for word, head, dep in dependencies:
        if word != head:  # Skip self-loops
            adjacency_matrix[word_index[head]][word_index[word]] = 1
    
    return adjacency_matrix, words

In [11]:
import spacy
vp_patterns = [
    [{"POS": "ADV"}, {"POS": "VERB"}],
    [{"POS": "NOUN"}, {"POS": "VERB"}],
    [{"POS": "PRON"}, {"POS": "VERB"}],
    [{"POS": "ADJ"}, {"POS": "VERB"}],
    [{"POS": "VERB"}, {"POS": "PART"}],
    [{"POS": "VERB"}, {"POS": "NOUN"}],
    [{"POS": "VERB"}, {"POS": "NOUN"}, {"POS": "NOUN"}],
    [{"POS": "VERB"}, {"POS": "ADV"}],
    [{"POS": "VERB"}, {"POS": "ADJ"}],
    [{"POS": "VERB"}, {"POS": "PRON"}],
    [{"POS": "VERB"}, {"POS": "ADP"}],
    [{"POS": "VERB"}, {"POS": "ADP"}, {"POS": "ADV"}],
    [{"POS": "VERB"}, {"POS": "CONJ"}, {"POS": "VERB"}],
    [{"POS": "VERB"}, {"POS": "DET"}, {"POS": "NOUN"}],
    [{"POS": "VERB"}, {"POS": "ADP"}, {"POS": "ADJ"}, {"POS": "NOUN"}],
    [{"POS": "VERB"}, {"POS": "ADJ"}, {"POS": "NOUN"}],
    [{"POS": "VERB"}, {"POS": "DET"}, {"POS": "ADJ"}],
    [{"POS": "VERB"}, {"POS": "PART"}, {"POS": "ADP"}]
]

# Load the spacy model
nlp = spacy.load("en_core_web_lg")

def extract_phrases(text, return_value = 'phrase'):
    doc = nlp(text)
    verb_phrases = textacy.extract.token_matches(doc, vp_patterns)

    phrase_list = []
    tag_list = []

    # Extract Noun Phrases and corresponding pos tags
    for chunk in doc.noun_chunks:
        phrase_list.append(chunk.text)
        tag_list.append(' '.join([t.pos_ for t in chunk]))
    # Print all Verb Phrase and corresponding pos tags
    for chunk in verb_phrases:
        phrase_list.append(chunk.text)
        tag_list.append(' '.join([t.pos_ for t in chunk]))
    
    if return_value == 'phrase':
        return phrase_list
    elif return_value == 'tag':
        return tag_list

In [12]:
import numpy as np
import torch
from gensim.models import Word2Vec
from collections import defaultdict
from itertools import product
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
import pandas as pd
import textacy
import spacy
from tqdm import tqdm
import json
import pickle as pkl
import numpy as np
from scipy.sparse import coo_matrix


  from .autonotebook import tqdm as notebook_tqdm


In [40]:
from torch_geometric import seed_everything

seed = 42
seed_everything(seed)

In [41]:


train_data['cleaned_text'] = train_data['text'].apply(lambda x: clean_str(x))
text = train_data['cleaned_text'].tolist()
test_data['cleaned_text'] = test_data['text'].apply(lambda x: clean_str(x))
test_text = test_data['cleaned_text'].tolist()

# Tokenize sentences into words
train_data['tokenized_text'] = train_data['cleaned_text'].apply(lambda x: x.split())
test_data['tokenized_text'] = test_data['cleaned_text'].apply(lambda x: x.split())
# labels = train_data['coarse_label'].tolist() # only for trec
labels = train_data['label'].tolist()
test_labels = test_data['label'].tolist()


# 1. Collect all unique words in the corpus
tokenized_corpus = train_data['tokenized_text'].tolist()
test_tokenized_corpus = test_data['tokenized_text'].tolist()
# word_set = set(word for sentence in tokenized_corpus for word in sentence)
print('text_tokenized')
word_prob, pair_prob, word_set = process_corpus(text)
test_word_prob, test_pair_prob, test_word_set = process_corpus(test_text)

# 2. Create a mapping from each word to a unique index
word_to_index = {word: idx for idx, word in enumerate(word_set)}
test_word_to_index = {word: idx for idx, word in enumerate(test_word_set)}

# 4. Generate word embeddings using Word2Vec
# ps = PorterStemmer()
model = Word2Vec(sentences=tokenized_corpus+test_tokenized_corpus, vector_size=100, window=5, min_count=1, sg=0)
word_embedding = torch.tensor([model.wv[word] for word in word_set])
test_word_embedding = torch.tensor([model.wv[word] for word in test_word_set])
print('word_embedding generated')

# do the same for phrase level
train_data['phrases'] = train_data['cleaned_text'].apply(lambda x: extract_phrases(x))
test_data['phrases'] = test_data['cleaned_text'].apply(lambda x: extract_phrases(x))
train_data['phrase_tags'] = train_data['cleaned_text'].apply(lambda x: extract_phrases(x, 'tag'))
test_data['phrase_tags'] = test_data['cleaned_text'].apply(lambda x: extract_phrases(x, 'tag'))

phrase_set = set(phrase for phrase_list in train_data['phrases'].tolist() for phrase in phrase_list)
test_phrase_set = set(phrase for phrase_list in test_data['phrases'].tolist() for phrase in phrase_list)
# phrase_tag_set = set(phrase_tag for phrase_tag_list in train_data['phrase_tags'].tolist() for phrase_tag in phrase_tag_list)

phrase_to_index = {phrase: idx for idx, phrase in enumerate(phrase_set)}
test_phrase_to_index = {phrase: idx for idx, phrase in enumerate(test_phrase_set)}
# phrase_tag_to_index = {phrase_tag: idx for idx, phrase_tag in enumerate(phrase_tag_set)}

phrases = train_data['phrases'].tolist()
test_phrases = test_data['phrases'].tolist()
# phrase_tags = train_data['phrase_tags'].tolist()
print('phrases and phrase_tags generated')



text_tokenized
word_embedding generated
phrases and phrase_tags generated


In [42]:
phrase_tags = train_data['phrase_tags'].tolist()
pos_tag_prob, pos_tag_pair_prob, pos_tag_set = process_corpus_tags(phrase_tags)
phrase_tag_to_index = {phrase_tag: idx for idx, phrase_tag in enumerate(pos_tag_set)}

test_phrase_tags = test_data['phrase_tags'].tolist()
test_pos_tag_prob, test_pos_tag_pair_prob, test_pos_tag_set = process_corpus_tags(test_phrase_tags)
test_phrase_tag_to_index = {phrase_tag: idx for idx, phrase_tag in enumerate(test_pos_tag_set)}

# Generate phrase embeddings
from sentence_transformers import SentenceTransformer

phrasebert_model = SentenceTransformer('whaleloops/phrase-bert')
phrase_embedding = torch.tensor([phrasebert_model.encode(phrase) for phrase in phrase_set])
test_phrase_embedding = torch.tensor([phrasebert_model.encode(phrase) for phrase in test_phrase_set])



In [43]:
# cosine sim matrix
phrase_emb_normed = phrase_embedding / np.sqrt(np.square(phrase_embedding).sum(-1, keepdims=True))
cos_sim_matrix = np.matmul(phrase_emb_normed, phrase_emb_normed.transpose(1, 0))

test_phrase_emb_normed = test_phrase_embedding / np.sqrt(np.square(test_phrase_embedding).sum(-1, keepdims=True))
test_cos_sim_matrix = np.matmul(test_phrase_emb_normed, test_phrase_emb_normed.transpose(1, 0))

In [44]:
# Named entity recognition
ent2id_new = json.load(open('./pretrained_emb/NELL_KG/ent2ids_refined', 'r'))
TransE_emb_file = np.loadtxt('./pretrained_emb/NELL_KG/entity2vec.TransE')    
TransE_emb = []
for i in range(len(TransE_emb_file)):
    TransE_emb.append(list(TransE_emb_file[i, :]))
    
def extract_entities(sentence, ent2id_new):
    np_list = []
    ent_list = []
    # extract NP first
    doc = nlp(sentence)
    for chunk in doc.noun_chunks:
        np_list.append(chunk.text)
    
    for ent in ent2id_new.keys():
        if ent in np_list:
            ent_list.append(ent)
            
    return ent_list

In [45]:
# extract entities
train_data['named_entities'] = train_data['cleaned_text'].apply(lambda x: extract_entities(x, ent2id_new))
named_entities = train_data['named_entities'].tolist()
named_entities_set = set(entity for entity_list in named_entities for entity in entity_list)
named_entities_to_index = {entity: idx for idx, entity in enumerate(named_entities_set)}

test_data['named_entities'] = test_data['cleaned_text'].apply(lambda x: extract_entities(x, ent2id_new))
test_named_entities = test_data['named_entities'].tolist()
test_named_entities_set = set(entity for entity_list in test_named_entities for entity in entity_list)
test_named_entities_to_index = {entity: idx for idx, entity in enumerate(test_named_entities_set)}

In [46]:
ent_emb = []
# for entity in entity mapping
for key in named_entities_to_index.keys():
    # add transE embedding to ent_emb
    ent_emb.append(TransE_emb[ent2id_new[key]])
    
ent_emb = torch.tensor(ent_emb, dtype=torch.float32)

test_ent_emb = []
# for entity in entity mapping
for key in test_named_entities_to_index.keys():
    # add transE embedding to ent_emb
    test_ent_emb.append(TransE_emb[ent2id_new[key]])
    
test_ent_emb = torch.tensor(test_ent_emb, dtype=torch.float32)

In [47]:
# one hot encoding for pos tags
pos_emb = torch.eye(len(phrase_tag_to_index))
test_pos_emb = torch.eye(len(test_phrase_tag_to_index))

In [22]:
class GroupedData(Data):
    def __inc__(self, key, value, *args, **kwargs):
        if key == 'ppmi_edge_index':
            return self.ppmi_x.size(0)
        if key == 'dp_edge_index':
            return self.dp_x.size(0)
        if key == 'sem_edge_index':
            return self.sem_x.size(0)
        if key == 'ner_edge_index':
            return self.ner_x.size(0)
        if key == 'pos_edge_index':
            return self.pos_x.size(0)
        return super().__inc__(key, value, *args, **kwargs)

In [48]:

# 5. Create a list of torch_geometric.data.Data graphs for each sentence in the corpus
graphs = []

for idx, sentence in tqdm(enumerate(tokenized_corpus)):
    
    
    # PPMI
    ppmi_edges = []
    ppmi_edge_weights = []
    
    n = len(sentence)

    for i, word1 in enumerate(sentence):
        for j in range(i + 1, n):
            word2 = sentence[j]
            if word1 != word2:
                pmi = calculate_pmi(word_prob, pair_prob, word1, word2)
                if pmi > 0:
                    ppmi_edges.append((word_to_index[word1], word_to_index[word2]))
                    ppmi_edge_weights.append(pmi)
                    ppmi_edges.append((word_to_index[word2], word_to_index[word1]))
                    ppmi_edge_weights.append(pmi) # ppmi is symmetric
    
    if len(ppmi_edges) == 0:
        ppmi_edge_index = torch.empty((2, 0), dtype=torch.long)
        ppmi_edge_attr = torch.empty((0,), dtype=torch.float)
    else:
        ppmi_edge_index = torch.tensor(ppmi_edges, dtype=torch.long).t().contiguous()
        ppmi_edge_attr = torch.tensor(ppmi_edge_weights, dtype=torch.float)

    
    # Dependency Parse
    dp_edges = []
    
    doc = nlp(''.join(sentence))
    
    dependencies = [(token.text, token.head.text, token.dep_) for token in doc]
    # dependencies = get_dependency_parse(text[idx])
    for word, head, dep in dependencies:
        if word != head and word.strip() and head.strip():  # Skip self-loops and empty tokens
            dp_edges.append((word_to_index[head], word_to_index[word]))
    
    if len(dp_edges) == 0:
        dp_edge_index = torch.empty((2, 0), dtype=torch.long)
    else:
        dp_edge_index = torch.tensor(dp_edges, dtype=torch.long).t().contiguous()
    
    
    # Phrase Embeddings
    sem_edges = []
    sem_edge_weights = []
    
    n_phrases = len(phrases[idx])

    for i, phrase1 in enumerate(phrases[idx]):
        for j in range(n_phrases):
            phrase2 = phrases[idx][j]
            if phrase1 != phrase2:
                sem_edges.append((phrase_to_index[phrase1], phrase_to_index[phrase2]))
                sem_edge_weights.append(cos_sim_matrix[phrase_to_index[phrase1], phrase_to_index[phrase2]])
                sem_edges.append((phrase_to_index[phrase2], phrase_to_index[phrase1]))
                sem_edge_weights.append(cos_sim_matrix[phrase_to_index[phrase2], phrase_to_index[phrase1]])
    
    if len(sem_edges) == 0:
        sem_edge_index = torch.empty((2, 0), dtype=torch.long)
        sem_edge_attr = torch.empty((0,), dtype=torch.float)
    else:
        sem_edge_index = torch.tensor(sem_edges, dtype=torch.long).t().contiguous()
        sem_edge_attr = torch.tensor(sem_edge_weights, dtype=torch.float)
    
    
    # Phrase tags
    
    pos_edges = []
    pos_edge_weights = []
    
    n_phrase_tags = len(phrase_tags[idx])
    
    for i, tag1 in enumerate(phrase_tags[idx]):
        for j in range(i + 1, n_phrase_tags):
            tag2 = phrase_tags[idx][j]
            if tag1 != tag2:
                pos_pmi = calculate_pmi(pos_tag_prob, pos_tag_pair_prob, tag1, tag2)
                if pos_pmi > 0:
                    pos_edges.append((phrase_tag_to_index[tag1], phrase_tag_to_index[tag2]))
                    pos_edge_weights.append(pos_pmi)
                    pos_edges.append((phrase_tag_to_index[tag2], phrase_tag_to_index[tag1]))
                    pos_edge_weights.append(pos_pmi) # ppmi is symmetric
    
    if len(pos_edges) == 0:
        pos_edge_index = torch.empty((2, 0), dtype=torch.long)
        pos_edge_attr = torch.empty((0,), dtype=torch.float)
    else:
        pos_edge_index = torch.tensor(pos_edges, dtype=torch.long).t().contiguous()
        pos_edge_attr = torch.tensor(pos_edge_weights, dtype=torch.float)
    
    
    # Named Entity Recognition
    
    ner_edges = []
    
    if named_entities[idx]: # if there are entities
        for i, entity in enumerate(named_entities[idx]):
            ner_edges.append((named_entities_to_index[entity], named_entities_to_index[entity]))
            
    if len(ner_edges) == 0:
        ner_edge_index = torch.empty((2, 0), dtype=torch.long)
    else:
        ner_edge_index = torch.tensor(ner_edges, dtype=torch.long).t().contiguous()
        
    # print(ppmi_edge_index.shape, ppmi_edge_attr.shape, dp_edge_index.shape, sem_edge_index.shape, sem_edge_attr.shape)
    # x = torch.tensor([model.wv[word] for word in sentence], dtype=torch.float)
    y = torch.tensor([labels[idx]])
    
    grouped_data = GroupedData(ppmi_x=word_embedding, 
                               ppmi_edge_index=ppmi_edge_index, 
                               ppmi_edge_attr=ppmi_edge_attr, 
                               dp_x=word_embedding, 
                               dp_edge_index=dp_edge_index,
                               sem_x=phrase_embedding,
                               sem_edge_index=sem_edge_index,
                               sem_edge_attr=sem_edge_attr,
                               pos_x=pos_emb,
                               pos_edge_index=pos_edge_index,
                               pos_edge_attr=pos_edge_attr,
                               ner_x=ent_emb,
                               ner_edge_index=ner_edge_index,
                               y=y)
    # print(grouped_data)
    graphs.append(grouped_data)


2017it [00:08, 247.06it/s]


In [49]:
test_graphs = []

for idx, sentence in tqdm(enumerate(test_tokenized_corpus)):
    
    # PPMI
    ppmi_edges = []
    ppmi_edge_weights = []
    
    n = len(sentence)

    for i, word1 in enumerate(sentence):
        for j in range(i + 1, n):
            word2 = sentence[j]
            if word1 != word2:
                pmi = calculate_pmi(test_word_prob, test_pair_prob, word1, word2)
                if pmi > 0:
                    ppmi_edges.append((test_word_to_index[word1], test_word_to_index[word2]))
                    ppmi_edge_weights.append(pmi)
                    ppmi_edges.append((test_word_to_index[word2], test_word_to_index[word1]))
                    ppmi_edge_weights.append(pmi) # ppmi is symmetric
    
    if len(ppmi_edges) == 0:
        ppmi_edge_index = torch.empty((2, 0), dtype=torch.long)
        ppmi_edge_attr = torch.empty((0,), dtype=torch.float)
    else:
        ppmi_edge_index = torch.tensor(ppmi_edges, dtype=torch.long).t().contiguous()
        ppmi_edge_attr = torch.tensor(ppmi_edge_weights, dtype=torch.float)

    
    # Dependency Parse
    dp_edges = []
    
    doc = nlp(' '.join(sentence))
    dependencies = [(token.text, token.head.text, token.dep_) for token in doc]
    # dependencies = get_dependency_parse(text[idx])
    for word, head, dep in dependencies:
        if word != head and word.strip() and head.strip():  # Skip self-loops and empty tokens
            dp_edges.append((test_word_to_index[head], test_word_to_index[word]))
    
    if len(dp_edges) == 0:
        dp_edge_index = torch.empty((2, 0), dtype=torch.long)
    else:
        dp_edge_index = torch.tensor(dp_edges, dtype=torch.long).t().contiguous()
    
    
    # Phrase Embeddings
    sem_edges = []
    sem_edge_weights = []
    
    n_phrases = len(test_phrases[idx])

    for i, phrase1 in enumerate(test_phrases[idx]):
        for j in range(n_phrases):
            phrase2 = test_phrases[idx][j]
            if phrase1 != phrase2:
                sem_edges.append((test_phrase_to_index[phrase1], test_phrase_to_index[phrase2]))
                sem_edge_weights.append(test_cos_sim_matrix[test_phrase_to_index[phrase1], test_phrase_to_index[phrase2]])
                sem_edges.append((test_phrase_to_index[phrase2], test_phrase_to_index[phrase1]))
                sem_edge_weights.append(test_cos_sim_matrix[test_phrase_to_index[phrase2], test_phrase_to_index[phrase1]])
    
    if len(sem_edges) == 0:
        sem_edge_index = torch.empty((2, 0), dtype=torch.long)
        sem_edge_attr = torch.empty((0,), dtype=torch.float)
    else:
        sem_edge_index = torch.tensor(sem_edges, dtype=torch.long).t().contiguous()
        sem_edge_attr = torch.tensor(sem_edge_weights, dtype=torch.float)
    
    
    # Phrase tags
    
    pos_edges = []
    pos_edge_weights = []
    
    n_phrase_tags = len(test_phrase_tags[idx])
    
    for i, tag1 in enumerate(test_phrase_tags[idx]):
        for j in range(i + 1, n_phrase_tags):
            tag2 = test_phrase_tags[idx][j]
            if tag1 != tag2:
                pos_pmi = calculate_pmi(test_pos_tag_prob, test_pos_tag_pair_prob, tag1, tag2)
                if pos_pmi > 0:
                    pos_edges.append((test_phrase_tag_to_index[tag1], test_phrase_tag_to_index[tag2]))
                    pos_edge_weights.append(pos_pmi)
                    pos_edges.append((test_phrase_tag_to_index[tag2], test_phrase_tag_to_index[tag1]))
                    pos_edge_weights.append(pos_pmi) # ppmi is symmetric
    
    if len(pos_edges) == 0:
        pos_edge_index = torch.empty((2, 0), dtype=torch.long)
        pos_edge_attr = torch.empty((0,), dtype=torch.float)
    else:
        pos_edge_index = torch.tensor(pos_edges, dtype=torch.long).t().contiguous()
        pos_edge_attr = torch.tensor(pos_edge_weights, dtype=torch.float)
    
    
    # Named Entity Recognition
    
    ner_edges = []
    
    if test_named_entities[idx]: # if there are entities
        for i, entity in enumerate(test_named_entities[idx]):
            ner_edges.append((test_named_entities_to_index[entity], test_named_entities_to_index[entity]))
            
    if len(ner_edges) == 0: # empty graph if there are no entities
        ner_edge_index = torch.empty((2, 0), dtype=torch.long)
    else:
        ner_edge_index = torch.tensor(ner_edges, dtype=torch.long).t().contiguous()
        
    # print(ppmi_edge_index.shape, ppmi_edge_attr.shape, dp_edge_index.shape, sem_edge_index.shape, sem_edge_attr.shape)
    # x = torch.tensor([model.wv[word] for word in sentence], dtype=torch.float)
    y = torch.tensor([test_labels[idx]])
    
    grouped_data = GroupedData(ppmi_x=word_embedding, 
                               ppmi_edge_index=ppmi_edge_index, 
                               ppmi_edge_attr=ppmi_edge_attr, 
                               dp_x=word_embedding, 
                               dp_edge_index=dp_edge_index,
                               sem_x=phrase_embedding,
                               sem_edge_index=sem_edge_index,
                               sem_edge_attr=sem_edge_attr,
                               pos_x=pos_emb,
                               pos_edge_index=pos_edge_index,
                               pos_edge_attr=pos_edge_attr,
                               ner_x=ent_emb,
                               ner_edge_index=ner_edge_index,
                               y=y)
    # print(grouped_data)
    test_graphs.append(grouped_data)

505it [00:02, 194.51it/s]


verb 0
adj 1
noun 2

'adj_noun' [0000010000] -> [0110000...]

# Model

In [108]:
# import torch
# import torch.nn.functional as F
# from torch_geometric.nn import GCNConv, SAGPooling, global_max_pool
# from torch_geometric.data import Data, DataLoader

# # Define a single GCN 
# class GCN(torch.nn.Module):
#     def __init__(self, in_channels, hidden_channels, out_channels, out_pooling):
#         super(GCN, self).__init__()
#         self.conv1 = GCNConv(in_channels, hidden_channels)
#         self.conv2 = GCNConv(hidden_channels, hidden_channels)
#         self.conv3 = GCNConv(hidden_channels, out_channels)
#         self.sag_pooling = SAGPooling(out_channels, ratio=out_pooling, GNN=GCNConv)

#     def forward(self, x, edge_index, edge_weight=None, batch=None):
#         x = self.conv1(x, edge_index, edge_weight)
#         x = F.relu(x)
#         x = self.conv2(x, edge_index, edge_weight)
#         x = F.relu(x)
#         x = self.conv3(x, edge_index, edge_weight)
#         # x = self.sag_pooling(x, edge_index, edge_weight, batch)
#         x = self.maxpooling(x, batch)
#         # print(x[0].shape)
#         return x

# # Final model combining multiple GCNs and a classification layer
# class MultiGCNClassifier(torch.nn.Module):
#     def __init__(self, input_emb_sizes, hidden_channels, out_channels, out_pooling, num_gcns, num_classes):
#         super(MultiGCNClassifier, self).__init__()
#         self.out_pooling = out_pooling
#         self.num_gcns = num_gcns
#         self.out_channels = out_channels
        
#         self.ppmi_gcn = GCN(input_emb_sizes['ppmi'], hidden_channels, out_channels, out_pooling)
#         self.dp_gcn = GCN(input_emb_sizes['dp'], hidden_channels, out_channels, out_pooling)
#         self.sem_gcn = GCN(input_emb_sizes['sem'], hidden_channels, out_channels, out_pooling)
#         self.pos_gcn = GCN(input_emb_sizes['pos'], hidden_channels, out_channels, out_pooling)
#         self.ner_gcn = GCN(input_emb_sizes['ner'], hidden_channels, out_channels, out_pooling)
#         self.linear = torch.nn.Linear(out_pooling*out_channels*num_gcns, num_classes)

#     def forward(self, batch):
#         embeddings = []
#         embeddings.append(self.ppmi_gcn(batch.ppmi_x, batch.ppmi_edge_index, batch.ppmi_edge_attr, batch=batch.ppmi_x_batch))
#         embeddings.append(self.dp_gcn(batch.dp_x, batch.dp_edge_index, batch=batch.dp_x_batch))
#         embeddings.append(self.sem_gcn(batch.sem_x, batch.sem_edge_index, batch.sem_edge_attr, batch=batch.sem_x_batch))
#         embeddings.append(self.pos_gcn(batch.pos_x, batch.pos_edge_index, batch.pos_edge_attr, batch=batch.pos_x_batch))
#         embeddings.append(self.ner_gcn(batch.ner_x, batch.ner_edge_index, batch=batch.ner_x_batch))
        
#         # Concatenate the embeddings from each GCN
#         concatenated = torch.cat(embeddings, dim=1)
#         # reshape to (batch_size, out_pooling, num_gcns*out_channels)
#         concatenated = concatenated.reshape(batch.y.shape[0], self.out_pooling*self.num_gcns*self.out_channels)
#         # apply linear layer
#         out = self.linear(concatenated)
#         return F.log_softmax(out, dim=1)


In [None]:
# # combined graph model
# class CombinedGraphGCN(torch.nn.Module):
#     def __init__(self, input_channels, hidden_channels, out_channels, out_pooling, num_gcns, num_classes):
#         super(CombinedGraphGCN, self).__init__()
#         self.conv1 = GCNConv(input_channels, hidden_channels)
#         self.conv2 = GCNConv(hidden_channels, hidden_channels)
#         self.conv3 = GCNConv(hidden_channels, out_channels)
#         self.pooling = SAGPooling(out_channels, ratio=out_pooling, GNN=GCNConv)
#         self.linear = torch.nn.Linear(out_pooling*out_channels, num_classes)

#     def forward(self, batch):
#         x = self.conv1(batch.x, batch.edge_index, batch.edge_attr)
#         x = F.relu(x)
#         x = self.conv2(x, batch.edge_index, batch.edge_attr)
#         x = F.relu(x)
#         x = self.conv3(x, batch.edge_index, batch.edge_attr)
#         x = self.pooling(x, batch.edge_index, batch.edge_attr)
#         return out

# Set-up

#### num_classes

TREC coarse : 6

ARC : 3

ARG: 4

NU : 3

LREC : 3

BLOOM : 6


try different pooling

try another small dataset 
- tried NU

try MLP instead 

try batch size 1



In [50]:
from sklearn.model_selection import train_test_split
train_set, validation_set = train_test_split(graphs, test_size=0.2, random_state=42)

In [54]:
# Initialize the model, optimizer, and loss function
num_gcns = 5
hidden_channels = 64
out_channels = 32
out_pooling = 32
num_classes = 6 # change according to dataset
batch_size = 2
dropout = 0.1

train_loader = DataLoader(train_set, batch_size=batch_size, follow_batch=['ppmi_x', 'dp_x', 'sem_x', 'pos_x', 'ner_x'])
val_loader = DataLoader(validation_set, batch_size=len(validation_set), follow_batch=['ppmi_x', 'dp_x', 'sem_x', 'pos_x', 'ner_x'])
test_loader = DataLoader(test_graphs, batch_size=len(test_graphs), follow_batch=['ppmi_x', 'dp_x', 'sem_x', 'pos_x', 'ner_x'])

# train_loader = DataLoader(train_set, batch_size=batch_size, )
# val_loader = DataLoader(validation_set, batch_size=len(validation_set), )
# test_loader = DataLoader(test_graphs, batch_size=len(test_graphs), )


# obtain embedding sizes from the first batch
input_emb_sizes = {}
batch = next(iter(train_loader))
input_emb_sizes['ppmi'] = batch.ppmi_x.shape[1]
input_emb_sizes['dp'] = batch.dp_x.shape[1]
input_emb_sizes['sem'] = batch.sem_x.shape[1]
input_emb_sizes['pos'] = batch.pos_x.shape[1]
input_emb_sizes['ner'] = batch.ner_x.shape[1]

# model = MultiGCNClassifier(input_emb_sizes=input_emb_sizes, 
#                            hidden_channels=hidden_channels, 
#                            out_channels=out_channels, 
#                            num_gcns=num_gcns,
#                            out_pooling=out_pooling, 
#                            num_classes=num_classes)

# optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

In [55]:
batch = next(iter(train_loader))
print(batch)

GroupedDataBatch(y=[2], ppmi_x=[9018, 100], ppmi_x_batch=[9018], ppmi_x_ptr=[3], ppmi_edge_index=[2, 206], ppmi_edge_attr=[206], dp_x=[9018, 100], dp_x_batch=[9018], dp_x_ptr=[3], dp_edge_index=[2, 0], sem_x=[17450, 768], sem_x_batch=[17450], sem_x_ptr=[3], sem_edge_index=[2, 80], sem_edge_attr=[80], pos_x=[712, 356], pos_x_batch=[712], pos_x_ptr=[3], pos_edge_index=[2, 0], pos_edge_attr=[0], ner_x=[1140, 100], ner_x_batch=[1140], ner_x_ptr=[3], ner_edge_index=[2, 2])


In [163]:
# import torch
# import torch.nn.functional as F
# from torch_geometric.nn import GCNConv, SAGPooling, global_max_pool
# from torch_geometric.data import Data, DataLoader

# # Define a single GCN 
# class GCN(torch.nn.Module):
#     def __init__(self, in_channels, hidden_channels, out_channels, out_pooling):
#         super(GCN, self).__init__()
#         self.conv1 = GCNConv(in_channels, hidden_channels)
#         self.conv2 = GCNConv(hidden_channels, hidden_channels)
#         self.conv3 = GCNConv(hidden_channels, out_channels)
#         self.sag_pooling = SAGPooling(out_channels, ratio=out_pooling)

#     def forward(self, x, edge_index, edge_weight=None, batch=None):
#         x = self.conv1(x, edge_index, edge_weight)
#         x = F.relu(x)
#         x = self.conv2(x, edge_index, edge_weight)
#         x = F.relu(x)
#         x = self.conv3(x, edge_index, edge_weight)
#         x = F.relu(x)
#         x, edge_index, edge_weight, batch, _, _ = self.sag_pooling(x, edge_index, edge_weight, batch)
#         x = global_max_pool(x, batch)
#         return x

# # Final model combining multiple GCNs and a classification layer
# class MultiGCNClassifier(torch.nn.Module):
#     def __init__(self, input_emb_sizes, hidden_channels, out_channels, out_pooling, num_gcns, num_classes):
#         super(MultiGCNClassifier, self).__init__()
#         self.out_pooling = out_pooling
#         self.num_gcns = num_gcns
#         self.out_channels = out_channels
        
#         self.ppmi_gcn = GCN(input_emb_sizes['ppmi'], hidden_channels, out_channels, out_pooling)
#         self.dp_gcn = GCN(input_emb_sizes['dp'], hidden_channels, out_channels, out_pooling)
#         self.sem_gcn = GCN(input_emb_sizes['sem'], hidden_channels, out_channels, out_pooling)
#         self.pos_gcn = GCN(input_emb_sizes['pos'], hidden_channels, out_channels, out_pooling)
#         self.ner_gcn = GCN(input_emb_sizes['ner'], hidden_channels, out_channels, out_pooling)
#         self.linear = torch.nn.Linear(num_gcns*out_channels, num_classes)

#     def forward(self, batch):
#         embeddings = []
#         embeddings.append(self.ppmi_gcn(batch.ppmi_x, batch.ppmi_edge_index, batch.ppmi_edge_attr, batch=batch.ppmi_x_batch))
#         embeddings.append(self.dp_gcn(batch.dp_x, batch.dp_edge_index, batch=batch.dp_x_batch))
#         embeddings.append(self.sem_gcn(batch.sem_x, batch.sem_edge_index, batch.sem_edge_attr, batch=batch.sem_x_batch))
#         embeddings.append(self.pos_gcn(batch.pos_x, batch.pos_edge_index, batch.pos_edge_attr, batch=batch.pos_x_batch))
#         embeddings.append(self.ner_gcn(batch.ner_x, batch.ner_edge_index, batch=batch.ner_x_batch))
        
#         # Concatenate the embeddings from each GCN
#         concatenated = torch.cat(embeddings, dim=1)
#         concatenated = concatenated.reshape(batch.y.shape[0], self.num_gcns*self.out_channels)
        
#         # Apply linear layer
#         out = self.linear(concatenated)
#         return F.log_softmax(out, dim=1)

# # Example training loop
# def train(model, data_loader, optimizer, criterion, device):
#     model.train()
#     for batch in data_loader:
#         batch = batch.to(device)
#         optimizer.zero_grad()
#         out = model(batch)
#         loss = criterion(out, batch.y)
#         loss.backward()
#         optimizer.step()
#         print('Train Loss:', loss.item())

# # Example data loader and training call
# # Ensure your data is prepared and loaded correctly
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model = MultiGCNClassifier(input_emb_sizes, hidden_channels, out_channels, out_pooling, num_gcns, num_classes).to(device)
# optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# criterion = torch.nn.CrossEntropyLoss()
# num_epochs = 10
# train_loss = []

# # # Assuming `train_loader` is your DataLoader
# # for epoch in range(num_epochs):
# #     train(model, train_loader, optimizer, criterion, device)


In [56]:
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch_geometric.nn import GCNConv, SAGPooling, global_max_pool, global_mean_pool
from torch_geometric.data import Data, DataLoader
from NodeNorm.layers import NodeNorm
import numpy as np

# Define a single GCN 
class GCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, out_pooling, dropout):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, out_channels)
        self.nodenorm = NodeNorm(nn_type='n')
        self.sag_pooling = SAGPooling(out_channels, ratio=out_pooling) # separate attention pooling for words and phrases
        self.dropout = dropout

    def forward(self, x, edge_index, edge_weight=None, batch=None):
        x = self.conv1(x, edge_index, edge_weight)
        # x = self.nodenorm(x)
        x = F.relu(x)
        # x = F.dropout(x, p=self.dropout, training=self.training)
        # x = self.conv2(x, edge_index, edge_weight)
        # x = F.relu(x)
        # x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.conv3(x, edge_index, edge_weight)
        # x = self.nodenorm(x)
        x = F.relu(x)
        # x = F.dropout(x, p=self.dropout, training=self.training)
        x, edge_index, edge_weight, batch, _, _ = self.sag_pooling(x, edge_index, edge_weight, batch)
        x = global_max_pool(x, batch)
        return x

# Final model combining multiple GCNs and a classification layer
class MultiGCNClassifier(torch.nn.Module):
    def __init__(self, input_emb_sizes, hidden_channels, out_channels, out_pooling, num_gcns, num_classes, dropout):
        super(MultiGCNClassifier, self).__init__()
        self.out_pooling = out_pooling
        self.num_gcns = num_gcns
        self.out_channels = out_channels
        
        self.ppmi_gcn = GCN(input_emb_sizes['ppmi'], hidden_channels, out_channels, out_pooling, dropout)
        self.dp_gcn = GCN(input_emb_sizes['dp'], hidden_channels, out_channels, out_pooling, dropout)
        self.sem_gcn = GCN(input_emb_sizes['sem'], hidden_channels, out_channels, out_pooling, dropout)
        self.pos_gcn = GCN(input_emb_sizes['pos'], hidden_channels, out_channels, out_pooling, dropout)
        self.ner_gcn = GCN(input_emb_sizes['ner'], hidden_channels, out_channels, out_pooling, dropout)

        self.conv2d = nn.Conv1d(num_gcns, 1, kernel_size=3, stride=1)
        self.linear = torch.nn.Linear((out_channels*num_gcns), num_classes)

    def forward(self, batch):
        embeddings = []
        embeddings.append(self.ppmi_gcn(batch.ppmi_x, batch.ppmi_edge_index, batch.ppmi_edge_attr, batch=batch.ppmi_x_batch))
        embeddings.append(self.dp_gcn(batch.dp_x, batch.dp_edge_index, batch=batch.dp_x_batch))
        embeddings.append(self.sem_gcn(batch.sem_x, batch.sem_edge_index, batch.sem_edge_attr, batch=batch.sem_x_batch))
        embeddings.append(self.pos_gcn(batch.pos_x, batch.pos_edge_index, batch.pos_edge_attr, batch=batch.pos_x_batch))
        embeddings.append(self.ner_gcn(batch.ner_x, batch.ner_edge_index, batch=batch.ner_x_batch))
        
        # Concatenate the embeddings from each GCN
        concatenated = torch.cat(embeddings, dim=1)
        concatenated = concatenated.reshape(batch.y.shape[0], self.num_gcns*self.out_channels)
        # concatenated = concatenated.reshape(batch.y.shape[0], self.num_gcns, self.out_channels)

        # # Apply 2D Convolutional layer
        # conv_output = self.conv2d(concatenated)
        # conv_output = conv_output.reshape(batch.y.shape[0], -1)
        # # print(conv_output.shape)
        
        # Apply linear layer
        out = self.linear(concatenated) # convolution instead
        return F.log_softmax(out, dim=1)

# Training loop with validation, learning rate warm-up, and early stopping
def train(model, train_loader, val_loader, optimizer, criterion, scheduler, device, num_epochs, patience):
    best_val_loss = float('inf')
    epochs_no_improve = 0
    
    for epoch in range(num_epochs):
        model.train()
        train_losses = []
        for batch in train_loader:
            batch = batch.to(device)
            optimizer.zero_grad()
            out = model(batch)
            loss = criterion(out, batch.y)
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())
        
        # # Learning rate warm-up
        # scheduler.step()
        
        # Validation step
        model.eval()
        val_losses = []
        correct = 0
        total = 0
        with torch.no_grad():
            for batch in val_loader:
                batch = batch.to(device)
                out = model(batch)
                loss = criterion(out, batch.y)
                val_losses.append(loss.item())
                
                pred = out.argmax(dim=1)
                correct += pred.eq(batch.y).sum().item()
                total += batch.y.size(0)
        
        val_loss = np.mean(val_losses)
        val_accuracy = correct / total
        
        scheduler.step(val_loss)

        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {np.mean(train_losses):.4f}, Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}')
        
        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_no_improve = 0
            torch.save(model.state_dict(), 'best_model.pth')
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= patience:
                print('Early stopping!')
                model.load_state_dict(torch.load('best_model.pth'))
                break

# Example data loader and training call
# Ensure your data is prepared and loaded correctly
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
model = MultiGCNClassifier(input_emb_sizes, hidden_channels, out_channels, out_pooling, num_gcns, num_classes, dropout).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = torch.nn.CrossEntropyLoss()
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=True)

num_epochs = 100
patience = 10

train(model, train_loader, val_loader, optimizer, criterion, scheduler, device, num_epochs, patience)


cpu
Epoch 1/100, Train Loss: 1.6655, Validation Loss: 1.6123, Validation Accuracy: 0.3911
Epoch 2/100, Train Loss: 1.5352, Validation Loss: 1.4895, Validation Accuracy: 0.4406
Epoch 3/100, Train Loss: 1.4110, Validation Loss: 1.4440, Validation Accuracy: 0.4703
Epoch 4/100, Train Loss: 1.2985, Validation Loss: 1.3867, Validation Accuracy: 0.4802
Epoch 5/100, Train Loss: 1.1886, Validation Loss: 1.3533, Validation Accuracy: 0.4926
Epoch 6/100, Train Loss: 1.1043, Validation Loss: 1.3379, Validation Accuracy: 0.4926
Epoch 7/100, Train Loss: 1.0385, Validation Loss: 1.3180, Validation Accuracy: 0.5124
Epoch 8/100, Train Loss: 0.9740, Validation Loss: 1.3211, Validation Accuracy: 0.5000
Epoch 9/100, Train Loss: 0.9144, Validation Loss: 1.3374, Validation Accuracy: 0.5124
Epoch 10/100, Train Loss: 0.8591, Validation Loss: 1.3684, Validation Accuracy: 0.5124
Epoch 11/100, Train Loss: 0.8128, Validation Loss: 1.3882, Validation Accuracy: 0.5025
Epoch 12/100, Train Loss: 0.7551, Validation Los

In [69]:
32*32*5

5120

In [57]:
# testing
from sklearn.metrics import f1_score, precision_score, recall_score

def test(model, test_loader, device):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in tqdm(test_loader):
            batch = batch.to(device)
            out = model(batch)
            pred = out.argmax(dim=1)
            all_preds.append(pred.cpu().numpy())
            all_labels.append(batch.y.cpu().numpy())

    all_preds = np.concatenate(all_preds, axis=0)
    all_labels = np.concatenate(all_labels, axis=0)

    # Calculate metrics
    f1_micro = f1_score(all_labels, all_preds, average='micro')
    f1_macro = f1_score(all_labels, all_preds, average='macro')
    precision_micro = precision_score(all_labels, all_preds, average='micro')
    precision_macro = precision_score(all_labels, all_preds, average='macro')
    recall_micro = recall_score(all_labels, all_preds, average='micro')
    recall_macro = recall_score(all_labels, all_preds, average='macro')

    print(f'Test F1 Score (Micro): {f1_micro:.4f}')
    print(f'Test F1 Score (Macro): {f1_macro:.4f}')
    print(f'Test Precision (Micro): {precision_micro:.4f}')
    print(f'Test Precision (Macro): {precision_macro:.4f}')
    print(f'Test Recall (Micro): {recall_micro:.4f}')
    print(f'Test Recall (Macro): {recall_macro:.4f}')

    return {
        "f1_micro": f1_micro,
        "f1_macro": f1_macro,
        "precision_micro": precision_micro,
        "precision_macro": precision_macro,
        "recall_micro": recall_micro,
        "recall_macro": recall_macro
    }

# Example test call
# Assuming `test_loader` is your DataLoader for the test set
test_metrics = test(model, test_loader, device)


Test F1 Score (Micro): 0.2634
Test F1 Score (Macro): 0.1551
Test Precision (Micro): 0.2634
Test Precision (Macro): 0.1693
Test Recall (Micro): 0.2634
Test Recall (Macro): 0.1702


# Training

In [160]:
from torcheval.metrics.functional import multiclass_f1_score, multiclass_recall, multiclass_precision, multiclass_accuracy

In [161]:
# Training loop
for epoch in range(10):
    model.train()
    for train_batch in tqdm(train_loader):
        optimizer.zero_grad()
        
        # Process each batch through the respective GCN
        out = model(train_batch)
        
        # Use the label to calculate loss calculation
        train_loss = criterion(out, train_batch.y)
        train_loss.backward()
        optimizer.step()
        
    # evaluate on validation set
    model.eval()
    correct = 0
    total = 0
    for val_batch in tqdm(val_loader):
        out = model(val_batch)
        pred = out.argmax(dim=1)
        correct += (pred == val_batch.y).sum().item()
        total += val_batch.y.size(0)
    
    val_loss = F.nll_loss(out, val_batch.y)
    val_acc = correct / total
    val_micro_f1 = multiclass_f1_score(out, val_batch.y, average='micro', num_classes=num_classes)
    val_macro_f1 = multiclass_f1_score(out, val_batch.y, average='macro', num_classes=num_classes)
    val_macro_precision = multiclass_precision(out, val_batch.y, average='macro', num_classes=num_classes)
    val_micro_precision = multiclass_precision(out, val_batch.y, average='micro', num_classes=num_classes)
    val_micro_recall = multiclass_recall(out, val_batch.y, average='micro', num_classes=num_classes)
    val_macro_recall = multiclass_recall(out, val_batch.y, average='macro', num_classes=num_classes)
    
    
    
    print(f'Epoch {epoch+1}, Train Loss: {train_loss.item()}')
    print(f'val_acc: {val_acc:.4f}, val_loss: {val_loss:.4f}, val_micro_f1: {val_micro_f1:.4f}, val_macro_f1: {val_macro_f1:.4f}, val_micro_precision: {val_micro_precision:.4f}, val_macro_precision: {val_macro_precision:.4f}, val_micro_recall: {val_micro_recall:.4f}, val_macro_recall: {val_macro_recall:.4f}')



100%|██████████| 84/84 [00:04<00:00, 19.20it/s]
100%|██████████| 1/1 [00:00<00:00,  1.14it/s]


Epoch 1, Train Loss: 0.8622546195983887
val_acc: 0.5000, val_loss: 0.9931, val_micro_f1: 0.5000, val_macro_f1: 0.2222, val_micro_precision: 0.5000, val_macro_precision: 0.1667, val_micro_recall: 0.5000, val_macro_recall: 0.3333


100%|██████████| 84/84 [00:05<00:00, 15.86it/s]
100%|██████████| 1/1 [00:01<00:00,  1.11s/it]


Epoch 2, Train Loss: 1.046687126159668
val_acc: 0.6190, val_loss: 0.8585, val_micro_f1: 0.6190, val_macro_f1: 0.4607, val_micro_precision: 0.6190, val_macro_precision: 0.4114, val_micro_recall: 0.6190, val_macro_recall: 0.5368


100%|██████████| 84/84 [00:05<00:00, 15.88it/s]
100%|██████████| 1/1 [00:00<00:00,  1.46it/s]


Epoch 3, Train Loss: 0.4766135811805725
val_acc: 0.6667, val_loss: 0.7929, val_micro_f1: 0.6667, val_macro_f1: 0.4938, val_micro_precision: 0.6667, val_macro_precision: 0.4431, val_micro_recall: 0.6667, val_macro_recall: 0.5578


100%|██████████| 84/84 [00:05<00:00, 15.59it/s]
100%|██████████| 1/1 [00:00<00:00,  1.38it/s]


Epoch 4, Train Loss: 0.37650415301322937
val_acc: 0.6667, val_loss: 0.7720, val_micro_f1: 0.6667, val_macro_f1: 0.5248, val_micro_precision: 0.6667, val_macro_precision: 0.7721, val_micro_recall: 0.6667, val_macro_recall: 0.5694


100%|██████████| 84/84 [00:05<00:00, 16.46it/s]
100%|██████████| 1/1 [00:00<00:00,  1.34it/s]


Epoch 5, Train Loss: 0.2880127727985382
val_acc: 0.6429, val_loss: 0.7853, val_micro_f1: 0.6429, val_macro_f1: 0.5568, val_micro_precision: 0.6429, val_macro_precision: 0.5998, val_micro_recall: 0.6429, val_macro_recall: 0.5769


100%|██████████| 84/84 [00:06<00:00, 13.11it/s]
100%|██████████| 1/1 [00:00<00:00,  1.08it/s]


Epoch 6, Train Loss: 0.15373949706554413
val_acc: 0.6190, val_loss: 0.7987, val_micro_f1: 0.6190, val_macro_f1: 0.5795, val_micro_precision: 0.6190, val_macro_precision: 0.5741, val_micro_recall: 0.6190, val_macro_recall: 0.5907


100%|██████████| 84/84 [00:05<00:00, 16.26it/s]
100%|██████████| 1/1 [00:00<00:00,  1.23it/s]


Epoch 7, Train Loss: 0.09430237859487534
val_acc: 0.6190, val_loss: 0.8760, val_micro_f1: 0.6190, val_macro_f1: 0.6070, val_micro_precision: 0.6190, val_macro_precision: 0.6123, val_micro_recall: 0.6190, val_macro_recall: 0.6373


100%|██████████| 84/84 [00:05<00:00, 14.93it/s]
100%|██████████| 1/1 [00:00<00:00,  1.19it/s]


Epoch 8, Train Loss: 0.03761705756187439
val_acc: 0.6429, val_loss: 0.8959, val_micro_f1: 0.6429, val_macro_f1: 0.6336, val_micro_precision: 0.6429, val_macro_precision: 0.6412, val_micro_recall: 0.6429, val_macro_recall: 0.6595


100%|██████████| 84/84 [00:07<00:00, 11.70it/s]
100%|██████████| 1/1 [00:01<00:00,  1.11s/it]


Epoch 9, Train Loss: 0.014343492686748505
val_acc: 0.5714, val_loss: 0.9929, val_micro_f1: 0.5714, val_macro_f1: 0.5712, val_micro_precision: 0.5714, val_macro_precision: 0.6004, val_micro_recall: 0.5714, val_macro_recall: 0.6011


100%|██████████| 84/84 [00:06<00:00, 13.02it/s]
100%|██████████| 1/1 [00:00<00:00,  1.32it/s]

Epoch 10, Train Loss: 0.012009811587631702
val_acc: 0.5595, val_loss: 1.0497, val_micro_f1: 0.5595, val_macro_f1: 0.5476, val_micro_precision: 0.5595, val_macro_precision: 0.5624, val_micro_recall: 0.5595, val_macro_recall: 0.5806





In [162]:
# evaluate on test set
model.eval()
correct = 0
total = 0
for test_batch in tqdm(test_loader):
    out = model(test_batch)
    pred = out.argmax(dim=1)
    correct += (pred == test_batch.y).sum().item()
    total += test_batch.y.size(0)

test_acc = correct / total

test_micro_f1 = multiclass_f1_score(out, test_batch.y, average='micro', num_classes=num_classes)
test_macro_f1 = multiclass_f1_score(out, test_batch.y, average='macro', num_classes=num_classes)

test_micro_precision = multiclass_precision(out, test_batch.y, average='micro', num_classes=num_classes)
test_macro_precision = multiclass_precision(out, test_batch.y, average='macro', num_classes=num_classes)

test_micro_recall = multiclass_recall(out, test_batch.y, average='micro', num_classes=num_classes)
test_macro_recall = multiclass_recall(out, test_batch.y, average='macro', num_classes=num_classes)
print(f'test_acc: {test_acc:.4f}, test_micro_f1: {test_micro_f1:.4f}, test_macro_f1: {test_macro_f1:.4f}, test_micro_precision: {test_micro_precision:.4f}, test_macro_precision: {test_macro_precision:.4f}, test_micro_recall: {test_micro_recall:.4f}, test_macro_recall: {test_macro_recall:.4f}')

100%|██████████| 1/1 [00:01<00:00,  1.93s/it]

test_acc: 0.3184, test_micro_f1: 0.3184, test_macro_f1: 0.2980, test_micro_precision: 0.3184, test_macro_precision: 0.3422, test_micro_recall: 0.3184, test_macro_recall: 0.3108



