In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import json
import re
import nltk
import numpy as np
import string
from collections import Counter, defaultdict
from math import log
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jylee/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [2]:
with open('preprocess/trec_split.json', 'r') as f:
    data = json.load(f)

train_data = data['train']
test_data = data['test']

In [64]:
train_data['0']

{'text': 'How did serfdom develop in and then leave Russia ?',
 'coarse_label': 2,
 'fine_label': 26}

to form any kinds of graph, you first need to go through the entire corpus and obtain {idx:node}. we first do this for word, then pos tags

In [4]:
# for cleaning text
import string
def clean_str(sentence ,use=True):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    if not use: return sentence

    sentence = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", sentence)
    sentence = re.sub(r"\'s", " \'s", sentence)
    sentence = re.sub(r"\'ve", " \'ve", sentence)
    sentence = re.sub(r"n\'t", " n\'t", sentence)
    sentence = re.sub(r"\'re", " \'re", sentence)
    sentence = re.sub(r"\'d", " \'d", sentence)
    sentence = re.sub(r"\'ll", " \'ll", sentence)
    sentence = re.sub(r",", " , ", sentence)
    sentence = re.sub(r"!", " ! ", sentence)
    sentence = re.sub(r"\(", " \( ", sentence)
    sentence = re.sub(r"\)", " \) ", sentence)
    sentence = re.sub(r"\?", " \? ", sentence)
    sentence = re.sub(r"\s{2,}", " ", sentence)
    sentence = sentence.translate(str.maketrans('', '', string.punctuation))
    return sentence.strip().lower()

In [5]:


def process_corpus(corpus):
    unique_words = set()
    word_count = Counter()
    pair_count = defaultdict(int)
    total_words = 0
    
    for line in corpus:
        line = clean_str(line)
        words = line.split()
        total_words += len(words)
        word_count.update(words)
        for i, word in enumerate(words):
            unique_words.add(word)
            for j in range(i + 1, len(words)):
                pair = tuple(sorted([word, words[j]]))
                pair_count[pair] += 1
    
    word_prob = {word: count / total_words for word, count in word_count.items()}
    pair_prob = {pair: count / total_words for pair, count in pair_count.items()}
    
    return word_prob, pair_prob, unique_words

def calculate_pmi(word_prob, pair_prob, word1, word2):
    pair = tuple(sorted([word1, word2]))
    if pair in pair_prob and word1 in word_prob and word2 in word_prob:
        pmi = log(pair_prob[pair] / (word_prob[word1] * word_prob[word2]))
        return pmi
    return 0.0

def create_pmi_matrix(sentence, word_prob, pair_prob, word_index):
    words = clean_str(sentence).split()
    n = len(words)
    pmi_matrix = np.zeros((n, n))
    node_list = []

    for word in words:
        if word in word_index:
            node_list.append(word_index[word])
        else:
            node_list.append(-1)
        
    for i in range(n):
        for j in range(i + 1, n):
            pmi = calculate_pmi(word_prob, pair_prob, words[i], words[j])
            pmi_matrix[i, j] = pmi
            pmi_matrix[j, i] = pmi  # PMI matrix is symmetric
    
    return pmi_matrix, node_list

# Example usage
corpus = [
    "Hello, world! This is a test.",
    "Another line; with more: punctuation.",
    "Is this working? Yes, it is!"
]

word_prob, pair_prob, unique_words = process_corpus(corpus)
word_index = {word: index for index, word in enumerate(sorted(unique_words))}

sentence = "Hello world, this is a test"
pmi_matrix, node_list = create_pmi_matrix(sentence, word_prob, pair_prob, word_index)
print("PMI Adjacency Matrix:")
print(pmi_matrix)


PMI Adjacency Matrix:
[[0.         2.83321334 2.14006616 1.73460106 2.83321334 2.83321334]
 [2.83321334 0.         2.14006616 1.73460106 2.83321334 2.83321334]
 [2.14006616 2.14006616 0.         2.14006616 2.14006616 2.14006616]
 [1.73460106 1.73460106 2.14006616 0.         1.73460106 1.73460106]
 [2.83321334 2.83321334 2.14006616 1.73460106 0.         2.83321334]
 [2.83321334 2.83321334 2.14006616 1.73460106 2.83321334 0.        ]]


In [6]:
word_index

{'a': 0,
 'another': 1,
 'hello': 2,
 'is': 3,
 'it': 4,
 'line': 5,
 'more': 6,
 'punctuation': 7,
 'test': 8,
 'this': 9,
 'with': 10,
 'working': 11,
 'world': 12,
 'yes': 13}

In [7]:
def process_corpus_tags(corpus):
    unique_tags = set()
    tag_count = Counter()
    tag_pair_count = defaultdict(int)
    total_tags = 0
    
    for line in corpus:
        line = clean_str(line)
        # get pos tags for words in the query
        tags = [one[1].lower() for one in nltk.pos_tag(nltk.word_tokenize(line))]
        if '' in tags:
            print(line)
        tags = line.split()
        total_tags += len(tags)
        tag_count.update(tags)
        for i, tag in enumerate(tags):
            unique_tags.add(tag)
            for j in range(i + 1, len(tags)):
                pair = tuple(sorted([tag, tags[j]]))
                tag_pair_count[pair] += 1
    
    tag_prob = {tag: count / total_tags for tag, count in tag_count.items()}
    pair_prob = {pair: count / total_tags for pair, count in tag_pair_count.items()}
    
    return tag_prob, tag_pair_prob, unique_tags

def calculate_pmi(word_prob, pair_prob, word1, word2):
    pair = tuple(sorted([word1, word2]))
    if pair in pair_prob and word1 in word_prob and word2 in word_prob:
        pmi = log(pair_prob[pair] / (word_prob[word1] * word_prob[word2]))
        return pmi
    return 0.0

def create_pmi_matrix(sentence, tag_prob, tag_pair_prob, tag_index):
    words = clean_str(sentence).split()
    n = len(words)
    pmi_matrix = np.zeros((n, n))
    node_list = []

    for word in words:
        if word in word_index:
            node_list.append(word_index[word])
        else:
            node_list.append(-1)
        
    for i in range(n):
        for j in range(i + 1, n):
            pmi = calculate_pmi(word_prob, pair_prob, words[i], words[j])
            pmi_matrix[i, j] = pmi
            pmi_matrix[j, i] = pmi  # PMI matrix is symmetric
    
    return pmi_matrix, node_list

# Example usage
corpus = [
    "Hello, world! This is a test.",
    "Another line; with more: punctuation.",
    "Is this working? Yes, it is!"
]

tag_prob, tag_pair_prob, unique_tags = process_corpus_tags(corpus)
tag_index = {tag: index for index, tag in enumerate(sorted(unique_tags))}

sentence = "Hello world, this is a test"
pmi_matrix, node_list = create_pmi_matrix(sentence, word_prob, pair_prob, word_index)
print("PMI Adjacency Matrix:")
print(pmi_matrix)

    

PMI Adjacency Matrix:
[[0.         2.83321334 2.14006616 1.73460106 2.83321334 2.83321334]
 [2.83321334 0.         2.14006616 1.73460106 2.83321334 2.83321334]
 [2.14006616 2.14006616 0.         2.14006616 2.14006616 2.14006616]
 [1.73460106 1.73460106 2.14006616 0.         1.73460106 1.73460106]
 [2.83321334 2.83321334 2.14006616 1.73460106 0.         2.83321334]
 [2.83321334 2.83321334 2.14006616 1.73460106 2.83321334 0.        ]]


### euclidean distance function

In [1]:
import numpy as np

def euclidean_distance(vector1, vector2):
    """
    Calculate the Euclidean distance between two embedding vectors.
    
    Args:
    vector1 (np.array): First embedding vector.
    vector2 (np.array): Second embedding vector.
    
    Returns:
    float: The Euclidean distance between the two vectors.
    """
    return np.linalg.norm(vector1 - vector2)

# Example usage
embedding1 = np.array([1, 2, 3])
embedding2 = np.array([4, 5, 6])

distance = euclidean_distance(embedding1, embedding2)
print(f"The Euclidean distance between the vectors is: {distance}")


The Euclidean distance between the vectors is: 5.196152422706632


# Phrase extraction

In [16]:
import nltk
import re

# Ensure you have the necessary nltk data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')


[nltk_data] Downloading package punkt to /Users/jylee/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jylee/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /Users/jylee/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [21]:

def pos_tag_sentence(sentence):
    words = nltk.word_tokenize(sentence)
    pos_tags = nltk.pos_tag(words, tagset='universal')
    return pos_tags

def identify_phrases(pos_tags, np_pattern, vp_pattern):
    pos_sequence = ' '.join([tag for word, tag in pos_tags])
    
    np_regex = re.compile(np_pattern)
    vp_regex = re.compile(vp_pattern)
    
    np_matches = [(match.start(), match.end()) for match in np_regex.finditer(pos_sequence)]
    vp_matches = [(match.start(), match.end()) for match in vp_regex.finditer(pos_sequence)]
    
    phrases = []
    for start, end in sorted(np_matches + vp_matches):
        phrase = ' '.join([word for word, tag in pos_tags[start:end]])
        phrases.append(phrase)
    
    return phrases

# Define the patterns
np_pattern = r"((DET)?(NUM)*((ADJ)(PUNCT)?(CONJ)?)*(((NOUN)|(PROPN))(PART)?)+)"
vp_pattern = r"((AUX)*(ADV)*(VERB))"

# Example usage
sentence = "The quick brown fox jumps over the lazy dog"
pos_tags = pos_tag_sentence(sentence)
print("POS Tags:", pos_tags)

phrases = identify_phrases(pos_tags, np_pattern, vp_pattern)
print("Phrases:", phrases)


POS Tags: [('The', 'DET'), ('quick', 'ADJ'), ('brown', 'NOUN'), ('fox', 'NOUN'), ('jumps', 'VERB'), ('over', 'ADP'), ('the', 'DET'), ('lazy', 'ADJ'), ('dog', 'NOUN')]
Phrases: ['dog', '', '', '']


## NP with spacy model (using this for now)

In [65]:
import spacy
nlp = spacy.load('en_core_web_lg')
sample_text = 'The quick brown fox jumps over the lazy dog'
sample_doc = nlp(sample_text)
# Extract Noun Phrases
for chunk in sample_doc.noun_chunks:
    print (chunk)

The quick brown fox
the lazy dog


## VP with spacy model (obtained from fyp student) (using this)

In [66]:
import textacy
sample_text = ('The quick brown fox jumps over the lazy dog')
expression = r'(<VERB>?<ADV>*<VERB>+)'
# pattern = [{"TEXT": {"REGEX": '(<VERB>?<ADV>*<VERB>+)'}}]
vp_patterns = [
    [{"POS": "ADV"}, {"POS": "VERB"}],
    [{"POS": "NOUN"}, {"POS": "VERB"}],
    [{"POS": "PRON"}, {"POS": "VERB"}],
    [{"POS": "ADJ"}, {"POS": "VERB"}],
    [{"POS": "VERB"}, {"POS": "PART"}],
    [{"POS": "VERB"}, {"POS": "NOUN"}],
    [{"POS": "VERB"}, {"POS": "NOUN"}, {"POS": "NOUN"}],
    [{"POS": "VERB"}, {"POS": "ADV"}],
    [{"POS": "VERB"}, {"POS": "ADJ"}],
    [{"POS": "VERB"}, {"POS": "PRON"}],
    [{"POS": "VERB"}, {"POS": "ADP"}],
    [{"POS": "VERB"}, {"POS": "ADP"}, {"POS": "ADV"}],
    [{"POS": "VERB"}, {"POS": "CONJ"}, {"POS": "VERB"}],
    [{"POS": "VERB"}, {"POS": "DET"}, {"POS": "NOUN"}],
    [{"POS": "VERB"}, {"POS": "ADP"}, {"POS": "ADJ"}, {"POS": "NOUN"}],
    [{"POS": "VERB"}, {"POS": "ADJ"}, {"POS": "NOUN"}],
    [{"POS": "VERB"}, {"POS": "DET"}, {"POS": "ADJ"}],
    [{"POS": "VERB"}, {"POS": "PART"}, {"POS": "ADP"}]
]


# get_verb_phrases = textacy.extract.token_matches(sample_text, patterns=patterns)
# verb_phrases = []
# for verb_phrase in get_verb_phrases:
#     verb_phrases.append(verb_phrase)
sample_doc = textacy.make_spacy_doc(sample_text,
                                        lang='en_core_web_lg')
verb_phrases = textacy.extract.token_matches(sample_doc, vp_patterns)
# Print all Verb Phrase
for chunk in verb_phrases:
    print(chunk)

fox jumps
jumps over


In [32]:
for match in re.finditer(expression, sample_doc.text):
    start, end = match.span()
    span = sample_doc.char_span(start, end)
    # This is a Span object or None if match doesn't map to valid token sequence
    if span is not None:
        print("Found match:", span.text)

## VP with spacy using grammar tree

In [35]:
import spacy

# Load the spaCy model
nlp = spacy.load("en_core_web_lg")

def extract_verb_phrases(text):
    doc = nlp(text)
    verb_phrases = []
    for token in doc:
        if token.pos_ == "VERB":
            verb_phrase = ' '.join([child.text for child in token.subtree])
            verb_phrases.append(verb_phrase)
    return verb_phrases

sentence = "He is eating an apple while she reads a book."
verb_phrases = extract_verb_phrases(sentence)
print("Verb Phrases:", verb_phrases)

Verb Phrases: ['He is eating an apple while she reads a book .', 'while she reads a book']


## VP using nltk regex

In [36]:
import nltk
from nltk import pos_tag
from nltk.chunk import RegexpParser
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package punkt to /Users/jylee/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jylee/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [37]:

def extract_verb_phrases(text):
    words = word_tokenize(text)
    tagged = pos_tag(words)
    chunk_grammar = "VP: {<VB.*><.*>*}"
    chunk_parser = RegexpParser(chunk_grammar)
    tree = chunk_parser.parse(tagged)

    verb_phrases = []
    for subtree in tree.subtrees():
        if subtree.label() == "VP":
            verb_phrase = ' '.join(word for word, pos in subtree.leaves())
            verb_phrases.append(verb_phrase)
    return verb_phrases

sentence = "He is eating an apple while she reads a book."
verb_phrases = extract_verb_phrases(sentence)
print("Verb Phrases:", verb_phrases)

Verb Phrases: ['is eating an apple while she reads a book .']


## combined phrase (NP and VP)

In [7]:
vp_patterns = [
    [{"POS": "ADV"}, {"POS": "VERB"}],
    [{"POS": "NOUN"}, {"POS": "VERB"}],
    [{"POS": "PRON"}, {"POS": "VERB"}],
    [{"POS": "ADJ"}, {"POS": "VERB"}],
    [{"POS": "VERB"}, {"POS": "PART"}],
    [{"POS": "VERB"}, {"POS": "NOUN"}],
    [{"POS": "VERB"}, {"POS": "NOUN"}, {"POS": "NOUN"}],
    [{"POS": "VERB"}, {"POS": "ADV"}],
    [{"POS": "VERB"}, {"POS": "ADJ"}],
    [{"POS": "VERB"}, {"POS": "PRON"}],
    [{"POS": "VERB"}, {"POS": "ADP"}],
    [{"POS": "VERB"}, {"POS": "ADP"}, {"POS": "ADV"}],
    [{"POS": "VERB"}, {"POS": "CONJ"}, {"POS": "VERB"}],
    [{"POS": "VERB"}, {"POS": "DET"}, {"POS": "NOUN"}],
    [{"POS": "VERB"}, {"POS": "ADP"}, {"POS": "ADJ"}, {"POS": "NOUN"}],
    [{"POS": "VERB"}, {"POS": "ADJ"}, {"POS": "NOUN"}],
    [{"POS": "VERB"}, {"POS": "DET"}, {"POS": "ADJ"}],
    [{"POS": "VERB"}, {"POS": "PART"}, {"POS": "ADP"}]
]

In [15]:
import spacy, textacy
nlp = spacy.load('en_core_web_lg')
sample_text = 'Calculate the Euclidean distance between two embedding vectors.'
sample_doc = nlp(sample_text)
verb_phrases = textacy.extract.token_matches(sample_doc, vp_patterns)

phrase_list = []
tag_list = []

# Extract Noun Phrases and corresponding pos tags
for chunk in sample_doc.noun_chunks:
    phrase_list.append(chunk)
    tag_list.append(' '.join([t.pos_ for t in chunk]))
# Print all Verb Phrase and corresponding pos tags
for chunk in verb_phrases:
    phrase_list.append(chunk)
    tag_list.append(' '.join([t.pos_ for t in chunk]))

print("Phrases: ", phrase_list)
print("POS Tags: ", tag_list)

Phrases:  [the Euclidean distance, two embedding vectors, Calculate the Euclidean, embedding vectors]
POS Tags:  ['DET ADJ NOUN', 'NUM VERB NOUN', 'VERB DET ADJ', 'VERB NOUN']


### one-hot encoding vectors for pos tags

In [2]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder

# Example corpus of sentences transformed into lists of POS tags
corpus = [
    ['VB', 'NN', 'NNP'],
    ['DT', 'NN', 'VBZ', 'VBG'],
    ['NN', 'NN', 'VB'],
]

# Step 1: Collect all unique POS tags to create a vocabulary
all_pos_tags = set(tag for sentence in corpus for tag in sentence)

# Step 2: Create a mapping from each POS tag to a unique index
pos_to_index = {tag: idx for idx, tag in enumerate(all_pos_tags)}

# Step 3: Generate one-hot encoded vectors
def one_hot_encode(pos_list, pos_to_index):
    vector = np.zeros(len(pos_to_index))
    for pos in pos_list:
        if pos in pos_to_index:
            vector[pos_to_index[pos]] = 1
    return vector

# Example usage
encoded_corpus = [one_hot_encode(sentence, pos_to_index) for sentence in corpus]

print("Vocabulary:", pos_to_index)
print("One-hot encoded vectors:")
for vec in encoded_corpus:
    print(vec)


Vocabulary: {'NNP': 0, 'DT': 1, 'VBG': 2, 'NN': 3, 'VB': 4, 'VBZ': 5}
One-hot encoded vectors:
[1. 0. 0. 1. 1. 0.]
[0. 1. 1. 1. 0. 1.]
[0. 0. 0. 1. 1. 0.]


# Phrase level embeddings

In [17]:
from sentence_transformers import SentenceTransformer
phrase_list = ['play an active role', 'participate actively', 'active lifestyle']

model = SentenceTransformer('whaleloops/phrase-bert')
phrase_embs = model.encode(phrase_list)
[p1, p2, p3] = phrase_embs




In [18]:
p1.shape

(768,)

# Dependency parsing

In [15]:
import spacy
import numpy as np

# Load the spacy model
nlp = spacy.load("en_core_web_lg")

def get_dependency_parse(sentence):
    doc = nlp(sentence)
    dependencies = [(token.text, token.head.text, token.dep_) for token in doc]
    return dependencies, [token.text for token in doc]

def create_adjacency_matrix(sentence):
    dependencies, words = get_dependency_parse(sentence)
    word_index = {word: i for i, word in enumerate(words)}
    n = len(words)
    
    adjacency_matrix = np.zeros((n, n), dtype=int)
    
    for word, head, dep in dependencies:
        if word != head:  # Skip self-loops
            adjacency_matrix[word_index[head]][word_index[word]] = 1
    
    return adjacency_matrix, words

# Example usage
sentence = "The quick brown fox jumps over the lazy dog."
adj_matrix, words = create_adjacency_matrix(sentence)

print("Words:", words)
print("Adjacency Matrix:")
print(adj_matrix)


Words: ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
Adjacency Matrix:
[[0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0]
 [1 1 1 0 0 0 0 0 0]
 [0 0 0 1 0 1 0 0 0]
 [0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 1 0]]


# NER

In [68]:
import json
import pickle as pkl
import numpy as np
from scipy.sparse import coo_matrix

nlp = spacy.load('en_core_web_lg')
ent2id_new = json.load(open('./pretrained_emb/NELL_KG/ent2ids_refined', 'r'))        
ent_mapping = {} 
entity_set = set()
adj_ent_index = []

def get_adj_ent_index(query, ent_mapping, ent2id_new):
    # named entity recognition
    np_list = []
    ent_list = []
    index = []
    
    # extract NP first
    doc = nlp(query)
    for chunk in doc.noun_chunks:
        np_list.append(chunk.text)
    
    # for every word in the NER dictionary
    for key in ent2id_new.keys(): 
        # check if the word is in the text
        if key in np_list: 
            # check if word is already in the mapping dict
            if key not in ent_mapping: 
                # add word to ent_list
                ent_list.append(key)
                # add word to mapping dict as word:idx_in_ent_list
                ent_mapping[key] = len(ent_mapping)
                # update the set
                entity_set.update(ent_list)
            if ent_mapping[key] not in index: 
                index.append(ent_mapping[key])
    # entity adjacency (index) matrix: list[list] of entities present in the sentences
    adj_ent_index.append(index)


sample_query = ['the quick brown fox jumps over the lazy dog', 'who is george washington']
for sent in sample_query:
    get_adj_ent_index(sent, ent_mapping, ent2id_new)
# json.dump([adj_ent_index, ent_mapping],
#           open('./{}_data/index_and_mapping.json'.format(dataset_name), 'w'), ensure_ascii=False)
ent_emb = []
TransE_emb_file = np.loadtxt('./pretrained_emb/NELL_KG/entity2vec.TransE')
TransE_emb = []

for i in range(len(TransE_emb_file)):
    TransE_emb.append(list(TransE_emb_file[i, :]))

rows = []
data = []
columns = []

max_num = len(ent_mapping)
# creating a coo format for matrix of adj_ent_index
for sent_i, indices in enumerate(adj_ent_index):
    for index in indices:
        data.append(1)
        rows.append(sent_i)
        columns.append(index)

# create a matrice of ones and zeros
# ones correspond to (sentence_index, entity_index) i.e. which entities are present in the sentence
adj_ent = coo_matrix((data, (rows, columns)), shape=(len(adj_ent_index), max_num))
# for entity in entity mapping
for key in ent_mapping.keys():
    # add embedding to ent_emb
    ent_emb.append(TransE_emb[ent2id_new[key]])

ent_emb = np.array(ent_emb)
print('ent shape', ent_emb.shape)
ent_emb_normed = ent_emb / np.sqrt(np.square(ent_emb).sum(-1, keepdims=True))
adj_emb = np.matmul(ent_emb_normed, ent_emb_normed.transpose())
print('entity_emb_cos', np.mean(np.mean(adj_emb, -1)))
# pkl.dump(np.array(ent_emb), open('./{}_data/entity_emb.pkl'.format(dataset_name), 'wb'))
# pkl.dump(adj_ent, open('./{}_data/adj_query2entity.pkl'.format(dataset_name), 'wb'))

entity_nodes = list(entity_set)

print('ent', len(entity_nodes))
print('entities', entity_nodes)

# 


ent shape (1, 100)
entity_emb_cos 0.9999999999999999
ent 1
entities ['george washington']


In [43]:
sample_query = 'the quick brown fox jumps over the lazy dog'
get_adj_ent_index(sample_query, ent_mapping, ent2id_new)

ent shape (6, 100)
entity_emb_cos 0.18653110780888263
ent 6
entities ['fox', 'jump', 'row', 'quick', 'dog', 'brown']


# Compile data

### PPMI

In [None]:
import numpy as np
from collections import defaultdict
from sklearn.preprocessing import OneHotEncoder
import networkx as nx
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity

# Example corpus of sentences
corpus = [
    "the cat sat on the mat",
    "the dog chased the cat",
    "the dog and cat are friends"
]

# Tokenize sentences into words
tokenized_corpus = [sentence.split() for sentence in corpus]

# 1. Collect all unique words in the corpus
word_set = set(word for sentence in tokenized_corpus for word in sentence)
print("Word Set:", word_set)

In [None]:
# 2. Create a mapping from each word to a unique index
word_to_index = {word: idx for idx, word in enumerate(word_set)}
print("Word to Index Mapping:", word_to_index)

In [None]:
from itertools import product
from collections import Counter
import math

# Compute co-occurrence matrix
co_occurrence_matrix = defaultdict(lambda: defaultdict(int))
window_size = 2

for sentence in tokenized_corpus:
    for i, word in enumerate(sentence):
        start = max(0, i - window_size)
        end = min(len(sentence), i + window_size + 1)
        for j in range(start, end):
            if i != j:
                co_occurrence_matrix[word][sentence[j]] += 1

# Convert co-occurrence matrix to DataFrame
import pandas as pd

co_occurrence_df = pd.DataFrame(co_occurrence_matrix).fillna(0)

# Compute Positive Pointwise Mutual Information (PPMI) values
total_sum = co_occurrence_df.values.sum()
word_counts = co_occurrence_df.sum(axis=1)
ppmi_matrix = np.maximum(
    np.log((co_occurrence_df.values * total_sum) / (word_counts.values[:, None] * word_counts.values[None, :])),
    0
)
np.fill_diagonal(ppmi_matrix, 0)
print("PPMI Matrix:\n", ppmi_matrix)

In [None]:
# 4. Generate word embeddings using Word2Vec
model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, sg=0)
word_vectors = np.array([model.wv[word] for word in word_set])
print("Word Embeddings Shape:", word_vectors.shape)

In [23]:
import numpy as np
import torch
import networkx as nx
from gensim.models import Word2Vec
from collections import defaultdict
from itertools import product
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
import pandas as pd

# Example corpus of sentences
corpus = [
    "the cat sat on the mat",
    "the dog chased the cat",
    "the dog and cat are friends"
]

# Tokenize sentences into words
tokenized_corpus = [sentence.split() for sentence in corpus]

# 1. Collect all unique words in the corpus
word_set = set(word for sentence in tokenized_corpus for word in sentence)

# 2. Create a mapping from each word to a unique index
word_to_index = {word: idx for idx, word in enumerate(word_set)}

# 3. Compute the co-occurrence matrix
co_occurrence_matrix = defaultdict(lambda: defaultdict(int))
window_size = 2

for sentence in tokenized_corpus:
    for i, word in enumerate(sentence):
        start = max(0, i - window_size)
        end = min(len(sentence), i + window_size + 1)
        for j in range(start, end):
            if i != j:
                co_occurrence_matrix[word][sentence[j]] += 1

# Convert co-occurrence matrix to DataFrame
co_occurrence_df = pd.DataFrame(co_occurrence_matrix).fillna(0)

# Compute Positive Pointwise Mutual Information (PPMI) values
total_sum = co_occurrence_df.values.sum()
word_counts = co_occurrence_df.sum(axis=1)
ppmi_matrix = np.maximum(
    np.log((co_occurrence_df.values * total_sum) / (word_counts.values[:, None] * word_counts.values[None, :])),
    0
)
np.fill_diagonal(ppmi_matrix, 0)

# 4. Generate word embeddings using Word2Vec
model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, sg=0)
word_vectors = np.array([model.wv[word] for word in word_set])

# 5. Create a list of torch_geometric.data.Data graphs for each sentence in the corpus
graphs = []

for sentence in tokenized_corpus:
    G = nx.Graph()
    node_indices = [word_to_index[word] for word in sentence]
    G.add_nodes_from(node_indices)
    
    edges = []
    edge_weights = []
    
    for i, word1 in enumerate(sentence):
        for j in range(i + 1, len(sentence)):
            word2 = sentence[j]
            if word1 != word2:
                ppmi_value = ppmi_matrix[word_to_index[word1], word_to_index[word2]]
                if ppmi_value > 0:
                    edges.append((word_to_index[word1], word_to_index[word2]))
                    edge_weights.append(ppmi_value)
    
    edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
    edge_attr = torch.tensor(edge_weights, dtype=torch.float)
    
    x = torch.tensor([model.wv[word] for word in sentence], dtype=torch.float)
    
    data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr)
    graphs.append(data)

# Create DataLoader for torch_geometric
data_loader = DataLoader(graphs, batch_size=2, shuffle=True)

# Example usage of DataLoader
for batch in data_loader:
    print(batch)
    print("Batch x shape:", batch.x.shape)
    print("Batch edge_index shape:", batch.edge_index.shape)
    print("Batch edge_attr shape:", batch.edge_attr.shape)


KeyboardInterrupt: 

issue for data
- structure of node embeddings  (num_nodes, emb_size)???
    - PMI: for word nodes: use glove
    - Entity: use TransE
    - Dependency parsing??


In [53]:
# Generate some random data for demonstration
def generate_random_graphs(num_nodes, num_node_features, num_classes, num_graphs):
    x = torch.rand((num_nodes, num_node_features), dtype=torch.float)  # Node features
    edge_index = torch.randint(0, num_nodes, (2, num_nodes * 2), dtype=torch.long)  # Edges
    y = torch.randint(0, num_classes, (num_nodes,), dtype=torch.long)  # Node labels
    return Data(x=x, edge_index=edge_index, y=y)

# Create datasets for 5 separate graphs
datasets = [generate_random_graphs(num_nodes=10, num_node_features=5, num_classes=3) for _ in range(100)]

dataloaders = [DataLoader(dataset, batch_size=16, shuffle=True) for dataset in datasets]

In [57]:
datasets[1]

[Data(x=[10, 5], edge_index=[2, 20], y=[10]),
 Data(x=[10, 5], edge_index=[2, 20], y=[10]),
 Data(x=[10, 5], edge_index=[2, 20], y=[10]),
 Data(x=[10, 5], edge_index=[2, 20], y=[10]),
 Data(x=[10, 5], edge_index=[2, 20], y=[10]),
 Data(x=[10, 5], edge_index=[2, 20], y=[10]),
 Data(x=[10, 5], edge_index=[2, 20], y=[10]),
 Data(x=[10, 5], edge_index=[2, 20], y=[10]),
 Data(x=[10, 5], edge_index=[2, 20], y=[10]),
 Data(x=[10, 5], edge_index=[2, 20], y=[10]),
 Data(x=[10, 5], edge_index=[2, 20], y=[10]),
 Data(x=[10, 5], edge_index=[2, 20], y=[10]),
 Data(x=[10, 5], edge_index=[2, 20], y=[10]),
 Data(x=[10, 5], edge_index=[2, 20], y=[10]),
 Data(x=[10, 5], edge_index=[2, 20], y=[10]),
 Data(x=[10, 5], edge_index=[2, 20], y=[10]),
 Data(x=[10, 5], edge_index=[2, 20], y=[10]),
 Data(x=[10, 5], edge_index=[2, 20], y=[10]),
 Data(x=[10, 5], edge_index=[2, 20], y=[10]),
 Data(x=[10, 5], edge_index=[2, 20], y=[10]),
 Data(x=[10, 5], edge_index=[2, 20], y=[10]),
 Data(x=[10, 5], edge_index=[2, 20

# Model

In [54]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data, DataLoader

# Define a single GCN
class GCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        return x

# Final model combining multiple GCNs and a classification layer
class MultiGCNClassifier(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_gncs, num_classes):
        super(MultiGCNClassifier, self).__init__()
        self.gcns = torch.nn.ModuleList([GCN(in_channels, hidden_channels, out_channels) for _ in range(num_gncs)])
        self.linear = torch.nn.Linear(out_channels * num_gncs, num_classes)

    def forward(self, graphs):
        embeddings = []
        for i, graph in enumerate(graphs):
            x, edge_index = graph.x, graph.edge_index
            embeddings.append(self.gcns[i](x, edge_index))
        
        # Concatenate the embeddings from each GCN
        concatenated = torch.cat(embeddings, dim=1)
        out = self.linear(concatenated)
        return F.log_softmax(out, dim=1)


# Set-up

In [None]:
# Initialize the model, optimizer, and loss function
num_gncs = 5
model = MultiGCNClassifier(in_channels=5, hidden_channels=16, out_channels=16, num_gncs=num_gncs, num_classes=3)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Training

In [55]:
# Training loop
for epoch in range(20):
    model.train()
    for batches in zip(*dataloaders):
        optimizer.zero_grad()
        
        # Process each batch through the respective GCN
        out = model(batches)
        
        # Use the labels for first graph of the batch for loss calculation
        loss = F.nll_loss(out, batches[0].y)
        loss.backward()
        optimizer.step()
    
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

# Test the model
model.eval()
correct = 0
total = 0
for batches in zip(*dataloaders):
    out = model(batches)
    pred = out.argmax(dim=1)
    correct += (pred == batches[0].y).sum().item()
    total += batches[0].num_nodes

accuracy = correct / total
print(f'Accuracy: {accuracy:.4f}')



Epoch 1, Loss: 1.0893805027008057
Epoch 2, Loss: 1.1288957595825195
Epoch 3, Loss: 1.1174507141113281
Epoch 4, Loss: 1.0951950550079346
Epoch 5, Loss: 1.0923782587051392
Epoch 6, Loss: 1.0871156454086304
Epoch 7, Loss: 1.090480089187622
Epoch 8, Loss: 1.11361563205719
Epoch 9, Loss: 1.0942275524139404
Epoch 10, Loss: 1.1013057231903076
Epoch 11, Loss: 1.0750558376312256
Epoch 12, Loss: 1.0790274143218994
Epoch 13, Loss: 1.1071563959121704
Epoch 14, Loss: 1.0836830139160156
Epoch 15, Loss: 1.0988441705703735
Epoch 16, Loss: 1.1076276302337646
Epoch 17, Loss: 1.1086926460266113
Epoch 18, Loss: 1.1190255880355835
Epoch 19, Loss: 1.0933531522750854
Epoch 20, Loss: 1.0927650928497314
Accuracy: 0.3760
