In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import json
import re
import nltk
import numpy as np
import string
from collections import Counter, defaultdict
from math import log
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jy/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [2]:
with open('preprocess/trec_split.json', 'r') as f:
    data = json.load(f)

train_data = data['train']
test_data = data['test']

In [4]:
train_data['0']

{'text': 'How did serfdom develop in and then leave Russia ?',
 'coarse_label': 2,
 'fine_label': 26}

to form any kinds of graph, you first need to go through the entire corpus and obtain {idx:node}. we first do this for word, then pos tags

In [14]:
# for cleaning text
import string
def clean_str(sentence ,use=True):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    if not use: return sentence

    sentence = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", sentence)
    sentence = re.sub(r"\'s", " \'s", sentence)
    sentence = re.sub(r"\'ve", " \'ve", sentence)
    sentence = re.sub(r"n\'t", " n\'t", sentence)
    sentence = re.sub(r"\'re", " \'re", sentence)
    sentence = re.sub(r"\'d", " \'d", sentence)
    sentence = re.sub(r"\'ll", " \'ll", sentence)
    sentence = re.sub(r",", " , ", sentence)
    sentence = re.sub(r"!", " ! ", sentence)
    sentence = re.sub(r"\(", " \( ", sentence)
    sentence = re.sub(r"\)", " \) ", sentence)
    sentence = re.sub(r"\?", " \? ", sentence)
    sentence = re.sub(r"\s{2,}", " ", sentence)
    sentence = sentence.translate(str.maketrans('', '', string.punctuation))
    return sentence.strip().lower()

In [16]:


def process_corpus(corpus):
    unique_words = set()
    word_count = Counter()
    pair_count = defaultdict(int)
    total_words = 0
    
    for line in corpus:
        line = clean_str(line)
        words = line.split()
        total_words += len(words)
        word_count.update(words)
        for i, word in enumerate(words):
            unique_words.add(word)
            for j in range(i + 1, len(words)):
                pair = tuple(sorted([word, words[j]]))
                pair_count[pair] += 1
    
    word_prob = {word: count / total_words for word, count in word_count.items()}
    pair_prob = {pair: count / total_words for pair, count in pair_count.items()}
    
    return word_prob, pair_prob, unique_words

def calculate_pmi(word_prob, pair_prob, word1, word2):
    pair = tuple(sorted([word1, word2]))
    if pair in pair_prob and word1 in word_prob and word2 in word_prob:
        pmi = log(pair_prob[pair] / (word_prob[word1] * word_prob[word2]))
        return pmi
    return 0.0

def create_pmi_matrix(sentence, word_prob, pair_prob, word_index):
    words = clean_str(sentence).split()
    n = len(words)
    pmi_matrix = np.zeros((n, n))
    node_list = []

    for word in words:
        if word in word_index:
            node_list.append(word_index[word])
        else:
            node_list.append(-1)
        
    for i in range(n):
        for j in range(i + 1, n):
            pmi = calculate_pmi(word_prob, pair_prob, words[i], words[j])
            pmi_matrix[i, j] = pmi
            pmi_matrix[j, i] = pmi  # PMI matrix is symmetric
    
    return pmi_matrix, node_list

# Example usage
corpus = [
    "Hello, world! This is a test.",
    "Another line; with more: punctuation.",
    "Is this working? Yes, it is!"
]

word_prob, pair_prob, unique_words = process_corpus(corpus)
word_index = {word: index for index, word in enumerate(sorted(unique_words))}

sentence = "Hello world, this is a test"
pmi_matrix, node_list = create_pmi_matrix(sentence, word_prob, pair_prob, word_index)
print("PMI Adjacency Matrix:")
print(pmi_matrix)


PMI Adjacency Matrix:
[[0.         2.83321334 2.14006616 1.73460106 2.83321334 2.83321334]
 [2.83321334 0.         2.14006616 1.73460106 2.83321334 2.83321334]
 [2.14006616 2.14006616 0.         2.14006616 2.14006616 2.14006616]
 [1.73460106 1.73460106 2.14006616 0.         1.73460106 1.73460106]
 [2.83321334 2.83321334 2.14006616 1.73460106 0.         2.83321334]
 [2.83321334 2.83321334 2.14006616 1.73460106 2.83321334 0.        ]]


In [17]:
word_index

{'a': 0,
 'another': 1,
 'hello': 2,
 'is': 3,
 'it': 4,
 'line': 5,
 'more': 6,
 'punctuation': 7,
 'test': 8,
 'this': 9,
 'with': 10,
 'working': 11,
 'world': 12,
 'yes': 13}

do the same for pos tags

In [5]:


def process_corpus(corpus):
    unique_tags = set()
    tag_count = Counter()
    tag_pair_count = defaultdict(int)
    total_tags = 0
    
    for line in corpus:
        line = clean_str(line)
        # get pos tags for words in the query
        tags = [one[1].lower() for one in nltk.pos_tag(nltk.word_tokenize(line))]
        if '' in tags:
            print(line)
        tags = line.split()
        total_tags += len(tags)
        tag_count.update(tags)
        for i, tag in enumerate(tags):
            unique_tags.add(tag)
            for j in range(i + 1, len(tags)):
                pair = tuple(sorted([tag, tags[j]]))
                tag_pair_count[pair] += 1
    
    tag_prob = {tag: count / total_tags for tag, count in tag_count.items()}
    pair_prob = {pair: count / total_tags for pair, count in tag_pair_count.items()}
    
    return word_prob, pair_prob, unique_words

def calculate_pmi(word_prob, pair_prob, word1, word2):
    pair = tuple(sorted([word1, word2]))
    if pair in pair_prob and word1 in word_prob and word2 in word_prob:
        pmi = log(pair_prob[pair] / (word_prob[word1] * word_prob[word2]))
        return pmi
    return 0.0

def create_pmi_matrix(sentence, word_prob, pair_prob, word_index):
    words = clean_str(sentence).split()
    n = len(words)
    pmi_matrix = np.zeros((n, n))
    node_list = []

    for word in words:
        if word in word_index:
            node_list.append(word_index[word])
        else:
            node_list.append(-1)
        
    for i in range(n):
        for j in range(i + 1, n):
            pmi = calculate_pmi(word_prob, pair_prob, words[i], words[j])
            pmi_matrix[i, j] = pmi
            pmi_matrix[j, i] = pmi  # PMI matrix is symmetric
    
    return pmi_matrix, node_list

# Example usage
corpus = [
    "Hello, world! This is a test.",
    "Another line; with more: punctuation.",
    "Is this working? Yes, it is!"
]

word_prob, pair_prob, unique_words = process_corpus(corpus)
word_index = {word: index for index, word in enumerate(sorted(unique_words))}

sentence = "Hello world, this is a test"
pmi_matrix, node_list = create_pmi_matrix(sentence, word_prob, pair_prob, word_index)
print("PMI Adjacency Matrix:")
print(pmi_matrix)

    

How did serfdom develop in and then leave Russia ?
