<ul>
<li>Here is the R8 R52 dataset on kaggle: https://www.kaggle.com/weipengfei/ohr8r52
<li>here is an example of using Glove embeddings with the R8 dataset: https://www.kaggle.com/dvircohen0/text-classification-with-word-vectors
<li>Here is where to get the Glove embeddings 6b.300d: https://www.kaggle.com/thanakomsn/glove6b300dtxt
<li>Here is the link to the paper I want to implement: Text Level Graph Neural Network for Text Classification (https://arxiv.org/pdf/1910.02356.pdf)
<li>Here is a link to an implementation of the research paper with pytorch: https://github.com/Cynwell/Text-Level-GNN
<li>Here is a compilation of other papers related to NLP and GNNs
<li>Here is the linkedin of the guy who's code I am working off of: https://www.linkedin.com/in/cynwell/
    </ul>

In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
import numpy as np
from time import time
import argparse
import re
import matplotlib.pyplot as plt
import copy

## GloveTokenizer
Below, I'm using the GloVe tokenizer. it is introduced by Jeffrey Pennington, Richard Socher, Christopher D. Manning in 2014 in a paper titled <i> GloVe: Global Vectors for Word Representation </i> which can be found [here](https://nlp.stanford.edu/pubs/glove.pdf). GloVe is a count-based, unsupervised learning model that uses co-occurrence (how frequently two words appear together) statistics at a Global level to model the vector representations of words. Since the statistics are captured at a global level directly by the model, it is named the Global Vectors model.

Here, I'm using the glove.6B.300d embeddings which is a .txt file where, on every line, the first entry is the word and the following 300 entries are the embedding. I define functions to encode strings, decode a list of integers, and and to get the embedding array from a list of integers. The file can be downloaded [here](https://www.kaggle.com/thanakomsn/glove6b300dtxt). I'm using the 300d GloVe embeddings, since that was what was used in the research paper.

In [2]:
filename = 'embeddings/glove.6B.300d.txt'
unk='<unk>'
pad='<pad>'
stoi = dict() 
itos = dict() 
embedding_matrix = list() # contains the embeddings
with open(filename, 'r', encoding='utf8') as f: # Read tokenizer file
    for i, line in enumerate(f):
        values = line.split()
        # the first value in each line is the string, the rest of the values in the line is the embedding
        stoi[values[0]] = i
        itos[i] = values[0]
        if i == 0:
            print([v for v in values[1:]])
        embedding_matrix.append([float(v) for v in values[1:]])
if unk is not None: # Add unk token into the tokenizer
    i += 1
    stoi[unk] = i
    itos[i] = unk
    embedding_matrix.append(np.random.rand(len(embedding_matrix[0]))) # embeddings are random numbers between 0-1
if pad is not None: # Add pad token into the tokenizer
    i += 1
    stoi[pad] = i
    itos[i] = pad
    embedding_matrix.append(np.zeros(len(embedding_matrix[0]))) # embeddings are all 0's
embedding_matrix = np.array(embedding_matrix).astype(np.float32) # Convert from double to float for efficiency


['0.04656', '0.21318', '-0.0074364', '-0.45854', '-0.035639', '0.23643', '-0.28836', '0.21521', '-0.13486', '-1.6413', '-0.26091', '0.032434', '0.056621', '-0.043296', '-0.021672', '0.22476', '-0.075129', '-0.067018', '-0.14247', '0.038825', '-0.18951', '0.29977', '0.39305', '0.17887', '-0.17343', '-0.21178', '0.23617', '-0.063681', '-0.42318', '-0.11661', '0.093754', '0.17296', '-0.33073', '0.49112', '-0.68995', '-0.092462', '0.24742', '-0.17991', '0.097908', '0.083118', '0.15299', '-0.27276', '-0.038934', '0.54453', '0.53737', '0.29105', '-0.0073514', '0.04788', '-0.4076', '-0.026759', '0.17919', '0.010977', '-0.10963', '-0.26395', '0.07399', '0.26236', '-0.1508', '0.34623', '0.25758', '0.11971', '-0.037135', '-0.071593', '0.43898', '-0.040764', '0.016425', '-0.4464', '0.17197', '0.046246', '0.058639', '0.041499', '0.53948', '0.52495', '0.11361', '-0.048315', '-0.36385', '0.18704', '0.092761', '-0.11129', '-0.42085', '0.13992', '-0.39338', '-0.067945', '0.12188', '0.16707', '0.075169

In [6]:
# last 6 words in itos
for i in range(i-5,i+1):
    print(itos[i])

kronik
rolonda
zsombor
sandberger
<unk>
<pad>


In [7]:
# first 5 words in itos
for this_i in range(0,5):
    print(itos[this_i])

the
,
.
of
to


# Loading the Tokenizer

In [10]:
class GloveTokenizer:
    def __init__(self, filename, unk='<unk>', pad='<pad>'):
        self.filename = filename
        self.unk = unk # unknown token
        self.pad = pad # pad token
        self.stoi = dict() # string to int dictionary
        self.itos = dict() # int to string dictionary
        self.embedding_matrix = list() # contains the embeddings
        with open(filename, 'r', encoding='utf8') as f: # Read tokenizer file
            for i, line in enumerate(f):
                values = line.split()
                # the first value in each line is the string, the rest of the values in the line is the embedding
                self.stoi[values[0]] = i
                self.itos[i] = values[0]
                self.embedding_matrix.append([float(v) for v in values[1:]])
        if self.unk is not None: # Add unk token into the tokenizer
            i += 1
            self.stoi[self.unk] = i
            self.itos[i] = self.unk
            # embeddings are random numbers between 0-1
            self.embedding_matrix.append(np.random.rand(len(self.embedding_matrix[0]))) 
        if self.pad is not None: # Add pad token into the tokenizer
            i += 1
            self.stoi[self.pad] = i
            self.itos[i] = self.pad
            # embeddings are all 0's
            self.embedding_matrix.append(np.zeros(len(self.embedding_matrix[0]))) 
        # Convert from double to float for efficiency
        self.embedding_matrix = np.array(self.embedding_matrix).astype(np.float32) 

    # to encode a string into numbers
    def encode(self, sentence):
        if type(sentence) == str:
            # splits the string by " " and also, include punctuation since Glove has embeddings for punctuation
            sentence = re.findall(r"[\w']+|[.,!?;]", sentence)
        elif len(sentence): # if it is not a string, but it is convertible to a list, then convert it
            sentence = list(sentence)
        else:
            raise TypeError('sentence should be either a str or a list of str!')
        encoded_sentence = list()
        for word in sentence:
            # encode each word using the string to int dictionary, if the word doesn't exist, use the unknown token
            # converting the word to lower case since the stoi dictionary only has lower case words
            # otherwise, all capitalized words would just be given the unknown token
            encoded_sentence.append(self.stoi.get(word.lower(), self.stoi[self.unk])) 
        return encoded_sentence

    # to decode numbers into a string
    def decode(self, encoded_sentence):
        try:
            encoded_sentence = list(encoded_sentence)
        except Exception as e:
            print(e)
            raise TypeError('encoded_sentence should be either a str or a data type that is convertible to list type!')
        sentence = list()
        for encoded_word in encoded_sentence:
            sentence.append(self.itos[encoded_word])
        return sentence

    # takes an encoded sentence and returns the embeddings of shape (len(encoded_sentence), 300)
    def embedding(self, encoded_sentence):
        return self.embedding_matrix[np.array(encoded_sentence)]
    
# example:
# tokenizer = GloveTokenizer('embeddings/glove.6B.300d.txt')

In [11]:
tokenizer = GloveTokenizer('embeddings/glove.6B.300d.txt')

## The Dataset
Here, I am using the R8 dataset. This is a subset of the Reuters-21578 dataset, which is a collection of documents with news articles. The Reuters-21578 dataset is one of the most widely used data collections for text categorization research. It is collected from the Reuters financial newswire service in 1987. The Reuters-21578 dataset has 10,369 documents and a vocabulary of 29,930 words. It can be found [here](https://paperswithcode.com/dataset/reuters-21578). The R8 dataset, which is implemented in the research paper, has 5,485 training examples and 2189 test examples and 8 classes. When the R8 dataset was created, it removed all numbers from the strings.

In [9]:
train_filename='r8-train-all-terms.txt'
test_filename='r8-test-all-terms.txt'
train_data = pd.read_csv(train_filename, sep='\t', header=None)
test_data = pd.read_csv(test_filename, sep='\t', header=None)
train_data.head()

Unnamed: 0,0,1
0,earn,champion products ch approves stock split cham...
1,acq,computer terminal systems cpml completes sale ...
2,earn,cobanco inc cbco year net shr cts vs dlrs net ...
3,earn,am international inc am nd qtr jan oper shr lo...
4,earn,brown forman inc bfd th qtr net shr one dlr vs...


# How TextLevelGNNDatasetClass.build_vocab() Works

In [12]:
# the label dictionary is created, which is one-hot encoded
label_dict = dict(zip(train_data[0].unique(), pd.get_dummies(train_data[0].unique()).values.tolist()))
label_dict

{'earn': [0, 0, 1, 0, 0, 0, 0, 0],
 'acq': [1, 0, 0, 0, 0, 0, 0, 0],
 'trade': [0, 0, 0, 0, 0, 0, 0, 1],
 'ship': [0, 0, 0, 0, 0, 0, 1, 0],
 'grain': [0, 0, 0, 1, 0, 0, 0, 0],
 'crude': [0, 1, 0, 0, 0, 0, 0, 0],
 'interest': [0, 0, 0, 0, 1, 0, 0, 0],
 'money-fx': [0, 0, 0, 0, 0, 1, 0, 0]}

In [13]:
# the dataset is split into the training and test set
train_validation_split=0.8
train_dataset, validation_dataset = random_split(train_data.to_numpy(), [int(len(train_data) * train_validation_split), len(train_data) - int(len(train_data) * train_validation_split)])

vocab_list = [re.findall(r"[\w']+|[.,!?;]", sentence) for _, sentence in train_dataset]

In [14]:
# get the list of unique vocabulary
vocab_list = [re.findall(r"[\w']+|[.,!?;]", sentence) for _, sentence in train_dataset]
unique_vocab = []

stoi = {'<unk>': 0, '<pad>': 1} # Re-index
itos = {0: '<unk>', 1: '<pad>'} # Re-index
vocab_count = len(stoi)

# vocab_list is a list of lists, so this creates unique_vocab which is a single list with all words
for vocab in vocab_list:
    unique_vocab.extend(vocab)
# getting list of unique tokens, make sure they are all lower case since the tokenizer does not recognize upper case
unique_vocab = [item.lower() for item in list(set(unique_vocab))]

for vocab in unique_vocab:
    # if the vocab word is recognized by the tokenizer, add the vocab to stoi, itos, and increment vocab_count by 1
    if vocab in tokenizer.stoi.keys():
        # only add the vocab to the stoi keys and increment the vocab_count if the vocab isn't already in stoi
        if vocab not in stoi.keys():
            stoi[vocab] = vocab_count
            itos[vocab_count] = vocab
            vocab_count += 1
embedding_matrix = tokenizer.embedding(tokenizer.encode(list(stoi.keys())))   
print(f'shape of embedding matrix: {embedding_matrix.shape}')

shape of embedding matrix: (15394, 300)


# How create_neighbor_set Works

In [16]:
def create_neighbor_set(node_set, p=2):
    if type(node_set[0]) != int:
        raise ValueError('node_set should be a 1D list!')
    if p < 0:
        raise ValueError('p should be an integer >= 0!')
    sequence_length = len(node_set)
    neighbor_set = []
    for i in range(sequence_length):
        neighbor = []
        for j in range(-p, p+1):
            if 0 <= i + j < sequence_length:
                neighbor.append(node_set[i+j])
        neighbor_set.append(neighbor)
    return neighbor_set
# example:
# single_nodeset = [8145, 15469, 446, 6223, 5523, 4492, 8145, 11493, 11578, 9559]
# create_neighbor_set(single_nodeset)

In [17]:
MAX_LENGTH = 10
p = 2
# gets the first MAX_LENGTH words from each sentence in train_dataset and converts them to ints
node_sets = [[stoi.get(vocab, 0) for vocab in re.findall(r"[\w']+|[.,!?;]", sentence)][:MAX_LENGTH] for _, sentence in train_dataset] # Only retrieve the first MAX_LENGTH words in each document
neighbor_sets = [create_neighbor_set(node_set, p=p) for node_set in node_sets]
labels = [label_dict[label] for label, _ in train_dataset]    

In [18]:
node_sets[0]

[1043, 9855, 11196, 1675, 767, 3160, 4042, 1043, 9855, 6299]

In [19]:
neighbor_sets[0]

[[1043, 9855, 11196],
 [1043, 9855, 11196, 1675],
 [1043, 9855, 11196, 1675, 767],
 [9855, 11196, 1675, 767, 3160],
 [11196, 1675, 767, 3160, 4042],
 [1675, 767, 3160, 4042, 1043],
 [767, 3160, 4042, 1043, 9855],
 [3160, 4042, 1043, 9855, 6299],
 [4042, 1043, 9855, 6299],
 [1043, 9855, 6299]]

# How build_public_edge_mask Works

In [20]:
# this is the way that it is done by the other person
# this results in a test accuracy of around 0.96
min_freq = 2
edge_stat = torch.zeros(vocab_count, vocab_count)

z = 0
for node_set, neighbor_set in zip(node_sets, neighbor_sets):
    for neighbor in neighbor_set:
        for to_node in neighbor:
            if z < 2:
                print(f'neighbor_set: {neighbor_set}')
                print(f'neighbor: {neighbor}')
                print(f'node_set: {node_set}')
                print(f'to_node: {to_node}')
                print()
            z+=1
            edge_stat[node_set, to_node] += 1
public_edge_mask = edge_stat < min_freq
#edge_df = pd.DataFrame(edge_stat.numpy())
#edge_df.iloc[[11215, 14384, 10347, 2464, 2370],[11215, 14384, 10347, 2464, 2370]]
#[itos[wrd] for wrd in [11215, 14384, 10347, 2464, 2370]]

neighbor_set: [[1043, 9855, 11196], [1043, 9855, 11196, 1675], [1043, 9855, 11196, 1675, 767], [9855, 11196, 1675, 767, 3160], [11196, 1675, 767, 3160, 4042], [1675, 767, 3160, 4042, 1043], [767, 3160, 4042, 1043, 9855], [3160, 4042, 1043, 9855, 6299], [4042, 1043, 9855, 6299], [1043, 9855, 6299]]
neighbor: [1043, 9855, 11196]
node_set: [1043, 9855, 11196, 1675, 767, 3160, 4042, 1043, 9855, 6299]
to_node: 1043

neighbor_set: [[1043, 9855, 11196], [1043, 9855, 11196, 1675], [1043, 9855, 11196, 1675, 767], [9855, 11196, 1675, 767, 3160], [11196, 1675, 767, 3160, 4042], [1675, 767, 3160, 4042, 1043], [767, 3160, 4042, 1043, 9855], [3160, 4042, 1043, 9855, 6299], [4042, 1043, 9855, 6299], [1043, 9855, 6299]]
neighbor: [1043, 9855, 11196]
node_set: [1043, 9855, 11196, 1675, 767, 3160, 4042, 1043, 9855, 6299]
to_node: 9855



In [21]:
# this is the way that I think that it should be done, based off of the research paper
# where each node is connected to its neighbors, also it is bidirectional
# however, this method results in a test accuracy around 0.90
min_freq = 2
edge_stat = torch.zeros(vocab_count, vocab_count)

z=0
for neighbor_set in neighbor_sets:
    for neighbor in neighbor_set:
        for to_node in neighbor:
            if z < 2:
                print(f'neighbor_set: {neighbor_set}')
                print(f'neighbor: {neighbor}')
                print(f'to_node: {to_node}')
                print()
            z+=1
            # create neighbors_temp which removes one single occurrence of to_node,
            # that way the edges only connect the same node with itself if it shows up multiple times in neighbor
            neighbors_temp = copy.deepcopy(neighbor)
            neighbors_temp.remove(to_node)
            # connect all neighbors to the node
            edge_stat[neighbors_temp, to_node] += 1
            # connect all nodes to the neighbors
            edge_stat[to_node, neighbors_temp] += 1
public_edge_mask = edge_stat < min_freq # mark True at uncommon edges
#edge_df = pd.DataFrame(edge_stat.numpy())
#edge_df.iloc[[11215, 14384, 10347, 2464, 2370],[11215, 14384, 10347, 2464, 2370]]
#[itos[wrd] for wrd in [11215, 14384, 10347, 2464, 2370]]

neighbor_set: [[1043, 9855, 11196], [1043, 9855, 11196, 1675], [1043, 9855, 11196, 1675, 767], [9855, 11196, 1675, 767, 3160], [11196, 1675, 767, 3160, 4042], [1675, 767, 3160, 4042, 1043], [767, 3160, 4042, 1043, 9855], [3160, 4042, 1043, 9855, 6299], [4042, 1043, 9855, 6299], [1043, 9855, 6299]]
neighbor: [1043, 9855, 11196]
to_node: 1043

neighbor_set: [[1043, 9855, 11196], [1043, 9855, 11196, 1675], [1043, 9855, 11196, 1675, 767], [9855, 11196, 1675, 767, 3160], [11196, 1675, 767, 3160, 4042], [1675, 767, 3160, 4042, 1043], [767, 3160, 4042, 1043, 9855], [3160, 4042, 1043, 9855, 6299], [4042, 1043, 9855, 6299], [1043, 9855, 6299]]
neighbor: [1043, 9855, 11196]
to_node: 9855

