<a href="https://colab.research.google.com/github/hazelhkim/Pytorch-Framework/blob/master/HeteroGraph_for_Word_Doc_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Data

### Remove Words

In [1]:
from nltk.corpus import stopwords # Stopwords are the words in any language which does not add much meaning to a sentence. They can safely be ignored without sacrificing the meaning of the sentence. For some search engines, these are some of the most common, short function words, such as the, is, at, which, and on.
import nltk
from nltk.wsd import lesk # WSD = Word Sense Disambiguation; lesk(list, word, noun 'n') -> Synset('...')
from nltk.corpus import wordnet as wn 

In [2]:
# There is also a corpus of stopwords, that is, high-frequency words like "the", "to" and "also"
# that we sometimes want to filter out of a document before further processing. 
# Stopwords usually have little lexical content, and their presence in a text fails to distinguish it from other texts.
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
print(stop_words)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
{'on', 'i', 'once', 'doesn', 'when', 'theirs', 'will', 'should', 'yourselves', "doesn't", "she's", 'aren', 'into', 'while', 'there', 'with', 'her', 'and', 'had', 'if', 'wouldn', 'she', 'before', 're', "mustn't", 'it', "needn't", 'here', 'hadn', "shan't", 'out', 'during', 'why', 'hers', 'again', 'what', 'which', 'only', 'me', 'more', 'under', 'were', 'this', 'hasn', 'that', 'an', 'll', "you're", "mightn't", 'yours', 'just', 'are', 'doing', 'than', 'the', 'at', 'ain', 'any', "shouldn't", 'do', 'a', 'themselves', 'needn', 'nor', "wasn't", 'because', 'how', 'won', 'couldn', 'was', 'whom', 'being', 'ours', "it's", 'didn', "you've", 'who', "didn't", 'wasn', "haven't", 'm', 'down', 'am', 'too', 'few', 'mightn', "won't", 'over', 'its', 'until', 'to', 'my', 'been', 'about', 'where', 'has', 'herself', 'below', 'did', 'most', 'all', 'no', 'or', 'some', 'then', 'he', 'between', "a

In [3]:
doc_content_list = []
#with open('data/wiki_long_abstracts_en_text.txt', 'r') as f:
with open('/content/drive/My Drive/Colab Notebooks/AAAI21/data/corpus/ohsumed.txt', 'rb') as f:
    for line in f.readlines():
        doc_content_list.append(line.strip().decode('latin1'))

print(len(doc_content_list)) #7400

7400


In [4]:
import re

def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

In [5]:
# to remove rare words

word_freq = {}  

for doc_content in doc_content_list:
  temp = clean_str(doc_content)
  words = temp.split()
  for word in words:
    if word in word_freq:
        word_freq[word] += 1
    else:
        word_freq[word] = 1

print(word_freq)



In [6]:
clean_docs = []
for doc_content in doc_content_list:
  temp = clean_str(doc_content)
  words = temp.split()
  doc_words = []
  for word in words:
    # word not in stop_words and word_freq[word] >= 5
    if word not in stop_words and word_freq[word] >= 5:
      doc_words.append(word)
  doc_str = ' '.join(doc_words).strip()
  clean_docs.append(doc_str)
clean_corpus_str = '\n'.join(clean_docs)

In [7]:
print(clean_corpus_str)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [8]:
#with open('../data/wiki_long_abstracts_en_text.clean.txt', 'w') as f:
with open('/content/drive/My Drive/Colab Notebooks/AAAI21/data/corpus/ohsumed.txt', 'w') as f:
    f.write(clean_corpus_str)

In [9]:
min_len = 10000
aver_len = 0
max_len = 0 

with open('/content/drive/My Drive/Colab Notebooks/AAAI21/data/corpus/ohsumed.txt', 'r') as f:
    lines = f.readlines()
    for line in lines:
        line = line.strip()
        temp = line.split()
        aver_len = aver_len + len(temp)
        if len(temp) < min_len:
            min_len = len(temp)
        if len(temp) > max_len:
            max_len = len(temp)

aver_len = 1.0 * aver_len / len(lines)
print('Min_len : ' + str(min_len))
print('Max_len : ' + str(max_len))
print('Average_len : ' + str(aver_len))

Min_len : 14
Max_len : 476
Average_len : 135.81837837837838


### Build Graph

In [10]:
# Read Word Vectors
doc_name_list = []
doc_train_list = []
doc_test_list = []

with open('/content/drive/My Drive/Colab Notebooks/AAAI21/data/ohsumed.txt', 'r') as f:
    lines = f.readlines()
    for line in lines:
        doc_name_list.append(line.strip())
        temp = line.split("\t")
        if temp[1].find('test') != -1:
            doc_test_list.append(line.strip())
        elif temp[1].find('train') != -1:
            doc_train_list.append(line.strip())
print(len(doc_train_list)) #3357
print(len(doc_test_list))  #4043

3357
4043


In [11]:
doc_content_list = []
with open('/content/drive/My Drive/Colab Notebooks/AAAI21/data/corpus/ohsumed.txt', 'r') as f:
    lines = f.readlines()
    for line in lines:
        doc_content_list.append(line.strip())
print(len(doc_content_list)) #7400


7400


In [12]:
## For Train Dataset
import random

train_ids = []
for train_name in doc_train_list:
    train_id = doc_name_list.index(train_name)
    train_ids.append(train_id)
print(train_ids)
random.shuffle(train_ids)


[4043, 4044, 4045, 4046, 4047, 4048, 4049, 4050, 4051, 4052, 4053, 4054, 4055, 4056, 4057, 4058, 4059, 4060, 4061, 4062, 4063, 4064, 4065, 4066, 4067, 4068, 4069, 4070, 4071, 4072, 4073, 4074, 4075, 4076, 4077, 4078, 4079, 4080, 4081, 4082, 4083, 4084, 4085, 4086, 4087, 4088, 4089, 4090, 4091, 4092, 4093, 4094, 4095, 4096, 4097, 4098, 4099, 4100, 4101, 4102, 4103, 4104, 4105, 4106, 4107, 4108, 4109, 4110, 4111, 4112, 4113, 4114, 4115, 4116, 4117, 4118, 4119, 4120, 4121, 4122, 4123, 4124, 4125, 4126, 4127, 4128, 4129, 4130, 4131, 4132, 4133, 4134, 4135, 4136, 4137, 4138, 4139, 4140, 4141, 4142, 4143, 4144, 4145, 4146, 4147, 4148, 4149, 4150, 4151, 4152, 4153, 4154, 4155, 4156, 4157, 4158, 4159, 4160, 4161, 4162, 4163, 4164, 4165, 4166, 4167, 4168, 4169, 4170, 4171, 4172, 4173, 4174, 4175, 4176, 4177, 4178, 4179, 4180, 4181, 4182, 4183, 4184, 4185, 4186, 4187, 4188, 4189, 4190, 4191, 4192, 4193, 4194, 4195, 4196, 4197, 4198, 4199, 4200, 4201, 4202, 4203, 4204, 4205, 4206, 4207, 4208, 420

In [13]:
# partially labeled data
# train_ids = 0.2 of the total data

train_ids_str = '\n'.join(str(index) for index in train_ids)
with open('/content/drive/My Drive/Colab Notebooks/AAAI21/data/ohsumed.train.index', 'w') as f:
  f.write(train_ids_str)


In [14]:
## For Test Dataset

test_ids = []
for test_name in doc_test_list:
  test_id = doc_name_list.index(test_name)
  test_ids.append(test_id)

print(test_ids)
random.shuffle(test_ids)

test_ids_str = '\n'.join(str(index) for index in test_ids)
with open('/content/drive/My Drive/Colab Notebooks/AAAI21/data/ohsumed.test.index', 'w') as f:
    f.write(test_ids_str)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221,

In [15]:
ids = train_ids + test_ids
print(ids)
print(len(ids))

[5691, 6956, 5080, 7268, 4608, 4904, 4453, 6488, 4536, 5510, 7247, 4503, 5426, 5370, 4934, 4742, 4077, 7379, 4559, 4821, 6480, 6919, 4593, 6797, 5766, 4284, 6275, 5880, 5854, 5151, 4910, 6975, 4127, 6170, 4896, 4990, 5154, 5044, 6479, 6543, 4346, 5494, 4239, 4618, 5171, 7229, 6773, 5738, 4232, 6224, 4554, 6597, 7328, 6934, 7255, 4443, 7358, 6729, 6899, 4286, 5759, 4606, 6306, 4217, 4706, 4641, 6837, 5335, 5029, 6760, 4815, 5778, 5054, 4609, 4835, 5362, 7231, 6847, 6375, 5895, 5327, 4582, 6527, 5261, 5193, 4900, 4636, 7264, 5131, 5737, 4247, 5971, 6563, 4433, 4471, 5078, 4703, 6405, 6340, 4242, 4941, 4890, 5027, 5927, 5166, 5876, 6867, 5326, 4317, 5083, 7394, 6647, 6765, 4490, 6857, 7252, 4132, 4708, 4183, 5299, 4806, 4790, 4846, 6362, 4879, 6848, 5095, 6027, 6142, 7064, 6568, 5186, 6264, 4352, 7244, 4581, 4935, 4829, 5794, 4291, 5515, 4839, 5967, 5573, 7105, 5517, 5522, 5115, 5321, 5838, 5920, 4766, 5403, 4627, 4525, 4494, 5767, 5089, 4221, 6575, 5931, 5844, 6044, 4136, 6616, 7002, 486

In [16]:
shuffle_doc_name_list = []
shuffle_doc_words_list = []

for id in ids:
    shuffle_doc_name_list.append(doc_name_list[int(id)])
    shuffle_doc_words_list.append(doc_content_list[int(id)])
shuffle_doc_name_str = '\n'.join(shuffle_doc_name_list)
shuffle_doc_words_str = '\n'.join(shuffle_doc_words_list)

with open('/content/drive/My Drive/Colab Notebooks/AAAI21/data/ohsumed_shuffle.txt', 'w') as f:
    f.write(shuffle_doc_name_str)

with open('/content/drive/My Drive/Colab Notebooks/AAAI21/data/corpus/ohsumed_shuffle.txt', 'w') as f:
    f.write(shuffle_doc_words_str)

In [17]:
## Build vocab

# Word frequency
word_freq = {}
word_set = set()
for doc_words in shuffle_doc_words_list:
    words = doc_words.split()
    for word in words:
        word_set.add(word)
        if word in word_freq:
            word_freq[word] += 1
        else:
            word_freq[word] = 1
vocab = list(word_set)
vocab_size = len(vocab)
print(vocab)
print(vocab_size)

14157


In [18]:
# word_doc_list set with document index
word_doc_list = {}

for i in range(len(shuffle_doc_words_list)):
    doc_words = shuffle_doc_words_list[i]
    words = doc_words.split()
    appeared = set()
    for word in words:
        if word in appeared:
            continue
        if word in word_doc_list:
            doc_list = word_doc_list[word]
            doc_list.append(i)
            word_doc_list[word] = doc_list
        else:
            word_doc_list[word] = [i]
        appeared.add(word)

In [19]:
word_doc_freq = {}
for word, doc_list in word_doc_list.items():
    word_doc_freq[word] = len(doc_list)

word_id_map = {}
for i in range(vocab_size):
    word_id_map[vocab[i]] = i

vocab_str = '\n'.join(vocab)

with open('/content/drive/My Drive/Colab Notebooks/AAAI21/data/corpus/ohsumed_vocab.txt', 'w') as f:
    f.write(vocab_str)

In [20]:
## Label List
label_set = set()
for doc_meta in shuffle_doc_name_list:
  temp = doc_meta.split('\t')
  label_set.add(temp[2])
label_list = list(label_set)

label_list_str = '\n'.join(label_list)
with open('/content/drive/My Drive/Colab Notebooks/AAAI21/data/corpus/ohsumed_labels.txt', 'w') as f:
  f.write(label_list_str)


In [21]:
# x: feature vectors of training docs, no initial features
# slect 90% training set
train_size = len(train_ids)
val_size = int(0.1 * train_size)
real_train_size = train_size - val_size  # - int(0.5 * train_size)
# different training rates

real_train_doc_names = shuffle_doc_name_list[:real_train_size]
real_train_doc_names_str = '\n'.join(real_train_doc_names)

with open('/content/drive/My Drive/Colab Notebooks/AAAI21/data/ohsumed.real_train.name', 'w') as f:
    f.write(real_train_doc_names_str)


In [22]:
def loadWord2Vec(filename):
    """Read Word Vectors"""
    vocab = []
    embd = []
    word_vector_map = {}
    file = open(filename, 'r')
    for line in file.readlines():
        row = line.strip().split(' ')
        if(len(row) > 2):
            vocab.append(row[0])
            vector = row[1:]
            length = len(vector)
            for i in range(length):
                vector[i] = float(vector[i])
            embd.append(vector)
            word_vector_map[row[0]] = vector
    print('Loaded Word Vectors!')
    file.close()
    return vocab, embd, word_vector_map

In [23]:
## word definitions 
import nltk
nltk.download('wordnet')
from sklearn.feature_extraction.text import TfidfVectorizer

definitions = []

for word in vocab:
    word = word.strip()
    synsets = wn.synsets(clean_str(word))
    word_defs = []
    for synset in synsets:
        syn_def = synset.definition()
        word_defs.append(syn_def)
    word_des = ' '.join(word_defs)
    if word_des == '':
        word_des = '<PAD>'
    definitions.append(word_des)

string = '\n'.join(definitions)


f = open('/content/drive/My Drive/Colab Notebooks/AAAI21/data/corpus/ohsumed_vocab_def.txt', 'w')
f.write(string)
f.close()

tfidf_vec = TfidfVectorizer(max_features=1000)
tfidf_matrix = tfidf_vec.fit_transform(definitions)
tfidf_matrix_array = tfidf_matrix.toarray()
print(tfidf_matrix_array[0], len(tfidf_matrix_array[0]))

word_vectors = []

for i in range(len(vocab)):
    word = vocab[i]
    vector = tfidf_matrix_array[i]
    str_vector = []
    for j in range(len(vector)):
        str_vector.append(str(vector[j]))
    temp = ' '.join(str_vector)
    word_vector = word + ' ' + temp
    word_vectors.append(word_vector)

string = '\n'.join(word_vectors)

f = open('/content/drive/My Drive/Colab Notebooks/AAAI21/data/corpus/ohsumed_word_vectors.txt', 'w')
f.write(string)
f.close()

word_vector_file = '/content/drive/My Drive/Colab Notebooks/AAAI21/data/corpus/ohsumed_word_vectors.txt'
_, embd, word_vector_map = loadWord2Vec(word_vector_file)
word_embeddings_dim = len(embd[0])



[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.32785776 0.         0.39058103
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.    

In [24]:
import numpy as np

row_x = []
col_x = []
data_x = []
for i in range(real_train_size):
    doc_vec = np.array([0.0 for k in range(word_embeddings_dim)])
    doc_words = shuffle_doc_words_list[i]
    words = doc_words.split()
    doc_len = len(words)
    for word in words:
        if word in word_vector_map:
            word_vector = word_vector_map[word]
            # print(doc_vec)
            # print(np.array(word_vector))
            doc_vec = doc_vec + np.array(word_vector)

    for j in range(word_embeddings_dim):
        row_x.append(i)
        col_x.append(j)
        # np.random.uniform(-0.25, 0.25)
        data_x.append(doc_vec[j] / doc_len)  # doc_vec[j]/ doc_len

In [25]:
import scipy.sparse as sp
# x = sp.csr_matrix((real_train_size, word_embeddings_dim), dtype=np.float32)
x = sp.csr_matrix((data_x, (row_x, col_x)), shape=(
    real_train_size, word_embeddings_dim))

y = []
for i in range(real_train_size):
    doc_meta = shuffle_doc_name_list[i]
    temp = doc_meta.split('\t')
    label = temp[2]
    one_hot = [0 for l in range(len(label_list))]
    label_index = label_list.index(label)
    one_hot[label_index] = 1
    y.append(one_hot)
y = np.array(y)
print(y)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [26]:
# tx: feature vectors of test docs, no initial features
test_size = len(test_ids)

row_tx = []
col_tx = []
data_tx = []
for i in range(test_size):
    doc_vec = np.array([0.0 for k in range(word_embeddings_dim)])
    doc_words = shuffle_doc_words_list[i + train_size]
    words = doc_words.split()
    doc_len = len(words)
    for word in words:
        if word in word_vector_map:
            word_vector = word_vector_map[word]
            doc_vec = doc_vec + np.array(word_vector)

    for j in range(word_embeddings_dim):
        row_tx.append(i)
        col_tx.append(j)
        # np.random.uniform(-0.25, 0.25)
        data_tx.append(doc_vec[j] / doc_len)  # doc_vec[j] / doc_len


In [27]:
# tx = sp.csr_matrix((test_size, word_embeddings_dim), dtype=np.float32)
tx = sp.csr_matrix((data_tx, (row_tx, col_tx)),
                   shape=(test_size, word_embeddings_dim))

ty = []
for i in range(test_size):
    doc_meta = shuffle_doc_name_list[i + train_size]
    temp = doc_meta.split('\t')
    label = temp[2]
    one_hot = [0 for l in range(len(label_list))]
    label_index = label_list.index(label)
    one_hot[label_index] = 1
    ty.append(one_hot)
ty = np.array(ty)
print(ty)


[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [1 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [28]:
# allx: the the feature vectors of both labeled and unlabeled training instances
# (a superset of x)
# unlabeled training instances -> words
word_vectors = np.random.uniform(-0.01, 0.01,
                                 (vocab_size, word_embeddings_dim))

for i in range(len(vocab)):
    word = vocab[i]
    if word in word_vector_map:
        vector = word_vector_map[word]
        word_vectors[i] = vector

In [29]:
row_allx = []
col_allx = []
data_allx = []

for i in range(train_size):
    doc_vec = np.array([0.0 for k in range(word_embeddings_dim)])
    doc_words = shuffle_doc_words_list[i]
    words = doc_words.split()
    doc_len = len(words)
    for word in words:
        if word in word_vector_map:
            word_vector = word_vector_map[word]
            doc_vec = doc_vec + np.array(word_vector)

    for j in range(word_embeddings_dim):
        row_allx.append(int(i))
        col_allx.append(j)
        # np.random.uniform(-0.25, 0.25)
        data_allx.append(doc_vec[j] / doc_len)  # doc_vec[j]/doc_len



for i in range(vocab_size):
    for j in range(word_embeddings_dim):
        row_allx.append(int(i + train_size))
        col_allx.append(j)
        data_allx.append(word_vectors.item((i, j)))


row_allx = np.array(row_allx)
col_allx = np.array(col_allx)
data_allx = np.array(data_allx)


In [30]:
allx = sp.csr_matrix(
    (data_allx, (row_allx, col_allx)), shape=(train_size + vocab_size, word_embeddings_dim))

ally = []
for i in range(train_size):
    doc_meta = shuffle_doc_name_list[i]
    temp = doc_meta.split('\t')
    label = temp[2]
    one_hot = [0 for l in range(len(label_list))]
    label_index = label_list.index(label)
    one_hot[label_index] = 1
    ally.append(one_hot)

for i in range(vocab_size):
    one_hot = [0 for l in range(len(label_list))]
    ally.append(one_hot)

ally = np.array(ally)

print(x.shape, y.shape, tx.shape, ty.shape, allx.shape, ally.shape)
#(3022, 1000) (3022, 23) (4043, 1000) (4043, 23) (17514, 1000) (17514, 23)

(3022, 1000) (3022, 23) (4043, 1000) (4043, 23) (17514, 1000) (17514, 23)


In [31]:
'''
Doc word heterogeneous graph
'''

# word co-occurence with context windows
window_size = 20
windows = []

for doc_words in shuffle_doc_words_list:
    words = doc_words.split()
    length = len(words)
    if length <= window_size:
        windows.append(words)
    else:
        # print(length, length - window_size + 1)
        for j in range(length - window_size + 1):
            window = words[j: j + window_size]
            windows.append(window)
            # print(window)


In [32]:
word_window_freq = {}
for window in windows:
    appeared = set()
    for i in range(len(window)):
        if window[i] in appeared:
            continue
        if window[i] in word_window_freq:
            word_window_freq[window[i]] += 1
        else:
            word_window_freq[window[i]] = 1
        appeared.add(window[i])


In [33]:
word_pair_count = {}
for window in windows:
    for i in range(1, len(window)):
        for j in range(0, i):
            word_i = window[i]
            word_i_id = word_id_map[word_i]
            word_j = window[j]
            word_j_id = word_id_map[word_j]
            if word_i_id == word_j_id:
                continue
            word_pair_str = str(word_i_id) + ',' + str(word_j_id)
            if word_pair_str in word_pair_count:
                word_pair_count[word_pair_str] += 1
            else:
                word_pair_count[word_pair_str] = 1
            # two orders
            word_pair_str = str(word_j_id) + ',' + str(word_i_id)
            if word_pair_str in word_pair_count:
                word_pair_count[word_pair_str] += 1
            else:
                word_pair_count[word_pair_str] = 1

In [34]:
row = []
col = []
weight = []

from math import log

# pmi as weights

num_window = len(windows)

for key in word_pair_count:
    temp = key.split(',')
    i = int(temp[0])
    j = int(temp[1])
    count = word_pair_count[key]
    word_freq_i = word_window_freq[vocab[i]]
    word_freq_j = word_window_freq[vocab[j]]
    pmi = log((1.0 * count / num_window) /
              (1.0 * word_freq_i * word_freq_j/(num_window * num_window)))
    if pmi <= 0:
        continue
    row.append(train_size + i)
    col.append(train_size + j)
    weight.append(pmi)


In [35]:
# word vector cosine similarity as weights

'''
for i in range(vocab_size):
    for j in range(vocab_size):
        if vocab[i] in word_vector_map and vocab[j] in word_vector_map:
            vector_i = np.array(word_vector_map[vocab[i]])
            vector_j = np.array(word_vector_map[vocab[j]])
            similarity = 1.0 - cosine(vector_i, vector_j)
            if similarity > 0.9:
                print(vocab[i], vocab[j], similarity)
                row.append(train_size + i)
                col.append(train_size + j)
                weight.append(similarity)
'''
# doc word frequency
doc_word_freq = {}

for doc_id in range(len(shuffle_doc_words_list)):
    doc_words = shuffle_doc_words_list[doc_id]
    words = doc_words.split()
    for word in words:
        word_id = word_id_map[word]
        doc_word_str = str(doc_id) + ',' + str(word_id)
        if doc_word_str in doc_word_freq:
            doc_word_freq[doc_word_str] += 1
        else:
            doc_word_freq[doc_word_str] = 1

for i in range(len(shuffle_doc_words_list)):
    doc_words = shuffle_doc_words_list[i]
    words = doc_words.split()
    doc_word_set = set()
    for word in words:
        if word in doc_word_set:
            continue
        j = word_id_map[word]
        key = str(i) + ',' + str(j)
        freq = doc_word_freq[key]
        if i < train_size:
            row.append(i)
        else:
            row.append(i + vocab_size)
        col.append(train_size + j)
        idf = log(1.0 * len(shuffle_doc_words_list) /
                  word_doc_freq[vocab[j]])
        weight.append(freq * idf)
        doc_word_set.add(word)

node_size = train_size + vocab_size + test_size
adj = sp.csr_matrix(
    (weight, (row, col)), shape=(node_size, node_size))

In [36]:
import pickle as pkl

# dump objects
with open("/content/drive/My Drive/Colab Notebooks/AAAI21/data/ind.ohsumed.x", 'wb') as f:
    pkl.dump(x, f)

with open("/content/drive/My Drive/Colab Notebooks/AAAI21/data/ind.ohsumed.y", 'wb') as f:
    pkl.dump(y, f)

with open("/content/drive/My Drive/Colab Notebooks/AAAI21/data/ind.ohsumed.tx", 'wb') as f:
    pkl.dump(tx, f)

with open("/content/drive/My Drive/Colab Notebooks/AAAI21/data/ind.ohsumed.ty", 'wb') as f:
    pkl.dump(ty, f)

with open("/content/drive/My Drive/Colab Notebooks/AAAI21/data/ind.ohsumed.allx", 'wb') as f:
    pkl.dump(allx, f)

with open("/content/drive/My Drive/Colab Notebooks/AAAI21/data/ind.ohsumed.ally", 'wb') as f:
    pkl.dump(ally, f)

with open("/content/drive/My Drive/Colab Notebooks/AAAI21/data/ind.ohsumed.adj", 'wb') as f:
    pkl.dump(adj, f)
