##### Imports:

In [None]:
import os
import pandas as pds
import random
import torch
import torch.nn as nn
import numpy as np
from numpy import save
from google.colab import drive

##### Download the Kaggle common nouns dataset and a large corpus of Wikipedia sentences:

In [None]:
!pip install kaggle --upgrade # for kaggle download
!pip install -U sentence-transformers # for SBERT pre-trained download

# downloading kaggle noun dataset
os.environ['KAGGLE_USERNAME'] = ''
os.environ['KAGGLE_KEY'] = ''
!kaggle datasets download -d mikeortman/wikipedia-sentences # https://www.kaggle.com/datasets/mikeortman/wikipedia-sentences
!kaggle datasets download -d leite0407/list-of-nouns # https://www.kaggle.com/datasets/leite0407/list-of-nouns
!unzip wikipedia-sentences.zip
!unzip list-of-nouns.zip

##### Clean the data:

In [None]:
sentences=pds.read_csv('wikisent2.txt', error_bad_lines=False).to_numpy()
nouns = pds.read_csv('nounlist.csv').values.tolist()
for i in range(len(nouns)):
  nouns[i] = nouns[i][0]

##### Create a words dict to map words to indicies on the comatrix:

In [None]:
words_dict = {}
for i in range(len(nouns)):
  words_dict.update({nouns[i].lower(): i})

##### Create the comatrix (v 1.0, only look at next word):

In [None]:
def update_co_mat(co_mat, wrd_list):
    # Get all the words in the sentence and store it in an array wrd_lst
    for i in range(0, len(wrd_list) - 1): # can increase window size later
      co_mat[words_dict[wrd_list[i]]][words_dict[wrd_list[i+1]]] += 1
      co_mat[words_dict[wrd_list[i+1]]][words_dict[wrd_list[i]]] += 1


# make the matrix
co_mat = np.zeros((len(nouns),len(nouns)))
counter = 0
for s in sentences:
  s2 = s[0]
  temp_nouns_sent = ""
  if(not s2[0:1].isdigit()):
    for word in s2.split(): # processing at sentence level, could also do all of them without breaking by sentence
    # ignore all sentences with digits
      if word in nouns:
        temp_nouns_sent += word.lower() + " "
    wrd_list = temp_nouns_sent.split(' ')
    wrd_list.pop() # remove the empty element at the end

    if(wrd_list != [] and wrd_list != None):
      update_co_mat(co_mat, wrd_list)
      if(counter%700000==0):
        print(str((counter/7000000)*100)+"%")
    counter+=1

##### Create comatrix v2.0, look at all nouns in the sentence and scale relevance by distance using (1/2)^n for n words away

In [None]:
def update_co_mat(co_mat, wrd_list):
    # Get all the words in the sentence and store it in an array wrd_lst
    for i in range(0, len(wrd_list) - 1): # can increase window size later
      for j in range(i + 1, len(wrd_list) - 1):
        tempVal = j - i
        co_mat[words_dict[wrd_list[i]]][words_dict[wrd_list[j]]] += (1/2)**tempVal
        co_mat[words_dict[wrd_list[j]]][words_dict[wrd_list[i]]] += (1/2)**tempVal



# make the matrix
co_mat = np.zeros((len(nouns),len(nouns)))
counter = 0
for s in sentences:
  s2 = s[0]
  temp_nouns_sent = ""
  if(not s2[0:1].isdigit()):
    for word in s2.split(): # processing at sentence level, could also do all of them without breaking by sentence
    # ignore all sentences with digits
      if word in nouns:
        temp_nouns_sent += word.lower() + " "
    wrd_list = temp_nouns_sent.split(' ')
    wrd_list.pop() # remove the empty element at the end

    if(wrd_list != [] and wrd_list != None):
      update_co_mat(co_mat, wrd_list)
      if(counter%700000==0):
        print(str((counter/7000000)*100)+"%")
    counter+=1

20.0%
50.0%
70.0%
80.0%
90.0%


##### Create comatrix v3.0, look at all nouns in the sentence and scale relevance by distance using (1/n) for n words away

In [None]:
def update_co_mat(co_mat, wrd_list):
    # Get all the words in the sentence and store it in an array wrd_lst
    for i in range(0, len(wrd_list) - 1): # can increase window size later
      for j in range(i + 1, len(wrd_list) - 1):
        tempVal = j - i
        co_mat[words_dict[wrd_list[i]]][words_dict[wrd_list[j]]] += (1/tempVal)
        co_mat[words_dict[wrd_list[j]]][words_dict[wrd_list[i]]] += (1/tempVal)



# make the matrix
co_mat = np.zeros((len(nouns),len(nouns)))
counter = 0
for s in sentences:
  s2 = s[0]
  temp_nouns_sent = ""
  if(not s2[0:1].isdigit()):
    for word in s2.split(): # processing at sentence level, could also do all of them without breaking by sentence
    # ignore all sentences with digits
      if word in nouns:
        temp_nouns_sent += word.lower() + " "
    wrd_list = temp_nouns_sent.split(' ')
    wrd_list.pop() # remove the empty element at the end

    if(wrd_list != [] and wrd_list != None):
      update_co_mat(co_mat, wrd_list)
      if(counter%700000==0):
        print(str((counter/7000000)*100)+"%")
    counter+=1

Test the comatrix:

In [None]:
print("Co-occurrence Matrix: ")
print(co_mat.shape)

words = ["water", "bottle", "river", "dog", "park", "tree"]
print(co_mat[words_dict[words[0]]][words_dict[words[1]]])
print(co_mat[words_dict[words[0]]][words_dict[words[2]]])
print(co_mat[words_dict[words[1]]][words_dict[words[2]]])
print()
print(co_mat[words_dict[words[3]]][words_dict[words[4]]])
print(co_mat[words_dict[words[3]]][words_dict[words[5]]])
print(co_mat[words_dict[words[4]]][words_dict[words[5]]])

Save the comatrix to Google Drive:

In [None]:
drive.mount('/content/drive')

with open('/content/drive/My Drive/comatrix3.npy', 'wb') as f:
  np.save(f, co_mat)

In [None]:
with open('/content/drive/My Drive/J-Term 2023/comatrix3.npy', 'rb') as f: 
  co_mat=np.load(f, allow_pickle=True)

reduced = np.array([x[:10] for x in co_mat], dtype=float)

words_dict_rev = {}
for i in range(len(nouns)):
  words_dict.update({i:nouns[i].lower()})

df = pds.DataFrame(co_mat).rename(columns=words_dict, index=words_dict)
print(df)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
                   cd       suv          tv  aardvark  abacus  abbey  \
cd         113.074607  0.000000    2.833333       0.0     0.0    0.0   
suv          0.000000  0.666667    0.000000       0.0     0.0    0.0   
tv           2.833333  0.000000  374.113051       0.0     0.0    0.0   
aardvark     0.000000  0.000000    0.000000       0.0     0.0    0.0   
abacus       0.000000  0.000000    0.000000       0.0     0.0    0.0   
...               ...       ...         ...       ...     ...    ...   
zoo          0.000000  0.000000    0.333333       0.0     0.0    0.0   
zoologist    0.000000  0.000000    0.000000       0.0     0.0    0.0   
zoology      0.000000  0.000000    0.000000       0.0     0.0    0.0   
zoot-suit    0.000000  0.000000    0.000000       0.0     0.0    0.0   
zucchini     0.000000  0.000000    0.000000       0.0     0.0    0.0   

      