# Natural Language Processing

# Calculating term-term co-occurrence matrix with a context window ±3.

    Text:
    "It is going to rain today"
    "Today I am not going outside"
    "NLP is an interesting topic"
    "NLP includes ML DL topics too"
    "I am going to complete NLP homework today"

In [2]:
from collections import defaultdict
import pandas as pd
import numpy as np

def co_occurrence(sentences, window_size):
    d = defaultdict(int)
    vocab = set()
    for text in sentences:
        # preprocessing (use tokenizer instead)
        text = text.lower().split()
        # iterate over sentences
        for i in range(len(text)):
            token = text[i]
            vocab.add(token)  # add to vocab
            next_token = text[i+1 : i+1+window_size]
            for t in next_token:
                key = tuple( sorted([t, token]) )
                d[key] += 1
    
    # formulate the dictionary into dataframe
    vocab = sorted(vocab) # sort vocab
    df = pd.DataFrame(data=np.zeros((len(vocab), len(vocab)), dtype=np.int16),
                      index=vocab,
                      columns=vocab)
    for key, value in d.items():
        df.at[key[0], key[1]] = value
        df.at[key[1], key[0]] = value
    return df

text = ["It is going to rain today", "Today I am not going outside", "NLP is an interesting topic", "NLP includes ML DL topics too", "I am going to complete NLP homework today"] 
df = co_occurrence(text, 3)
# df.to_csv("term-term_Co-occurrenceMatrix.csv")
df

Unnamed: 0,am,an,complete,dl,going,homework,i,includes,interesting,is,...,ml,nlp,not,outside,rain,to,today,too,topic,topics
am,0,0,1,0,2,0,2,0,0,0,...,0,0,1,1,0,1,1,0,0,0
an,0,0,0,0,0,0,0,0,1,1,...,0,1,0,0,0,0,0,0,1,0
complete,1,0,0,0,1,1,0,0,0,0,...,0,1,0,0,0,1,1,0,0,0
dl,0,0,0,0,0,0,0,1,0,0,...,1,1,0,0,0,0,0,1,0,1
going,2,0,1,0,0,0,2,0,0,1,...,0,1,1,1,1,2,1,0,0,0
homework,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,1,1,0,0,0
i,2,0,0,0,2,0,0,0,0,0,...,0,0,1,0,0,1,1,0,0,0
includes,0,0,0,1,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,1
interesting,0,1,0,0,0,0,0,0,0,1,...,0,1,0,0,0,0,0,0,1,0
is,0,1,0,0,1,0,0,0,1,0,...,0,1,0,0,1,1,0,0,1,0


In [3]:
columns = df.columns.to_list()
col = [df[i].to_list() for i in columns]
matrix = col[1:]
print("Term-Term Co-occurrence Matrix: ")
matrix

Term-Term Co-occurrence Matrix: 


[[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0],
 [1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1],
 [2, 0, 1, 0, 0, 0, 2, 0, 0, 1, 1, 0, 1, 1, 1, 1, 2, 1, 0, 0, 0],
 [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0],
 [2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0],
 [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1],
 [0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0],
 [0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0],
 [0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
 [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1],
 [0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0],
 [1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0],
 [1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0],
 [1, 0, 1,