In [41]:
import numpy as np
import pandas as pd
import re
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity

def tokenize(text):
    text = re.sub(r'[^\w\s]', '', text.lower())
    return text.split()

tokens = tokenize(text)
vocab = sorted(set(tokens))
word_to_idx = {word: i for i, word in enumerate(vocab)}
idx_to_word = {i: word for word, i in word_to_idx.items()}


In [42]:
text = "I love natural language processing and I love machine learning I enjoy data science and explore deep learning techniques"


In [43]:
def generate_cooccurrence_matrix(tokens, window_size=2):
    vocab_size = len(vocab)
    matrix = np.zeros((vocab_size, vocab_size), dtype=int)

    for idx, word in enumerate(tokens):
        word_idx = word_to_idx[word]
        start = max(0, idx - window_size)
        end = min(len(tokens), idx + window_size + 1)

        for i in range(start, end):
            if i != idx:
                context_word = tokens[i]
                context_idx = word_to_idx[context_word]
                matrix[word_idx][context_idx] += 1

    return matrix


In [44]:
window_size = 2
co_matrix = generate_cooccurrence_matrix(tokens, window_size)
co_df = pd.DataFrame(co_matrix)
print(co_df)


print("\nWord Index:")
for i, word in enumerate(vocab):
    print(f"{i}: {word}")


    0   1   2   3   4   5   6   7   8   9   10  11  12  13
0    0   1   1   0   1   1   1   0   1   0   0   1   1   0
1    1   0   0   1   0   1   0   0   0   0   0   0   1   0
2    1   0   0   0   1   0   0   1   0   0   0   0   0   1
3    0   1   0   0   0   1   0   1   0   0   0   0   1   0
4    1   0   1   0   0   0   0   1   0   0   0   0   1   0
5    1   1   0   1   0   0   0   1   2   2   1   1   0   0
6    1   0   0   0   0   0   0   0   1   0   1   1   0   0
7    0   0   1   1   1   1   0   0   1   1   0   0   0   1
8    1   0   0   0   0   2   1   1   0   1   1   0   0   0
9    0   0   0   0   0   2   0   1   1   0   0   0   0   0
10   0   0   0   0   0   1   1   0   1   0   0   1   0   0
11   1   0   0   0   0   1   1   0   0   0   1   0   0   0
12   1   1   0   1   1   0   0   0   0   0   0   0   0   0
13   0   0   1   0   0   0   0   1   0   0   0   0   0   0

Word Index:
0: and
1: data
2: deep
3: enjoy
4: explore
5: i
6: language
7: learning
8: love
9: machine
10: natural

In [45]:
word_vecs = co_matrix
target_word = 'love'
target_idx = word_to_idx[target_word]


similarities = cosine_similarity([word_vecs[target_idx]], word_vecs)[0]
sim_df = pd.DataFrame({'word': vocab, 'similarity_to_love': similarities})
print(sim_df.sort_values(by='similarity_to_love', ascending=False))


          word  similarity_to_love
8         love            1.000000
11  processing            0.833333
9      machine            0.680414
1         data            0.500000
3        enjoy            0.500000
10     natural            0.500000
5            i            0.445435
7     learning            0.377964
0          and            0.353553
2         deep            0.333333
4      explore            0.333333
6     language            0.333333
13  techniques            0.235702
12     science            0.166667
