<a href="https://colab.research.google.com/github/jasmis1229/task3-1/blob/main/task3_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from collections import defaultdict
from scipy import sparse

def build_cooccurrence_matrix(tokenized_corpus, word_to_id, window_size=2):
    vocab_size = len(word_to_id)
    cooccurrence_dict = defaultdict(float)

    for sentence in tokenized_corpus:
        sentence_length = len(sentence)
        for i, center_word in enumerate(sentence):
            if center_word not in word_to_id:
                continue
            center_id = word_to_id[center_word]
            for j in range(max(0, i - window_size), min(sentence_length, i + window_size + 1)):
                if i != j:
                    context_word = sentence[j]
                    if context_word in word_to_id:
                        context_id = word_to_id[context_word]
                        distance = abs(j - i)
                        weight = 1.0 / distance
                        cooccurrence_dict[(center_id, context_id)] += weight

    row, col, data = zip(*[(i, j, v) for (i, j), v in cooccurrence_dict.items()])
    return sparse.csr_matrix((data, (row, col)), shape=(vocab_size, vocab_size))

In [None]:
window_sizes = [1, 3, 5]
matrices = {
    w: build_cooccurrence_matrix(tokenized_corpus, word_to_id, window_size=w)
    for w in window_sizes
}

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

top_words = list(word_to_id.keys())[:20]
indices = [word_to_id[w] for w in top_words]

fig, axes = plt.subplots(1, 3, figsize=(20, 6))
for idx, w in enumerate(window_sizes):
    mat = matrices[w][indices, :][:, indices].todense()
    sns.heatmap(mat, xticklabels=top_words, yticklabels=top_words, ax=axes[idx], cmap="Blues")
    axes[idx].set_title(f"윈도우 크기: {w}")
plt.tight_layout()
plt.show()
