# Exercises

## Task 1. `Реалізація вагування PPMI на основі співвходження у тому ж абзаці`

In [10]:
from collections import Counter
import numpy as np

In [7]:
def calculate_ppmi_paragraph(corpus):
    paragraphs = corpus.split('\n\n')

    word_cooccurrences = {}
    total_words = 0
    words_count = Counter()

    for paragraph in paragraphs:
        words = paragraph.split()
        total_words += len(words)
        words_count.update(words)

        for i, word1 in enumerate(words):
            for j, word2 in enumerate(words):
                if i != j:
                    key = (word1, word2)
                    word_cooccurrences[key] = word_cooccurrences.get(key, 0) + 1

    ppmi_scores = {}
    for pair, count in word_cooccurrences.items():
        word1, word2 = pair
        p_wc = count / total_words
        p_w = words_count[word1] / total_words
        p_c = words_count[word2] / total_words
        pmi = max(np.log2(p_wc / (p_w * p_c)), 0)
        ppmi_scores[pair] = pmi

    return ppmi_scores

In [12]:
corpus = "Sample text corpus with multiple paragraphs.\n\nEach paragraph contains some words."
ppmi_paragraph = calculate_ppmi_paragraph(corpus)
print(ppmi_paragraph)

{('Sample', 'text'): 3.4594316186372973, ('Sample', 'corpus'): 3.4594316186372973, ('Sample', 'with'): 3.4594316186372973, ('Sample', 'multiple'): 3.4594316186372973, ('Sample', 'paragraphs.'): 3.4594316186372973, ('text', 'Sample'): 3.4594316186372973, ('text', 'corpus'): 3.4594316186372973, ('text', 'with'): 3.4594316186372973, ('text', 'multiple'): 3.4594316186372973, ('text', 'paragraphs.'): 3.4594316186372973, ('corpus', 'Sample'): 3.4594316186372973, ('corpus', 'text'): 3.4594316186372973, ('corpus', 'with'): 3.4594316186372973, ('corpus', 'multiple'): 3.4594316186372973, ('corpus', 'paragraphs.'): 3.4594316186372973, ('with', 'Sample'): 3.4594316186372973, ('with', 'text'): 3.4594316186372973, ('with', 'corpus'): 3.4594316186372973, ('with', 'multiple'): 3.4594316186372973, ('with', 'paragraphs.'): 3.4594316186372973, ('multiple', 'Sample'): 3.4594316186372973, ('multiple', 'text'): 3.4594316186372973, ('multiple', 'corpus'): 3.4594316186372973, ('multiple', 'with'): 3.459431618

---

## Task 2. `Реалізація вагування PPMI на основі співвходження у вікні сусідніх слів`

In [16]:
def calculate_ppmi_window(corpus, window_size=5):
    words = corpus.split()

    word_cooccurrences = {}
    total_words = len(words)
    words_count = Counter(words)

    for i in range(total_words):
        word1 = words[i]

        for j in range(max(i - window_size, 0), min(i + window_size + 1, total_words)):
            if j != i:
                word2 = words[j]
                key = (word1, word2)
                word_cooccurrences[key] = word_cooccurrences.get(key, 0) + 1

    ppmi_scores = {}
    for pair, count in word_cooccurrences.items():
        word1, word2 = pair
        p_wc = count / total_words
        p_w = words_count[word1] / total_words
        p_c = words_count[word2] / total_words
        pmi = max(np.log2(p_wc / (p_w * p_c)), 0)
        ppmi_scores[pair] = pmi

    return ppmi_scores

In [17]:
corpus = "Sample text corpus with multiple words."
ppmi_window = calculate_ppmi_window(corpus, window_size=5)
print(ppmi_window)

{('Sample', 'text'): 2.584962500721156, ('Sample', 'corpus'): 2.584962500721156, ('Sample', 'with'): 2.584962500721156, ('Sample', 'multiple'): 2.584962500721156, ('Sample', 'words.'): 2.584962500721156, ('text', 'Sample'): 2.584962500721156, ('text', 'corpus'): 2.584962500721156, ('text', 'with'): 2.584962500721156, ('text', 'multiple'): 2.584962500721156, ('text', 'words.'): 2.584962500721156, ('corpus', 'Sample'): 2.584962500721156, ('corpus', 'text'): 2.584962500721156, ('corpus', 'with'): 2.584962500721156, ('corpus', 'multiple'): 2.584962500721156, ('corpus', 'words.'): 2.584962500721156, ('with', 'Sample'): 2.584962500721156, ('with', 'text'): 2.584962500721156, ('with', 'corpus'): 2.584962500721156, ('with', 'multiple'): 2.584962500721156, ('with', 'words.'): 2.584962500721156, ('multiple', 'Sample'): 2.584962500721156, ('multiple', 'text'): 2.584962500721156, ('multiple', 'corpus'): 2.584962500721156, ('multiple', 'with'): 2.584962500721156, ('multiple', 'words.'): 2.584962500

---

## Task 3. `Вирішення проблеми високих значень PMI для дуже рідкі слів`

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [22]:
def calculate_ppmi_with_tfidf(corpus, word_cooccurrences):
    from sklearn.feature_extraction.text import TfidfVectorizer
    
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([corpus])

    feature_names = vectorizer.get_feature_names_out()
    tfidf_scores = dict(zip(feature_names, tfidf_matrix.toarray()[0]))

    ppmi_scores = {}
    total_words = sum(word_cooccurrences.values())
    for word_pair, count in word_cooccurrences.items():
        word1, word2 = word_pair
        p_wc = count / total_words
        p_w = tfidf_scores.get(word1, 0)
        p_c = tfidf_scores.get(word2, 0)
        pmi = max(np.log2(p_wc / (p_w * p_c)), 0)
        ppmi_scores[word_pair] = pmi

    return ppmi_scores

In [23]:
corpus = "This is the first sentence. This is the second sentence."

word_cooccurrences = {('This', 'is'): 2, ('This', 'the'): 1, ('first', 'sentence'): 1, ('is', 'the'): 1, ('the', 'first'): 1, ('the', 'second'): 1, ('second', 'sentence'): 1}

In [24]:
ppmi_scores = calculate_ppmi_with_tfidf(corpus, word_cooccurrences)
print("PPMI з використанням TF-IDF:")
for pair, ppmi in ppmi_scores.items():
    print(pair, ppmi)

PPMI з використанням TF-IDF:
('This', 'is') inf
('This', 'the') inf
('first', 'sentence') 0.16992500144231207
('is', 'the') 0
('the', 'first') 0.16992500144231207
('the', 'second') 0.16992500144231207
('second', 'sentence') 0.16992500144231207


  pmi = max(np.log2(p_wc / (p_w * p_c)), 0)


---

## Task 4.

In [28]:
ppmi_scores = {
    ('happy', 'joyful'): 0.8,
    ('happy', 'cheerful'): 0.7,
    ('happy', 'sad'): 0.3,
    ('sad', 'unhappy'): 0.6,
    ('sad', 'miserable'): 0.5,
    ('sad', 'cheerful'): 0.2,
    ('beautiful', 'pretty'): 0.9,
    ('beautiful', 'gorgeous'): 0.8,
    ('beautiful', 'ugly'): 0.1,
    ('ugly', 'unattractive'): 0.7,
} # випадкові значення як приклад

In [29]:
def create_shaded_table(ppmi_scores):
    words = set()
    for pair in ppmi_scores:
        words.update(pair)
    words = sorted(words)
    
    table = np.zeros((len(words), len(words)))
    
    for i, word1 in enumerate(words):
        for j, word2 in enumerate(words):
            if (word1, word2) in ppmi_scores:
                table[i, j] = ppmi_scores[(word1, word2)]
            elif (word2, word1) in ppmi_scores:
                table[i, j] = ppmi_scores[(word2, word1)]
    
    for i, word1 in enumerate(words):
        for j, word2 in enumerate(words):
            shade = int(table[i, j] * 255)  # Відтінок від 0 до 255
            print(f'<td style="background-color: rgb({shade}, {shade}, {shade});">')
            print(f'({word1}, {word2})<br>{table[i, j]:.2f}')
            print('</td>')
        print()

In [30]:
create_shaded_table(ppmi_scores)

<td style="background-color: rgb(0, 0, 0);">
(beautiful, beautiful)<br>0.00
</td>
<td style="background-color: rgb(0, 0, 0);">
(beautiful, cheerful)<br>0.00
</td>
<td style="background-color: rgb(204, 204, 204);">
(beautiful, gorgeous)<br>0.80
</td>
<td style="background-color: rgb(0, 0, 0);">
(beautiful, happy)<br>0.00
</td>
<td style="background-color: rgb(0, 0, 0);">
(beautiful, joyful)<br>0.00
</td>
<td style="background-color: rgb(0, 0, 0);">
(beautiful, miserable)<br>0.00
</td>
<td style="background-color: rgb(229, 229, 229);">
(beautiful, pretty)<br>0.90
</td>
<td style="background-color: rgb(0, 0, 0);">
(beautiful, sad)<br>0.00
</td>
<td style="background-color: rgb(25, 25, 25);">
(beautiful, ugly)<br>0.10
</td>
<td style="background-color: rgb(0, 0, 0);">
(beautiful, unattractive)<br>0.00
</td>
<td style="background-color: rgb(0, 0, 0);">
(beautiful, unhappy)<br>0.00
</td>

<td style="background-color: rgb(0, 0, 0);">
(cheerful, beautiful)<br>0.00
</td>
<td style="background-c