# Task 1 
Use SpaCy tokenizer API to tokenize the text from the PiQA corpus.

In [1]:
import spacy
from spacy.tokenizer import Tokenizer

import pandas as pd

In [2]:
corpus_df = pd.read_json("./data/corpus.jsonl", lines=True)
corpus_df = corpus_df.set_index('_id').sort_index()
corpus_df.head()

Unnamed: 0_level_0,title,text,metadata
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3,,"Nie mówię, że nie podoba mi się też pomysł szk...",{}
31,,Tak więc nic nie zapobiega fałszywym ocenom po...,{}
56,,Nigdy nie możesz korzystać z FSA dla indywidua...,{}
59,,Samsung stworzył LCD i inne technologie płaski...,{}
63,,Oto wymagania SEC: Federalne przepisy dotycząc...,{}


In [3]:
from typing import Sequence


Corpus = Sequence[str]

corpus = [row["text"] for _, row in corpus_df.iterrows()]

In [4]:
polish = spacy.load("pl_core_news_sm")

# Task 2 
Compute bigram counts of downcased tokens.

In [5]:
NGrams = dict[str, int]


class NGramsEngine:

    def __init__(self, nlp) -> None:
        self._nlp = nlp
        self._tokenizer = Tokenizer(nlp.vocab)


    def count_ngrams(
            self, corpus: Corpus, n: int, lower: bool = True, lemma_tag: bool = False
        ) -> NGrams:
        from_token = (
            # lemmatization sometimes contains many elements
            (lambda t: f"{''.join(t.lemma_.split())}:{t.tag_}") if lemma_tag else 
            (lambda t: t.text)
            
        )
        maybe_lower = (lambda t: t.lower()) if lower else (lambda t: t)
        preproc = lambda t: maybe_lower(from_token(t))

        nlp = lambda doc: self._nlp(doc) if lemma_tag else doc

        ngrams: NGrams = {}
        
        for doc in self._tokenizer.pipe(corpus):
            if len(doc) < n:
                break

            buffer = []

            for token in nlp(doc):
                buffer.append(preproc(token))

                if len(buffer) < n:
                    continue

                ngram = " ".join(buffer)
                ngrams[ngram] = ngrams.get(ngram, 0) + 1
                buffer.pop(0)

        return ngrams
    

ngrams_engine = NGramsEngine(polish)

In [6]:
ngrams_engine.count_ngrams(["The quick brown fox jumps over the lazy dog."], n=3)

{'the quick brown': 1,
 'quick brown fox': 1,
 'brown fox jumps': 1,
 'fox jumps over': 1,
 'jumps over the': 1,
 'over the lazy': 1,
 'the lazy dog.': 1}

In [7]:
bigrams = ngrams_engine.count_ngrams(corpus, n=2)

# Task 3
Discard bigrams containing characters other than letters.

In [8]:
def filter(ngrams: NGrams, predicate) -> NGrams:
    return {
        ngram: count 
        for ngram, count in ngrams.items()
        if predicate(ngram, count)
    }

In [9]:
import regex as re



not_only_letters = r"[^\p{L}\u0100-\u017F\s]+"


def filter_only_letters(ngrams: NGrams) -> NGrams:
    predicate = lambda ngram, _: re.search(not_only_letters, ngram) is None
    return filter(ngrams, predicate)

In [10]:
bigrams = filter_only_letters(bigrams)

# Task 4
Compute the pointwise mutual information for all pairs of words.

In [11]:
import numpy as np


def pmi(bigrams: NGrams, unigrams: NGrams) -> dict[str, float]:
    result = {}

    total_bigrams = sum(bigrams.values())
    total_unigrams = sum(unigrams.values())

    for bigram, count in bigrams.items():
        x, y = bigram.split()
        
        ratio = (count * total_unigrams**2) / (total_bigrams * unigrams[x] * unigrams[y])

        result[bigram] = np.log2(ratio)

    return result

In [12]:
unigrams = ngrams_engine.count_ngrams(corpus, n=1)
unigrams = filter_only_letters(unigrams)

In [13]:
bigrams_pmi = pmi(bigrams, unigrams)

# Task 5
Sort the word pairs according to that measure in the descending order and determine top 10 entries.

In [14]:
def order_by_values(d: dict) -> list[tuple]:
    return [(k, v) for k, v in sorted(d.items(), key=lambda p: p[1], reverse=True)]

In [15]:
bigrams_pmi_ordered = order_by_values(bigrams_pmi)
bigrams_pmi_ordered[:10]

[('zadłużonych studentów', 16.413533185693815),
 ('uniemożliwiają instytucjom', 16.413533185693815),
 ('należytej staranności', 16.413533185693815),
 ('technologie płaskiego', 16.413533185693815),
 ('fabryk samsunga', 16.413533185693815),
 ('dzieła randa', 16.413533185693815),
 ('strip john', 16.413533185693815),
 ('john galt', 16.413533185693815),
 ('sucho wpychanie', 16.413533185693815),
 ('miliarderów polityków', 16.413533185693815)]

# Task 6
Filter bigrams with number of occurrences lower than 5. Determine top 10 entries for the remaining dataset.

In [16]:
bigrams_filtered = filter(bigrams, lambda _, count: count >= 5)

bigrams_filtered_pmi = pmi(bigrams_filtered, unigrams)

bigrams_filtered_pmi_ordered = order_by_values(bigrams_filtered_pmi)
bigrams_filtered_pmi_ordered[:10]

[('konstruktywne opinie', 16.033649435185765),
 ('dyrektor generalny', 16.033649435185765),
 ('paliwa ciekłego', 15.99300745068842),
 ('agencje ratingowe', 15.84100435724337),
 ('zbiorniku magazynowym', 15.519076262356009),
 ('trend wzrostowy', 15.519076262356009),
 ('kredytów hipotecznych', 15.381572738606074),
 ('sygnałów elektrycznych', 15.381572738606074),
 ('of america', 15.256041856522215),
 ('kredyty hipoteczne', 15.256041856522215)]

# Task 7-9
Use SpaCy to lemmatize and tag the sentences in the corpus.
Using the tagged corpus compute bigram statistic for the tokens containing:
- lemmatized, downcased word.
- morphosyntactic category of the word (subst, fin, adj, etc.).

In [17]:
ngrams_engine.count_ngrams(["Ala ma kota."], n=2, lemma_tag=True)

{'ala:subst mieć:fin': 1, 'mieć:fin kota.:qub': 1}

In [18]:
bigrams_lemma = ngrams_engine.count_ngrams(corpus, n=2, lemma_tag=True)

In [19]:
def filter_only_letters_lemma(ngrams: NGrams) -> NGrams:
    predicate = lambda ngram, _: all(
        [re.search(not_only_letters, gram.split(":")[0]) is None for gram in ngram.split()]
    )
    return filter(ngrams, predicate)


bigrams_lemma = filter_only_letters_lemma(bigrams_lemma)

# Task 10
Compute the same statistics as for the non-lemmatized words.

In [20]:
unigrams_lemma = ngrams_engine.count_ngrams(corpus, n=1, lemma_tag=True)
unigrams_lemma = filter_only_letters_lemma(unigrams_lemma)

In [21]:
bigrams_lemma_pmi = pmi(bigrams_lemma, unigrams_lemma)

In [22]:
bigrams_lemma_pmi_ordered = order_by_values(bigrams_lemma_pmi)
bigrams_lemma_pmi_ordered[:10]

[('należyty:adj staranność:subst', 16.413413983248002),
 ('sam::subst strip:subst', 16.413413983248002),
 ('strip:subst john:subst', 16.413413983248002),
 ('john:subst galt:subst', 16.413413983248002),
 ('sucho:adv wpychanie:ger', 16.413413983248002),
 ('miliarder:subst polityk:subst', 16.413413983248002),
 ('kursow:adj wynikającego:pact', 16.413413983248002),
 ('toronto:subst star:subst', 16.413413983248002),
 ('sądach:subst okręgowy:adj', 16.413413983248002),
 ('zobaczyć:subst poniższ:fin', 16.413413983248002)]

In [23]:
bigrams_lemma_filtered = filter(bigrams_lemma, lambda _, count: count >= 5)

bigrams_lemma_filtered_pmi = pmi(bigrams_lemma_filtered, unigrams_lemma)

bigrams_lemma_filtered_pmi_ordered = order_by_values(bigrams_lemma_filtered_pmi)
bigrams_lemma_filtered_pmi_ordered[:10]

[('paliwo:subst ciekły:adj', 15.234722440568252),
 ('of:subst america:subst', 14.997683243267403),
 ('konstruktywny:adj opinia:subst', 14.872152361183543),
 ('zbiornik:subst magazynowy:adj', 14.872152361183543),
 ('dyrektor:subst generalny:adj', 14.4571148619047),
 ('trend:subst wzrostowy:adj', 14.287189860462387),
 ('opieka:subst zdrowotny:adj', 14.177006942711964),
 ('działalność:subst gospodarczy:adj', 14.024155454628595),
 ('gospodarstwo:subst domowy:adj', 13.779042956792063),
 ('sygnał:subst elektryczny:adj', 13.708176626072419)]

# Task 11-12
Group the bigrams by morphosyntactic tag.
Print top-10 categories and print top-5 pairs for each category.

In [24]:
def group_by_tag(ngrams: NGrams) -> dict[str, NGrams]:
    groups = {}

    get_tags = lambda ngram: " ".join([lemma_tag.split(":")[1] for lemma_tag in ngram.split()])

    for ngram, count in ngrams.items():
        tags = get_tags(ngram)
        if tags not in groups:
            groups[tags] = {}
        groups[tags][ngram] = count

    return groups

In [25]:
bigrams_lemma_grouped = group_by_tag(bigrams_lemma)

In [26]:
groups_total_count = {tags: sum(group.values()) for tags, group in bigrams_lemma_grouped.items()}
groups_total_count_ordered = order_by_values(groups_total_count)

In [27]:
for i, (tags, total_count) in enumerate(groups_total_count_ordered[:10]):
    print(f"[{i+1}] category: ({tags}), total count: {total_count}")
    group_ordered = order_by_values(bigrams_lemma_grouped[tags])
    print(" | ".join([f"({bigram}) {count}" for bigram, count in group_ordered[:5]]))


[1] category: (adj subst), total count: 3736
(ten:adj sposób:subst) 16 | (drugi:adj strona:subst) 15 | (taki:adj przypadek:subst) 13 | (twój:adj firma:subst) 12 | (ten:adj wszystko:subst) 10
[2] category: (prep subst), total count: 3521
(w:prep przypadek:subst) 73 | (w:prep ciąg:subst) 45 | (w:prep stan:subst) 42 | (na:prep przykład:subst) 39 | (w:prep zależność:subst) 35
[3] category: (subst prep), total count: 3317
(zależność:subst od:prep) 37 | (pieniądz:subst na:prep) 37 | (wzgląd:subst na:prep) 35 | (podatek:subst od:prep) 32 | (opłata:subst za:prep) 24
[4] category: (subst subst), total count: 2606
(cena:subst akcja:subst) 21 | (wartość:subst firma:subst) 9 | (miejsce:subst praca:subst) 8 | (punkt:subst widzenie:subst) 8 | (spłata:subst kredyt:subst) 8
[5] category: (prep adj), total count: 2095
(w:prep ten:adj) 96 | (w:prep który:adj) 95 | (z:prep ten:adj) 51 | (na:prep ten:adj) 39 | (dla:prep który:adj) 30
[6] category: (subst fin), total count: 2010
(co:subst być:fin) 32 | (to

# Task 13
Table comparing the results for copora without and with tagging and lemmatization.

In [28]:
table = pd.DataFrame(
    {
        "Top PMI no lemma no filter": bigrams_pmi_ordered[:10],
        "Top PMI no lemma with filter": bigrams_filtered_pmi_ordered[:10],
        "Top PMI with lemma no filter": bigrams_lemma_pmi_ordered[:10],
        "Top PMI with lemma with filter": bigrams_lemma_filtered_pmi_ordered[:10]
    }
)

table

Unnamed: 0,Top PMI no lemma no filter,Top PMI no lemma with filter,Top PMI with lemma no filter,Top PMI with lemma with filter
0,"(zadłużonych studentów, 16.413533185693815)","(konstruktywne opinie, 16.033649435185765)","(należyty:adj staranność:subst, 16.41341398324...","(paliwo:subst ciekły:adj, 15.234722440568252)"
1,"(uniemożliwiają instytucjom, 16.413533185693815)","(dyrektor generalny, 16.033649435185765)","(sam::subst strip:subst, 16.413413983248002)","(of:subst america:subst, 14.997683243267403)"
2,"(należytej staranności, 16.413533185693815)","(paliwa ciekłego, 15.99300745068842)","(strip:subst john:subst, 16.413413983248002)","(konstruktywny:adj opinia:subst, 14.8721523611..."
3,"(technologie płaskiego, 16.413533185693815)","(agencje ratingowe, 15.84100435724337)","(john:subst galt:subst, 16.413413983248002)","(zbiornik:subst magazynowy:adj, 14.87215236118..."
4,"(fabryk samsunga, 16.413533185693815)","(zbiorniku magazynowym, 15.519076262356009)","(sucho:adv wpychanie:ger, 16.413413983248002)","(dyrektor:subst generalny:adj, 14.4571148619047)"
5,"(dzieła randa, 16.413533185693815)","(trend wzrostowy, 15.519076262356009)","(miliarder:subst polityk:subst, 16.41341398324...","(trend:subst wzrostowy:adj, 14.287189860462387)"
6,"(strip john, 16.413533185693815)","(kredytów hipotecznych, 15.381572738606074)","(kursow:adj wynikającego:pact, 16.413413983248...","(opieka:subst zdrowotny:adj, 14.177006942711964)"
7,"(john galt, 16.413533185693815)","(sygnałów elektrycznych, 15.381572738606074)","(toronto:subst star:subst, 16.413413983248002)","(działalność:subst gospodarczy:adj, 14.0241554..."
8,"(sucho wpychanie, 16.413533185693815)","(of america, 15.256041856522215)","(sądach:subst okręgowy:adj, 16.413413983248002)","(gospodarstwo:subst domowy:adj, 13.77904295679..."
9,"(miliarderów polityków, 16.413533185693815)","(kredyty hipoteczne, 15.256041856522215)","(zobaczyć:subst poniższ:fin, 16.413413983248002)","(sygnał:subst elektryczny:adj, 13.708176626072..."


# Questions

## Why do we have to filter the bigrams, rather than the token sequence?

If we filter tokens before building bigrams, tokens that were not previously adjacent are next to each other and can be used to build a bigram that does not actually exist.

## What types of expressions are discovered by the methods?

It seems that PMI with filtering is pretty good at discovering bigrams with words that are commonly used together, like "konstruktywne opinie" and on the other hand, are pretty popular. 
At the same time, if we don't use filtering, we get bigrams that most probably occured only once.

## Can you devise a different type of filtering that would yield better results?

Probably we can remove documents that are not in specific language.