In [1]:
import pandas as pd
import os
import re


In [2]:
def read_text_file(file_path):
    with open(file_path, 'r') as f:
        return f.read().replace('\n', ' ')

articles_list = []
articles_name = []

path = "ustawy"
for file in os.listdir(path):
    # Check whether file is in text format or not
    file_path = f"{path}/{file}"
    temp = read_text_file(file_path).replace('\n',' ')
    temp = temp.replace('\xa0', ' ')
    temp = re.sub('\s+',' ', temp)
    articles_list.append(temp.replace('\xa0', ' '))
    articles_name.append(file)

import re

articles_list = [re.sub('<[^<]+?>', '', text) for text in articles_list]

1. Use SpaCy tokenizer API to tokenize the text from the law corpus.

In [52]:
from spacy.lang.pl import Polish
#from spacy.tokenizer import Tokenizer
nlp = Polish()
tokenizer = nlp.tokenizer
#tokenizer = Tokenizer(nlp.vocab)

In [53]:
doc = [tokenizer(art) for art in articles_list]
doc_text = [[token.text for token in document if token.text != " "] for document in doc]


2. Compute bigram counts of downcased tokens. Given the sentence: "The quick brown fox jumps over the lazy dog.", the bigram counts are as follows:

Wyznaczemy najpierw bigramy uwzględniajac kropki oraz inne znaki jak w przykładzie. Nastepnie zliczane sa ilosci wystapien

In [55]:
bigrams = [[f'{doc[i].lower()} {doc[i+1].lower()}' for i in range(len(doc)) if i != len(doc)-1] #laczenie w pary
           for doc in doc_text]

In [56]:
bigrams[1][:5]

['dz .', '. u', 'u .', '. z', 'z 1996']

In [57]:
from collections import Counter
frequency_bigram_list = [Counter(big) for big in bigrams]

In [58]:
global_frequency_bigram = sum(frequency_bigram_list, Counter())

In [59]:
global_frequency_bigram.most_common(5)

[('art .', 83778),
 ('ust .', 53552),
 ('poz .', 45198),
 (', poz', 43188),
 ('. 1', 39953)]

3. Discard bigrams containing characters other than letters. Make sure that you discard the invalid entries after computing the bigram counts.

sprawdzenie czy bigram zawiera tylko litery poprzez isalpha()

In [60]:
for e in list(global_frequency_bigram):
    temp = e.replace(" ", "")
    if not temp.isalpha():
        del global_frequency_bigram[e]

In [61]:
global_frequency_bigram.most_common(5)


[('w art', 32042),
 ('mowa w', 28471),
 ('w ust', 23557),
 ('o których', 13884),
 ('których mowa', 13857)]

4. Use pointwise mutual information to compute the measure for all pairs of words.


Funkcja Pmi wyznacza pomiary w bigramie

In [88]:
doc_words_only = [[token.text.lower() for token in document if token.is_alpha and token.text != " "] for document in doc]
frequency_list_words = [Counter(doc) for doc in doc_words_only]
global_frequency_list_words = sum(frequency_list_words, Counter())
global_frequency_list_words.most_common(5)

[('w', 201224), ('i', 90009), ('art', 83804), ('z', 82443), ('o', 64776)]

In [90]:

total = sum(global_frequency_list_words.values())

In [95]:
from numpy import log
def pmi(bi, total, global_frequency_list_words):
    w1, w2 = bi[0].split(" ")
    val_bi =bi[1]
    val_w1 = global_frequency_list_words.__getitem__(w1)
    vaw_w2 = global_frequency_list_words.__getitem__(w2)
    if ((val_w1/total)*(vaw_w2/total)) == 0:
        print(f'{w1}: {val_w1}, {w2}: {vaw_w2}')
    return (w1, w2, log((val_bi/total)/((val_w1/total)*(vaw_w2/total))))

In [96]:
pmi(global_frequency_bigram.most_common(5)[1], total, global_frequency_list_words)

('mowa', 'w', 2.867627254917188)

In [97]:
pmi_tab = [pmi(bi, total, global_frequency_list_words) for bi in global_frequency_bigram.most_common()]

In [103]:
pmi_tab[:5]

[('w', 'art', 1.9170929678118054),
 ('mowa', 'w', 2.867627254917188),
 ('w', 'ust', 2.0557286964261223),
 ('o', 'których', 3.7561599873615954),
 ('których', 'mowa', 4.5653636387808385)]

5.Sort the word pairs according to that measure in the descending order and determine top 10 entries.

In [111]:
pmi_tab.sort(key=lambda x: x[2], reverse= True)


In [114]:
pmi_tab[:10]

[('acetanilid', 'acetylometadol', 15.090700159021198),
 ('admi', 'nistracji', 15.090700159021198),
 ('adminis', 'tracji', 15.090700159021198),
 ('aegroti', 'suprema', 15.090700159021198),
 ('aerodynamicznej', 'szorstkości', 15.090700159021198),
 ('aethina', 'tumida', 15.090700159021198),
 ('agenci', 'ubezpieczeniowi', 15.090700159021198),
 ('agregatach', 'pralniczych', 15.090700159021198),
 ('agricoltura', 'biologica', 15.090700159021198),
 ('agriculture', 'biologique', 15.090700159021198)]

6. Filter bigrams with number of occurrences <strong>higher</strong> than 5. Determine top 10 entries for the remaining dataset (>=5 occurrences).

In [115]:
pmi_tab_filtered = [pmi(bi, total, global_frequency_list_words) for bi in global_frequency_bigram.most_common() if bi[1] > 5]


In [117]:
pmi_tab_filtered.sort(key=lambda x: x[2], reverse= True)
pmi_tab_filtered[:10]

[('adama', 'mickiewicza', 13.298940689793143),
 ('odczynów', 'poszczepiennych', 13.298940689793143),
 ('diagności', 'laboratoryjni', 13.298940689793143),
 ('słomę', 'makową', 13.298940689793143),
 ('lambrekiny', 'okienne', 13.144790009965886),
 ('zatoki', 'gdańskiej', 13.144790009965886),
 ('poddanymi', 'aromatyzacji', 13.144790009965886),
 ('piotrków', 'trybunalski', 13.144790009965885),
 ('zapieczętowanej', 'kopercie', 13.144790009965885),
 ('papierem', 'wartościowym', 13.144790009965885)]

7. Use KRNNT or Clarin-PL API(https://ws.clarin-pl.eu/tager.shtml) to tag and lemmatize the corpus.

Jest uzyte Clarin-PL, nastepnie regex zeby wyciagnac z dok. odpowiednie slowo oraz oznaczenie

In [121]:
regex_pattern = '\<base\>(\w+)\<\/base\>\<ctag\>(\w+)'
words_7 = []
path = "ustawy_lab4"
for file in os.listdir(path):
    # Check whether file is in text format or not
    file_path = f"{path}/{file}"
    with open(file_path, 'r') as f:
        text =  f.read()
        res = re.findall(r"\<base\>(\w+)\<\/base\>\<ctag\>(\w+)", text)
        words_7.append(res)


8. Using the tagged corpus compute bigram statistic for the tokens containing: a. lemmatized, downcased word b. morphosyntactic category of the word (subst, fin, adj, etc.)

In [122]:
bigrams_7 = [[f'{doc[i][0].lower()}:{doc[i][1]} {doc[i+1][0].lower()}:{doc[i+1][1].lower()}' for i in range(len(doc)) if i != len(doc)-1] #laczenie w pary
            for doc in words_7]

9. For example: "Ala ma kota", which is tagged as:

Przykładowo wyciągnięte bigramy

In [132]:
bigrams_7[15][:10]

['u:prep z:prep',
 'z:prep 2001:num',
 '2001:num r:ign',
 'r:ign nr:subst',
 'nr:subst 63:num',
 '63:num poz:ign',
 'poz:ign 639:num',
 '639:num ustawa:subst',
 'ustawa:subst z:prep',
 'z:prep dzień:subst']

10. Compute the same statistics as for the non-lemmatized words (i.e. PMI) and print top-10 entries with at least 5 occurrences.

In [133]:
frequency_bigram_7_list = [Counter(big) for big in bigrams_7]
global_frequency_bigram_7 = sum(frequency_bigram_7_list, Counter())
global_frequency_bigram_7.most_common(5)

[('w:prep art:ign', 32045),
 ('o:prep który:adj', 28656),
 ('który:adj mowa:subst', 28538),
 ('mowa:subst w:prep', 28473),
 ('w:prep usta:subst', 23557)]

In [137]:
doc_words_only_7 = [[f'{document[i][0].lower()}:{document[i][1]}' for i in range(len(document)) if document[i] != " "] for document in words_7]
frequency_list_words_7 = [Counter(doc) for doc in doc_words_only_7]
global_frequency_list_words_7 = sum(frequency_list_words_7, Counter())
global_frequency_list_words_7.most_common(5)

[('w:prep', 202951),
 ('i:conj', 90044),
 ('z:prep', 87991),
 ('art:ign', 83805),
 ('1:num', 74573)]

In [138]:
total_7 = sum(global_frequency_list_words_7.values())

In [142]:
pmi_tab_filtered_7 = [pmi(bi, total_7, global_frequency_list_words_7) for bi in global_frequency_bigram_7.most_common() if bi[1]> 5]


In [143]:
pmi_tab_filtered_7[:5]

[('w:prep', 'art:ign', 2.060368419895352),
 ('o:prep', 'który:adj', 3.619016874504095),
 ('który:adj', 'mowa:subst', 4.425786061112725),
 ('mowa:subst', 'w:prep', 3.010127213150502),
 ('w:prep', 'usta:subst', 2.198829241685272)]

In [144]:
pmi_tab_filtered_7.sort(key=lambda x: x[2], reverse=True)
pmi_tab_filtered_7[:10]

[('postępować:fin', 'postępywać:fin', 13.450680306867234),
 ('ląg:subst', 'lęg:subst', 13.450680306867234),
 ('adam:subst', 'mickiewicz:subst', 13.450680306867234),
 ('obrząd:subst', 'obrzęd:subst', 13.450680306867234),
 ('pielić:fin', 'pleć:fin', 13.450680306867234),
 ('warmińsko:adv', 'mazurski:adj', 13.296529627039975),
 ('ciesać:pact', 'cieszyć:pact', 13.296529627039975),
 ('piotrek:subst', 'trybunalski:adj', 13.296529627039975),
 ('media:subst', 'medium:subst', 13.296529627039975),
 ('teologiczno:ign', 'pastoralny:adj', 13.296529627039975)]

11. Compute trigram counts for both corpora and perform the same filtering.

In [147]:
trigrams = [[f'{doc[i].lower()} {doc[i+1].lower()} {doc[i+2].lower()}' for i in range(len(doc)) if i < len(doc)-2] #laczenie w pary
           for doc in doc_text]

In [None]:
frequency_trigram_list = [Counter(big) for big in trigrams]
global_frequency_trigram = sum(frequency_trigram_list, Counter())
global_frequency_trigram.most_common(5)

In [150]:
for e in list(global_frequency_trigram):
    temp = e.replace(" ", "")
    if not temp.isalpha() or global_frequency_trigram[e] <= 5:
        del global_frequency_trigram[e]
global_frequency_trigram.most_common(5)

[('o których mowa', 13856),
 ('których mowa w', 13806),
 ('mowa w ust', 13474),
 ('mowa w art', 12311),
 ('o którym mowa', 9169)]

In [151]:
trigrams_7 = [[f'{doc[i][0].lower()}:{doc[i][1]} {doc[i+1][0].lower()}:{doc[i+1][1].lower()} {doc[i+2][0].lower()}:{doc[i+2][1].lower()}' for i in range(len(doc)) if i < len(doc)-2] #laczenie w pary
            for doc in words_7]
frequency_trigram_7_list = [Counter(big) for big in trigrams_7]
global_frequency_trigram_7 = sum(frequency_trigram_7_list, Counter())
global_frequency_trigram_7.most_common(5)

[('o:prep który:adj mowa:subst', 28535),
 ('który:adj mowa:subst w:prep', 28442),
 ('_:interp _:interp _:interp', 16213),
 ('mowa:subst w:prep usta:subst', 13474),
 ('w:prep usta:subst 1:num', 12842)]

In [152]:
for e in list(global_frequency_trigram_7):
    temp = e.replace(" ", "")
    temp = temp.replace(":", "")
    if not temp.isalpha() or global_frequency_trigram_7[e] <= 5:
        del global_frequency_trigram_7[e]
global_frequency_trigram_7.most_common(5)

[('o:prep który:adj mowa:subst', 28535),
 ('który:adj mowa:subst w:prep', 28442),
 ('mowa:subst w:prep usta:subst', 13474),
 ('mowa:subst w:prep art:ign', 12311),
 ('dwa:num dwa:num dwa:num', 8732)]

12. Use PMI (with 5 occurrence threshold) to compute top 10 results for the trigrams. Devise a method for computing the values, based on the results for bigrams.

In [155]:
def pmi_tri(tri, total, global_frequency_list_words):
    w1, w2, w3 = tri[0].split(" ")
    val_tri =tri[1]
    val_w1 = global_frequency_list_words.__getitem__(w1)
    vaw_w2 = global_frequency_list_words.__getitem__(w2)
    vaw_w3 = global_frequency_list_words.__getitem__(w3)

    return (w1, w2, w3, log((val_tri/total)/((val_w1/total)*(vaw_w2/total))*(vaw_w3/total)))

In [156]:
pmi_tri_tab = [pmi_tri(tri, total, global_frequency_list_words) for tri in global_frequency_trigram.most_common()]

In [160]:
pmi_tri_tab.sort(key=lambda x: x[3], reverse= True)
pmi_tri_tab[:10]

[('adama', 'mickiewicza', 'w', 10.420414525160096),
 ('chrześcijan', 'baptystów', 'w', 10.112113165505578),
 ('hugona', 'kołłątaja', 'w', 9.909588901394104),
 ('tadeusza', 'kotarbińskiego', 'w', 9.909588901394104),
 ('wyznaniowa', 'żydowska', 'w', 9.791805865737722),
 ('michała', 'oczapowskiego', 'w', 9.727267344600149),
 ('lambrekiny', 'okienne', 'i', 9.461754795257422),
 ('przymusowo', 'zatrudnianym', 'w', 9.321802236491985),
 ('mistrzostw', 'europy', 'w', 9.28365047052761),
 ('starokatolickiego', 'mariawitów', 'w', 9.13948067969803)]

In [157]:
pmi_tri__7 = [pmi_tri(tri, total_7, global_frequency_list_words_7) for tri in global_frequency_trigram_7.most_common() if tri[1]> 5]

In [161]:
pmi_tri__7.sort(key=lambda x: x[3], reverse= True)
pmi_tri__7[:10]

[('adam:subst', 'mickiewicz:subst', 'w:prep', 10.42896038034885),
 ('teologiczno:ign', 'pastoralny:adj', 'w:prep', 10.274809700521592),
 ('curie:subst', 'skłodowska:subst', 'w:prep', 9.918134756582859),
 ('hugon:subst', 'kołłątaj:subst', 'w:prep', 9.918134756582859),
 ('tadeusz:subst', 'kotarbiński:subst', 'w:prep', 9.918134756582859),
 ('michał:subst', 'oczapowskiego:ign', 'w:prep', 9.735813199788904),
 ('leż:subst', 'leża:subst', 'w:prep', 9.202514720170855),
 ('linek:subst', 'link:subst', 'i:conj', 8.95951471311693),
 ('lambrekin:subst', 'okienny:adj', 'i:conj', 8.95131794591275),
 ('chłodnica:subst', 'odmulina:subst', 'i:conj', 8.923147068946054)]

13. Create a table comparing the results for copora without and with tagging and lemmatization (separate table for bigrams and trigrams).


In [163]:
import pandas as pd
dt_bigram = pd.DataFrame(list(zip(
    [item[0] for item in pmi_tab_filtered]
    , [item[1] for item in pmi_tab_filtered]
    , [item[2] for item in pmi_tab_filtered]
    ,[item[0] for item in pmi_tab_filtered_7]
    , [item[1] for item in pmi_tab_filtered_7]
    , [item[2] for item in pmi_tab_filtered_7]
    )), columns=['1 word', '2 word', 'scoore', '1 word', '2 word', 'scoore'])

In [168]:
dt_bigram

Unnamed: 0,1 word,2 word,scoore,1 word.1,2 word.1,scoore.1
0,adama,mickiewicza,13.298941,postępować:fin,postępywać:fin,13.450680
1,odczynów,poszczepiennych,13.298941,ląg:subst,lęg:subst,13.450680
2,diagności,laboratoryjni,13.298941,adam:subst,mickiewicz:subst,13.450680
3,słomę,makową,13.298941,obrząd:subst,obrzęd:subst,13.450680
4,lambrekiny,okienne,13.144790,pielić:fin,pleć:fin,13.450680
...,...,...,...,...,...,...
64750,o,z,-5.516093,dyscyplinarny:adj,5:num,0.662937
64751,w,i,-5.584700,towarowy:subst,i:conj,0.662933
64752,z,i,-5.690917,113:num,1:num,0.662820
64753,o,w,-6.002940,za:qub,pracownik:subst,0.662699


In [165]:
dt_trigram = pd.DataFrame(list(zip(
    [item[0] for item in pmi_tri_tab]
    , [item[1] for item in pmi_tri_tab]
    , [item[2] for item in pmi_tri_tab]
    , [item[3] for item in pmi_tri_tab]
    ,[item[0] for item in pmi_tri__7]
    , [item[1] for item in pmi_tri__7]
    , [item[2] for item in pmi_tri__7]
    , [item[3] for item in pmi_tri__7]
    )), columns=['1 word', '2 word', '3 word', 'scoore', '1 word', '2 word', '3 word', 'scoore'])

In [166]:
dt_trigram

Unnamed: 0,1 word,2 word,3 word,scoore,1 word.1,2 word.1,3 word.1,scoore.1
0,adama,mickiewicza,w,10.420415,adam:subst,mickiewicz:subst,w:prep,10.428960
1,chrześcijan,baptystów,w,10.112113,teologiczno:ign,pastoralny:adj,w:prep,10.274810
2,hugona,kołłątaja,w,9.909589,curie:subst,skłodowska:subst,w:prep,9.918135
3,tadeusza,kotarbińskiego,w,9.909589,hugon:subst,kołłątaj:subst,w:prep,9.918135
4,wyznaniowa,żydowska,w,9.791806,tadeusz:subst,kotarbiński:subst,w:prep,9.918135
...,...,...,...,...,...,...,...,...
44875,się,w,trudnej,-17.870916,żywność:subst,woda:subst,pitny:adj,-8.381750
44876,się,w,skonsolidowanym,-17.896233,oraz:conj,wzór:subst,stosowany:adj,-8.381865
44877,w,których,występowało,-17.981164,albo:conj,jednostka:subst,policja:subst,-8.381919
44878,się,w,niebezpieczeństwie,-18.109807,dopuszczać:fin,się:qub,czynny:adj,-8.382031


14. Answer the following questions:

Why do we have to filter the bigrams, rather than the token sequence?

Ponieważ powstawałyby błędne bigramy. Np ostanie słowo w zdaniu oraz pierwsze w kolejnym zostałyby połączone w bigram co tak normalnie nie ma miejsca. Ponadto w korpusie nad którym pracujemy często sa numery ustaw co również prowadziłoby do postawania błędnych bigramow

Which method works better for the bigrams and which for the trigrams?

Lepiej działa Pmi z filtrowaniem. Pozwala odfiltrować rzadkie wystąpienia które często mogą być literówkami. Zarówno działa to dla bigramow jak i trigramow. Jeśli chodzi o Clarin-PL lepiej działa dla bigramow ponieważ 3 wyraz to często spójnik.

What types of expressions are discovered by the methods.

Są okrywane wyrażenia typowe dla języka oraz typowe dla przerabianego korpusu. Mogą też być wykrywane błędy w tekście

Can you devise a different type of filtering that would yield better results?

Można pozbyć się spójników poprzez odfiltrowanie najkrótszych wyrazów. Występują często w jeżyku i maja małe znaczenie w większości przypadków