# Word frequencies

In [243]:
import os
import csv
from bs4 import BeautifulSoup
import datetime

In [244]:
path = './assets/xml'

In [248]:
def getFiles(path):
    """Walk through subfolders and retrieve file list"""
    xmls = []
    for root, dirs, files in os.walk(path):
        for name in files:
            if name.endswith('.xml'):
                xmls.append(root+'/'+name)
                
    xmls = sorted(xmls)
                
    return xmls

In [249]:
xmls = getFiles(path)
len(xmls)

3068

In [250]:
xmls[0]

'./assets/xml/1.xml'

In [251]:
def getRaw(path):
    """Open file"""
    with open(path,'r') as f:
        raw = f.read()   
    return raw

## Collect tags from XMLs

In [252]:
def allWords(filelist):
    print(datetime.datetime.now())
    
    wordList = []
    
    for file in filelist:
        raw = getRaw(file)
        soup = BeautifulSoup(raw,'xml')
        for w in soup.find_all('w'):
            
            token = ' '
            lemma = ' '
            msd = ' '
            pos = ' '
            ppos = ' '
            
            if w.string:
                token = w.string.lower()
            
            if 'lemma' in w.attrs:
                lemma = w['lemma']   
                
            if 'pos' in w.attrs:
                pos = w['pos']
      
            if 'ppos' in w.attrs:
                ppos = w['ppos']
                
            if 'msd' in w.attrs:
                msd = w['msd']                   
            
            tags = ','.join([token, lemma, pos, ppos, msd])
            
            wordList.append(tags)
            
            print(datetime.datetime.now())
        
    return wordList

In [253]:
allForms = allWords(xmls)

In [255]:
len(allForms)

520075

In [256]:
allForms[:20]

['ведае,ведаць,VERB,VBC,Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|Voice=Act',
 'вёска,вёска,NOUN,NN,Animacy=Inan|Case=Nom|Gender=Fem|Number=Sing',
 'так,так,ADV,RB,Degree=Pos',
 'шчыра,шчыра,ADV,RB,Degree=Pos',
 'марылю,марыль,PROPN,NNP,Animacy=Anim|Case=Dat|Gender=Masc|Number=Sing',
 'ясь,ясь,PROPN,NNP,Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing',
 'маладую,малады,ADJ,JJL,Case=Acc|Degree=Sup|Gender=Fem|Number=Sing',
 'кахае,кахаць,VERB,VBC,Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|Voice=Act',
 'з,з,ADP,IN, ',
 'полацка,полацк,PROPN,NNP,Animacy=Inan|Case=Gen|Gender=Masc|Number=Sing',
 'возіць,возіць,VERB,VBC,Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|Voice=Act',
 'ёй,яна,PRON,PRP,Case=Dat|Gender=Fem|Number=Sing|Person=3|PronType=Prs',
 'стужкі,стужка,NOUN,NN,Animacy=Inan|Case=Gen|Gender=Fem|Number=Sing',
 'штохвілю,штохвіль,NOUN,NN,Animacy=Inan|Case=Gen|Gender=Masc|Number=Sing',
 'брошкі,брошка,PROPN,NNP,Animacy=Inan

In [222]:
allForms[0]

'ведае,ведаць,VERB,VBC,Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|Voice=Act'

## Count ungrouped forms

In [257]:
def uniqueForms(wlist):
    
    uDict = {}
    print(datetime.datetime.now())
    
    for w in wlist:
        if w not in uDict.keys():
            uDict[w] = 1
           
        else:
            uDict[w] += 1
                    
    print(datetime.datetime.now())
    
    return uDict

In [327]:
u = uniqueForms(allForms)

2022-06-04 19:16:56.878368
2022-06-04 19:16:57.379239


In [328]:
len(u)

120241

In [329]:
u['ведае,ведаць,VERB,VBC,Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|Voice=Act']

46

In [330]:
type(u)

dict

In [348]:
def formCounts(fdict):
    print(datetime.datetime.now())
    counts = []
    for w in fdict.keys():
        vals = w.split(',')
        q = fdict[w]
        vals.append(q)
        counts.append(vals)

    counts = sorted(counts, key = lambda x: x[-1], reverse = True)
    
    numCounts = []
    for i,c in enumerate(counts):
        newRow = [i]
        newRow = newRow + c
        numCounts.append(newRow)
        
    print(datetime.datetime.now())
    
    return numCounts

In [350]:
fc = formCounts(u)

2022-06-04 19:20:36.118627
2022-06-04 19:20:36.814821


In [299]:
len(sortedUnique)

120241

In [498]:
multipleUnique = [x for x in fc if x[-1] <= 5]

In [499]:
len(multipleUnique)

110760

In [417]:
for f in multipleUnique[:10]:
    print(f)

[0, 'і', 'і', 'CCONJ', 'CC', ' ', 16851]
[1, 'не', 'не', 'PART', 'UH', 'Polarity=Neg', 10581]
[2, 'на', 'на', 'ADP', 'IN', ' ', 9032]
[3, 'ў', 'у', 'ADP', 'IN', ' ', 9022]
[4, 'з', 'з', 'ADP', 'IN', ' ', 8442]
[5, 'у', 'у', 'ADP', 'IN', ' ', 5944]
[6, 'а', 'а', 'CCONJ', 'CC', ' ', 5225]
[7, 'я', 'я', 'PRON', 'PRP', 'Case=Nom|Number=Sing|Person=1|PronType=Prs', 4729]
[8, 'ты', 'ты', 'PRON', 'PRP', 'Case=Nom|Number=Sing|Person=2|PronType=Prs', 3559]
[9, 'як', 'як', 'ADV', 'WRB', 'Degree=Pos', 3418]


In [500]:
with open('BPCfreq_sorted_not-grouped_5-and-less.csv', 'w', newline='', encoding='utf-8') as fu:
    writer = csv.writer(fu, delimiter = ',')
    writer.writerow(['order','form', 'lemma', 'pos', 'ppos', 'msd', 'count'])
    for u in multipleUnique:
        writer.writerow(u)

In [357]:
[i for i in counts if i[0] == ' ']

[[' ', '', 'NUM', 'CD', 'NumType=Card', 207], [' ', '', 'ADJ', 'ORD', ' ', 10]]

----- 

## Group by lemma

In [387]:
recomb = [[c[0], ','.join(c[2:4]), c[6]] for c in fc]
len(recomb)

120241

In [388]:
recomb[:10]

[[0, 'і,CCONJ', 16851],
 [1, 'не,PART', 10581],
 [2, 'на,ADP', 9032],
 [3, 'у,ADP', 9022],
 [4, 'з,ADP', 8442],
 [5, 'у,ADP', 5944],
 [6, 'а,CCONJ', 5225],
 [7, 'я,PRON', 4729],
 [8, 'ты,PRON', 3559],
 [9, 'як,ADV', 3418]]

In [389]:
rep = list(set(r[1] for r in recomb))
len(rep)

62448

In [420]:
repeats = {}
for c in rep:
    repeats[c] = []
    
for x in recomb:
    for k in repeats.keys():
        if x[1] == k:
            repeats[k].append(x)

In [458]:
repeats

{'дняпро,PROPN': [[1839, 'дняпро,PROPN', 26],
  [20907, 'дняпро,PROPN', 3],
  [35791, 'дняпро,PROPN', 2]],
 'стыд,NOUN': [[6170, 'стыд,NOUN', 8],
  [23177, 'стыд,NOUN', 2],
  [28916, 'стыд,NOUN', 2],
  [39962, 'стыд,NOUN', 1],
  [43967, 'стыд,NOUN', 1],
  [102075, 'стыд,NOUN', 1]],
 'пакорліва,ADV': [[18898, 'пакорліва,ADV', 3]],
 'яры,NUM': [[32653, 'яры,NUM', 2]],
 'сябрук,NOUN': [[48648, 'сябрук,NOUN', 1]],
 'тэраса,NOUN': [[78292, 'тэраса,NOUN', 1], [96743, 'тэраса,NOUN', 1]],
 'трапеча,NOUN': [[95050, 'трапеча,NOUN', 1]],
 'прасвята,NOUN': [[77668, 'прасвята,NOUN', 1]],
 'сынаў,PROPN': [[81034, 'сынаў,PROPN', 1]],
 'ян,PRON': [[33582, 'ян,PRON', 2]],
 'рудзька,PROPN': [[43841, 'рудзька,PROPN', 1], [43848, 'рудзька,PROPN', 1]],
 'ўмяшацца,VERB': [[45247, 'ўмяшацца,VERB', 1]],
 'згніць,ADJ': [[65304, 'згніць,ADJ', 1]],
 'эзель,PROPN': [[53418, 'эзель,PROPN', 1],
  [53428, 'эзель,PROPN', 1],
  [53429, 'эзель,PROPN', 1]],
 'шчасьце,PROPN': [[60974, 'шчасьце,PROPN', 1], [61754, 'шчасьц

In [438]:
lemmas = {}

for k in repeats.keys():
    lemmas[k] = {'count': 0, 'ids': []}

In [461]:
repeats['дняпро,PROPN']

[[1839, 'дняпро,PROPN', 26],
 [20907, 'дняпро,PROPN', 3],
 [35791, 'дняпро,PROPN', 2]]

In [474]:
lemmas = {}

for k in repeats.keys():
    lemmas[k] = {'count': 0, 'ids': []}
    
for k in repeats.keys():
    for i in repeats[k]:
        if isinstance(i[2], int):
            c = lemmas[k]['count'] + i[2]
            lemmas[k]['count'] = c
            lemmas[k]['ids'].append(i[0])
        else:
            print(i)

[112832, '-,малое', 'SYM']
[102611, '-,гулі', 'NN']
[57325, 'і,узяуся', 'NN']
[110178, '—,ж', 'SYM']
[58106, 'бы,паляцеў', 'NN']


In [475]:
lemmas

{'дняпро,PROPN': {'count': 31, 'ids': [1839, 20907, 35791]},
 'стыд,NOUN': {'count': 15, 'ids': [6170, 23177, 28916, 39962, 43967, 102075]},
 'пакорліва,ADV': {'count': 3, 'ids': [18898]},
 'яры,NUM': {'count': 2, 'ids': [32653]},
 'сябрук,NOUN': {'count': 1, 'ids': [48648]},
 'тэраса,NOUN': {'count': 2, 'ids': [78292, 96743]},
 'трапеча,NOUN': {'count': 1, 'ids': [95050]},
 'прасвята,NOUN': {'count': 1, 'ids': [77668]},
 'сынаў,PROPN': {'count': 1, 'ids': [81034]},
 'ян,PRON': {'count': 2, 'ids': [33582]},
 'рудзька,PROPN': {'count': 2, 'ids': [43841, 43848]},
 'ўмяшацца,VERB': {'count': 1, 'ids': [45247]},
 'згніць,ADJ': {'count': 1, 'ids': [65304]},
 'эзель,PROPN': {'count': 3, 'ids': [53418, 53428, 53429]},
 'шчасьце,PROPN': {'count': 2, 'ids': [60974, 61754]},
 'крыжавая,NOUN': {'count': 1, 'ids': [101334]},
 'беларус,PRON': {'count': 1, 'ids': [115001]},
 'Квотнікаў,NUM': {'count': 1, 'ids': [57106]},
 'зімовы,ADJ': {'count': 28,
  'ids': [13522,
   16727,
   19169,
   20242,
   

In [476]:
lCounts = []
for l in lemmas.keys():
    resplit = l.split(',')
    count = [lemmas[l]['count']]
    newRow = resplit + count
    lCounts.append(newRow)
    
lCounts = sorted(lCounts, key = lambda x: x[-1], reverse = True)

In [482]:
lCounts[:10]

[['і', 'CCONJ', 19404],
 ['у', 'ADP', 14978],
 ['не', 'PART', 10595],
 ['на', 'ADP', 9032],
 ['з', 'ADP', 8525],
 ['я', 'PRON', 7872],
 ['а', 'CCONJ', 5228],
 ['ты', 'PRON', 5136],
 ['мы', 'PRON', 4095],
 ['ён', 'PRON', 4005]]

In [483]:
len(lCounts)

62448

In [487]:
numLCounts = []
for i,c in enumerate(lCounts):
    newRow = [i]
    newRow = newRow + c
    numLCounts.append(newRow)

In [489]:
numLCounts[:10]

[[0, 'і', 'CCONJ', 19404],
 [1, 'у', 'ADP', 14978],
 [2, 'не', 'PART', 10595],
 [3, 'на', 'ADP', 9032],
 [4, 'з', 'ADP', 8525],
 [5, 'я', 'PRON', 7872],
 [6, 'а', 'CCONJ', 5228],
 [7, 'ты', 'PRON', 5136],
 [8, 'мы', 'PRON', 4095],
 [9, 'ён', 'PRON', 4005]]

In [495]:
multipleLemma = [x for x in numLCounts if x[-1] <= 5]

In [496]:
len(multipleLemma)

54380

In [497]:
with open('BPCfreq_lemma_5-and-less.csv', 'w', newline='', encoding='utf-8') as fu:
    writer = csv.writer(fu, delimiter = ',')
    writer.writerow(['id','lemma', 'pos', 'count'])
    for u in multipleLemma:
        writer.writerow(u)