In [7]:
import mysql.connector
import csv
import pandas as pd
import os

In [8]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 1000)

In [9]:
con = mysql.connector.connect(user='andr',
                              password='rstq!2Ro',
                              host='127.0.0.1',
                              database='cat_db',
                              auth_plugin='mysql_native_password'
                             )

In [10]:
curA = con.cursor(dictionary=True, buffered=True)
curB = con.cursor(dictionary=True, buffered=True)
curC = con.cursor(dictionary=True, buffered=True)
curD = con.cursor(dictionary=True, buffered=True)

## First experiments
curA and curB shows the first few experiments on morphology, where we were not keeping track of the POS.

In [8]:
curA.execute("SELECT id_unigram, unigram, freq_all, morph FROM unigrams;")
rows = curA.fetchall()

In [9]:
with open('morph_tags.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['id_unigram', 'unigram', 'freq_all', 'morph'])
    for dictionary in rows:
        writer.writerow([dictionary['id_unigram'], dictionary['unigram'], dictionary['freq_all'], dictionary['morph']])

df = pd.read_csv('morph_tags.csv')

df.head()

Unnamed: 0,id_unigram,unigram,freq_all,morph
0,36215,NUM,94888,_
1,47683,<URL>,1307,_
2,381939,А,2,Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing
3,381940,В,803,Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing
4,381941,Малько,45,Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing


In [10]:
curB.execute("SELECT morph, freq_all*count AS 'par_count' FROM (SELECT morph, freq_all, COUNT(*) AS 'count' FROM unigrams GROUP BY morph, freq_all) AS tab1")

In [11]:
rowsB = curB.fetchall()

In [12]:
with open('stats.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['morph', 'count'])
    for dictionary in rowsB:
        writer.writerow([dictionary['morph'], dictionary['par_count']])

In [13]:
df_par_stats = pd.read_csv('stats.csv')

df_par_stats[:10]

Unnamed: 0,morph,count
0,_,94888
1,_,1307
2,Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing,536
3,Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing,803
4,Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing,90
5,_,175057
6,Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing,107
7,Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing,70
8,_,137292
9,Case=Nom|Degree=Pos|Number=Plur,49


## MySQL --> POS/tagset counts

With this MySQL search we extract from the database the frequency of use of each udpipe morphological tagset, preserving information about POS for each tagset. The aim is keeping a distinction between the different tagsets also based on the part of speech, because we assume POS behavior as disinct one from the other. 

In [14]:
curC.execute("""SELECT morph, pos, freq_all FROM
(SELECT morph, lemma, freq_all FROM unigrams) AS a JOIN
(SELECT id_lemmas, id_pos FROM lemmas) AS b ON lemma = id_lemmas JOIN pos ON b.id_pos = pos.id_pos;""")

In [15]:
rowsC = curC.fetchall()

In [16]:
with open('statistica.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['morph', 'POS', 'count'])
    for dictionary in rowsC:
        writer.writerow([dictionary['morph'], dictionary['pos'], dictionary['freq_all']])

In [17]:
df_statistica = pd.read_csv('statistica.csv')

df_statistica[:5]

Unnamed: 0,morph,POS,count
0,Case=Nom|Degree=Pos|Number=Plur,ADJ,49
1,Case=Nom|Degree=Pos|Gender=Masc|Number=Sing,ADJ,6
2,Case=Gen|Degree=Pos|Number=Plur,ADJ,40
3,Case=Gen|Degree=Pos|Gender=Masc|Number=Sing,ADJ,5
4,Case=Loc|Degree=Pos|Number=Plur,ADJ,10


In [18]:
len(df_statistica)

156637

In [19]:
df_statistica = df_statistica[df_statistica.morph != '_']

In [20]:
len(df_statistica)

151025

In [21]:
keys = zip(df_statistica['morph'], df_statistica['POS'], df_statistica['count'])
# freq = df_statistica['count']

In [22]:
dictn = {}
for key in keys:
    if (key[0], key[1]) in dictn:
        dictn[(key[0], key[1])] += key[2]
    else:
        dictn[(key[0], key[1])] = key[2]

In [23]:
len(dictn)

610

In [24]:
df_statistica = pd.Series(dictn).reset_index()
df_statistica.columns = ['tagset', 'POS', 'count'] 
df_statistica[:5]

Unnamed: 0,tagset,POS,count
0,Case=Nom|Degree=Pos|Number=Plur,ADJ,17467
1,Case=Nom|Degree=Pos|Gender=Masc|Number=Sing,ADJ,13906
2,Case=Gen|Degree=Pos|Number=Plur,ADJ,47265
3,Case=Gen|Degree=Pos|Gender=Masc|Number=Sing,ADJ,26364
4,Case=Loc|Degree=Pos|Number=Plur,ADJ,7511


In [25]:
df_statistica['part_freq'] = df_statistica['count'].apply(lambda x: x/df_statistica['count'].sum(axis=0))

In [26]:
df_statistica[:5]

Unnamed: 0,tagset,POS,count,part_freq
0,Case=Nom|Degree=Pos|Number=Plur,ADJ,17467,0.010666
1,Case=Nom|Degree=Pos|Gender=Masc|Number=Sing,ADJ,13906,0.008491
2,Case=Gen|Degree=Pos|Number=Plur,ADJ,47265,0.028861
3,Case=Gen|Degree=Pos|Gender=Masc|Number=Sing,ADJ,26364,0.016098
4,Case=Loc|Degree=Pos|Number=Plur,ADJ,7511,0.004586


In [27]:
df_statistica.to_csv(os.path.join(r"C:\Users\Andrea\desktop\part_freq.csv"))

In [28]:
if "morph_tags.csv" in os.getcwd():
    !del "morph_tags.csv" 

In [29]:
if "statistica.csv" in os.getcwd():
    !del "statistica.csv"

## Student texts
Now let's take some example student texts and extract tagsets and POS tagging using conllu API.

In [107]:
from conllu import parse, parse_tree

In [108]:
def parser(filename):
    """
    Yields a sentence from conllu tree with its tags

    """
    """
    >>> for i in parser('/content/gdrive/My Drive/Новые conll по доменам/NewVers/CleanedPsyEdu.conllu'):
      print(i)   
    TokenList<Музыка, звучит, отовсюду, независимо, от, нашего, желания, или, нежелания, слушать, ее, .>
    """
    with open(filename, 'r', encoding='utf-8') as f:
        data = f.read()
    tree = parse(data)
    for token in tree:
        yield token

In [109]:
def get_words(tree):
    """
    tree - generator of sentences (TokenLists) from conllu tree

    words, list is a list of all tokens we need from the tree
    size, int is a number of all words in the domain
    """
    words = []
    for sentence in tree:
        for token in sentence:
#             print(token)
            if token['form'] != '_' and token['upostag'] != '_' and token['upostag']!='NONLEX' and token['form'] not in r'[]\/':
                for wordform in token['form'].lower().split():
                    words.append((wordform, token['lemma'], token['feats'], token['upostag']))
    size = len(words)
    return words, size

#### Папки с студентическими текстами

In [135]:
stud_dir = r'C:\Users\Andrea\Desktop\stud_textVSscie_text\Student_texts_for_experiments\stud_txt'
low_lvl = os.path.join(stud_dir, 'Low Level')
reg_lvl = os.path.join(stud_dir, 'Regular Level')
low_prsd = os.path.join(stud_dir, 'Low Level Parsed')
reg_prsd = os.path.join(stud_dir, 'Regular Level Parsed')

In [136]:
def tagset_lemma(words):
    print('tagset being created...')
    with open('tagset.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['token', 'lemma', 'tagset', 'POS'])

        for word in words:
            if word[2]:
                tag_lst = []
                for tag in list(word[2].items()):
                    tag = '{}={}|'.format(tag[0], tag[1])
                    tag_lst.append(tag)

                tag_str = ''.join([str(elem) for elem in tag_lst])
                tag_str = tag_str[:-1]

                writer.writerow([word[0], word[1], tag_str, word[3]])
            # print(tag_str)
            else:
                tag_str = 'None'
                writer.writerow([word[0], word[1], tag_str, word[3]])
                
    df = pd.read_csv('tagset.csv')
    word_list = df.values.tolist()
    return word_list

In [137]:
def morph_error_catcher(words):
    mistakes = {}
    corrects = {}
    for i, word in enumerate(words):
        
        curD.execute("""SELECT unigram, lemm, morph, pos FROM
                    (SELECT unigram, morph, lemma FROM unigrams) AS a JOIN
                    (SELECT id_lemmas, id_pos, lemma AS lemm FROM lemmas) AS b ON lemma = id_lemmas JOIN pos ON b.id_pos = pos.id_pos
                    WHERE unigram='{}' &&
                    lemm='{}' &&
                    morph='{}' && 
                    pos='{}';""".format(word[0], word[1], word[2], word[3]))
        rowsD = curD.fetchall()
        if not rowsD:
            mistakes[i] = [word[0], word[1], word[2], word[3]]
        else:
            corrects[i] = [word[0], word[1], word[2], word[3]]
    return mistakes, corrects

In [138]:
from string import punctuation
punctuation += '«»—…“”'
from nltk.corpus import stopwords
stops = stopwords.words('russian')

In [139]:
def correctionA(words):    
    tagset = tagset_lemma(words)
    correction = []
    mistakes = morph_error_catcher(tagset)[0]
    # for idx in mistakes:
    for i, word in enumerate(tagset):
        if i in mistakes:
            correction.append('\033[31m' + word[0] + '\033[39m')
        else:
            correction.append(word[0])
    correction = ' '.join(correction)
    return print(correction)

In [183]:
def correctionX(words):
    tagset = tagset_lemma(words)
    correction = []
    mistakes = morph_error_catcher(tagset)[0]
    # for idx in mistakes:
    for i, word in enumerate(tagset):
        if i in mistakes and word[0] not in punctuation and word[0] not in stops:
            correction.append('\033[31m' + word[0] + '\033[39m')
        else:
            correction.append(word[0])
    correction = ' '.join(correction)
    return print(correction)

### Пример 1

In [184]:
tree = parser(os.path.join(low_prsd, 'prs_EC12_B1_2421.conllu'))

In [185]:
words, size = get_words(tree)

In [189]:
del tree

In [191]:
correctionX(words)

tagset being created...
[31m<b1[39m [31m2421[39m > [31m<russian[39m [31m“н”=n[39m > [31m<о[39m вас = у вас , [31mнето[39m = [31mнету[39m > я хочу вам рассказать о [31mдружба[39m . по [31mмоему[39m , это одних из самых важных вещи в жизни . [31mнада[39m найти друзья с кем вы [31mрассчитаивайте[39m , ну быть [31mрассчитавайная[39m тоже [31mглавnое[39m . я [31mсогласин[39m , что есть [31mчитири[39m [31mкачествы[39m нужно в друзья [31m–[39m честность , быть [31mвежливый[39m , [31mюмор[39m и общие интересы . во - первых , друзья [31mверют[39m друг - другом . например , это страшно , если вы [31mсказайте[39m [31mсекрет[39m и ваш друг его [31mсказает[39m с них или неё друзья . как муж и жена никогда [31mсделаю[39m чем то они будет [31mвиновать[39m [31mдалшее[39m , друзья [31mвирит[39m , что они будет [31m<ъuдет[39m > помогать если [31mбеда[39m [31mприэсходится[39m . самый [31mхuчший[39m друзья всегда открывают дверь - это называетс

### Пример 2

In [196]:
tree = parser(os.path.join(low_prsd, 'prs_EC12-B1-0404.conllu'))

In [197]:
words, size = get_words(tree)

In [198]:
del tree

In [199]:
correctionX(words)

tagset being created...
﻿what [31mis[39m a [31mfriend[39m ? что такое друг ? что такое друг ? [31mкажде[39m человек что такое друг подругому . ест [31mлуди[39m кто читает друг человек кого они [31mвидет[39m [31mкажде[39m день и разговаривает [31mснеме[39m . но на другое [31mстаране[39m ест то кто читает друг как семья и не [31mзнаит[39m получается разговаривает [31mкажде[39m день . я [31mсегда[39m читал что [31mлуча[39m [31mимет[39m [31mдве[39m или три [31mхороши[39m [31mблиски[39m друзья чем [31mимет[39m [31mдвадцат[39m друзья но [31mскем[39m нет [31mблиски[39m [31mотношени[39m . я пришёл в такую [31mэдеа[39m потому что когда я был [31mриёнок[39m моя [31mсемя[39m [31mнога[39m раз [31mпережали[39m в [31mновие[39m дом и мне была [31mнада[39m [31mучитца[39m в [31mновие[39m [31mшкола[39m . в [31mновие[39m школа мне была [31mнада[39m найти новые [31mдрузя[39m . потому что я был новые студент , меня [31mник-[39m то не зна

-----------------------------------------------------------------

In [39]:
dictn = {}

In [40]:
keys = zip(df['tagset'], df['POS'])
for couple in keys:
    if couple in dictn:
        dictn[couple] += 1
    else:
        dictn[couple] = 1

In [41]:
df_stat = pd.Series(dictn).reset_index()
df_stat.columns = ['tagset', 'POS', 'count'] 
df_stat[:5]

Unnamed: 0,tagset,POS,count
0,,NUM,2
1,,SYM,6
2,Foreign=Yes,PROPN,2
3,Degree=Pos,ADV,18
4,Case=Acc|Number=Plur|Person=2,PRON,1


In [42]:
df_stat['part_freq'] = df_stat['count'].apply(lambda x: x/df_stat['count'].sum(axis=0))

In [43]:
df_stat

Unnamed: 0,tagset,POS,count,part_freq
0,,NUM,2,0.004396
1,,SYM,6,0.013187
2,Foreign=Yes,PROPN,2,0.004396
3,Degree=Pos,ADV,18,0.03956
4,Case=Acc|Number=Plur|Person=2,PRON,1,0.002198
5,,ADP,33,0.072527
6,Case=Gen|Number=Plur|Person=2,PRON,3,0.006593
7,,PUNCT,79,0.173626
8,Case=Nom,PRON,4,0.008791
9,Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense...,VERB,11,0.024176


In [44]:
df_statistica[:10]

Unnamed: 0,tagset,POS,count,part_freq
0,Case=Nom|Degree=Pos|Number=Plur,ADJ,17467,0.010666
1,Case=Nom|Degree=Pos|Gender=Masc|Number=Sing,ADJ,13906,0.008491
2,Case=Gen|Degree=Pos|Number=Plur,ADJ,47265,0.028861
3,Case=Gen|Degree=Pos|Gender=Masc|Number=Sing,ADJ,26364,0.016098
4,Case=Loc|Degree=Pos|Number=Plur,ADJ,7511,0.004586
5,Case=Acc|Degree=Pos|Gender=Fem|Number=Sing,ADJ,7714,0.00471
6,Case=Nom|Degree=Pos|Gender=Fem|Number=Sing,ADJ,11949,0.007296
7,Animacy=Inan|Case=Acc|Degree=Pos|Number=Plur,ADJ,10441,0.006375
8,Case=Gen|Degree=Pos|Gender=Fem|Number=Sing,ADJ,33116,0.020221
9,Case=Ins|Degree=Pos|Number=Plur,ADJ,8915,0.005444


In [45]:
df_stat['count'].sum(axis=0)

455

In [46]:
df_stat['x1000_freq'] = df_stat['count'].apply(lambda x: (x*1000)/df_stat['count'].sum(axis=0))

In [47]:
df_stat = df_stat.sort_values(by='POS').reset_index(drop=True)

In [48]:
df_stat[:10]

Unnamed: 0,tagset,POS,count,part_freq,x1000_freq
0,Case=Nom|Degree=Pos|Gender=Masc|Number=Sing,ADJ,12,0.026374,26.373626
1,Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|N...,ADJ,4,0.008791,8.791209
2,Case=Nom|Degree=Pos|Gender=Neut|Number=Sing,ADJ,1,0.002198,2.197802
3,Degree=Pos|Gender=Fem|Number=Sing|Variant=Short,ADJ,1,0.002198,2.197802
4,Case=Acc|Degree=Pos|Gender=Neut|Number=Sing,ADJ,1,0.002198,2.197802
5,Degree=Pos|Gender=Masc|Number=Sing|Variant=Short,ADJ,1,0.002198,2.197802
6,Animacy=Inan|Case=Acc|Degree=Pos|Number=Plur,ADJ,1,0.002198,2.197802
7,Case=Gen|Degree=Pos|Number=Plur,ADJ,6,0.013187,13.186813
8,Case=Dat|Degree=Pos|Number=Plur,ADJ,1,0.002198,2.197802
9,Case=Loc|Degree=Pos|Number=Plur,ADJ,2,0.004396,4.395604


In [49]:
df_statistica['x1000_freq'] = df_statistica['count'].apply(lambda x: (x*1000)/df_statistica['count'].sum(axis=0))

In [50]:
df_statistica[:10]

Unnamed: 0,tagset,POS,count,part_freq,x1000_freq
0,Case=Nom|Degree=Pos|Number=Plur,ADJ,17467,0.010666,10.665607
1,Case=Nom|Degree=Pos|Gender=Masc|Number=Sing,ADJ,13906,0.008491,8.491208
2,Case=Gen|Degree=Pos|Number=Plur,ADJ,47265,0.028861,28.860703
3,Case=Gen|Degree=Pos|Gender=Masc|Number=Sing,ADJ,26364,0.016098,16.098245
4,Case=Loc|Degree=Pos|Number=Plur,ADJ,7511,0.004586,4.586327
5,Case=Acc|Degree=Pos|Gender=Fem|Number=Sing,ADJ,7714,0.00471,4.710282
6,Case=Nom|Degree=Pos|Gender=Fem|Number=Sing,ADJ,11949,0.007296,7.296235
7,Animacy=Inan|Case=Acc|Degree=Pos|Number=Plur,ADJ,10441,0.006375,6.375428
8,Case=Gen|Degree=Pos|Gender=Fem|Number=Sing,ADJ,33116,0.020221,20.221116
9,Case=Ins|Degree=Pos|Number=Plur,ADJ,8915,0.005444,5.44363


## Comparing statistics
Let's now try to compare the tagset frequency for our corpus and for the student text we selected.

In [79]:
corpus_kfreq = zip(df_statistica['tagset'], df_statistica['POS'], df_statistica['part_freq'])
corpus_dict = {(elem[0], elem[1]) : (elem[2],) for elem in corpus_kfreq}

In [80]:
corpus_dict

{('Case=Nom|Degree=Pos|Number=Plur', 'ADJ'): (0.0106656066395798,),
 ('Case=Nom|Degree=Pos|Gender=Masc|Number=Sing',
  'ADJ'): (0.00849120775920288,),
 ('Case=Gen|Degree=Pos|Number=Plur', 'ADJ'): (0.02886070291519661,),
 ('Case=Gen|Degree=Pos|Gender=Masc|Number=Sing',
  'ADJ'): (0.016098245459774535,),
 ('Case=Loc|Degree=Pos|Number=Plur', 'ADJ'): (0.004586326871808775,),
 ('Case=Acc|Degree=Pos|Gender=Fem|Number=Sing',
  'ADJ'): (0.004710281652127931,),
 ('Case=Nom|Degree=Pos|Gender=Fem|Number=Sing',
  'ADJ'): (0.007296234827751704,),
 ('Animacy=Inan|Case=Acc|Degree=Pos|Number=Plur',
  'ADJ'): (0.006375427888237974,),
 ('Case=Gen|Degree=Pos|Gender=Fem|Number=Sing', 'ADJ'): (0.02022111578841957,),
 ('Case=Ins|Degree=Pos|Number=Plur', 'ADJ'): (0.00544362988445949,),
 ('Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|Number=Sing',
  'ADJ'): (0.004288346907297701,),
 ('Case=Dat|Degree=Pos|Number=Plur', 'ADJ'): (0.002947437066997864,),
 ('Case=Gen|Degree=Pos|Gender=Neut|Number=Sing',
  'ADJ'): 

In [81]:
stud_kfreq = zip(df_stat['tagset'], df_stat['POS'], df_stat['part_freq'])
stud_dict = {(elem[0], elem[1]) : (elem[2],) for elem in stud_kfreq}

In [82]:
stud_dict

{('Case=Nom|Degree=Pos|Gender=Masc|Number=Sing',
  'ADJ'): (0.026373626373626374,),
 ('Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|Number=Sing',
  'ADJ'): (0.008791208791208791,),
 ('Case=Nom|Degree=Pos|Gender=Neut|Number=Sing',
  'ADJ'): (0.002197802197802198,),
 ('Degree=Pos|Gender=Fem|Number=Sing|Variant=Short',
  'ADJ'): (0.002197802197802198,),
 ('Case=Acc|Degree=Pos|Gender=Neut|Number=Sing',
  'ADJ'): (0.002197802197802198,),
 ('Degree=Pos|Gender=Masc|Number=Sing|Variant=Short',
  'ADJ'): (0.002197802197802198,),
 ('Animacy=Inan|Case=Acc|Degree=Pos|Number=Plur',
  'ADJ'): (0.002197802197802198,),
 ('Case=Gen|Degree=Pos|Number=Plur', 'ADJ'): (0.013186813186813187,),
 ('Case=Dat|Degree=Pos|Number=Plur', 'ADJ'): (0.002197802197802198,),
 ('Case=Loc|Degree=Pos|Number=Plur', 'ADJ'): (0.004395604395604396,),
 ('Degree=Pos|Gender=Neut|Number=Sing|Variant=Short',
  'ADJ'): (0.01978021978021978,),
 ('Case=Ins|Degree=Pos|Number=Plur', 'ADJ'): (0.002197802197802198,),
 ('Case=Nom|Degree=Po

In [83]:
stat_dict = {}
for entry in stud_dict:
    if entry in corpus_dict:
        stat_dict[entry] = corpus_dict[entry] + (stud_dict[entry])

In [84]:
stat_dict

{('Case=Nom|Degree=Pos|Gender=Masc|Number=Sing', 'ADJ'): (0.00849120775920288,
  0.026373626373626374),
 ('Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|Number=Sing',
  'ADJ'): (0.004288346907297701, 0.008791208791208791),
 ('Case=Nom|Degree=Pos|Gender=Neut|Number=Sing', 'ADJ'): (0.010016523233278011,
  0.002197802197802198),
 ('Degree=Pos|Gender=Fem|Number=Sing|Variant=Short',
  'ADJ'): (0.00115650420652454, 0.002197802197802198),
 ('Case=Acc|Degree=Pos|Gender=Neut|Number=Sing', 'ADJ'): (0.004342080999258714,
  0.002197802197802198),
 ('Degree=Pos|Gender=Masc|Number=Sing|Variant=Short',
  'ADJ'): (0.0014532629416728643, 0.002197802197802198),
 ('Animacy=Inan|Case=Acc|Degree=Pos|Number=Plur',
  'ADJ'): (0.006375427888237974, 0.002197802197802198),
 ('Case=Gen|Degree=Pos|Number=Plur', 'ADJ'): (0.02886070291519661,
  0.013186813186813187),
 ('Case=Dat|Degree=Pos|Number=Plur', 'ADJ'): (0.002947437066997864,
  0.002197802197802198),
 ('Case=Loc|Degree=Pos|Number=Plur', 'ADJ'): (0.0045863268

In [None]:
# d = pd.DataFrame.from_dict(stud_dict) 

In [None]:
df_diff = pd.Series(stat_dict).reset_index()
df_diff.columns = ['tagset', 'POS', 'freq']
df_diff[['corpus_freq', 'stud_txt_freq']] = pd.DataFrame(df_diff['freq'].values.tolist(), index=df_diff.index)
del df_diff['freq']
df_diff

In [None]:
df_diff['corpus_freq'].mean()

In [None]:
ttest_ind(df_diff['corpus_freq'], df_diff['stud_txt_freq'])

In [None]:
from scipy.stats import ttest_ind

In [85]:
# diff_dict = {}
# for elem in stud_dict:
#     if elem in corpus_dict:
#         diff = corpus_dict[elem] - stud_dict[elem]
#     diff_dict[elem] = diff

In [None]:
df_diff = pd.Series(stat_dict).reset_index()
df_diff.columns = ['tagset', 'POS', 'freq']
df_diff[['corpus_freq', 'stud_txt_freq']] = pd.DataFrame(df_diff['freq'].values.tolist(), index=df_diff.index)
del df_diff['freq']
df_diff

In [None]:
df_diff['corpus_freq'].mean()

In [None]:
ttest_ind(df_diff['corpus_freq'], df_diff['stud_txt_freq'])

In [None]:
from scipy.stats import ttest_ind

In [None]:
# diff_dict = {}
# for elem in stud_dict:
#     if elem in corpus_dict:
#         diff = corpus_dict[elem] - stud_dict[elem]
#     diff_dict[elem] = diff

In [None]:
len(diff_dict)

In [None]:
df_diff

In [None]:
!del 'tagset.csv'

In [None]:
# def tagset(words):
#     with open('tagset.csv', 'w', newline='', encoding='utf-8') as f:
#         writer = csv.writer(f)
#         writer.writerow(['token', 'tagset', 'POS'])

#         for word in words:
#             if word[1]:
#                 tag_lst = []
#                 for tag in list(word[1].items()):
#                     tag = '{}={}|'.format(tag[0], tag[1])
#                     tag_lst.append(tag)

#                 tag_str = ''.join([str(elem) for elem in tag_lst])
#                 tag_str = tag_str[:-1]

#                 writer.writerow([word[0], tag_str, word[2]])
#             # print(tag_str)
#             else:
#                 tag_str = 'None'
#                 writer.writerow([word[0], tag_str, word[2]]) 

In [86]:
# def tagset(words):
#     with open('tagset.csv', 'w', newline='', encoding='utf-8') as f:
#         writer = csv.writer(f)
#         writer.writerow(['token', 'tagset', 'POS'])

#         for word in words:
#             if word[1]:
#                 tag_lst = []
#                 for tag in list(word[1].items()):
#                     tag = '{}={}|'.format(tag[0], tag[1])
#                     tag_lst.append(tag)

#                 tag_str = ''.join([str(elem) for elem in tag_lst])
#                 tag_str = tag_str[:-1]

#                 writer.writerow([word[0], tag_str, word[2]])
#             # print(tag_str)
#             else:
#                 tag_str = 'None'
#                 writer.writerow([word[0], tag_str, word[2]]) 

Unnamed: 0,tagset,POS,corpus_freq,stud_txt_freq
0,Case=Nom|Degree=Pos|Gender=Masc|Number=Sing,ADJ,0.008491,0.026374
1,Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|N...,ADJ,0.004288,0.008791
2,Case=Nom|Degree=Pos|Gender=Neut|Number=Sing,ADJ,0.010017,0.002198
3,Degree=Pos|Gender=Fem|Number=Sing|Variant=Short,ADJ,0.001157,0.002198
4,Case=Acc|Degree=Pos|Gender=Neut|Number=Sing,ADJ,0.004342,0.002198
5,Degree=Pos|Gender=Masc|Number=Sing|Variant=Short,ADJ,0.001453,0.002198
6,Animacy=Inan|Case=Acc|Degree=Pos|Number=Plur,ADJ,0.006375,0.002198
7,Case=Gen|Degree=Pos|Number=Plur,ADJ,0.028861,0.013187
8,Case=Dat|Degree=Pos|Number=Plur,ADJ,0.002947,0.002198
9,Case=Loc|Degree=Pos|Number=Plur,ADJ,0.004586,0.004396


In [87]:
df_diff['corpus_freq'].mean()

0.006529302787944514

In [88]:
ttest_ind(df_diff['corpus_freq'], df_diff['stud_txt_freq'])

Ttest_indResult(statistic=-0.6499397174902122, pvalue=0.5166517276692948)

In [57]:
from scipy.stats import ttest_ind

In [135]:
# diff_dict = {}
# for elem in stud_dict:
#     if elem in corpus_dict:
#         diff = corpus_dict[elem] - stud_dict[elem]
#     diff_dict[elem] = diff

In [136]:
len(diff_dict)

0

In [137]:
df_diff

Unnamed: 0,tagset,POS,freq,corpus_freq,stud_txt_freq
0,Case=Nom|Degree=Pos|Gender=Masc|Number=Sing,ADJ,"(0.026373626373626374, 0.00849120775920288)",0.026374,0.008491
1,Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|N...,ADJ,"(0.008791208791208791, 0.004288346907297701)",0.008791,0.004288
2,Case=Nom|Degree=Pos|Gender=Neut|Number=Sing,ADJ,"(0.002197802197802198, 0.010016523233278011)",0.002198,0.010017
3,Degree=Pos|Gender=Fem|Number=Sing|Variant=Short,ADJ,"(0.002197802197802198, 0.00115650420652454)",0.002198,0.001157
4,Case=Acc|Degree=Pos|Gender=Neut|Number=Sing,ADJ,"(0.002197802197802198, 0.004342080999258714)",0.002198,0.004342
5,Degree=Pos|Gender=Masc|Number=Sing|Variant=Short,ADJ,"(0.002197802197802198, 0.0014532629416728643)",0.002198,0.001453
6,Animacy=Inan|Case=Acc|Degree=Pos|Number=Plur,ADJ,"(0.002197802197802198, 0.006375427888237974)",0.002198,0.006375
7,Case=Gen|Degree=Pos|Number=Plur,ADJ,"(0.013186813186813187, 0.02886070291519661)",0.013187,0.028861
8,Case=Dat|Degree=Pos|Number=Plur,ADJ,"(0.002197802197802198, 0.002947437066997864)",0.002198,0.002947
9,Case=Loc|Degree=Pos|Number=Plur,ADJ,"(0.004395604395604396, 0.004586326871808775)",0.004396,0.004586


In [294]:
!del 'tagset.csv'

Impossibile trovare C:\Users\Andrea\CATandkittens_2019-2021\morphology\'tagset.csv'


In [118]:
# def tagset(words):
#     with open('tagset.csv', 'w', newline='', encoding='utf-8') as f:
#         writer = csv.writer(f)
#         writer.writerow(['token', 'tagset', 'POS'])

#         for word in words:
#             if word[1]:
#                 tag_lst = []
#                 for tag in list(word[1].items()):
#                     tag = '{}={}|'.format(tag[0], tag[1])
#                     tag_lst.append(tag)

#                 tag_str = ''.join([str(elem) for elem in tag_lst])
#                 tag_str = tag_str[:-1]

#                 writer.writerow([word[0], tag_str, word[2]])
#             # print(tag_str)
#             else:
#                 tag_str = 'None'
#                 writer.writerow([word[0], tag_str, word[2]]) 