In [1]:
import mysql.connector
import csv
import pandas as pd
import os
import tempfile
import re

In [2]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 1000)

In [3]:
con = mysql.connector.connect(user='andr',
                              password='rstq!2Ro',
                              host='127.0.0.1',
                              database='cat_db',
                              auth_plugin='mysql_native_password'
                             )

In [4]:
cur = con.cursor(dictionary=True, buffered=True)

In [5]:
cur.execute("""
SELECT id_text, morph, pos FROM
(SELECT id_text, id_unigram FROM words) AS a JOIN
(SELECT morph, id_unigram, lemma FROM unigrams) AS b ON a.id_unigram = b.id_unigram JOIN
(SELECT id_lemmas, id_pos FROM lemmas) AS c ON lemma = id_lemmas JOIN
(SELECT pos, id_pos FROM pos) AS d ON c.id_pos = d.id_pos""")

In [6]:
rows = cur.fetchall()

In [7]:
with open('regtab.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['id_text', 'morph', 'pos'])
    for dictionary in rows:
        writer.writerow([dictionary['id_text'], dictionary['morph'], dictionary['pos']])

In [8]:
df = pd.read_csv('regtab.csv')

In [9]:
def factor(dictionary):
    factor = 1.0/sum(dictionary.values())
    for k in dictionary:
        dictionary[k] = dictionary[k]*factor
    return dictionary

In [10]:
class Morphodict:
    def __init__(self):
        self.impVERB = 0
        self.perVERB = 0
        self.prsVERB = 0
        self.pstVERB = 0
        self.futVERB = 0
        self.finVERB = 0
        self.partVERB = 0
        self.passVOICE = 0
        self.midVOICE = 0
        self.actVOICE = 0
        self.inADJ = 0
        self.anADJ = 0
        self.inNOUN = 0
        self.anNOUN = 0
        self.AUX = 0
        
    def toDict(self):
        res={"impVERB":self.impVERB, "perVERB":self.perVERB, "prsVERB":self.prsVERB, "pstVERB":self.pstVERB,\
             "futVERB":self.futVERB, "finVERB":self.finVERB, "partVERB":self.partVERB, "passVOICE":self.passVOICE, \
             "midVOICE":self.midVOICE, "actVOICE":self.actVOICE, "inADJ":self.inADJ, "anADJ":self.anADJ, \
             "inNOUN":self.inNOUN, "anNOUN":self.anNOUN, "AUX":self.AUX}
        return res

In [11]:
def tagcounter(id_text, df):
    morcounter = Morphodict()
    current = df.loc[df['id_text'] == id_text]
    for i, row in current.iterrows():
        morph_split = row['morph'].split('|')
#         print(morph_split)
        for chunk in morph_split:
            if row['pos'] == 'VERB':
                if 'Aspect=Imp' in chunk:
                    morcounter.impVERB = morcounter.impVERB + 1
                elif 'Aspect=Perf' in chunk:
                    morcounter.perVERB = morcounter.perVERB + 1
                elif 'Tense=Pres' in chunk:
                    morcounter.prsVERB = morcounter.prsVERB + 1
                elif 'Tense=Past' in chunk:
                    morcounter.pstVERB = morcounter.pstVERB + 1
                elif 'Tense=Futr' in chunk:
                    morcounter.futVERB = morcounter.futVERB + 1
                elif 'VerbForm=Fin' in chunk:
                    morcounter.finVERB = morcounter.finVERB + 1
                elif 'VerbForm=Part' in chunk:
                    morcounter.partVERB = morcounter.partVERB + 1
                elif 'Voice=Pass' in chunk:
                    morcounter.passVOICE = morcounter.passVOICE + 1
                elif 'Voice=Mid' in chunk:
                    morcounter.midVOICE = morcounter.midVOICE + 1
                elif 'Voice=Act' in chunk:
                    morcounter.actVOICE = morcounter.actVOICE + 1
                else: 
                    continue
                        
            elif row['pos'] == 'ADJ':
                if 'Animacy=Inan' in chunk:
                    morcounter.inADJ = morcounter.inADJ + 1
                elif 'Animacy=Anim' in chunk:
                    morcounter.anADJ = morcounter.anADJ + 1
                else:
                    continue
                    
            elif row['pos'] == 'NOUN':
                if 'Animacy=Inan' in chunk:
                    morcounter.inNOUN = morcounter.inNOUN + 1
                elif 'Animacy=Anim' in chunk:
                    morcounter.anNOUN = morcounter.anNOUN + 1
                else:
                    continue

            elif row['pos'] == 'AUX':
                morcounter.AUX = morcounter.AUX + 1
                
            else: 
                continue
    morcounter = morcounter.toDict()
    return factor(morcounter)

In [12]:
tagcounter(691, df)

{'impVERB': 0.07820383451059536,
 'perVERB': 0.042885973763874874,
 'prsVERB': 0.06685166498486378,
 'pstVERB': 0.03229061553985873,
 'futVERB': 0.0,
 'finVERB': 0.05676084762865792,
 'partVERB': 0.04112008072653885,
 'passVOICE': 0.034308779011099896,
 'midVOICE': 0.014631685166498487,
 'actVOICE': 0.07214934409687185,
 'inADJ': 0.018415741675075682,
 'anADJ': 0.0,
 'inNOUN': 0.5035317860746721,
 'anNOUN': 0.01160443995963673,
 'AUX': 0.027245206861755803}

In [13]:
def morphlibrary(text_ids, df):
    textholder = []
    for text_id in text_ids:
        textholder.append(tagcounter(text_id, df))
    return textholder

### Student texts for negatives

In [14]:
from conllu import parse, parse_tree

In [15]:
def parser(filename):
    """
    Yields a sentence from conllu tree with its tags

    """
    """
    >>> for i in parser('/content/gdrive/My Drive/Новые conll по доменам/NewVers/CleanedPsyEdu.conllu'):
      print(i)   
    TokenList<Музыка, звучит, отовсюду, независимо, от, нашего, желания, или, нежелания, слушать, ее, .>
    """
    with open(filename, 'r', encoding='utf-8') as f:
        data = f.read()
    tree = parse(data)
    for token in tree:
        yield token

In [16]:
def get_words(tree):
    """
    tree - generator of sentences (TokenLists) from conllu tree

    words, list is a list of all tokens we need from the tree
    size, int is a number of all words in the domain
    """
    words = []
    for sentence in tree:
        for token in sentence:
#             print(token)
            if token['form'] != '_' and token['upostag'] != '_' and token['upostag']!='NONLEX' and token['form'] not in r'[]\/':
                for wordform in token['form'].lower().split():
                    words.append((wordform, token['lemma'], token['feats'], token['upostag']))
    size = len(words)
    return words, size

In [17]:
filepath = r'C:\Users\Andrea\Desktop\stud_textVSscie_text\RULEC_PARSED\Max_FINISHED_PARSED\Max_HL_AM_2011-2012_Week_10_1_paragraph+_expository_non-timed.conllu'

In [18]:
# tree = parser(filepath)

In [19]:
# words, size = get_words(tree)
# del tree 

In [20]:
# tagcounter4studtexts2(words)

In [21]:
def detagger(words):
    tag_dict = {}
    for word in words:
        if word[2]:
            for tag in list(word[2].items()):
                full_tag = '{}={}'.format(tag[0], tag[1])
                if word[3] == 'VERB':
                    if 'VERB' in tag_dict:
                        tag_dict['VERB'].append(full_tag)
                    else:
                        tag_dict['VERB'] = [full_tag]
                elif word[3] == 'ADJ':
                    if 'ADJ' in tag_dict:
                        tag_dict['ADJ'].append(full_tag)
                    else:
                        tag_dict['ADJ'] = [full_tag]
                elif word[3] == 'NOUN':
                    if 'NOUN' in tag_dict:
                        tag_dict['NOUN'].append(full_tag)
                    else:
                        tag_dict['NOUN'] = [full_tag]
                elif word[3] == 'AUX':
                    tag_dict['AUX'] = tag_dict['AUX'] + 1 if 'AUX' in tag_dict else 1
                else:
                    continue
    return tag_dict

In [22]:
def tagcounter4studtexts(words):
    morcounter = Morphodict()
    tagdict = detagger(words)
    if 'VERB' in tagdict:
        for value in tagdict['VERB']:
            if 'Aspect=Imp' in value:
                morcounter.impVERB = morcounter.impVERB + 1
            elif 'Aspect=Perf' in value:
                morcounter.perVERB = morcounter.perVERB + 1
            elif 'Tense=Pres' in value:
                morcounter.prsVERB = morcounter.prsVERB + 1
            elif 'Tense=Past' in value:
                morcounter.pstVERB = morcounter.pstVERB + 1
            elif 'Tense=Futr' in value:
                morcounter.futVERB = morcounter.futVERB + 1
            elif 'VerbForm=Fin' in value:
                morcounter.finVERB = morcounter.finVERB + 1
            elif 'VerbForm=Part' in value:
                morcounter.partVERB = morcounter.partVERB + 1
            elif 'Voice=Pass' in value:
                morcounter.passVOICE = morcounter.passVOICE + 1
            elif 'Voice=Mid' in value:
                morcounter.midVOICE = morcounter.midVOICE + 1
            elif 'Voice=Act' in value:
                morcounter.actVOICE = morcounter.actVOICE + 1
            else: 
                continue
                        
    if 'ADJ' in tagdict:
        for value in tagdict['ADJ']:
            if 'Animacy=Inan' in value:
                morcounter.inADJ = morcounter.inADJ + 1
            elif 'Animacy=Anim' in value:
                morcounter.anADJ = morcounter.anADJ + 1
            else:
                continue
            
    if 'NOUN' in tagdict:
        for value in tagdict['NOUN']:
            if 'Animacy=Inan' in value:
                morcounter.inNOUN = morcounter.inNOUN + 1
            elif 'Animacy=Anim' in value:
                morcounter.anNOUN = morcounter.anNOUN + 1
            else:
                continue
            
    if 'AUX' in tagdict:
        morcounter.AUX = tagdict['AUX']
    else:
        morcounter.AUX = 0
    morcounter = morcounter.toDict()
    if sum(morcounter.values()) > 150:
        return factor(morcounter)
    else:
        print('The text was too short')

In [23]:
PATH_TO_STUDTEXTS_RULEC = r'C:\Users\Andrea\Desktop\stud_textVSscie_text\RULEC_PARSED'
PATH_TO_STUDTEXT = r'C:\Users\Andrea\Desktop\stud_textVSscie_text\Student_texts_for_experiments\stud_txt\conllu'

In [24]:
PATHS = [PATH_TO_STUDTEXTS_RULEC, PATH_TO_STUDTEXT]

In [25]:
def morphlibrary4students(paths_to_student_text_dir):
    textholder = []
    for path in paths_to_student_text_dir:
        for dir_ in os.listdir(path):
            for filepath in os.listdir(os.path.join(path, dir_)):
                tree = parser(os.path.join(path, dir_, filepath))
                words, size = get_words(tree)
                del tree
                textholder.append(tagcounter4studtexts(words))
    return textholder

In [26]:
len(list(df.id_text.unique()))

600

In [27]:
def buildthetab(df, paths_to_student_text_dir):
    
    # list of dictionaries with morphological parameters for academic texts
    text_ids = list(df.id_text.unique())
    academic_txt_datalibrary = morphlibrary(text_ids, df)
    
    # list of dictionaries for student texts
    student_txt_datalibrary = morphlibrary4students(paths_to_student_text_dir)
    
    with open('datab.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['impVERB', 'perVERB', 'prsVERB', 'pstVERB', 'futVERB', 'finVERB', 'partVERB',
                        'passVOICE', 'midVOICE', 'actVOICE', 'inADJ', 'anADJ', 'inNOUN', 'anNOUN',
                        'AUX', 'academicity'])
        for _dict in academic_txt_datalibrary:
            writer.writerow([_dict['impVERB'], _dict['perVERB'], _dict['prsVERB'], _dict['pstVERB'],
                            _dict['futVERB'], _dict['finVERB'], _dict['partVERB'], _dict['passVOICE'],
                            _dict['midVOICE'], _dict['actVOICE'], _dict['inADJ'], _dict['anADJ'],
                            _dict['inNOUN'], _dict['anNOUN'], _dict['AUX'], 1])
        for _dict in student_txt_datalibrary:
            try:
                writer.writerow([_dict['impVERB'], _dict['perVERB'], _dict['prsVERB'], _dict['pstVERB'],
                            _dict['futVERB'], _dict['finVERB'], _dict['partVERB'], _dict['passVOICE'],
                            _dict['midVOICE'], _dict['actVOICE'], _dict['inADJ'], _dict['anADJ'],
                            _dict['inNOUN'], _dict['anNOUN'], _dict['AUX'], 0])
            except TypeError:
                print(_dict)
                continue

In [29]:
buildthetab(df, PATHS)

The text was too short
The text was too short
The text was too short
The text was too short
The text was too short
The text was too short
The text was too short
The text was too short
The text was too short
The text was too short
The text was too short
The text was too short
The text was too short
The text was too short
The text was too short
The text was too short
The text was too short
The text was too short
The text was too short
The text was too short
The text was too short
The text was too short
The text was too short
The text was too short
The text was too short
The text was too short
The text was too short
The text was too short
The text was too short
The text was too short
The text was too short
The text was too short
The text was too short
The text was too short
The text was too short
The text was too short
The text was too short
The text was too short
The text was too short
The text was too short
The text was too short
The text was too short
The text was too short
The text wa

In [30]:
morpho_df = pd.read_csv('datab.csv')

In [31]:
morpho_df.shape

(920, 16)

In [32]:
morpho_df.head()

Unnamed: 0,impVERB,perVERB,prsVERB,pstVERB,futVERB,finVERB,partVERB,passVOICE,midVOICE,actVOICE,inADJ,anADJ,inNOUN,anNOUN,AUX,academicity
0,0.078204,0.042886,0.066852,0.032291,0.0,0.056761,0.04112,0.034309,0.014632,0.072149,0.018416,0.0,0.503532,0.011604,0.027245,1
1,0.101195,0.032431,0.092416,0.027067,0.0,0.081687,0.037064,0.029993,0.032431,0.071202,0.009998,0.000244,0.452573,0.019507,0.012192,1
2,0.077165,0.030377,0.055168,0.022346,0.0,0.060405,0.024092,0.023743,0.015712,0.068087,0.00838,0.0,0.594972,0.013617,0.005936,1
3,0.09171,0.047824,0.071643,0.03976,0.0,0.072018,0.040885,0.032633,0.020818,0.086084,0.011065,0.000188,0.428357,0.015004,0.042011,1
4,0.094278,0.037057,0.076294,0.024523,0.0,0.064305,0.036512,0.032153,0.017439,0.081744,0.014714,0.0,0.46049,0.014714,0.045777,1


In [33]:
morpho_df.loc[morpho_df['academicity'] == 0]

Unnamed: 0,impVERB,perVERB,prsVERB,pstVERB,futVERB,finVERB,partVERB,passVOICE,midVOICE,actVOICE,inADJ,anADJ,inNOUN,anNOUN,AUX,academicity
600,0.106061,0.025253,0.101010,0.020202,0.0,0.095960,0.030303,0.035354,0.040404,0.055556,0.000000,0.000000,0.343434,0.095960,0.050505,0
601,0.109091,0.042424,0.084848,0.030303,0.0,0.096970,0.018182,0.018182,0.000000,0.133333,0.018182,0.000000,0.327273,0.078788,0.042424,0
602,0.090323,0.064516,0.077419,0.051613,0.0,0.090323,0.045161,0.045161,0.019355,0.090323,0.025806,0.000000,0.335484,0.019355,0.045161,0
603,0.085714,0.028571,0.039286,0.067857,0.0,0.096429,0.007143,0.010714,0.035714,0.067857,0.007143,0.000000,0.364286,0.092857,0.096429,0
604,0.110526,0.057895,0.047368,0.078947,0.0,0.126316,0.010526,0.005263,0.042105,0.121053,0.015789,0.000000,0.252632,0.068421,0.063158,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,0.152778,0.041667,0.106481,0.027778,0.0,0.138889,0.004630,0.004630,0.032407,0.157407,0.004630,0.000000,0.138889,0.092593,0.097222,0
916,0.101399,0.059441,0.090909,0.024476,0.0,0.136364,0.000000,0.000000,0.013986,0.146853,0.006993,0.000000,0.244755,0.101399,0.073427,0
917,0.111111,0.033333,0.094444,0.016667,0.0,0.105556,0.005556,0.005556,0.016667,0.122222,0.000000,0.000000,0.177778,0.155556,0.155556,0
918,0.119658,0.051282,0.082621,0.042735,0.0,0.136752,0.002849,0.002849,0.048433,0.119658,0.002849,0.002849,0.145299,0.122507,0.119658,0
