In [1]:
import csv
import re
import math
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from scipy.stats import t
import os
import warnings
warnings.filterwarnings('ignore')

In [12]:
def divide_by_genre(table_name):
    with open(table_name) as table:
        lines = list(csv.reader(table))
    print(lines)
    data = lines[1:]
    data_1 = []
    data_2 = []
    for line in data:
        if line[0].endswith('2.txt') or line[0].endswith('2(full).txt'):
            data_2.append(line)
        else:
            data_1.append(line)
    return data_1, data_2, lines[0]

In [3]:
def dispersion(f_data, means):
    disps = []
    for line in enumerate(f_data):
        alts = [(abs(means[line[0]] - x)) ** 2 for x in line[1]]
        disp = sum(alts) / (len(alts) - 1)
        disps.append(disp)
    return disps

In [4]:
def t_test(data, columns):
    f_data_1 = []
    f_data_5 = []
    for _ in range(len(data[0]) - 2):
        f_data_1.append([])
        f_data_5.append([])
    for i in range(len(data)):
        if int(data[i][1]) == 1:
            for j in range(2, len(data[i])):
                f_data_1[j - 2].append(float(data[i][j]))
        else:
            for j in range(2, len(data[i])):
                f_data_5[j - 2].append(float(data[i][j]))

#     print(f_data_1)            
    
    means_1 = [sum(x) / len(x) for x in f_data_1]
    means_5 = [sum(x) / len(x) for x in f_data_5]

#     print(means_1)
#     print(means_5)
    
    disps_1 = dispersion(f_data_1, means_1)
    disps_5 = dispersion(f_data_5, means_5)
    
#     print(disps_1)
    
    t_tests = []
    for line in enumerate(means_1):
        i = line[0]
        t_test = abs(line[1] - means_5[i]) / math.sqrt(
            disps_1[i] ** 2 / len(f_data_1[0]) + disps_5[i] ** 2 / len(f_data_5[0])
        )
#         df = (disps_1[i] ** 2 / len(f_data_1[0]) + disps_5[i] ** 2 / len(f_data_5[0])) ** 2
#         df /= (disps_1[i] ** 2 / len(f_data_1[0])) ** 2 / (len(f_data_1[0]) - 1) + (disps_5[i] ** 2 / len(f_data_5[0])) ** 2 / (len(f_data_5[0]) - 1)
#         print(df)
#         print(len(f_data_1[0]) + len(f_data_5[0]))
        t_tests.append((1 - t.pdf(t_test, df=len(f_data_1[0]) + len(f_data_5[0]))) * 100)
    
    tt = []
    tt.append(('feature', 't_test'))
    for line in enumerate(t_tests):
        i = line[0]
        t_test = line[1]
        tt.append((columns[i + 2], t_test))
    return tt

In [5]:
def create_table_for_lr(table_name, data):
    data = [','.join(x[1:]) for x in data]
    with open(table_name, 'w') as f:
        f.write('\n'.join(data))

In [6]:
def log_reg(table_name, columns):
    spam_data = pd.read_csv(table_name, header=None)
    X, y = spam_data.values[:, 1:], spam_data.values[:, 0]

    normalizer = MinMaxScaler()
    X_real_norm_np = normalizer.fit_transform(X)


    # splitting data
    from sklearn.model_selection import KFold
    kf = KFold(n_splits=5, random_state=123, shuffle=True)
    lr = LogisticRegression()
    quals = []

    dats = kf.split(X)


    all_coefs = []
    errs = []
    for data in dats:
        tr_inds = data[0]
        test_inds = data[1]
        X_tr = np.array([X_real_norm_np[i] for i in tr_inds])
        X_test = np.array([X_real_norm_np[i] for i in test_inds])
        y_tr = np.array([y[i] for i in tr_inds])
        y_test = np.array([y[i] for i in test_inds])
        lr = LogisticRegression(max_iter=1000)
        lr = lr.fit(X_tr, y_tr)

        preds = lr.predict(X_test)
        error = 1 - metrics.accuracy_score(y_test, preds)

        coefs = enumerate(lr.coef_[0])
        all_coefs.append(coefs)
        errs.append(error)
#         print('Ошибка =', error)

    print('Средняя ошибка =', sum(errs) / len(errs))

    from collections import defaultdict as dd
    dick = dd(list)

    for coefs in all_coefs:
        for coef in coefs:
            dick[coef[0]].append(coef[1])

    weights = []
    weights.append(('feature', 'coef'))
    for key, value in dick.items():
        ave = sum([math.fabs(i) for i in value]) / len(value)
        weights.append((columns[key + 2], ave))

    # weights = sorted(weights, key=lambda x: x[1], reverse=True)
    # s = sum([y[1] for y in weights])
    # weights = [(x[0], round(x[1] / s * 100, 2)) for x in weights]
    return weights

In [7]:
def normalizer(t_test, log_reg, columns):
    data = []
    for line in enumerate(t_test[1:]):
        i = line[0]
        t_test = line[1]
        data.append((t_test[1], log_reg[i + 1][1]))
    df = pd.DataFrame(data)
    normalizer = MinMaxScaler()
    norm = normalizer.fit_transform(df)
    wn = []
    for line in enumerate(norm):
        wn.append([columns[line[0] + 2], line[1][0], line[1][1]])
    norm = [f'{x[0]},{str(x[1])},{str(x[2])},{str(x[1] + x[2])}' for x in wn]
    return norm

In [240]:
def main():
    tables = [
        'dataset_prepositions.csv',
        'dataset_rhetorical.csv',
        'dataset_derivation.csv',
        'dataset_writing_conventions.csv',
        'dataset_verb_morphology.csv',
        'dataset_syntax.csv',
        'dataset_lexical_choice_verb_pattern.csv'
    ]
    for table in tables:
        data_1, data_2, columns = divide_by_genre(table)
        t_t_1 = t_test(data_1, columns)
        t_t_2 = t_test(data_2, columns)
        create_table_for_lr(f'{table[:-4]}_lr_1.csv', data_1)
        create_table_for_lr(f'{table[:-4]}_lr_2.csv', data_2)
        lr_1 = log_reg(f'{table[:-4]}_lr_1.csv', columns)
        lr_2 = log_reg(f'{table[:-4]}_lr_2.csv', columns)
        norm_1 = normalizer(t_t_1, lr_1, columns)
        norm_2 = normalizer(t_t_2, lr_2, columns)
        with open(f'{table[:-4]}_1.csv', 'w') as f:
            f.write('\n'.join(norm_1))
        with open(f'{table[:-4]}_2.csv', 'w') as f:
            f.write('\n'.join(norm_2))
#         pd.DataFrame(norm_1).to_csv(f'{table[:-4]}_1.csv')
#         pd.DataFrame(norm_2).to_csv(f'{table[:-4]}_2.csv')

In [14]:
def main_1():
    tables = [
        'dataset_prepositions.csv',
        'dataset_rhetorical.csv',
        'dataset_derivation.csv',
        'dataset_writing_conventions.csv',
        'dataset_verb_morphology.csv',
        'dataset_syntax.csv',
        'dataset_lexical_choice_verb_pattern.csv'
    ]
    for table in tables:
        print(table)
        data, data_2, columns = divide_by_genre(table)
        data.extend(data_2)
        t_t = t_test(data, columns)
        create_table_for_lr(f'{table[:-4]}_lr.csv', data)
        lr = log_reg(f'{table[:-4]}_lr.csv', columns)
        norm = normalizer(t_t, lr, columns)
        with open(f'{table[:-4]}.csv', 'w') as f:
            f.write('\n'.join(norm))

In [214]:
data_1, data_2, columns = divide_by_genre('dataset_syntax.csv')
t_t = t_test(data_1, columns)
lr = log_reg('synt_1.csv', columns)
pd.DataFrame(normalizer(t_t, lr, columns)).to_csv('Hey.csv')

Средняя ошибка = 0.27097112075415236




In [16]:
main_1()

dataset_prepositions.csv
[['name', 'errors', 'num_prep'], ['exam2014/AAl_13_1.txt', '1', '11'], ['exam2014/AAl_14_2.txt', '1', '21'], ['exam2014/AAl_15_1.txt', '1', '18'], ['exam2014/AAl_15_2.txt', '1', '16'], ['exam2014/AAl_16_1.txt', '1', '8'], ['exam2014/AAl_16_2.txt', '1', '9'], ['exam2014/AAl_17_1.txt', '1', '13'], ['exam2014/AAl_19_1.txt', '1', '11'], ['exam2014/AAl_19_2.txt', '1', '16'], ['exam2014/AAl_23_2.txt', '1', '11'], ['exam2014/AAl_24_1.txt', '1', '9'], ['exam2014/AAl_28_2.txt', '1', '13'], ['exam2014/AAl_29_1.txt', '1', '14'], ['exam2014/AAl_30_1.txt', '1', '14'], ['exam2014/AAl_31_2.txt', '1', '14'], ['exam2014/AAl_33_1.txt', '1', '12'], ['exam2014/AAl_34_2.txt', '1', '12'], ['exam2014/AAl_35_1.txt', '1', '12'], ['exam2014/AAl_35_2.txt', '1', '16'], ['exam2014/AAl_5_1.txt', '1', '9'], ['exam2014/AAl_5_2.txt', '1', '15'], ['exam2014/AAl_7_1.txt', '1', '6'], ['exam2014/AAl_8_2.txt', '1', '12'], ['exam2014/ADe_12_2.txt', '1', '16'], ['exam2014/ADe_16_2.txt', '1', '14'], [

Средняя ошибка = 0.09235602094240838
dataset_derivation.csv
[['name', 'errors', 'der_level3', 'der_level4', 'der_level5', 'der_level6', 'mci', 'num_inf', 'num_gerunds'], ['exam2014/AAl_11_2.txt', '1', '0.030303030303030304', '0.04040404040404041', '0.0707070707070707', '0.12121212121212122', '3.75', '10', '5'], ['exam2014/AAl_12_2.txt', '1', '0.06896551724137931', '0.1724137931034483', '0.12643678160919541', '0.06896551724137931', '4.0', '23', '2'], ['exam2014/AAl_14_1.txt', '1', '0.023809523809523808', '0.14285714285714285', '0.07142857142857142', '0.07142857142857142', '3.5', '2', '0'], ['exam2014/AAl_17_2.txt', '1', '0.015625', '0.15625', '0.046875', '0.078125', '3.25', '25', '0'], ['exam2014/AAl_2_2.txt', '1', '0.033707865168539325', '0.12359550561797752', '0.033707865168539325', '0.056179775280898875', '3.0', '24', '1'], ['exam2014/AAl_35_2.txt', '1', '0.03225806451612903', '0.1774193548387097', '0.03225806451612903', '0.03225806451612903', '3.5', '13', '0'], ['exam2014/AAl_9_2.tx

Средняя ошибка = 0.8817635492468334
dataset_verb_morphology.csv
[['name', 'errors', 'num_inf', 'num_gerunds', 'num_pres_sing', 'num_pres_plur', 'num_past_part', 'num_past_simple', 'num_noun_inf'], ['exam2014/AAl_11_1.txt', '1', '3', '0', '6', '0', '8', '2', '0'], ['exam2014/AAl_11_2.txt', '1', '10', '5', '12', '0', '4', '9', '0'], ['exam2014/AAl_13_1.txt', '1', '7', '0', '10', '0', '8', '6', '1'], ['exam2014/AAl_15_1.txt', '1', '1', '3', '4', '0', '7', '5', '0'], ['exam2014/AAl_16_1.txt', '1', '8', '0', '15', '0', '2', '3', '0'], ['exam2014/AAl_17_2.txt', '1', '25', '0', '15', '0', '2', '7', '7'], ['exam2014/AAl_20_2.txt', '1', '16', '2', '8', '0', '3', '9', '0'], ['exam2014/AAl_22_1.txt', '1', '2', '1', '9', '0', '10', '1', '0'], ['exam2014/AAl_25_1.txt', '1', '9', '0', '11', '0', '4', '1', '0'], ['exam2014/AAl_26_2.txt', '1', '9', '7', '20', '0', '3', '2', '3'], ['exam2014/AAl_29_1.txt', '1', '1', '1', '4', '0', '4', '3', '0'], ['exam2014/AAl_2_1.txt', '1', '8', '1', '5', '0', '5', '

Средняя ошибка = 0.3725901169730246
dataset_syntax.csv
[['name', 'errors', 'av_depth', 'max_depth', 'min_depth', 'num_acl', 'num_advcl', 'num_sent', 'num_tok', 'av_tok_before_root', 'av_len_sent', 'num_cl', 'num_tu', 'num_compl_tu', 'num_coord', 'num_adj_noun', 'num_part_noun', 'num_noun_inf', 'pos_sim_nei', 'lemma_sim_nei', 'pos_sim_all', 'lemma_sim_all'], ['exam2014/AAl_10_2.txt', '1', '4.454545454545454', '7', '0', '6', '8', '11', '331', '9.909090909090908', '30.09090909090909', '31', '21', '10', '12', '20', '2', '4', '25.4', '33.4', '24.67186507936508', '32.74916666666667'], ['exam2014/AAl_11_1.txt', '1', '4.7272727272727275', '9', '2', '5', '0', '11', '210', '5.090909090909091', '19.09090909090909', '15', '12', '3', '7', '7', '5', '0', '16.4', '20.9', '16.054523809523808', '20.554960317460317'], ['exam2014/AAl_11_2.txt', '1', '4.190476190476191', '8', '1', '7', '7', '21', '359', '5.428571428571429', '17.095238095238095', '38', '26', '12', '14', '28', '1', '0', '14.75', '19.15', '1

Средняя ошибка = 0.28951203881717474
dataset_lexical_choice_verb_pattern.csv
[['name', 'errors', 'density', 'ls', 'vs', 'corrected_vs', 'squared_vs', 'lfp_1000', 'lfp_2000', 'lfp_uwl', 'lfp_rest', 'ndw', 'ttr', 'corrected_ttr', 'root_ttr', 'log_ttr', 'uber_ttr', 'lv', 'vvi', 'squared_vv', 'corrected_vv', 'vvii', 'nv', 'adjv', 'advv', 'modv'], ['exam2014/AAl_15_1.txt', '1', '0.3611111111111111', '0.34065934065934067', '0.125', '0.35355339059327373', '0.25', '0.5753968253968254', '0.03968253968253968', '0.047619047619047616', '0.33730158730158744', '92', '0.36507936507936506', '4.09800570932384', '5.795455252808151', '0.8177677126309107', '278.82472341204095', '0.36507936507936506', '0.6875', '7.5625', '1.9445436482630056', '0.12087912087912088', '0.0873015873015873', '0.07692307692307693', '0.06593406593406594', '0.14285714285714285'], ['exam2014/AAl_17_1.txt', '1', '0.36199095022624433', '0.1875', '0.0', '0.0', '0.0', '0.7285067873303167', '0.06787330316742081', '0.03167420814479638', 

Средняя ошибка = 0.6614800759013282


In [222]:
a = 'table.csv'
f'{a[:-4]}_1.csv'

'table_1.csv'