In [1]:
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats 
import seaborn as sns
import statsmodels.formula.api as smf
import scikit_posthocs as sp
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

In [2]:
plt.rc("font", size=14)
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from matplotlib.colors import BoundaryNorm, ListedColormap

In [3]:
columns = ['noun', 'adj', 'propn', 'adv', 'intj', 'cconj', 'sconj', 'adp', 'det', 'num', 'punct', 'symbol', 'pron', 'abbr', 'TTR', 'avg_word_len', 'avr_sent_len', 'hapax_legomena', 'coref', 'see_pron', 'see_det', '1st_pron', '2nd_pron', '3rd_pron', 'active_voice', 'passive_voice', '1st_prs_verb', '2nd_prs_verb', '3rd_prs_verb', 'core_verb', 'verbtype_ratio', 'da_inf', 'gerund', 'supine', 'verb_particle', 'discourse', 'pres_tense', 'past_tense', 'ind_mood', 'cond_mood', 'imp_mood', 'quot_mood', 'neg_polarity', 'nom_case', 'gen_case', 'part_case', 'ill_case', 'ine_case', 'ela_case', 'alla_case', 'ade_case', 'abl_case', 'tra_case', 'ter_case', 'ess_case', 'abe_case', 'com_case', 'nsubj', 'nsubj_cop', 'modal', 'acl:relc', 'csubj', 'csubj_cop', 'obj', 'ccomp', 'xcomp', 'obl', 'nmod', 'appos', 'nummod', 'amod', 'advcl', 'voc', 'cop', 'conj', 'cc', 'yneemid', 'emoticons']

def get_data(f):
    ls = []
    with open(f, 'r') as fid:
        reader = csv.DictReader(fid, delimiter=';')
        data = [line for line in reader]
    return data


### FUNKTSIOONIDE SEKTSIOON

In [4]:
def create_array(data):
    # KW sisendi saamiseks on vaja iga rea (=faili) tunnuste väärtused ehk
    # [[tunnus11, tunnus12,.., tunnus1N], [tunnus21, tunnus22,.., tunnus2N], ..]
    d = [list(line.items()) for line in data]
    arrays = []
    for f in d:
        arrays.append([element[1] for element in f[1:]])
    return np.array(arrays)

In [5]:
def calculate_hb(rank):
    # holm-bonferroni meetodil p väärtuse korrigeerimine
    adjusted_p = 0.05 / (78 - rank + 1)
    return adjusted_p

In [6]:
def get_relevant_features(f, p):
    # tagastab listide listi, kus igas listis on [tunnus, järjekorra nr, KW p, korrigeeritud p, saatus]
    features = [[feature, round(p, 5)] for feature, f, p in zip(columns, f.tolist(), p.tolist())]          
    p_values = [feature[1] for feature in features]
    ranks = stats.rankdata(p_values)
    features = [feature[0] for feature in features]
    m = [s for s in zip(features, p_values, ranks)]
    positive_features = [] 
    for el in m:
        kW_p = el[1]
        adjusted_p = calculate_hb(el[2])
        if adjusted_p < kW_p:
            state = 'YES'
        else:
            state = 'NO'
        positive_features.append([el[0], el[2], kW_p, adjusted_p, state])

    return positive_features  

In [7]:
def calculate_dunn_test2(feature, strong_pd, weak_pd, not_pres_pd):
    data = [strong_pd[feature], weak_pd[feature], notpres_pd[feature]]
    df = pd.DataFrame(data).transpose()

    # gruppide määramine ehk tugev/mõõdukas; nõrk; not_present
    group1 = df.iloc[:, 0].dropna()
    group2 = df.iloc[:, 1].dropna()
    group3 = df.iloc[:, 2].dropna()
    
    dunn = sp.posthoc_dunn([group1, group2, group3], p_adjust='holm')
#         dunn = sp.posthoc_dunn([group1, group2, group3])

    return feature.upper(), dunn


In [8]:
def generate_df(strong_array, weak_array, notpres_array):
    # Kruskal Wallise väljund
    f_statistic, p_value = stats.f_oneway(strong_array, weak_array, notpres_array)
    feature_data = get_relevant_features(f_statistic, p_value)

    return feature_data, pd.DataFrame(feature_data, columns =['feature', 'rank', 'KW p', 'adjusted p', 'state'])

    

In [9]:
def display_posthoc_results(feats, strong, weak, notpres):
#     results = []
    strong_w = []
    strong_not = []
    weak_not = []
    for feat in feats:
        out = calculate_dunn_test2(feat, strong, weak, notpres)
        strong_weak = [out[1].iloc[1, 0], 'tugev/mõõdukas vs nõrk']
        strong_notpres = [out[1].iloc[2, 0], 'tugev/mõõdukas vs puudu']
        weak_notpres = [out[1].iloc[2, 1], 'nõrk vs puudu']
        for score in [strong_weak, strong_notpres, weak_notpres]:
            if score[0] <= 0.05:
                if score[1] == 'tugev/mõõdukas vs nõrk':
                    strong_w.append([feat, round(score[0], 4)])
                if score[1] == 'tugev/mõõdukas vs puudu':
                    strong_not.append([feat, round(score[0], 4)])
                if score[1] == 'nõrk vs puudu':
                    weak_not.append([feat, round(score[0], 4)])

                print(f'{feat.upper()} =>\n {score[1]}, {round(score[0], 4)}')
    return strong_w, strong_not, weak_not

### INSTRUEERIVUS

In [10]:
f1 = 'dimensioonide_grupid/inst/inst_strong.csv'
f2 = 'dimensioonide_grupid/inst/inst_weak.csv'
f3 = 'dimensioonide_grupid/inst/inst_not_present.csv'

In [11]:
# kruskall Wallise sisendid (kolm gruppi)
strong = get_data(f1)
weak = get_data(f2)
notpres = get_data(f3)

print(len(strong))
print(len(weak))
print(len(notpres))

feature_data, df = generate_df(create_array(strong), create_array(weak), create_array(notpres))
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(df)

15
18
58


Unnamed: 0,feature,rank,KW p,adjusted p,state
0,noun,21.0,0.04293,0.000862,YES
1,adj,59.0,0.43347,0.0025,YES
2,propn,17.0,0.03378,0.000806,YES
3,adv,31.0,0.12192,0.001042,YES
4,intj,49.5,0.38327,0.001695,YES
5,cconj,29.0,0.11363,0.001,YES
6,sconj,53.0,0.40782,0.001923,YES
7,adp,69.0,0.69323,0.005,YES
8,det,77.0,0.95225,0.025,YES
9,num,68.0,0.68711,0.004545,YES


In [12]:
relevant_feats = [feat[0] for feat in feature_data if feat[4] == 'YES']

strong_pd = pd.read_csv(f1, sep=';')
weak_pd = pd.read_csv(f2, sep=';')
notpres_pd = pd.read_csv(f3, sep=';')

res1, res2, res3 = display_posthoc_results(relevant_feats, strong_pd, weak_pd, notpres_pd)

NOUN =>
 nõrk vs puudu, 0.0474
PROPN =>
 tugev/mõõdukas vs puudu, 0.0043
AVG_WORD_LEN =>
 nõrk vs puudu, 0.0398
AVR_SENT_LEN =>
 tugev/mõõdukas vs nõrk, 0.0316
AVR_SENT_LEN =>
 tugev/mõõdukas vs puudu, 0.0007
VERBTYPE_RATIO =>
 nõrk vs puudu, 0.0026
DA_INF =>
 tugev/mõõdukas vs puudu, 0.037
VERB_PARTICLE =>
 tugev/mõõdukas vs nõrk, 0.0487
VERB_PARTICLE =>
 nõrk vs puudu, 0.0487
PAST_TENSE =>
 tugev/mõõdukas vs puudu, 0.0037
COND_MOOD =>
 tugev/mõõdukas vs puudu, 0.0208
GEN_CASE =>
 tugev/mõõdukas vs puudu, 0.045
NSUBJ =>
 tugev/mõõdukas vs puudu, 0.0024
CSUBJ_COP =>
 tugev/mõõdukas vs nõrk, 0.0168
CSUBJ_COP =>
 tugev/mõõdukas vs puudu, 0.0029
OBJ =>
 nõrk vs puudu, 0.0115
APPOS =>
 tugev/mõõdukas vs nõrk, 0.0317
APPOS =>
 tugev/mõõdukas vs puudu, 0.0245


In [13]:

print(f'TUGEV vs NÕRK: {res1}')
print('==========')
print(f'TUGEV VS NOT PRESENT: {res2}')
print('==========')
print(f'NÕRK VS NOT PRESENT: {res3}')

TUGEV vs NÕRK: [['avr_sent_len', 0.0316], ['verb_particle', 0.0487], ['csubj_cop', 0.0168], ['appos', 0.0317]]
TUGEV VS NOT PRESENT: [['propn', 0.0043], ['avr_sent_len', 0.0007], ['da_inf', 0.037], ['past_tense', 0.0037], ['cond_mood', 0.0208], ['gen_case', 0.045], ['nsubj', 0.0024], ['csubj_cop', 0.0029], ['appos', 0.0245]]
NÕRK VS NOT PRESENT: [['noun', 0.0474], ['avg_word_len', 0.0398], ['verbtype_ratio', 0.0026], ['verb_particle', 0.0487], ['obj', 0.0115]]


### ABSTRAKTSUS

In [14]:
f1 = 'dimensioonide_grupid/abs/abs_strong.csv'
f2 = 'dimensioonide_grupid/abs/abs_weak.csv'
f3 = 'dimensioonide_grupid/abs/abs_not_present.csv'

# kruskall Wallise sisendid (kolm gruppi)
strong = get_data(f1)
weak = get_data(f2)
notpres = get_data(f3)

print(len(strong))
print(len(weak))
print(len(notpres))

feature_data, df = generate_df(create_array(strong), create_array(weak), create_array(notpres))

with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(df)
    
relevant_feats = [feat[0] for feat in feature_data if feat[4] == 'YES']

strong_pd = pd.read_csv(f1, sep=';')
weak_pd = pd.read_csv(f2, sep=';')
notpres_pd = pd.read_csv(f3, sep=';')

res1, res2, res3 = display_posthoc_results(relevant_feats, strong_pd, weak_pd, notpres_pd)

6
35
56


Unnamed: 0,feature,rank,KW p,adjusted p,state
0,noun,15.0,0.03351,0.000781,YES
1,adj,30.0,0.20497,0.00102,YES
2,propn,23.0,0.10331,0.000893,YES
3,adv,11.0,0.02556,0.000735,YES
4,intj,72.5,0.79882,0.007692,YES
5,cconj,56.0,0.59215,0.002174,YES
6,sconj,32.0,0.20877,0.001064,YES
7,adp,45.0,0.42781,0.001471,YES
8,det,29.0,0.1891,0.001,YES
9,num,22.0,0.10116,0.000877,YES


PUNCT =>
 tugev/mõõdukas vs puudu, 0.0381
PRON =>
 nõrk vs puudu, 0.0109
HAPAX_LEGOMENA =>
 nõrk vs puudu, 0.039
COREF =>
 nõrk vs puudu, 0.0094
2ND_PRON =>
 nõrk vs puudu, 0.0494
ADE_CASE =>
 nõrk vs puudu, 0.0115
MODAL =>
 nõrk vs puudu, 0.0419
CSUBJ_COP =>
 tugev/mõõdukas vs nõrk, 0.0346
CSUBJ_COP =>
 tugev/mõõdukas vs puudu, 0.0161
NUMMOD =>
 tugev/mõõdukas vs puudu, 0.0282
NUMMOD =>
 nõrk vs puudu, 0.0487
AMOD =>
 tugev/mõõdukas vs nõrk, 0.02
AMOD =>
 tugev/mõõdukas vs puudu, 0.0192
COP =>
 tugev/mõõdukas vs puudu, 0.0338


In [15]:
print(f'TUGEV vs NÕRK: {res1}')
print('==========')
print(f'TUGEV VS NOT PRESENT: {res2}')
print('==========')
print(f'NÕRK VS NOT PRESENT: {res3}')

TUGEV vs NÕRK: [['csubj_cop', 0.0346], ['amod', 0.02]]
TUGEV VS NOT PRESENT: [['punct', 0.0381], ['csubj_cop', 0.0161], ['nummod', 0.0282], ['amod', 0.0192], ['cop', 0.0338]]
NÕRK VS NOT PRESENT: [['pron', 0.0109], ['hapax_legomena', 0.039], ['coref', 0.0094], ['2nd_pron', 0.0494], ['ade_case', 0.0115], ['modal', 0.0419], ['nummod', 0.0487]]


### AFEKTIIVSUS

In [16]:
f1 = 'dimensioonide_grupid/afek/afek_strong.csv'
f2 = 'dimensioonide_grupid/afek/afek_weak.csv'
f3 = 'dimensioonide_grupid/afek/afek_not_present.csv'

# kruskall Wallise sisendid (kolm gruppi)
strong = get_data(f1)
weak = get_data(f2)
notpres = get_data(f3)

print(len(strong))
print(len(weak))
print(len(notpres))

feature_data, df = generate_df(create_array(strong), create_array(weak), create_array(notpres))
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(df)
    
relevant_feats = [feat[0] for feat in feature_data if feat[4] == 'YES']

strong_pd = pd.read_csv(f1, sep=';')
weak_pd = pd.read_csv(f2, sep=';')
notpres_pd = pd.read_csv(f3, sep=';')

res1, res2, res3 = display_posthoc_results(relevant_feats, strong_pd, weak_pd, notpres_pd)

28
18
45


Unnamed: 0,feature,rank,KW p,adjusted p,state
0,noun,5.5,0.0,0.00068,NO
1,adj,28.0,0.00208,0.00098,YES
2,propn,15.0,5e-05,0.000781,NO
3,adv,5.5,0.0,0.00068,NO
4,intj,24.5,0.00073,0.000917,NO
5,cconj,55.0,0.24936,0.002083,YES
6,sconj,5.5,0.0,0.00068,NO
7,adp,57.0,0.28412,0.002273,YES
8,det,34.0,0.00917,0.001111,YES
9,num,33.0,0.00788,0.001087,YES


ADJ =>
 tugev/mõõdukas vs puudu, 0.0004
DET =>
 tugev/mõõdukas vs puudu, 0.0074
DET =>
 nõrk vs puudu, 0.0124
PUNCT =>
 tugev/mõõdukas vs puudu, 0.0302
ABBR =>
 tugev/mõõdukas vs puudu, 0.0443
ABBR =>
 nõrk vs puudu, 0.0168
TTR =>
 tugev/mõõdukas vs puudu, 0.0206
AVR_SENT_LEN =>
 tugev/mõõdukas vs puudu, 0.0024
HAPAX_LEGOMENA =>
 tugev/mõõdukas vs puudu, 0.0236
SEE_DET =>
 nõrk vs puudu, 0.0446
1ST_PRS_VERB =>
 tugev/mõõdukas vs puudu, 0.0
1ST_PRS_VERB =>
 nõrk vs puudu, 0.0001
2ND_PRS_VERB =>
 tugev/mõõdukas vs nõrk, 0.0006
2ND_PRS_VERB =>
 tugev/mõõdukas vs puudu, 0.0
DA_INF =>
 tugev/mõõdukas vs puudu, 0.0019
VERB_PARTICLE =>
 tugev/mõõdukas vs nõrk, 0.0451
VERB_PARTICLE =>
 tugev/mõõdukas vs puudu, 0.0072
COND_MOOD =>
 tugev/mõõdukas vs puudu, 0.0029
IMP_MOOD =>
 tugev/mõõdukas vs nõrk, 0.0028
IMP_MOOD =>
 tugev/mõõdukas vs puudu, 0.0
ILL_CASE =>
 tugev/mõõdukas vs puudu, 0.0474
NSUBJ_COP =>
 tugev/mõõdukas vs puudu, 0.002
CSUBJ =>
 nõrk vs puudu, 0.0118
NUMMOD =>
 tugev/mõõdukas v

In [17]:
print(f'TUGEV vs NÕRK: {res1}')
print('==========')
print(f'TUGEV VS NOT PRESENT: {res2}')
print('==========')
print(f'NÕRK VS NOT PRESENT: {res3}')

TUGEV vs NÕRK: [['2nd_prs_verb', 0.0006], ['verb_particle', 0.0451], ['imp_mood', 0.0028]]
TUGEV VS NOT PRESENT: [['adj', 0.0004], ['det', 0.0074], ['punct', 0.0302], ['abbr', 0.0443], ['TTR', 0.0206], ['avr_sent_len', 0.0024], ['hapax_legomena', 0.0236], ['1st_prs_verb', 0.0], ['2nd_prs_verb', 0.0], ['da_inf', 0.0019], ['verb_particle', 0.0072], ['cond_mood', 0.0029], ['imp_mood', 0.0], ['ill_case', 0.0474], ['nsubj_cop', 0.002], ['nummod', 0.0399], ['amod', 0.0051], ['advcl', 0.0254], ['cop', 0.0008]]
NÕRK VS NOT PRESENT: [['det', 0.0124], ['abbr', 0.0168], ['see_det', 0.0446], ['1st_prs_verb', 0.0001], ['csubj', 0.0118], ['conj', 0.0144]]


### AEG

In [18]:
f1 = 'dimensioonide_grupid/aeg/aeg_strong.csv'
f2 = 'dimensioonide_grupid/aeg/aeg_weak.csv'
f3 = 'dimensioonide_grupid/aeg/aeg_not_present.csv'

# kruskall Wallise sisendid (kolm gruppi)
strong = get_data(f1)
weak = get_data(f2)
notpres = get_data(f3)

print(len(strong))
print(len(weak))
print(len(notpres))

feature_data, df = generate_df(create_array(strong), create_array(weak), create_array(notpres))
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(df)
    
relevant_feats = [feat[0] for feat in feature_data if feat[4] == 'YES']

strong_pd = pd.read_csv(f1, sep=';')
weak_pd = pd.read_csv(f2, sep=';')
notpres_pd = pd.read_csv(f3, sep=';')

res1, res2, res3 = display_posthoc_results(relevant_feats, strong_pd, weak_pd, notpres_pd)

16
39
19


Unnamed: 0,feature,rank,KW p,adjusted p,state
0,noun,77.0,0.93681,0.025,YES
1,adj,52.0,0.51276,0.001852,YES
2,propn,9.0,0.0293,0.000714,YES
3,adv,31.0,0.23926,0.001042,YES
4,intj,37.5,0.31072,0.001205,YES
5,cconj,24.5,0.14757,0.000917,YES
6,sconj,16.0,0.07406,0.000794,YES
7,adp,49.0,0.48615,0.001667,YES
8,det,39.0,0.31511,0.00125,YES
9,num,4.0,0.00022,0.000667,NO


PROPN =>
 tugev/mõõdukas vs puudu, 0.0107
TTR =>
 tugev/mõõdukas vs nõrk, 0.0213
VERBTYPE_RATIO =>
 tugev/mõõdukas vs nõrk, 0.0051
VERBTYPE_RATIO =>
 tugev/mõõdukas vs puudu, 0.0018
DA_INF =>
 tugev/mõõdukas vs nõrk, 0.0115
DA_INF =>
 tugev/mõõdukas vs puudu, 0.0003
PART_CASE =>
 tugev/mõõdukas vs puudu, 0.0243
ABL_CASE =>
 tugev/mõõdukas vs nõrk, 0.0399
ADVCL =>
 tugev/mõõdukas vs nõrk, 0.035
ADVCL =>
 tugev/mõõdukas vs puudu, 0.0042


In [19]:
print(f'TUGEV vs NÕRK: {res1}')
print('==========')
print(f'TUGEV VS NOT PRESENT: {res2}')
print('==========')
print(f'NÕRK VS NOT PRESENT: {res3}')

TUGEV vs NÕRK: [['TTR', 0.0213], ['verbtype_ratio', 0.0051], ['da_inf', 0.0115], ['abl_case', 0.0399], ['advcl', 0.035]]
TUGEV VS NOT PRESENT: [['propn', 0.0107], ['verbtype_ratio', 0.0018], ['da_inf', 0.0003], ['part_case', 0.0243], ['advcl', 0.0042]]
NÕRK VS NOT PRESENT: []


### ARGUMENTATIIVSUS

In [20]:
f1 = 'dimensioonide_grupid/arg/arg_strong.csv'
f2 = 'dimensioonide_grupid/arg/arg_weak.csv'
f3 = 'dimensioonide_grupid/arg/arg_not_present.csv'

# kruskall Wallise sisendid (kolm gruppi)
strong = get_data(f1)
weak = get_data(f2)
notpres = get_data(f3)

print(len(strong))
print(len(weak))
print(len(notpres))

feature_data, df = generate_df(create_array(strong), create_array(weak), create_array(notpres))
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(df)
    
relevant_feats = [feat[0] for feat in feature_data if feat[4] == 'YES']

strong_pd = pd.read_csv(f1, sep=';')
weak_pd = pd.read_csv(f2, sep=';')
notpres_pd = pd.read_csv(f3, sep=';')

res1, res2, res3 = display_posthoc_results(relevant_feats, strong_pd, weak_pd, notpres_pd)

21
27
31




Unnamed: 0,feature,rank,KW p,adjusted p,state
0,noun,38.0,0.17783,0.00122,YES
1,adj,41.0,0.23607,0.001316,YES
2,propn,3.0,0.00027,0.000658,NO
3,adv,34.0,0.15361,0.001111,YES
4,intj,63.5,0.63369,0.003226,YES
5,cconj,47.0,0.33466,0.001563,YES
6,sconj,22.0,0.06019,0.000877,YES
7,adp,66.0,0.75156,0.003846,YES
8,det,42.0,0.2695,0.001351,YES
9,num,30.0,0.13857,0.00102,YES


SEE_PRON =>
 tugev/mõõdukas vs puudu, 0.0285
SEE_PRON =>
 nõrk vs puudu, 0.0057
VERBTYPE_RATIO =>
 tugev/mõõdukas vs puudu, 0.0262
PAST_TENSE =>
 tugev/mõõdukas vs puudu, 0.0024
PAST_TENSE =>
 nõrk vs puudu, 0.0104
COND_MOOD =>
 tugev/mõõdukas vs puudu, 0.0135
COND_MOOD =>
 nõrk vs puudu, 0.0085
ABE_CASE =>
 tugev/mõõdukas vs nõrk, 0.0092
ABE_CASE =>
 tugev/mõõdukas vs puudu, 0.0489
MODAL =>
 tugev/mõõdukas vs puudu, 0.0103
MODAL =>
 nõrk vs puudu, 0.0103
OBL =>
 tugev/mõõdukas vs puudu, 0.0159
OBL =>
 nõrk vs puudu, 0.0085
APPOS =>
 tugev/mõõdukas vs puudu, 0.0446
NUMMOD =>
 tugev/mõõdukas vs puudu, 0.0408
ADVCL =>
 tugev/mõõdukas vs puudu, 0.0139


In [21]:
print(f'TUGEV vs NÕRK: {res1}')
print('==========')
print(f'TUGEV VS NOT PRESENT: {res2}')
print('==========')
print(f'NÕRK VS NOT PRESENT: {res3}')

TUGEV vs NÕRK: [['abe_case', 0.0092]]
TUGEV VS NOT PRESENT: [['see_pron', 0.0285], ['verbtype_ratio', 0.0262], ['past_tense', 0.0024], ['cond_mood', 0.0135], ['abe_case', 0.0489], ['modal', 0.0103], ['obl', 0.0159], ['appos', 0.0446], ['nummod', 0.0408], ['advcl', 0.0139]]
NÕRK VS NOT PRESENT: [['see_pron', 0.0057], ['past_tense', 0.0104], ['cond_mood', 0.0085], ['modal', 0.0103], ['obl', 0.0085]]


### FORMAALSUS

In [22]:
f1 = 'dimensioonide_grupid/form/form_strong.csv'
f2 = 'dimensioonide_grupid/form/form_weak.csv'
f3 = 'dimensioonide_grupid/form/form_not_present.csv'

# kruskall Wallise sisendid (kolm gruppi)
strong = get_data(f1)
weak = get_data(f2)
notpres = get_data(f3)

print(len(strong))
print(len(weak))
print(len(notpres))

feature_data, df = generate_df(create_array(strong), create_array(weak), create_array(notpres))
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(df)
    
relevant_feats = [feat[0] for feat in feature_data if feat[4] == 'YES']

strong_pd = pd.read_csv(f1, sep=';')
weak_pd = pd.read_csv(f2, sep=';')
notpres_pd = pd.read_csv(f3, sep=';')

res1, res2, res3 = display_posthoc_results(relevant_feats, strong_pd, weak_pd, notpres_pd)

14
28
49




Unnamed: 0,feature,rank,KW p,adjusted p,state
0,noun,3.5,0.0,0.000662,NO
1,adj,9.0,5e-05,0.000714,NO
2,propn,13.0,0.00013,0.000758,NO
3,adv,3.5,0.0,0.000662,NO
4,intj,45.5,0.1229,0.001493,YES
5,cconj,72.0,0.68308,0.007143,YES
6,sconj,27.0,0.00345,0.000962,YES
7,adp,67.0,0.5359,0.004167,YES
8,det,18.0,0.0007,0.00082,NO
9,num,20.0,0.00161,0.000847,YES


SCONJ =>
 tugev/mõõdukas vs puudu, 0.0055
SCONJ =>
 nõrk vs puudu, 0.0363
NUM =>
 nõrk vs puudu, 0.042
SYMBOL =>
 tugev/mõõdukas vs puudu, 0.0126
SEE_PRON =>
 tugev/mõõdukas vs puudu, 0.0084
SEE_DET =>
 tugev/mõõdukas vs nõrk, 0.0215
SEE_DET =>
 tugev/mõõdukas vs puudu, 0.0096
1ST_PRON =>
 tugev/mõõdukas vs puudu, 0.0001
1ST_PRON =>
 nõrk vs puudu, 0.0001
2ND_PRON =>
 nõrk vs puudu, 0.0109
3RD_PRON =>
 nõrk vs puudu, 0.0167
2ND_PRS_VERB =>
 tugev/mõõdukas vs puudu, 0.0133
2ND_PRS_VERB =>
 nõrk vs puudu, 0.0005
VERBTYPE_RATIO =>
 tugev/mõõdukas vs nõrk, 0.0023
VERBTYPE_RATIO =>
 tugev/mõõdukas vs puudu, 0.0006
DA_INF =>
 tugev/mõõdukas vs nõrk, 0.0158
DA_INF =>
 nõrk vs puudu, 0.0085
IND_MOOD =>
 tugev/mõõdukas vs nõrk, 0.015
IND_MOOD =>
 tugev/mõõdukas vs puudu, 0.0005
IMP_MOOD =>
 tugev/mõõdukas vs puudu, 0.0103
IMP_MOOD =>
 nõrk vs puudu, 0.001
NEG_POLARITY =>
 tugev/mõõdukas vs puudu, 0.0137
NEG_POLARITY =>
 nõrk vs puudu, 0.0332
NOM_CASE =>
 tugev/mõõdukas vs nõrk, 0.0015
NOM_CASE 

In [23]:
print(f'TUGEV vs NÕRK: {res1}')
print('==========')
print(f'TUGEV VS NOT PRESENT: {res2}')
print('==========')
print(f'NÕRK VS NOT PRESENT: {res3}')

TUGEV vs NÕRK: [['see_det', 0.0215], ['verbtype_ratio', 0.0023], ['da_inf', 0.0158], ['ind_mood', 0.015], ['nom_case', 0.0015], ['nsubj_cop', 0.0065], ['cop', 0.0121]]
TUGEV VS NOT PRESENT: [['sconj', 0.0055], ['symbol', 0.0126], ['see_pron', 0.0084], ['see_det', 0.0096], ['1st_pron', 0.0001], ['2nd_prs_verb', 0.0133], ['verbtype_ratio', 0.0006], ['ind_mood', 0.0005], ['imp_mood', 0.0103], ['neg_polarity', 0.0137], ['nom_case', 0.0228], ['nsubj_cop', 0.0001], ['amod', 0.0351], ['advcl', 0.0097], ['cop', 0.0002]]
NÕRK VS NOT PRESENT: [['sconj', 0.0363], ['num', 0.042], ['1st_pron', 0.0001], ['2nd_pron', 0.0109], ['3rd_pron', 0.0167], ['2nd_prs_verb', 0.0005], ['da_inf', 0.0085], ['imp_mood', 0.001], ['neg_polarity', 0.0332], ['xcomp', 0.001], ['advcl', 0.0153]]


### IMPERSONAALSUS

In [24]:
f1 = 'dimensioonide_grupid/imp/imp_strong.csv'
f2 = 'dimensioonide_grupid/imp/imp_weak.csv'
f3 = 'dimensioonide_grupid/imp/imp_not_present.csv'

# kruskall Wallise sisendid (kolm gruppi)
strong = get_data(f1)
weak = get_data(f2)
notpres = get_data(f3)


print(len(strong))
print(len(weak))
print(len(notpres))

feature_data, df = generate_df(create_array(strong), create_array(weak), create_array(notpres))
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(df)
    
relevant_feats = [feat[0] for feat in feature_data if feat[4] == 'YES']

strong_pd = pd.read_csv(f1, sep=';')
weak_pd = pd.read_csv(f2, sep=';')
notpres_pd = pd.read_csv(f3, sep=';')

res1, res2, res3 = display_posthoc_results(relevant_feats, strong_pd, weak_pd, notpres_pd)

37
21
28


Unnamed: 0,feature,rank,KW p,adjusted p,state
0,noun,5.0,0.0,0.000676,NO
1,adj,15.0,0.00031,0.000781,NO
2,propn,34.0,0.04077,0.001111,YES
3,adv,5.0,0.0,0.000676,NO
4,intj,26.5,0.01449,0.000952,YES
5,cconj,69.0,0.64792,0.005,YES
6,sconj,18.0,0.00111,0.00082,YES
7,adp,60.0,0.41389,0.002632,YES
8,det,20.0,0.00221,0.000847,YES
9,num,41.0,0.08903,0.001316,YES


INTJ =>
 tugev/mõõdukas vs puudu, 0.0026
INTJ =>
 nõrk vs puudu, 0.0076
SCONJ =>
 tugev/mõõdukas vs puudu, 0.0118
SCONJ =>
 nõrk vs puudu, 0.0392
DET =>
 tugev/mõõdukas vs puudu, 0.0007
DET =>
 nõrk vs puudu, 0.0222
PUNCT =>
 tugev/mõõdukas vs puudu, 0.0132
SEE_DET =>
 tugev/mõõdukas vs puudu, 0.0277
2ND_PRON =>
 tugev/mõõdukas vs puudu, 0.0016
2ND_PRON =>
 nõrk vs puudu, 0.0225
PASSIVE_VOICE =>
 tugev/mõõdukas vs puudu, 0.0036
PASSIVE_VOICE =>
 nõrk vs puudu, 0.0063
2ND_PRS_VERB =>
 tugev/mõõdukas vs puudu, 0.0002
2ND_PRS_VERB =>
 nõrk vs puudu, 0.0223
VERBTYPE_RATIO =>
 tugev/mõõdukas vs nõrk, 0.0006
VERBTYPE_RATIO =>
 tugev/mõõdukas vs puudu, 0.014
VERB_PARTICLE =>
 tugev/mõõdukas vs puudu, 0.0095
DISCOURSE =>
 tugev/mõõdukas vs puudu, 0.0026
DISCOURSE =>
 nõrk vs puudu, 0.0076
IMP_MOOD =>
 tugev/mõõdukas vs puudu, 0.0002
IMP_MOOD =>
 nõrk vs puudu, 0.0347
NEG_POLARITY =>
 tugev/mõõdukas vs puudu, 0.0014
ILL_CASE =>
 tugev/mõõdukas vs puudu, 0.0029
ILL_CASE =>
 nõrk vs puudu, 0.0216

In [25]:
print(f'TUGEV vs NÕRK: {res1}')
print('==========')
print(f'TUGEV VS NOT PRESENT: {res2}')
print('==========')
print(f'NÕRK VS NOT PRESENT: {res3}')

TUGEV vs NÕRK: [['verbtype_ratio', 0.0006]]
TUGEV VS NOT PRESENT: [['intj', 0.0026], ['sconj', 0.0118], ['det', 0.0007], ['punct', 0.0132], ['see_det', 0.0277], ['2nd_pron', 0.0016], ['passive_voice', 0.0036], ['2nd_prs_verb', 0.0002], ['verbtype_ratio', 0.014], ['verb_particle', 0.0095], ['discourse', 0.0026], ['imp_mood', 0.0002], ['neg_polarity', 0.0014], ['ill_case', 0.0029], ['amod', 0.0448]]
NÕRK VS NOT PRESENT: [['intj', 0.0076], ['sconj', 0.0392], ['det', 0.0222], ['2nd_pron', 0.0225], ['passive_voice', 0.0063], ['2nd_prs_verb', 0.0223], ['discourse', 0.0076], ['imp_mood', 0.0347], ['ill_case', 0.0216]]


### INFOTIHEDUS

In [27]:
f1 = 'dimensioonide_grupid/info/info_strong.csv'
f2 = 'dimensioonide_grupid/info/info_weak.csv'
f3 = 'dimensioonide_grupid/info/info_not_present.csv'

# kruskall Wallise sisendid (kolm gruppi)
strong = get_data(f1)
weak = get_data(f2)
notpres = get_data(f3)

print(len(strong))
print(len(weak))
print(len(notpres))

feature_data, df = generate_df(create_array(strong), create_array(weak), create_array(notpres))
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(df)
    
relevant_feats = [feat[0] for feat in feature_data if feat[4] == 'YES']

strong_pd = pd.read_csv(f1, sep=';')
weak_pd = pd.read_csv(f2, sep=';')
notpres_pd = pd.read_csv(f3, sep=';')

res1, res2, res3 = display_posthoc_results(relevant_feats, strong_pd, weak_pd, notpres_pd)

45
26
5


Unnamed: 0,feature,rank,KW p,adjusted p,state
0,noun,11.0,0.00065,0.000735,NO
1,adj,29.0,0.04901,0.001,YES
2,propn,28.0,0.04007,0.00098,YES
3,adv,9.0,0.00042,0.000714,NO
4,intj,24.5,0.03012,0.000917,YES
5,cconj,70.0,0.73196,0.005556,YES
6,sconj,13.0,0.00293,0.000758,YES
7,adp,36.0,0.08649,0.001163,YES
8,det,14.0,0.00875,0.000769,YES
9,num,6.0,0.00015,0.000685,NO


ADJ =>
 tugev/mõõdukas vs nõrk, 0.0407
INTJ =>
 tugev/mõõdukas vs nõrk, 0.0108
SCONJ =>
 tugev/mõõdukas vs nõrk, 0.0041
DET =>
 tugev/mõõdukas vs nõrk, 0.0336
DET =>
 tugev/mõõdukas vs puudu, 0.0336
PUNCT =>
 tugev/mõõdukas vs nõrk, 0.0496
SEE_DET =>
 tugev/mõõdukas vs puudu, 0.043
2ND_PRON =>
 tugev/mõõdukas vs nõrk, 0.0016
2ND_PRON =>
 tugev/mõõdukas vs puudu, 0.0044
ACTIVE_VOICE =>
 tugev/mõõdukas vs nõrk, 0.0255
PASSIVE_VOICE =>
 tugev/mõõdukas vs puudu, 0.0357
2ND_PRS_VERB =>
 tugev/mõõdukas vs nõrk, 0.0018
2ND_PRS_VERB =>
 tugev/mõõdukas vs puudu, 0.0002
2ND_PRS_VERB =>
 nõrk vs puudu, 0.0326
SUPINE =>
 tugev/mõõdukas vs nõrk, 0.0182
DISCOURSE =>
 tugev/mõõdukas vs nõrk, 0.0108
IND_MOOD =>
 tugev/mõõdukas vs nõrk, 0.006
IMP_MOOD =>
 tugev/mõõdukas vs nõrk, 0.0166
IMP_MOOD =>
 tugev/mõõdukas vs puudu, 0.0001
IMP_MOOD =>
 nõrk vs puudu, 0.0166
GEN_CASE =>
 tugev/mõõdukas vs nõrk, 0.0219
GEN_CASE =>
 tugev/mõõdukas vs puudu, 0.0415
ILL_CASE =>
 tugev/mõõdukas vs nõrk, 0.0333
NSUBJ =

In [28]:
print(f'TUGEV vs NÕRK: {res1}')
print('==========')
print(f'TUGEV VS NOT PRESENT: {res2}')
print('==========')
print(f'NÕRK VS NOT PRESENT: {res3}')

TUGEV vs NÕRK: [['adj', 0.0407], ['intj', 0.0108], ['sconj', 0.0041], ['det', 0.0336], ['punct', 0.0496], ['2nd_pron', 0.0016], ['active_voice', 0.0255], ['2nd_prs_verb', 0.0018], ['supine', 0.0182], ['discourse', 0.0108], ['ind_mood', 0.006], ['imp_mood', 0.0166], ['gen_case', 0.0219], ['ill_case', 0.0333], ['nsubj', 0.0125], ['nmod', 0.0284]]
TUGEV VS NOT PRESENT: [['det', 0.0336], ['see_det', 0.043], ['2nd_pron', 0.0044], ['passive_voice', 0.0357], ['2nd_prs_verb', 0.0002], ['imp_mood', 0.0001], ['gen_case', 0.0415], ['nmod', 0.0294]]
NÕRK VS NOT PRESENT: [['2nd_prs_verb', 0.0326], ['imp_mood', 0.0166]]


### INTERAKTIIVSUS

In [29]:
f1 = 'dimensioonide_grupid/inter/inter_strong.csv'
f2 = 'dimensioonide_grupid/inter/inter_weak.csv'
f3 = 'dimensioonide_grupid/inter/inter_not_present.csv'

# kruskall Wallise sisendid (kolm gruppi)
strong = get_data(f1)
weak = get_data(f2)
notpres = get_data(f3)

print(len(strong))
print(len(weak))
print(len(notpres))

feature_data, df = generate_df(create_array(strong), create_array(weak), create_array(notpres))
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(df)
    
relevant_feats = [feat[0] for feat in feature_data if feat[4] == 'YES']

strong_pd = pd.read_csv(f1, sep=';')
weak_pd = pd.read_csv(f2, sep=';')
notpres_pd = pd.read_csv(f3, sep=';')

res1, res2, res3 = display_posthoc_results(relevant_feats, strong_pd, weak_pd, notpres_pd)

10
15
75


Unnamed: 0,feature,rank,KW p,adjusted p,state
0,noun,3.0,0.0,0.000658,NO
1,adj,24.0,0.00997,0.000909,YES
2,propn,18.0,0.00425,0.00082,YES
3,adv,13.0,0.00161,0.000758,YES
4,intj,15.5,0.00316,0.000787,YES
5,cconj,77.0,0.99285,0.025,YES
6,sconj,20.0,0.00809,0.000847,YES
7,adp,49.0,0.20706,0.001667,YES
8,det,46.0,0.15321,0.001515,YES
9,num,30.0,0.02414,0.00102,YES


ADJ =>
 tugev/mõõdukas vs puudu, 0.0078
PROPN =>
 tugev/mõõdukas vs puudu, 0.0353
PROPN =>
 nõrk vs puudu, 0.0317
ADV =>
 tugev/mõõdukas vs puudu, 0.0151
ADV =>
 nõrk vs puudu, 0.0208
INTJ =>
 tugev/mõõdukas vs nõrk, 0.0266
INTJ =>
 tugev/mõõdukas vs puudu, 0.0
SCONJ =>
 nõrk vs puudu, 0.0266
NUM =>
 tugev/mõõdukas vs nõrk, 0.0496
NUM =>
 nõrk vs puudu, 0.0496
PUNCT =>
 tugev/mõõdukas vs puudu, 0.0023
TTR =>
 tugev/mõõdukas vs puudu, 0.0086
TTR =>
 nõrk vs puudu, 0.0238
AVR_SENT_LEN =>
 tugev/mõõdukas vs puudu, 0.021
3RD_PRON =>
 nõrk vs puudu, 0.0038
PASSIVE_VOICE =>
 tugev/mõõdukas vs puudu, 0.0113
1ST_PRS_VERB =>
 tugev/mõõdukas vs puudu, 0.0061
1ST_PRS_VERB =>
 nõrk vs puudu, 0.0006
2ND_PRS_VERB =>
 tugev/mõõdukas vs nõrk, 0.0059
2ND_PRS_VERB =>
 tugev/mõõdukas vs puudu, 0.0
2ND_PRS_VERB =>
 nõrk vs puudu, 0.0005
CORE_VERB =>
 tugev/mõõdukas vs puudu, 0.0463
CORE_VERB =>
 nõrk vs puudu, 0.0129
VERB_PARTICLE =>
 tugev/mõõdukas vs puudu, 0.0234
DISCOURSE =>
 tugev/mõõdukas vs nõrk, 0

In [30]:
print(f'TUGEV vs NÕRK: {res1}')
print('==========')
print(f'TUGEV VS NOT PRESENT: {res2}')
print('==========')
print(f'NÕRK VS NOT PRESENT: {res3}')

TUGEV vs NÕRK: [['intj', 0.0266], ['num', 0.0496], ['2nd_prs_verb', 0.0059], ['discourse', 0.0266], ['imp_mood', 0.0037], ['acl:relc', 0.0399], ['emoticons', 0.0286]]
TUGEV VS NOT PRESENT: [['adj', 0.0078], ['propn', 0.0353], ['adv', 0.0151], ['intj', 0.0], ['punct', 0.0023], ['TTR', 0.0086], ['avr_sent_len', 0.021], ['passive_voice', 0.0113], ['1st_prs_verb', 0.0061], ['2nd_prs_verb', 0.0], ['core_verb', 0.0463], ['verb_particle', 0.0234], ['discourse', 0.0], ['cond_mood', 0.0282], ['imp_mood', 0.0], ['neg_polarity', 0.0047], ['xcomp', 0.0159], ['obl', 0.0057], ['amod', 0.008], ['emoticons', 0.0089]]
NÕRK VS NOT PRESENT: [['propn', 0.0317], ['adv', 0.0208], ['sconj', 0.0266], ['num', 0.0496], ['TTR', 0.0238], ['3rd_pron', 0.0038], ['1st_prs_verb', 0.0006], ['2nd_prs_verb', 0.0005], ['core_verb', 0.0129], ['cond_mood', 0.0282], ['imp_mood', 0.0014], ['xcomp', 0.038], ['nummod', 0.0307]]


### KEERULISUS

In [31]:
f1 = 'dimensioonide_grupid/keer/keer_strong.csv'
f2 = 'dimensioonide_grupid/keer/keer_weak.csv'
f3 = 'dimensioonide_grupid/keer/keer_not_present.csv'

# kruskall Wallise sisendid (kolm gruppi)
strong = get_data(f1)
weak = get_data(f2)
notpres = get_data(f3)

print(len(strong))
print(len(weak))
print(len(notpres))

feature_data, df = generate_df(create_array(strong), create_array(weak), create_array(notpres))
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(df)
    
relevant_feats = [feat[0] for feat in feature_data if feat[4] == 'YES']

strong_pd = pd.read_csv(f1, sep=';')
weak_pd = pd.read_csv(f2, sep=';')
notpres_pd = pd.read_csv(f3, sep=';')

res1, res2, res3 = display_posthoc_results(relevant_feats, strong_pd, weak_pd, notpres_pd)

9
25
53




Unnamed: 0,feature,rank,KW p,adjusted p,state
0,noun,12.0,0.0057,0.000746,YES
1,adj,20.0,0.02119,0.000847,YES
2,propn,30.0,0.05097,0.00102,YES
3,adv,28.0,0.04249,0.00098,YES
4,intj,39.5,0.14832,0.001266,YES
5,cconj,68.0,0.61344,0.004545,YES
6,sconj,42.0,0.16474,0.001351,YES
7,adp,51.0,0.33563,0.001786,YES
8,det,35.0,0.1139,0.001136,YES
9,num,32.0,0.10573,0.001064,YES


NOUN =>
 tugev/mõõdukas vs puudu, 0.0397
ADJ =>
 tugev/mõõdukas vs puudu, 0.0164
PROPN =>
 nõrk vs puudu, 0.046
ADV =>
 nõrk vs puudu, 0.0343
PUNCT =>
 tugev/mõõdukas vs puudu, 0.0163
PUNCT =>
 nõrk vs puudu, 0.0045
PRON =>
 nõrk vs puudu, 0.0158
COREF =>
 nõrk vs puudu, 0.0085
1ST_PRON =>
 tugev/mõõdukas vs puudu, 0.0058
1ST_PRON =>
 nõrk vs puudu, 0.003
ACTIVE_VOICE =>
 tugev/mõõdukas vs puudu, 0.0055
1ST_PRS_VERB =>
 nõrk vs puudu, 0.0005
2ND_PRS_VERB =>
 nõrk vs puudu, 0.0121
3RD_PRS_VERB =>
 tugev/mõõdukas vs nõrk, 0.0428
GERUND =>
 tugev/mõõdukas vs puudu, 0.0253
SUPINE =>
 tugev/mõõdukas vs puudu, 0.0085
IND_MOOD =>
 tugev/mõõdukas vs nõrk, 0.0414
IND_MOOD =>
 tugev/mõõdukas vs puudu, 0.0073
COND_MOOD =>
 nõrk vs puudu, 0.0195
IMP_MOOD =>
 nõrk vs puudu, 0.0481
GEN_CASE =>
 tugev/mõõdukas vs puudu, 0.0149
GEN_CASE =>
 nõrk vs puudu, 0.0149
NSUBJ =>
 tugev/mõõdukas vs nõrk, 0.0204
NSUBJ =>
 tugev/mõõdukas vs puudu, 0.0397
NSUBJ_COP =>
 nõrk vs puudu, 0.0149
OBL =>
 tugev/mõõdukas

In [32]:
print(f'TUGEV vs NÕRK: {res1}')
print('==========')
print(f'TUGEV VS NOT PRESENT: {res2}')
print('==========')
print(f'NÕRK VS NOT PRESENT: {res3}')

TUGEV vs NÕRK: [['3rd_prs_verb', 0.0428], ['ind_mood', 0.0414], ['nsubj', 0.0204], ['obl', 0.0428], ['yneemid', 0.0153]]
TUGEV VS NOT PRESENT: [['noun', 0.0397], ['adj', 0.0164], ['punct', 0.0163], ['1st_pron', 0.0058], ['active_voice', 0.0055], ['gerund', 0.0253], ['supine', 0.0085], ['ind_mood', 0.0073], ['gen_case', 0.0149], ['nsubj', 0.0397], ['advcl', 0.0286], ['yneemid', 0.0121]]
NÕRK VS NOT PRESENT: [['propn', 0.046], ['adv', 0.0343], ['punct', 0.0045], ['pron', 0.0158], ['coref', 0.0085], ['1st_pron', 0.003], ['1st_prs_verb', 0.0005], ['2nd_prs_verb', 0.0121], ['cond_mood', 0.0195], ['imp_mood', 0.0481], ['gen_case', 0.0149], ['nsubj_cop', 0.0149], ['cop', 0.0059]]


### SPONTAANSUS

In [33]:
f1 = 'dimensioonide_grupid/spont/spont_strong.csv'
f2 = 'dimensioonide_grupid/spont/spont_weak.csv'
f3 = 'dimensioonide_grupid/spont/spont_not_present.csv'

# kruskall Wallise sisendid (kolm gruppi)
strong = get_data(f1)
weak = get_data(f2)
notpres = get_data(f3)


print(len(strong))
print(len(weak))
print(len(notpres))

feature_data, df = generate_df(create_array(strong), create_array(weak), create_array(notpres))
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(df)
    
relevant_feats = [feat[0] for feat in feature_data if feat[4] == 'YES']

strong_pd = pd.read_csv(f1, sep=';')
weak_pd = pd.read_csv(f2, sep=';')
notpres_pd = pd.read_csv(f3, sep=';')

res1, res2, res3 = display_posthoc_results(relevant_feats, strong_pd, weak_pd, notpres_pd)

12
6
79


Unnamed: 0,feature,rank,KW p,adjusted p,state
0,noun,4.5,0.0,0.000671,NO
1,adj,36.0,0.05827,0.001163,YES
2,propn,14.5,0.00091,0.000775,YES
3,adv,4.5,0.0,0.000671,NO
4,intj,4.5,0.0,0.000671,NO
5,cconj,73.0,0.76276,0.008333,YES
6,sconj,41.0,0.10961,0.001316,YES
7,adp,53.0,0.34204,0.001923,YES
8,det,50.0,0.28438,0.001724,YES
9,num,47.0,0.18364,0.001563,YES


PROPN =>
 tugev/mõõdukas vs puudu, 0.0013
TTR =>
 tugev/mõõdukas vs puudu, 0.0163
AVR_SENT_LEN =>
 tugev/mõõdukas vs puudu, 0.0185
ACTIVE_VOICE =>
 tugev/mõõdukas vs puudu, 0.0164
PASSIVE_VOICE =>
 tugev/mõõdukas vs puudu, 0.0017
1ST_PRS_VERB =>
 tugev/mõõdukas vs puudu, 0.0003
2ND_PRS_VERB =>
 tugev/mõõdukas vs puudu, 0.0
2ND_PRS_VERB =>
 nõrk vs puudu, 0.0053
SUPINE =>
 tugev/mõõdukas vs puudu, 0.0095
VERB_PARTICLE =>
 tugev/mõõdukas vs puudu, 0.0024
IMP_MOOD =>
 tugev/mõõdukas vs nõrk, 0.0095
IMP_MOOD =>
 tugev/mõõdukas vs puudu, 0.0
NEG_POLARITY =>
 tugev/mõõdukas vs puudu, 0.0004
INE_CASE =>
 tugev/mõõdukas vs puudu, 0.0129
ADE_CASE =>
 tugev/mõõdukas vs puudu, 0.0035
NSUBJ_COP =>
 tugev/mõõdukas vs puudu, 0.0025
NSUBJ_COP =>
 nõrk vs puudu, 0.0338
CSUBJ_COP =>
 nõrk vs puudu, 0.0391
XCOMP =>
 tugev/mõõdukas vs puudu, 0.0166
AMOD =>
 tugev/mõõdukas vs puudu, 0.0341
VOC =>
 nõrk vs puudu, 0.0192
COP =>
 tugev/mõõdukas vs puudu, 0.0011
EMOTICONS =>
 tugev/mõõdukas vs puudu, 0.0242


In [34]:
print(f'TUGEV vs NÕRK: {res1}')
print('==========')
print(f'TUGEV VS NOT PRESENT: {res2}')
print('==========')
print(f'NÕRK VS NOT PRESENT: {res3}')

TUGEV vs NÕRK: [['imp_mood', 0.0095]]
TUGEV VS NOT PRESENT: [['propn', 0.0013], ['TTR', 0.0163], ['avr_sent_len', 0.0185], ['active_voice', 0.0164], ['passive_voice', 0.0017], ['1st_prs_verb', 0.0003], ['2nd_prs_verb', 0.0], ['supine', 0.0095], ['verb_particle', 0.0024], ['imp_mood', 0.0], ['neg_polarity', 0.0004], ['ine_case', 0.0129], ['ade_case', 0.0035], ['nsubj_cop', 0.0025], ['xcomp', 0.0166], ['amod', 0.0341], ['cop', 0.0011], ['emoticons', 0.0242]]
NÕRK VS NOT PRESENT: [['2nd_prs_verb', 0.0053], ['nsubj_cop', 0.0338], ['csubj_cop', 0.0391], ['voc', 0.0192]]


### SUBJEKTIIVSUS

In [35]:
f1 = 'dimensioonide_grupid/subj/subj_strong.csv'
f2 = 'dimensioonide_grupid/subj/subj_weak.csv'
f3 = 'dimensioonide_grupid/subj/subj_not_present.csv'

# kruskall Wallise sisendid (kolm gruppi)
strong = get_data(f1)
weak = get_data(f2)
notpres = get_data(f3)

print(len(strong))
print(len(weak))
print(len(notpres))

feature_data, df = generate_df(create_array(strong), create_array(weak), create_array(notpres))
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(df)
    
relevant_feats = [feat[0] for feat in feature_data if feat[4] == 'YES']

strong_pd = pd.read_csv(f1, sep=';')
weak_pd = pd.read_csv(f2, sep=';')
notpres_pd = pd.read_csv(f3, sep=';')

res1, res2, res3 = display_posthoc_results(relevant_feats, strong_pd, weak_pd, notpres_pd)

30
15
52


Unnamed: 0,feature,rank,KW p,adjusted p,state
0,noun,6.5,0.0,0.00069,NO
1,adj,19.5,0.00075,0.00084,NO
2,propn,6.5,0.0,0.00069,NO
3,adv,6.5,0.0,0.00069,NO
4,intj,37.5,0.01565,0.001205,YES
5,cconj,68.0,0.65019,0.004545,YES
6,sconj,24.0,0.00155,0.000909,YES
7,adp,63.0,0.53655,0.003125,YES
8,det,35.0,0.01161,0.001136,YES
9,num,45.0,0.06728,0.001471,YES


INTJ =>
 tugev/mõõdukas vs puudu, 0.0002
SCONJ =>
 tugev/mõõdukas vs puudu, 0.0013
DET =>
 tugev/mõõdukas vs puudu, 0.0041
PUNCT =>
 tugev/mõõdukas vs puudu, 0.0051
PUNCT =>
 nõrk vs puudu, 0.012
SYMBOL =>
 tugev/mõõdukas vs nõrk, 0.0309
ABBR =>
 tugev/mõõdukas vs puudu, 0.0064
TTR =>
 tugev/mõõdukas vs puudu, 0.003
AVR_SENT_LEN =>
 tugev/mõõdukas vs puudu, 0.0016
AVR_SENT_LEN =>
 nõrk vs puudu, 0.0355
HAPAX_LEGOMENA =>
 tugev/mõõdukas vs puudu, 0.004
SEE_DET =>
 tugev/mõõdukas vs puudu, 0.0351
SEE_DET =>
 nõrk vs puudu, 0.0268
3RD_PRON =>
 tugev/mõõdukas vs puudu, 0.0049
2ND_PRS_VERB =>
 tugev/mõõdukas vs nõrk, 0.0006
2ND_PRS_VERB =>
 tugev/mõõdukas vs puudu, 0.0
DA_INF =>
 tugev/mõõdukas vs puudu, 0.0156
SUPINE =>
 tugev/mõõdukas vs puudu, 0.0009
VERB_PARTICLE =>
 tugev/mõõdukas vs puudu, 0.0107
DISCOURSE =>
 tugev/mõõdukas vs puudu, 0.0002
COND_MOOD =>
 tugev/mõõdukas vs puudu, 0.0015
IMP_MOOD =>
 tugev/mõõdukas vs nõrk, 0.0
IMP_MOOD =>
 tugev/mõõdukas vs puudu, 0.0
QUOT_MOOD =>
 tu

In [36]:
print(f'TUGEV vs NÕRK: {res1}')
print('==========')
print(f'TUGEV VS NOT PRESENT: {res2}')
print('==========')
print(f'NÕRK VS NOT PRESENT: {res3}')

TUGEV vs NÕRK: [['symbol', 0.0309], ['2nd_prs_verb', 0.0006], ['imp_mood', 0.0], ['quot_mood', 0.0326]]
TUGEV VS NOT PRESENT: [['intj', 0.0002], ['sconj', 0.0013], ['det', 0.0041], ['punct', 0.0051], ['abbr', 0.0064], ['TTR', 0.003], ['avr_sent_len', 0.0016], ['hapax_legomena', 0.004], ['see_det', 0.0351], ['3rd_pron', 0.0049], ['2nd_prs_verb', 0.0], ['da_inf', 0.0156], ['supine', 0.0009], ['verb_particle', 0.0107], ['discourse', 0.0002], ['cond_mood', 0.0015], ['imp_mood', 0.0], ['nsubj_cop', 0.0002], ['modal', 0.0022], ['csubj_cop', 0.0015], ['xcomp', 0.0076], ['amod', 0.0013], ['advcl', 0.009], ['voc', 0.0366]]
NÕRK VS NOT PRESENT: [['punct', 0.012], ['avr_sent_len', 0.0355], ['see_det', 0.0268], ['nsubj_cop', 0.0402], ['csubj_cop', 0.029]]


## KORRELATSIOONID

In [None]:
input_f = 'limesurvey_feature_results_w_dims_uus.csv'

data = []

with open(input_f, 'r') as fid:
    csv_reader = csv.DictReader(fid, delimiter=',')
    fieldnames = csv_reader.fieldnames
    for line in csv_reader:
        data.append(line)

In [None]:
initial_df = pd.DataFrame(data).iloc[:, 1:].astype(float)

In [None]:
initial_df

In [None]:
feature_names = initial_df.columns[12:]
feature_names

In [None]:
def save_csv(prefix, df):
    df.to_csv(f'korrelatsiooni_csvd/{prefix}.csv')

In [None]:
def plot_important_features(model, dimname):
    
    print(model.coef_)

    feature_importance = abs(model.coef_[0])
    
    
    feature_importance = 1.0 * (feature_importance / feature_importance.max())
    sorted_idx = np.argsort(feature_importance)
    pos = np.arange(sorted_idx.shape[0]) + .5

    featfig = plt.figure(figsize=(15,20))
    featax = featfig.add_subplot(1, 1, 1)

    featax.barh(pos, feature_importance[sorted_idx], align='center')
    featax.set_yticks(pos)
    featax.set_yticklabels(np.array(X.columns)[sorted_idx], fontsize=10)
    featax.set_xlabel(f'Relative Feature Importance {str.upper(dimname)}')

    # plt.tight_layout()   
    plt.show()

In [None]:
def get_corr_pairs(corr_df):

    columns = [line for line in corr_df]
    matches = []

    for column, rows in corr_df.items():
        for m, score in rows.items():
            if (score > 0.75 or score < -0.75) and column != m:
                matches.append([(column, m), score])
            continue

    without_duplicates = []
    for ws, i in matches:
        ls = (sorted(ws), i)
        if ls not in without_duplicates:
            without_duplicates.append(ls)       

    return without_duplicates

In [None]:
def generate_heatmap(dim, corr_data):

    my_colors = ['black', 'lightgrey', 'white', 'red']
    my_cmap = ListedColormap(my_colors)
    bounds = [-1.0, -0.75, 0.75, 1.0]
    my_norm = BoundaryNorm(bounds, ncolors=len(my_colors))



    mask = np.triu(np.ones_like(corr_data))

    fig, ax = plt.subplots(1, 1, figsize=(20,20))
    hmap = sns.heatmap(corr_data,
                yticklabels=1, 
                ax=ax,
                linewidths=1.0,
                cmap=my_cmap,
                norm=my_norm,
                mask=mask,
               cbar_kws = dict(use_gridspec=False,location="top")
               )

    colorbar = ax.collections[0].colorbar
    hmap.figure.savefig(f'heatmapid/{dim}_heatmap.png', format='png', dpi=150)
    plt.show()


In [None]:
all_df = initial_df.iloc[:, 12:]
all_corr = all_df.corr(method='pearson').round(2)

# save_csv('dimensions', all_corr)

In [None]:
generate_heatmap('all_dims', all_corr)

In [None]:
pairs = get_corr_pairs(all_corr)
for pair in pairs:
    print(f'{pair[0]} = {pair[1]}')

## ABSTRAKTNE

In [None]:
abs_df = initial_df['abs']
abs_df
# initial_df[initial_df['abs']<=1]

In [None]:
pos, neg = make_new_df('abs')

print(len(pos), len(neg))

In [None]:
pos

In [None]:
abs_corr = pos.corr(method='pearson').round(2)
save_csv('abs', abs_corr)
abs_corr


In [None]:
generate_heatmap('abs', abs_corr)

In [None]:
pairs = get_corr_pairs(abs_corr)
for pair in pairs:
    print(f'{pair[0]} = {pair[1]}')


### NORMAALJAOTUS

In [None]:
def generate_plot(dim, df1, df2, features):
    fig, axs = plt.subplots(ncols=2, figsize=(20, 200))
    plt.subplots_adjust(hspace=0.5)

    for i, feature in enumerate(features):
        df1_mean = np.mean(df1[str(feature)])
        df1_std = np.std(df1[str(feature)])
        pdf1 = stats.norm.pdf(df1[str(feature)].sort_values(), df1_mean, df1_std)
        
        df2_mean = np.mean(df2[str(feature)])
        df2_std = np.std(df2[str(feature)])
        pdf2 = stats.norm.pdf(df2[str(feature)].sort_values(), df2_mean, df2_std)
    
        ax = plt.subplot(40, 2, i+1)
        plt.plot(df1[str(feature)].sort_values(), pdf1, label='>=2')
        plt.plot(df2[str(feature)].sort_values(), pdf2, label='<2')
        plt.legend(loc=1, prop={'size': 20})
        plt.xlabel(str(feature.upper()))

#         plt.savefig(f'normdist_plots/{dim}.pdf')

In [None]:
generate_plot('abs', pos, neg, feature_names)

### LOGISTILINE REGRESSIOON

In [None]:
feats = [*feature_names]

In [None]:
only_abs = initial_df.loc[:, feats + ['abs']].astype('float64')
only_abs['state'] = [1 if score >= 1 else 0 for score in only_abs['abs']]
# print(only_abs)

final_features = only_abs.columns.values.tolist()[:-2]


X = only_abs[final_features]
y = only_abs.state

model = LogisticRegression().fit(X,y)

plot_important_features(model, 'abstraktsus')


In [None]:

log_reg = smf.logit("state ~ coref + hapax_legomena + noun + nom_case + avg_word_len + past_tense + pres_tense + obl + adv + num + ade_case + nummod + part_case", data=only_abs).fit()
log_reg.summary()

# AFEKTIIVNE

In [None]:
dim = 'afek'

In [None]:
pos, neg = make_new_df(dim)
print(len(pos), len(neg))

In [None]:


dim_df_corr = pos.corr(method='pearson').round(2)
save_csv(dim, dim_df_corr)

generate_heatmap(dim, dim_df_corr)


pairs = get_corr_pairs(dim_df_corr)
for pair in pairs:
    print(f'{pair[0]} = {pair[1]}')


generate_plot(dim, pos, neg, feature_names)

In [None]:
df = initial_df.loc[:, feats + ['afek']].astype('float64')
df['state'] = [1 if score >= 1.5 else 0 for score in df['afek']]

final_features = df.columns.values.tolist()[:-2]


X = df[final_features]
y = df.state

model = LogisticRegression().fit(X,y)
plot_important_features(model, 'afektiivsus')

In [None]:
log_reg = smf.logit("state ~ avg_word_len + coref + noun + adv + pron + verbtype_ratio + TTR + punct + active_voice + adj + conj + avr_sent_len", data=df).fit()
log_reg.summary()

## AEG

In [None]:
dim = 'aeg'
pos, neg = make_new_df(dim)
print(len(pos), len(neg))

In [None]:
dim_df_corr = pos.corr(method='pearson').round(2)
save_csv(dim, dim_df_corr)

generate_heatmap(dim, dim_df_corr)

pairs = get_corr_pairs(dim_df_corr)
for pair in pairs:
    print(f'{pair[0]} = {pair[1]}')


generate_plot(dim, pos, neg, feature_names)


In [None]:
df = initial_df.loc[:, feats + ['aeg']].astype('float64')
df['state'] = [1 if score >= 1.5 else 0 for score in df['aeg']]

final_features = df.columns.values.tolist()[:-2]


X = df[final_features]
y = df.state

model = LogisticRegression().fit(X,y)
plot_important_features(model, 'aeg')

In [None]:
log_reg = smf.logit("state ~ coref + past_tense + pres_tense + obl + verbtype_ratio + adv + num + avg_word_len + gen_case + nummod + noun + core_verb + ade_case + propn + da_inf + cop", data=df).fit()
log_reg.summary()

## ARGUMENTATIIVNE

In [None]:
dim = 'arg'

pos, neg = make_new_df(dim)
print(len(pos), len(neg))

In [None]:
dim_df_corr = pos.corr(method='pearson').round(2)
save_csv(dim, dim_df_corr)

generate_heatmap(dim, dim_df_corr)


pairs = get_corr_pairs(dim_df_corr)
for pair in pairs:
    print(f'{pair[0]} = {pair[1]}')

generate_plot(dim, pos, neg, feature_names)

In [None]:
df = initial_df.loc[:, feats + ['arg']].astype('float64')
df['state'] = [1 if score >= 1.5 else 0 for score in df['arg']]

final_features = df.columns.values.tolist()[:-2]


X = df[final_features]
y = df.state

model = LogisticRegression().fit(X,y)
plot_important_features(model, 'arg')



In [None]:
log_reg = smf.logit("state ~ coref + hapax_legomena + propn + verbtype_ratio + past_tense + pres_tense + avg_word_len + gen_case + conj + TTR + adj + neg_polarity + obl + part_case", data=df).fit()
log_reg.summary()

## FORMAALNE

In [None]:
dim = 'form'

pos, neg = make_new_df(dim)
print(len(pos), len(neg))

In [None]:
dim_df_corr = pos.corr(method='pearson').round(2)
save_csv(dim, dim_df_corr)

generate_heatmap(dim, dim_df_corr)

pairs = get_corr_pairs(dim_df_corr)
for pair in pairs:
    print(f'{pair[0]} = {pair[1]}')

generate_plot(dim, pos, neg, feature_names)

In [None]:
df = initial_df.loc[:, feats + ['form']].astype('float64')
df['state'] = [1 if score >= 1.5 else 0 for score in df['form']]

final_features = df.columns.values.tolist()[:-2]


X = df[final_features]
y = df.state

model = LogisticRegression().fit(X,y)
plot_important_features(model, 'formaalsus')




In [None]:

log_reg = smf.logit("state ~ avg_word_len + coref + gen_case + noun + obl + adv + nmod + num", data=df).fit()
log_reg.summary()

## IMPERSONAALNE

In [None]:
dim = 'imp'

pos, neg = make_new_df(dim)
print(len(pos), len(neg))

In [None]:
dim_df_corr = pos.corr(method='pearson').round(2)
save_csv(dim, dim_df_corr)

generate_heatmap(dim, dim_df_corr)


pairs = get_corr_pairs(dim_df_corr)
for pair in pairs:
    print(f'{pair[0]} = {pair[1]}')

generate_plot(dim, pos, neg, feature_names)

In [None]:
df = initial_df.loc[:, feats + ['imp']].astype('float64')
df['state'] = [1 if score >= 1.5 else 0 for score in df['imp']]

final_features = df.columns.values.tolist()[:-2]


X = df[final_features]
y = df.state

model = LogisticRegression().fit(X,y)
plot_important_features(model, 'impersonaalsus')





In [None]:
log_reg = smf.logit("state ~ avg_word_len + coref + noun + past_tense + pres_tense + conj + pron + num", data=df).fit()
log_reg.summary()

## INFO

In [None]:
dim = 'info'

pos, neg = make_new_df(dim)
print(len(pos), len(neg))

In [None]:
dim_df_corr = pos.corr(method='pearson').round(2)
save_csv(dim, dim_df_corr)

generate_heatmap(dim, dim_df_corr)


pairs = get_corr_pairs(dim_df_corr)
for pair in pairs:
    print(f'{pair[0]} = {pair[1]}')


generate_plot(dim, pos, neg, feature_names)

In [None]:
df = initial_df.loc[:, feats + ['info']].astype('float64')
df['state'] = [1 if score >= 1.5 else 0 for score in df['info']]

final_features = df.columns.values.tolist()[:-2]

X = df[final_features]
y = df.state

model = LogisticRegression().fit(X,y)
plot_important_features(model, 'info')


In [None]:

log_reg = smf.logit("state ~ coref + num + avg_word_len + active_voice + propn + nummod + pron + ind_mood + hapax_legomena + nsubj", data=df).fit()
log_reg.summary()

## INSTRUEERIV

In [None]:
dim = 'inst'

pos, neg = make_new_df(dim)
print(len(pos), len(neg))

In [None]:
dim_df_corr = pos.corr(method='pearson').round(2)
save_csv(dim, dim_df_corr)

generate_heatmap(dim, dim_df_corr)


pairs = get_corr_pairs(dim_df_corr)
for pair in pairs:
    print(f'{pair[0]} = {pair[1]}')

generate_plot(dim, pos, neg, feature_names)

In [None]:
df = initial_df.loc[:, feats + ['inst']].astype('float64')
df['state'] = [1 if score >= 1.5 else 0 for score in df['inst']]

final_features = df.columns.values.tolist()[:-2]


X = df[final_features]
y = df.state

model = LogisticRegression().fit(X,y)
plot_important_features(model, 'instrueerivus')


In [None]:
log_reg = smf.logit("state ~ coref + pres_tense + nom_case + pron + hapax_legomena + past_tense + verbtype_ratio + noun + third_prs_verb + second_prs_verb + imp_mood + pron", data=df).fit()
log_reg.summary()

## INTERAKTIIVNE

In [None]:
dim = 'inter'

pos, neg = make_new_df(dim)
print(len(pos), len(neg))

In [None]:
dim_df_corr = pos.corr(method='pearson').round(2)
save_csv(dim, dim_df_corr)

generate_heatmap(dim, dim_df_corr)


pairs = get_corr_pairs(dim_df_corr)
for pair in pairs:
    print(f'{pair[0]} = {pair[1]}')

generate_plot(dim, pos, neg, feature_names)

In [None]:
df = initial_df.loc[:, feats + ['inter']].astype('float64')
df['state'] = [1 if score >= 1.5 else 0 for score in df['inter']]

final_features = df.columns.values.tolist()[:-2]


X = df[final_features]
y = df.state

model = LogisticRegression().fit(X,y)
plot_important_features(model, 'inter')

In [None]:
log_reg = smf.logit("state ~ avg_word_len + hapax_legomena + TTR + gen_case + noun + verbtype_ratio + avr_sent_len", data=df).fit()
log_reg.summary()

## KEER

In [None]:
dim = 'keer'

pos, neg = make_new_df(dim)
print(len(pos), len(neg))

In [None]:
dim_df_corr = pos.corr(method='pearson').round(2)
save_csv(dim, dim_df_corr)

generate_heatmap(dim, dim_df_corr)


pairs = get_corr_pairs(dim_df_corr)
for pair in pairs:
    print(f'{pair[0]} = {pair[1]}')


generate_plot(dim, pos, neg, feature_names)

In [None]:
df = initial_df.loc[:, feats + ['keer']].astype('float64')
df['state'] = [1 if score >= 1.5 else 0 for score in df['keer']]

final_features = df.columns.values.tolist()[:-2]


X = df[final_features]
y = df.state

model = LogisticRegression().fit(X,y)
plot_important_features(model, 'keerulisus')

In [None]:
log_reg = smf.logit("state ~ avg_word_len + coref + hapax_legomena + verbtype_ratio + abbr + nmod + past_tense + obl + active_voice", data=df).fit()
log_reg.summary()

## SPONT

In [None]:
dim = 'spont'

pos, neg = make_new_df(dim)
print(len(pos), len(neg))

In [None]:
dim_df_corr = pos.corr(method='pearson').round(2)
save_csv(dim, dim_df_corr)

generate_heatmap(dim, dim_df_corr)

pairs = get_corr_pairs(dim_df_corr)
for pair in pairs:
    print(f'{pair[0]} = {pair[1]}')


generate_plot(dim, pos, neg, feature_names)

In [None]:
df = initial_df.loc[:, feats + ['spont']].astype('float64')
df['state'] = [1 if score >= 1.5 else 0 for score in df['spont']]

final_features = df.columns.values.tolist()[:-2]


X = df[final_features]
y = df.state

model = LogisticRegression().fit(X,y)
plot_important_features(model, 'spontaansus')





In [None]:
log_reg = smf.logit("state ~ avg_word_len + coref + noun + gen_case + nom_case + avr_sent_len + obl + propn", data=df).fit()
log_reg.summary()

## SUBJ

In [None]:
dim = 'subj'

pos, neg = make_new_df(dim)
print(len(pos), len(neg))

In [None]:
dim_df_corr = pos.corr(method='pearson').round(2)
save_csv(dim, dim_df_corr)

generate_heatmap(dim, dim_df_corr)


pairs = get_corr_pairs(dim_df_corr)
for pair in pairs:
    print(f'{pair[0]} = {pair[1]}')


generate_plot(dim, pos, neg, feature_names)

In [None]:
df = initial_df.loc[:, feats + ['subj']].astype('float64')
df['state'] = [1 if score >= 1.5 else 0 for score in df['subj']]

final_features = df.columns.values.tolist()[:-2]


X = df[final_features]
y = df.state

model = LogisticRegression().fit(X,y)
plot_important_features(model, 'subjektiivsus')





In [None]:
log_reg = smf.logit("state ~ avg_word_len + coref + verbtype_ratio + adv + propn + noun", data=df).fit()
log_reg.summary()