In [1]:
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats 
import seaborn as sns
import statsmodels.formula.api as smf
import scikit_posthocs as sp
import seaborn as sns
import itertools
from matplotlib.backends.backend_pdf import PdfPages


sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

In [2]:
plt.rc("font", size=14)
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from matplotlib.colors import BoundaryNorm, ListedColormap

In [4]:
# columns = ['noun', 'adj', 'propn', 'adv', 'intj', 'cconj', 'sconj', 'adp', 'det', 'num', 'punct', 'symbol', 'pron', 'abbr', 'TTR', 'avg_word_len', 'avr_sent_len', 'hapax_legomena', 'coref', 'see_pron', 'see_det', '1st_pron', '2nd_pron', '3rd_pron', 'active_voice', 'passive_voice', '1st_prs_verb', '2nd_prs_verb', '3rd_prs_verb', 'core_verb', 'verbtype_ratio', 'da_inf', 'gerund', 'supine', 'verb_particle', 'discourse', 'pres_tense', 'past_tense', 'ind_mood', 'cond_mood', 'imp_mood', 'quot_mood', 'neg_polarity', 'nom_case', 'gen_case', 'part_case', 'ill_case', 'ine_case', 'ela_case', 'alla_case', 'ade_case', 'abl_case', 'tra_case', 'ter_case', 'ess_case', 'abe_case', 'com_case', 'nsubj', 'nsubj_cop', 'modal', 'acl:relc', 'csubj', 'csubj_cop', 'obj', 'ccomp', 'xcomp', 'obl', 'nmod', 'appos', 'nummod', 'amod', 'advcl', 'voc', 'cop', 'conj', 'cc', 'yneemid', 'emoticons']
columns = ['noun', 'adj', 'propn', 'adv', 'intj', 'cconj', 'sconj', 'adp', 'det', 'num', 'punct', 'symbol', 'pron', 'abbr', 'nominals', 'TTR', 'avg_word_len', 'avr_sent_len', 'hapax_legomena', 'pron/noun_ratio', 'see_pron', 'see_det', '1st_pron', '2nd_pron', '3rd_pron', 'nominalisation', 'active_voice', 'passive_voice', '1st_prs_verb', '2nd_prs_verb', '3rd_prs_verb', 'core_verb', 'verbtype_ratio', 'da_inf', 'inf_verb', 'finite_verb', 'gerund', 'supine', 'verb_particle', 'discourse', 'pres_tense', 'past_tense', 'ind_mood', 'cond_mood', 'imp_mood', 'quot_mood', 'neg_polarity', 'nom_case', 'gen_case', 'part_case', 'ill_case', 'ine_case', 'ela_case', 'alla_case', 'ade_case', 'abl_case', 'tra_case', 'ter_case', 'ess_case', 'abe_case', 'com_case', 'nsubj', 'nsubj_cop', 'modal', 'acl:relc', 'csubj', 'csubj_cop', 'obj', 'ccomp', 'xcomp', 'obl', 'nmod', 'appos', 'nummod', 'amod', 'advcl', 'voc', 'cop', 'conj', 'cc', 'yneemid']


def get_data(f):
    ls = []
    with open(f, 'r') as fid:
        reader = csv.DictReader(fid, delimiter=';')
        data = [line for line in reader]
        
    return data


### FUNKTSIOONIDE SEKTSIOON

In [5]:
def create_array(data):
    # KW sisendi saamiseks on vaja iga rea (=faili) tunnuste väärtused ehk
    # [[tunnus11, tunnus12,.., tunnus1N], [tunnus21, tunnus22,.., tunnus2N], ..]
    d = [list(line.items()) for line in data]
    arrays = []
    for f in d:
        arrays.append([element[1] for element in f[1:]])
    return np.array(arrays)

In [6]:
def calculate_hb(rank):
    # holm-bonferroni meetodil p väärtuse korrigeerimine, tavalaks on 0.05, aga siin oleme liberaalsed ja panime 0.1
    adjusted_p = 0.1 / (80 - rank + 1)
    return adjusted_p

In [7]:
def get_relevant_features(f, p):
    # tagastab listide listi, kus igas listis on [tunnus, järjekorra nr, KW p, korrigeeritud p, saatus]
    features = [[feature, round(p, 5), f] for feature, f, p in zip(columns, f.tolist(), p.tolist())]       
    H_statistics = [feature[2] for feature in features]
    p_values = [feature[1] for feature in features]
    ranks = stats.rankdata(p_values)
    features = [feature[0] for feature in features]
    m = [s for s in zip(features, p_values, H_statistics, ranks)]
    positive_features = [] 
    for el in m:
        kW_p = el[1]
        adjusted_p = calculate_hb(el[3])
        if adjusted_p > kW_p:
            state = 'YES'
        else:
            state = 'NO'
        positive_features.append([el[0], el[3], el[2], kW_p, adjusted_p, state])

    return positive_features  

In [8]:
def calculate_dunn_test2(feature, strong_pd, weak_pd, not_pres_pd):
    data = [strong_pd[feature], weak_pd[feature], notpres_pd[feature]]
    df = pd.DataFrame(data).transpose()

    # gruppide määramine ehk tugev/mõõdukas; nõrk; not_present
    group1 = df.iloc[:, 0].dropna()
    group2 = df.iloc[:, 1].dropna()
    group3 = df.iloc[:, 2].dropna()
    
    dunn = sp.posthoc_dunn([group1, group2, group3], p_adjust='holm')
#         dunn = sp.posthoc_dunn([group1, group2, group3])

    return feature.upper(), dunn


In [9]:
def generate_df(strong_array, weak_array, notpres_array):
    # Kruskal Wallise väljund
    f_statistic, p_value = stats.kruskal(strong_array, weak_array, notpres_array)
    feature_data = get_relevant_features(f_statistic, p_value)

    return feature_data, pd.DataFrame(feature_data, columns =['feature', 'rank', 'H statistic', 'KW p', 'adjusted p', 'state'])

    

In [10]:
def display_posthoc_results(feats, strong, weak, notpres):
    """sisend: tunnused, tugev-mõõdukas, nõrk, puudu dataframe'id.
    siin funktsioonis kasutatakse calculate_dunn_test2 funktsiooni, kus siis tunnus tunnuse haaval vaadatakse konkreetselt ühe tunnuse 
    mõõtmisi kolmes grupis ja arvutatakse, milliste gruppide vahel on erinevusi.
    väljund: list -> tugev-mõõdukas, tugev-puudu, nõrk-puudu grupid per tunnus (kui relevantne tunnuse p on dunni testis <=0.05) 
    """
    
    strong_w = []
    strong_not = []
    weak_not = []
    for feat in feats:
        out = calculate_dunn_test2(feat, strong, weak, notpres)
        print(f'\n{out[0]}\n{out[1]}')
        print('Kui p>0.05, siis need jäetakse välja!\n')
        strong_weak = [out[1].iloc[1, 0], 'tugev/mõõdukas vs nõrk']
        strong_notpres = [out[1].iloc[2, 0], 'tugev/mõõdukas vs puudu']
        weak_notpres = [out[1].iloc[2, 1], 'nõrk vs puudu']
        for score in [strong_weak, strong_notpres, weak_notpres]:
            if score[0] < 0.05:
                if score[1] == 'tugev/mõõdukas vs nõrk':
                    print('GRUPP 1 (tugev/mõõdukas vs nõrk)')
                    print(f'{feat}, {round(score[0], 10)} \n')
                    strong_w.append([feat, round(score[0], 10)])
                    
                if score[1] == 'tugev/mõõdukas vs puudu':
                    print('GRUPP 2 (tugev/mõõdukas vs MITTEEKSISTEERIV)')
                    print(f'{feat}, {round(score[0], 10)} \n')
                    strong_not.append([feat, round(score[0], 10)])
                if score[1] == 'nõrk vs puudu':
                    print('GRUPP 3 (nõrk vs MITTEEKSISTEERIV)')
                    print(f'{feat}, {round(score[0], 10)} \n')
                    weak_not.append([feat, round(score[0], 10)])
                print(f'TUGEV/MÕÕDUKAS -> mediaan/keskmine: {round(strong[feat].median(), 3)}; {round(strong[feat].mean(), 3)}')
                print(f'NÕRK -> mediaan/keskmine: {round(weak[feat].median(), 3)}; {round(weak[feat].mean(), 3)}')
                print(f'MITTEEKSISTEERIV -> mediaan/keskmine: {round(notpres[feat].median(), 3)}; {round(notpres[feat].mean(), 3)}')


                # print(f'{feat.upper()} =>\n {score[1]}, {round(score[0], 4)}')
            print('================')
    return strong_w, strong_not, weak_not

### INSTRUEERIVUS

In [11]:
f1 = 'dimensioonide_grupid/inst/inst_strong.csv'
f2 = 'dimensioonide_grupid/inst/inst_weak.csv'
f3 = 'dimensioonide_grupid/inst/inst_not_present.csv'

# kruskall Wallise sisendid (kolm gruppi)
strong = get_data(f1)
weak = get_data(f2)
notpres = get_data(f3)


print(f'"tugev/mõõdukas" kokku -> {len(strong)}')
print(f'"nõrk" kokku -> {len(weak)}')
print(f'"puudu" kokku -> {len(notpres)}')


feature_data, df = generate_df(create_array(strong), create_array(weak), create_array(notpres))
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(df)

"tugev/mõõdukas" kokku -> 15
"nõrk" kokku -> 18
"puudu" kokku -> 58


  adjusted_p = 0.1 / (80 - rank + 1)


Unnamed: 0,feature,rank,H statistic,KW p,adjusted p,state
0,noun,22.0,5.99721,0.04986,0.001695,NO
1,adj,63.0,1.372091,0.50356,0.005556,NO
2,propn,11.0,10.179264,0.00616,0.001429,NO
3,adv,29.0,5.319493,0.06997,0.001923,NO
4,intj,78.5,0.199667,0.90499,0.04,NO
5,cconj,35.0,4.484371,0.10623,0.002174,NO
6,sconj,58.0,1.579174,0.45403,0.004348,NO
7,adp,54.0,1.924492,0.38203,0.003704,NO
8,det,80.0,0.086687,0.95758,0.1,NO
9,num,44.0,2.714762,0.25733,0.002703,NO


In [12]:
relevant_feats = [feat[0] for feat in feature_data if feat[5] == 'YES']

strong_pd = pd.read_csv(f1, sep=';')
weak_pd = pd.read_csv(f2, sep=';')
notpres_pd = pd.read_csv(f3, sep=';')

for r in relevant_feats:
    print(r)

res1, res2, res3 = display_posthoc_results(relevant_feats, strong_pd, weak_pd, notpres_pd)

pres_tense
amod

PRES_TENSE
          1         2         3
1  1.000000  0.684228  0.001032
2  0.684228  1.000000  0.001824
3  0.001032  0.001824  1.000000
Kui p>0.05, siis need jäetakse välja!

GRUPP 2 (tugev/mõõdukas vs MITTEEKSISTEERIV)
pres_tense, 0.0010315064 

TUGEV/MÕÕDUKAS -> mediaan/keskmine: 0.116; 0.115
NÕRK -> mediaan/keskmine: 0.108; 0.106
MITTEEKSISTEERIV -> mediaan/keskmine: 0.071; 0.071
GRUPP 3 (nõrk vs MITTEEKSISTEERIV)
pres_tense, 0.0018241219 

TUGEV/MÕÕDUKAS -> mediaan/keskmine: 0.116; 0.115
NÕRK -> mediaan/keskmine: 0.108; 0.106
MITTEEKSISTEERIV -> mediaan/keskmine: 0.071; 0.071

AMOD
     1    2    3
1  1.0  1.0  1.0
2  1.0  1.0  1.0
3  1.0  1.0  1.0
Kui p>0.05, siis need jäetakse välja!



### ABSTRAKTSUS

In [13]:
f1 = 'dimensioonide_grupid/abs/abs_strong.csv'
f2 = 'dimensioonide_grupid/abs/abs_weak.csv'
f3 = 'dimensioonide_grupid/abs/abs_not_present.csv'

# kruskall Wallise sisendid (kolm gruppi)
strong = get_data(f1)
weak = get_data(f2)
notpres = get_data(f3)

print(f'"tugev/mõõdukas" kokku -> {len(strong)}')
print(f'"nõrk" kokku -> {len(weak)}')
print(f'"puudu" kokku -> {len(notpres)}')

feature_data, df = generate_df(create_array(strong), create_array(weak), create_array(notpres))

with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(df)
    
relevant_feats = [feat[0] for feat in feature_data if feat[5] == 'YES']

strong_pd = pd.read_csv(f1, sep=';')
weak_pd = pd.read_csv(f2, sep=';')
notpres_pd = pd.read_csv(f3, sep=';')

print(relevant_feats)

res1, res2, res3 = display_posthoc_results(relevant_feats, strong_pd, weak_pd, notpres_pd)

"tugev/mõõdukas" kokku -> 6
"nõrk" kokku -> 36
"puudu" kokku -> 56


  adjusted_p = 0.1 / (80 - rank + 1)


Unnamed: 0,feature,rank,H statistic,KW p,adjusted p,state
0,noun,20.0,6.12672,0.04673,0.001639,NO
1,adj,31.0,3.917161,0.14106,0.002,NO
2,propn,30.0,4.026985,0.13352,0.001961,NO
3,adv,16.0,6.532639,0.03815,0.001538,NO
4,intj,66.5,0.741086,0.69036,0.006897,NO
5,cconj,59.0,1.166809,0.558,0.004545,NO
6,sconj,41.0,2.506764,0.28554,0.0025,NO
7,adp,42.0,2.474983,0.29011,0.002564,NO
8,det,15.0,6.563836,0.03756,0.001515,NO
9,num,28.0,4.13329,0.12661,0.001887,NO


['obl', 'appos']

OBL
          1         2         3
1  1.000000  0.011901  0.000284
2  0.011901  1.000000  0.029788
3  0.000284  0.029788  1.000000
Kui p>0.05, siis need jäetakse välja!

GRUPP 1 (tugev/mõõdukas vs nõrk)
obl, 0.0119008024 

TUGEV/MÕÕDUKAS -> mediaan/keskmine: 0.054; 0.058
NÕRK -> mediaan/keskmine: 0.09; 0.093
MITTEEKSISTEERIV -> mediaan/keskmine: 0.105; 0.106
GRUPP 2 (tugev/mõõdukas vs MITTEEKSISTEERIV)
obl, 0.0002837739 

TUGEV/MÕÕDUKAS -> mediaan/keskmine: 0.054; 0.058
NÕRK -> mediaan/keskmine: 0.09; 0.093
MITTEEKSISTEERIV -> mediaan/keskmine: 0.105; 0.106
GRUPP 3 (nõrk vs MITTEEKSISTEERIV)
obl, 0.0297882814 

TUGEV/MÕÕDUKAS -> mediaan/keskmine: 0.054; 0.058
NÕRK -> mediaan/keskmine: 0.09; 0.093
MITTEEKSISTEERIV -> mediaan/keskmine: 0.105; 0.106

APPOS
     1    2    3
1  1.0  1.0  1.0
2  1.0  1.0  1.0
3  1.0  1.0  1.0
Kui p>0.05, siis need jäetakse välja!



### AFEKTIIVSUS

In [14]:
f1 = 'dimensioonide_grupid/afek/afek_strong.csv'
f2 = 'dimensioonide_grupid/afek/afek_weak.csv'
f3 = 'dimensioonide_grupid/afek/afek_not_present.csv'

# kruskall Wallise sisendid (kolm gruppi)
strong = get_data(f1)
weak = get_data(f2)
notpres = get_data(f3)


print(f'"tugev/mõõdukas" kokku -> {len(strong)}')
print(f'"nõrk" kokku -> {len(weak)}')
print(f'"puudu" kokku -> {len(notpres)}')

feature_data, df = generate_df(create_array(strong), create_array(weak), create_array(notpres))
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(df)


relevant_feats = [feat[0] for feat in feature_data if feat[5] == 'YES']
print(relevant_feats)

strong_pd = pd.read_csv(f1, sep=';')
weak_pd = pd.read_csv(f2, sep=';')
notpres_pd = pd.read_csv(f3, sep=';')


res1, res2, res3 = display_posthoc_results(relevant_feats, strong_pd, weak_pd, notpres_pd)

"tugev/mõõdukas" kokku -> 28
"nõrk" kokku -> 18
"puudu" kokku -> 45


  adjusted_p = 0.1 / (80 - rank + 1)


Unnamed: 0,feature,rank,H statistic,KW p,adjusted p,state
0,noun,9.0,45.329066,0.0,0.001389,YES
1,adj,30.0,14.991911,0.00056,0.001961,YES
2,propn,26.0,17.490151,0.00016,0.001818,YES
3,adv,9.0,42.370908,0.0,0.001389,YES
4,intj,23.0,18.676198,9e-05,0.001724,YES
5,cconj,66.0,2.010511,0.36595,0.006667,NO
6,sconj,9.0,29.072826,0.0,0.001389,YES
7,adp,59.0,2.800776,0.2465,0.004545,NO
8,det,32.0,12.541814,0.00189,0.002041,YES
9,num,47.0,7.097505,0.02876,0.002941,NO


['noun', 'adj', 'propn', 'adv', 'intj', 'sconj', 'det', 'pron', 'nominals', 'avg_word_len', 'pron/noun_ratio', 'see_pron', '1st_pron', '2nd_pron', '3rd_pron', 'active_voice', 'passive_voice', '1st_prs_verb', '2nd_prs_verb', '3rd_prs_verb', 'core_verb', 'finite_verb', 'supine', 'discourse', 'ind_mood', 'imp_mood', 'neg_polarity', 'gen_case', 'ade_case', 'modal', 'xcomp', 'nmod', 'cop']

NOUN
              1         2             3
1  1.000000e+00  0.077622  2.021318e-10
2  7.762157e-02  1.000000  3.966194e-04
3  2.021318e-10  0.000397  1.000000e+00
Kui p>0.05, siis need jäetakse välja!

GRUPP 2 (tugev/mõõdukas vs MITTEEKSISTEERIV)
noun, 2e-10 

TUGEV/MÕÕDUKAS -> mediaan/keskmine: 0.22; 0.209
NÕRK -> mediaan/keskmine: 0.241; 0.246
MITTEEKSISTEERIV -> mediaan/keskmine: 0.308; 0.316
GRUPP 3 (nõrk vs MITTEEKSISTEERIV)
noun, 0.0003966194 

TUGEV/MÕÕDUKAS -> mediaan/keskmine: 0.22; 0.209
NÕRK -> mediaan/keskmine: 0.241; 0.246
MITTEEKSISTEERIV -> mediaan/keskmine: 0.308; 0.316

ADJ
          1

MITTEEKSISTEERIV -> mediaan/keskmine: 0.0; 0.001

2ND_PRON
          1         2         3
1  1.000000  0.001411  0.000007
2  0.001411  1.000000  0.681755
3  0.000007  0.681755  1.000000
Kui p>0.05, siis need jäetakse välja!

GRUPP 1 (tugev/mõõdukas vs nõrk)
2nd_pron, 0.0014109175 

TUGEV/MÕÕDUKAS -> mediaan/keskmine: 0.004; 0.006
NÕRK -> mediaan/keskmine: 0.0; 0.001
MITTEEKSISTEERIV -> mediaan/keskmine: 0.0; 0.0
GRUPP 2 (tugev/mõõdukas vs MITTEEKSISTEERIV)
2nd_pron, 6.8425e-06 

TUGEV/MÕÕDUKAS -> mediaan/keskmine: 0.004; 0.006
NÕRK -> mediaan/keskmine: 0.0; 0.001
MITTEEKSISTEERIV -> mediaan/keskmine: 0.0; 0.0

3RD_PRON
          1         2         3
1  1.000000  0.016394  0.000015
2  0.016394  1.000000  0.282211
3  0.000015  0.282211  1.000000
Kui p>0.05, siis need jäetakse välja!

GRUPP 1 (tugev/mõõdukas vs nõrk)
3rd_pron, 0.0163939123 

TUGEV/MÕÕDUKAS -> mediaan/keskmine: 0.015; 0.017
NÕRK -> mediaan/keskmine: 0.003; 0.007
MITTEEKSISTEERIV -> mediaan/keskmine: 0.0; 0.005
GRUPP 2 (t


MODAL
              1         2             3
1  1.000000e+00  0.047784  5.079403e-07
2  4.778393e-02  1.000000  3.557906e-02
3  5.079403e-07  0.035579  1.000000e+00
Kui p>0.05, siis need jäetakse välja!

GRUPP 1 (tugev/mõõdukas vs nõrk)
modal, 0.047783929 

TUGEV/MÕÕDUKAS -> mediaan/keskmine: 0.027; 0.028
NÕRK -> mediaan/keskmine: 0.019; 0.02
MITTEEKSISTEERIV -> mediaan/keskmine: 0.007; 0.01
GRUPP 2 (tugev/mõõdukas vs MITTEEKSISTEERIV)
modal, 5.079e-07 

TUGEV/MÕÕDUKAS -> mediaan/keskmine: 0.027; 0.028
NÕRK -> mediaan/keskmine: 0.019; 0.02
MITTEEKSISTEERIV -> mediaan/keskmine: 0.007; 0.01
GRUPP 3 (nõrk vs MITTEEKSISTEERIV)
modal, 0.0355790598 

TUGEV/MÕÕDUKAS -> mediaan/keskmine: 0.027; 0.028
NÕRK -> mediaan/keskmine: 0.019; 0.02
MITTEEKSISTEERIV -> mediaan/keskmine: 0.007; 0.01

XCOMP
          1         2         3
1  1.000000  0.118558  0.000120
2  0.118558  1.000000  0.133277
3  0.000120  0.133277  1.000000
Kui p>0.05, siis need jäetakse välja!

GRUPP 2 (tugev/mõõdukas vs MITTEEK

### AEG

In [15]:
f1 = 'dimensioonide_grupid/aeg/aeg_strong.csv'
f2 = 'dimensioonide_grupid/aeg/aeg_weak.csv'
f3 = 'dimensioonide_grupid/aeg/aeg_not_present.csv'

# kruskall Wallise sisendid (kolm gruppi)
strong = get_data(f1)
weak = get_data(f2)
notpres = get_data(f3)

print(len(strong))
print(len(weak))
print(len(notpres))

feature_data, df = generate_df(create_array(strong), create_array(weak), create_array(notpres))
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(df)
    
relevant_feats = [feat[0] for feat in feature_data if feat[5] == 'YES']

strong_pd = pd.read_csv(f1, sep=';')
weak_pd = pd.read_csv(f2, sep=';')
notpres_pd = pd.read_csv(f3, sep=';')

print(relevant_feats)

res1, res2, res3 = display_posthoc_results(relevant_feats, strong_pd, weak_pd, notpres_pd)

16
39
19


  adjusted_p = 0.1 / (80 - rank + 1)


Unnamed: 0,feature,rank,H statistic,KW p,adjusted p,state
0,noun,74.0,0.362775,0.83411,0.014286,NO
1,adj,64.0,0.873345,0.64618,0.005882,NO
2,propn,10.0,8.804216,0.01225,0.001408,NO
3,adv,45.0,2.462256,0.29196,0.002778,NO
4,intj,27.5,3.695353,0.1576,0.001869,NO
5,cconj,37.5,2.80163,0.2464,0.002299,NO
6,sconj,18.0,4.806154,0.09044,0.001587,NO
7,adp,70.0,0.814837,0.66537,0.009091,NO
8,det,31.0,3.354441,0.18689,0.002,NO
9,num,6.0,11.363839,0.00341,0.001333,NO


['da_inf', 'inf_verb', 'pres_tense', 'past_tense', 'ine_case', 'nummod']

DA_INF
          1         2         3
1  1.000000  0.011486  0.000284
2  0.011486  1.000000  0.071271
3  0.000284  0.071271  1.000000
Kui p>0.05, siis need jäetakse välja!

GRUPP 1 (tugev/mõõdukas vs nõrk)
da_inf, 0.0114856217 

TUGEV/MÕÕDUKAS -> mediaan/keskmine: 0.009; 0.009
NÕRK -> mediaan/keskmine: 0.017; 0.02
MITTEEKSISTEERIV -> mediaan/keskmine: 0.026; 0.025
GRUPP 2 (tugev/mõõdukas vs MITTEEKSISTEERIV)
da_inf, 0.0002838366 

TUGEV/MÕÕDUKAS -> mediaan/keskmine: 0.009; 0.009
NÕRK -> mediaan/keskmine: 0.017; 0.02
MITTEEKSISTEERIV -> mediaan/keskmine: 0.026; 0.025

INF_VERB
          1         2         3
1  1.000000  0.002599  0.000409
2  0.002599  1.000000  0.224727
3  0.000409  0.224727  1.000000
Kui p>0.05, siis need jäetakse välja!

GRUPP 1 (tugev/mõõdukas vs nõrk)
inf_verb, 0.0025993051 

TUGEV/MÕÕDUKAS -> mediaan/keskmine: 0.041; 0.041
NÕRK -> mediaan/keskmine: 0.058; 0.067
MITTEEKSISTEERIV -> mediaan/k

### ARGUMENTATIIVSUS

In [16]:
f1 = 'dimensioonide_grupid/arg/arg_strong.csv'
f2 = 'dimensioonide_grupid/arg/arg_weak.csv'
f3 = 'dimensioonide_grupid/arg/arg_not_present.csv'

# kruskall Wallise sisendid (kolm gruppi)
strong = get_data(f1)
weak = get_data(f2)
notpres = get_data(f3)


# print(len(strong))
# print(len(weak))
# print(len(notpres))

feature_data, df = generate_df(create_array(strong), create_array(weak), create_array(notpres))
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(df)

    
relevant_feats = [feat[0] for feat in feature_data if feat[5] == 'YES']

strong_pd = pd.read_csv(f1, sep=';')
weak_pd = pd.read_csv(f2, sep=';')
notpres_pd = pd.read_csv(f3, sep=';')

print(relevant_feats)

res1, res2, res3 = display_posthoc_results(relevant_feats, strong_pd, weak_pd, notpres_pd)

  adjusted_p = 0.1 / (80 - rank + 1)


Unnamed: 0,feature,rank,H statistic,KW p,adjusted p,state
0,noun,48.0,2.592282,0.27359,0.00303,NO
1,adj,42.0,2.911416,0.23324,0.002564,NO
2,propn,4.0,15.48652,0.00043,0.001299,YES
3,adv,40.0,3.227402,0.19915,0.002439,NO
4,intj,77.5,0.401698,0.81804,0.028571,NO
5,cconj,50.0,2.485287,0.28862,0.003226,NO
6,sconj,21.0,5.617425,0.06028,0.001667,NO
7,adp,65.0,1.037452,0.59528,0.00625,NO
8,det,37.0,3.318292,0.1903,0.002273,NO
9,num,27.0,4.855024,0.08826,0.001852,NO


['propn', 'symbol', 'da_inf', 'pres_tense', 'past_tense', 'neg_polarity', 'modal']

PROPN
          1         2         3
1  1.000000  0.231327  0.000532
2  0.231327  1.000000  0.013760
3  0.000532  0.013760  1.000000
Kui p>0.05, siis need jäetakse välja!

GRUPP 2 (tugev/mõõdukas vs MITTEEKSISTEERIV)
propn, 0.000531673 

TUGEV/MÕÕDUKAS -> mediaan/keskmine: 0.02; 0.032
NÕRK -> mediaan/keskmine: 0.038; 0.047
MITTEEKSISTEERIV -> mediaan/keskmine: 0.076; 0.09
GRUPP 3 (nõrk vs MITTEEKSISTEERIV)
propn, 0.0137603004 

TUGEV/MÕÕDUKAS -> mediaan/keskmine: 0.02; 0.032
NÕRK -> mediaan/keskmine: 0.038; 0.047
MITTEEKSISTEERIV -> mediaan/keskmine: 0.076; 0.09

SYMBOL
     1    2    3
1  1.0  1.0  1.0
2  1.0  1.0  1.0
3  1.0  1.0  1.0
Kui p>0.05, siis need jäetakse välja!


DA_INF
          1         2         3
1  1.000000  0.999051  0.003872
2  0.999051  1.000000  0.002610
3  0.003872  0.002610  1.000000
Kui p>0.05, siis need jäetakse välja!

GRUPP 2 (tugev/mõõdukas vs MITTEEKSISTEERIV)
da_inf, 0.0

### FORMAALSUS

In [17]:
f1 = 'dimensioonide_grupid/form/form_strong.csv'
f2 = 'dimensioonide_grupid/form/form_weak.csv'
f3 = 'dimensioonide_grupid/form/form_not_present.csv'

# kruskall Wallise sisendid (kolm gruppi)
strong = get_data(f1)
weak = get_data(f2)
notpres = get_data(f3)

print(len(strong))
print(len(weak))
print(len(notpres))

feature_data, df = generate_df(create_array(strong), create_array(weak), create_array(notpres))
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(df)
    
relevant_feats = [feat[0] for feat in feature_data if feat[5] == 'YES']
print(relevant_feats)


strong_pd = pd.read_csv(f1, sep=';')
weak_pd = pd.read_csv(f2, sep=';')
notpres_pd = pd.read_csv(f3, sep=';')

res1, res2, res3 = display_posthoc_results(relevant_feats, strong_pd, weak_pd, notpres_pd)

14
28
49


  adjusted_p = 0.1 / (80 - rank + 1)


Unnamed: 0,feature,rank,H statistic,KW p,adjusted p,state
0,noun,6.5,29.979714,0.0,0.001342,YES
1,adj,24.0,14.089414,0.00087,0.001754,YES
2,propn,15.0,17.6915,0.00014,0.001515,YES
3,adv,6.5,35.72759,0.0,0.001342,YES
4,intj,48.5,5.057513,0.07976,0.003077,NO
5,cconj,56.0,4.22642,0.12085,0.004,NO
6,sconj,29.0,12.078198,0.00238,0.001923,NO
7,adp,77.0,0.740396,0.6906,0.025,NO
8,det,13.5,18.014834,0.00012,0.001481,YES
9,num,42.0,6.669272,0.03563,0.002564,NO


['noun', 'adj', 'propn', 'adv', 'det', 'punct', 'pron', 'nominals', 'avg_word_len', 'avr_sent_len', 'pron/noun_ratio', '1st_pron', 'active_voice', 'passive_voice', '1st_prs_verb', '2nd_prs_verb', 'core_verb', 'verbtype_ratio', 'finite_verb', 'past_tense', 'ind_mood', 'imp_mood', 'gen_case', 'nsubj_cop', 'xcomp', 'nmod', 'cop']

NOUN
          1         2         3
1  1.000000  0.114095  0.000005
2  0.114095  1.000000  0.000173
3  0.000005  0.000173  1.000000
Kui p>0.05, siis need jäetakse välja!

GRUPP 2 (tugev/mõõdukas vs MITTEEKSISTEERIV)
noun, 5.3796e-06 

TUGEV/MÕÕDUKAS -> mediaan/keskmine: 0.335; 0.347
NÕRK -> mediaan/keskmine: 0.299; 0.302
MITTEEKSISTEERIV -> mediaan/keskmine: 0.245; 0.242
GRUPP 3 (nõrk vs MITTEEKSISTEERIV)
noun, 0.0001730261 

TUGEV/MÕÕDUKAS -> mediaan/keskmine: 0.335; 0.347
NÕRK -> mediaan/keskmine: 0.299; 0.302
MITTEEKSISTEERIV -> mediaan/keskmine: 0.245; 0.242

ADJ
          1         2         3
1  1.000000  0.008966  0.000535
2  0.008966  1.000000  0.385642


PASSIVE_VOICE
          1         2         3
1  1.000000  0.207484  0.002348
2  0.207484  1.000000  0.021240
3  0.002348  0.021240  1.000000
Kui p>0.05, siis need jäetakse välja!

GRUPP 2 (tugev/mõõdukas vs MITTEEKSISTEERIV)
passive_voice, 0.0023483223 

TUGEV/MÕÕDUKAS -> mediaan/keskmine: 0.027; 0.029
NÕRK -> mediaan/keskmine: 0.015; 0.021
MITTEEKSISTEERIV -> mediaan/keskmine: 0.011; 0.012
GRUPP 3 (nõrk vs MITTEEKSISTEERIV)
passive_voice, 0.0212398773 

TUGEV/MÕÕDUKAS -> mediaan/keskmine: 0.027; 0.029
NÕRK -> mediaan/keskmine: 0.015; 0.021
MITTEEKSISTEERIV -> mediaan/keskmine: 0.011; 0.012

1ST_PRS_VERB
          1         2         3
1  1.000000  0.525818  0.000044
2  0.525818  1.000000  0.000016
3  0.000044  0.000016  1.000000
Kui p>0.05, siis need jäetakse välja!

GRUPP 2 (tugev/mõõdukas vs MITTEEKSISTEERIV)
1st_prs_verb, 4.37987e-05 

TUGEV/MÕÕDUKAS -> mediaan/keskmine: 0.0; 0.0
NÕRK -> mediaan/keskmine: 0.0; 0.002
MITTEEKSISTEERIV -> mediaan/keskmine: 0.007; 0.013
GRUPP 3 (nõrk

### IMPERSONAALSUS

In [18]:
f1 = 'dimensioonide_grupid/imp/imp_strong.csv'
f2 = 'dimensioonide_grupid/imp/imp_weak.csv'
f3 = 'dimensioonide_grupid/imp/imp_not_present.csv'

# kruskall Wallise sisendid (kolm gruppi)
strong = get_data(f1)
weak = get_data(f2)
notpres = get_data(f3)


print(len(strong))
print(len(weak))
print(len(notpres))

feature_data, df = generate_df(create_array(strong), create_array(weak), create_array(notpres))
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(df)
    
relevant_feats = [feat[0] for feat in feature_data if feat[5] == 'YES']
print(relevant_feats)

strong_pd = pd.read_csv(f1, sep=';')
weak_pd = pd.read_csv(f2, sep=';')
notpres_pd = pd.read_csv(f3, sep=';')

res1, res2, res3 = display_posthoc_results(relevant_feats, strong_pd, weak_pd, notpres_pd)

37
21
28


  adjusted_p = 0.1 / (80 - rank + 1)


Unnamed: 0,feature,rank,H statistic,KW p,adjusted p,state
0,noun,6.0,41.10848,0.0,0.001333,YES
1,adj,19.0,15.142822,0.00051,0.001613,YES
2,propn,41.0,5.113597,0.07755,0.0025,NO
3,adv,12.5,23.83863,1e-05,0.00146,YES
4,intj,23.5,13.183002,0.00137,0.001739,YES
5,cconj,63.0,2.258884,0.32321,0.005556,NO
6,sconj,31.0,9.408624,0.00906,0.002,NO
7,adp,70.0,1.485944,0.4757,0.009091,NO
8,det,21.0,14.350126,0.00077,0.001667,YES
9,num,52.0,4.010251,0.13464,0.003448,NO


['noun', 'adj', 'adv', 'intj', 'det', 'pron', 'abbr', 'nominals', 'avg_word_len', 'pron/noun_ratio', '1st_pron', '2nd_pron', '3rd_pron', 'active_voice', 'passive_voice', '1st_prs_verb', '2nd_prs_verb', 'core_verb', 'verbtype_ratio', 'finite_verb', 'supine', 'discourse', 'ind_mood', 'imp_mood', 'gen_case', 'tra_case', 'xcomp', 'nmod']

NOUN
              1         2             3
1  1.000000e+00  0.000986  7.810205e-10
2  9.864760e-04  1.000000  2.876591e-02
3  7.810205e-10  0.028766  1.000000e+00
Kui p>0.05, siis need jäetakse välja!

GRUPP 1 (tugev/mõõdukas vs nõrk)
noun, 0.000986476 

TUGEV/MÕÕDUKAS -> mediaan/keskmine: 0.321; 0.328
NÕRK -> mediaan/keskmine: 0.262; 0.263
MITTEEKSISTEERIV -> mediaan/keskmine: 0.216; 0.214
GRUPP 2 (tugev/mõõdukas vs MITTEEKSISTEERIV)
noun, 8e-10 

TUGEV/MÕÕDUKAS -> mediaan/keskmine: 0.321; 0.328
NÕRK -> mediaan/keskmine: 0.262; 0.263
MITTEEKSISTEERIV -> mediaan/keskmine: 0.216; 0.214
GRUPP 3 (nõrk vs MITTEEKSISTEERIV)
noun, 0.0287659082 

TUGEV/MÕÕDUKA


ACTIVE_VOICE
          1         2         3
1  1.000000  0.202867  0.000013
2  0.202867  1.000000  0.010884
3  0.000013  0.010884  1.000000
Kui p>0.05, siis need jäetakse välja!

GRUPP 2 (tugev/mõõdukas vs MITTEEKSISTEERIV)
active_voice, 1.31407e-05 

TUGEV/MÕÕDUKAS -> mediaan/keskmine: 0.111; 0.115
NÕRK -> mediaan/keskmine: 0.123; 0.127
MITTEEKSISTEERIV -> mediaan/keskmine: 0.152; 0.152
GRUPP 3 (nõrk vs MITTEEKSISTEERIV)
active_voice, 0.0108836314 

TUGEV/MÕÕDUKAS -> mediaan/keskmine: 0.111; 0.115
NÕRK -> mediaan/keskmine: 0.123; 0.127
MITTEEKSISTEERIV -> mediaan/keskmine: 0.152; 0.152

PASSIVE_VOICE
          1         2         3
1  1.000000  0.878455  0.003611
2  0.878455  1.000000  0.006267
3  0.003611  0.006267  1.000000
Kui p>0.05, siis need jäetakse välja!

GRUPP 2 (tugev/mõõdukas vs MITTEEKSISTEERIV)
passive_voice, 0.0036113212 

TUGEV/MÕÕDUKAS -> mediaan/keskmine: 0.018; 0.025
NÕRK -> mediaan/keskmine: 0.019; 0.021
MITTEEKSISTEERIV -> mediaan/keskmine: 0.008; 0.011
GRUPP 3 


NMOD
              1         2             3
1  1.000000e+00  0.004888  1.500235e-07
2  4.887706e-03  1.000000  6.258373e-02
3  1.500235e-07  0.062584  1.000000e+00
Kui p>0.05, siis need jäetakse välja!

GRUPP 1 (tugev/mõõdukas vs nõrk)
nmod, 0.0048877059 

TUGEV/MÕÕDUKAS -> mediaan/keskmine: 0.121; 0.127
NÕRK -> mediaan/keskmine: 0.083; 0.082
MITTEEKSISTEERIV -> mediaan/keskmine: 0.052; 0.061
GRUPP 2 (tugev/mõõdukas vs MITTEEKSISTEERIV)
nmod, 1.5e-07 

TUGEV/MÕÕDUKAS -> mediaan/keskmine: 0.121; 0.127
NÕRK -> mediaan/keskmine: 0.083; 0.082
MITTEEKSISTEERIV -> mediaan/keskmine: 0.052; 0.061


### INFOTIHEDUS

In [19]:
f1 = 'dimensioonide_grupid/info/info_strong.csv'
f2 = 'dimensioonide_grupid/info/info_weak.csv'
f3 = 'dimensioonide_grupid/info/info_not_present.csv'

# kruskall Wallise sisendid (kolm gruppi)
strong = get_data(f1)
weak = get_data(f2)
notpres = get_data(f3)

print(len(strong))
print(len(weak))
print(len(notpres))

feature_data, df = generate_df(create_array(strong), create_array(weak), create_array(notpres))
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(df)
    
relevant_feats = [feat[0] for feat in feature_data if feat[5] == 'YES']
print(relevant_feats)

strong_pd = pd.read_csv(f1, sep=';')
weak_pd = pd.read_csv(f2, sep=';')
notpres_pd = pd.read_csv(f3, sep=';')

res1, res2, res3 = display_posthoc_results(relevant_feats, strong_pd, weak_pd, notpres_pd)

45
26
5


  adjusted_p = 0.1 / (80 - rank + 1)


Unnamed: 0,feature,rank,H statistic,KW p,adjusted p,state
0,noun,16.0,14.593942,0.00068,0.001538,YES
1,adj,36.0,6.760937,0.03403,0.002222,NO
2,propn,28.0,8.954491,0.01136,0.001887,NO
3,adv,14.0,15.668664,0.0004,0.001493,YES
4,intj,26.5,9.315008,0.00949,0.001835,NO
5,cconj,66.0,0.852564,0.65293,0.006667,NO
6,sconj,18.0,10.8518,0.0044,0.001587,NO
7,adp,53.0,2.967836,0.22675,0.003571,NO
8,det,19.0,10.765474,0.0046,0.001613,NO
9,num,10.5,19.460002,6e-05,0.001418,YES


['noun', 'adv', 'num', 'pron', 'nominals', 'avg_word_len', 'pron/noun_ratio', '1st_pron', '2nd_pron', '1st_prs_verb', '2nd_prs_verb', 'finite_verb', 'imp_mood', 'neg_polarity', 'ter_case', 'xcomp', 'nummod']

NOUN
          1         2         3
1  1.000000  0.002513  0.027851
2  0.002513  1.000000  0.490774
3  0.027851  0.490774  1.000000
Kui p>0.05, siis need jäetakse välja!

GRUPP 1 (tugev/mõõdukas vs nõrk)
noun, 0.0025134747 

TUGEV/MÕÕDUKAS -> mediaan/keskmine: 0.302; 0.3
NÕRK -> mediaan/keskmine: 0.236; 0.247
MITTEEKSISTEERIV -> mediaan/keskmine: 0.219; 0.23
GRUPP 2 (tugev/mõõdukas vs MITTEEKSISTEERIV)
noun, 0.0278508216 

TUGEV/MÕÕDUKAS -> mediaan/keskmine: 0.302; 0.3
NÕRK -> mediaan/keskmine: 0.236; 0.247
MITTEEKSISTEERIV -> mediaan/keskmine: 0.219; 0.23

ADV
          1         2         3
1  1.000000  0.000531  0.099573
2  0.000531  1.000000  0.998293
3  0.099573  0.998293  1.000000
Kui p>0.05, siis need jäetakse välja!

GRUPP 1 (tugev/mõõdukas vs nõrk)
adv, 0.0005307072 

TU

TUGEV/MÕÕDUKAS -> mediaan/keskmine: 0.0; 0.003
NÕRK -> mediaan/keskmine: 0.0; 0.002
MITTEEKSISTEERIV -> mediaan/keskmine: 0.014; 0.015

NEG_POLARITY
          1         2         3
1  1.000000  0.000025  0.076949
2  0.000025  1.000000  0.803371
3  0.076949  0.803371  1.000000
Kui p>0.05, siis need jäetakse välja!

GRUPP 1 (tugev/mõõdukas vs nõrk)
neg_polarity, 2.52475e-05 

TUGEV/MÕÕDUKAS -> mediaan/keskmine: 0.0; 0.007
NÕRK -> mediaan/keskmine: 0.016; 0.021
MITTEEKSISTEERIV -> mediaan/keskmine: 0.018; 0.015

TER_CASE
     1    2    3
1  1.0  1.0  1.0
2  1.0  1.0  1.0
3  1.0  1.0  1.0
Kui p>0.05, siis need jäetakse välja!


XCOMP
          1         2         3
1  1.000000  0.000117  0.022626
2  0.000117  1.000000  0.710998
3  0.022626  0.710998  1.000000
Kui p>0.05, siis need jäetakse välja!

GRUPP 1 (tugev/mõõdukas vs nõrk)
xcomp, 0.000117423 

TUGEV/MÕÕDUKAS -> mediaan/keskmine: 0.008; 0.01
NÕRK -> mediaan/keskmine: 0.018; 0.019
MITTEEKSISTEERIV -> mediaan/keskmine: 0.018; 0.021
GRU

### INTERAKTIIVSUS

In [20]:
f1 = 'dimensioonide_grupid/inter/inter_strong.csv'
f2 = 'dimensioonide_grupid/inter/inter_weak.csv'
f3 = 'dimensioonide_grupid/inter/inter_not_present.csv'

# kruskall Wallise sisendid (kolm gruppi)
strong = get_data(f1)
weak = get_data(f2)
notpres = get_data(f3)

print(len(strong))
print(len(weak))
print(len(notpres))

feature_data, df = generate_df(create_array(strong), create_array(weak), create_array(notpres))
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(df)
    
relevant_feats = [feat[0] for feat in feature_data if feat[5] == 'YES']
print(relevant_feats)

strong_pd = pd.read_csv(f1, sep=';')
weak_pd = pd.read_csv(f2, sep=';')
notpres_pd = pd.read_csv(f3, sep=';')

res1, res2, res3 = display_posthoc_results(relevant_feats, strong_pd, weak_pd, notpres_pd)

10
15
75


  adjusted_p = 0.1 / (80 - rank + 1)


Unnamed: 0,feature,rank,H statistic,KW p,adjusted p,state
0,noun,7.0,23.334374,1e-05,0.001351,YES
1,adj,28.0,10.943733,0.0042,0.001887,NO
2,propn,29.0,10.678939,0.0048,0.001923,NO
3,adv,22.0,12.668687,0.00177,0.001695,NO
4,intj,8.5,20.476598,4e-05,0.001379,YES
5,cconj,81.0,0.073968,0.96369,inf,YES
6,sconj,37.0,7.137616,0.02819,0.002273,NO
7,adp,50.5,3.616788,0.16392,0.003279,NO
8,det,49.0,3.726473,0.15517,0.003125,NO
9,num,36.0,7.203906,0.02727,0.002222,NO


['noun', 'intj', 'cconj', 'pron', 'nominals', 'TTR', 'avg_word_len', 'hapax_legomena', 'pron/noun_ratio', '1st_pron', '2nd_pron', '1st_prs_verb', '2nd_prs_verb', 'core_verb', 'finite_verb', 'discourse', 'imp_mood', 'neg_polarity', 'gen_case', 'modal', 'nmod', 'voc']

NOUN
          1         2         3
1  1.000000  0.522908  0.000532
2  0.522908  1.000000  0.000798
3  0.000532  0.000798  1.000000
Kui p>0.05, siis need jäetakse välja!

GRUPP 2 (tugev/mõõdukas vs MITTEEKSISTEERIV)
noun, 0.0005316103 

TUGEV/MÕÕDUKAS -> mediaan/keskmine: 0.212; 0.205
NÕRK -> mediaan/keskmine: 0.227; 0.222
MITTEEKSISTEERIV -> mediaan/keskmine: 0.286; 0.289
GRUPP 3 (nõrk vs MITTEEKSISTEERIV)
noun, 0.0007983047 

TUGEV/MÕÕDUKAS -> mediaan/keskmine: 0.212; 0.205
NÕRK -> mediaan/keskmine: 0.227; 0.222
MITTEEKSISTEERIV -> mediaan/keskmine: 0.286; 0.289

INTJ
          1         2         3
1  1.000000  0.026642  0.000033
2  0.026642  1.000000  0.096095
3  0.000033  0.096095  1.000000
Kui p>0.05, siis need jäet

MITTEEKSISTEERIV -> mediaan/keskmine: 0.0; 0.002
GRUPP 3 (nõrk vs MITTEEKSISTEERIV)
imp_mood, 0.0014093658 

TUGEV/MÕÕDUKAS -> mediaan/keskmine: 0.009; 0.012
NÕRK -> mediaan/keskmine: 0.0; 0.004
MITTEEKSISTEERIV -> mediaan/keskmine: 0.0; 0.002

NEG_POLARITY
          1         2         3
1  1.000000  0.265824  0.004694
2  0.265824  1.000000  0.061867
3  0.004694  0.061867  1.000000
Kui p>0.05, siis need jäetakse välja!

GRUPP 2 (tugev/mõõdukas vs MITTEEKSISTEERIV)
neg_polarity, 0.0046941461 

TUGEV/MÕÕDUKAS -> mediaan/keskmine: 0.019; 0.022
NÕRK -> mediaan/keskmine: 0.016; 0.019
MITTEEKSISTEERIV -> mediaan/keskmine: 0.008; 0.011

GEN_CASE
          1         2         3
1  1.000000  0.071281  0.000134
2  0.071281  1.000000  0.071281
3  0.000134  0.071281  1.000000
Kui p>0.05, siis need jäetakse välja!

GRUPP 2 (tugev/mõõdukas vs MITTEEKSISTEERIV)
gen_case, 0.0001335929 

TUGEV/MÕÕDUKAS -> mediaan/keskmine: 0.063; 0.062
NÕRK -> mediaan/keskmine: 0.097; 0.105
MITTEEKSISTEERIV -> mediaan

### KEERULISUS

In [21]:
f1 = 'dimensioonide_grupid/keer/keer_strong.csv'
f2 = 'dimensioonide_grupid/keer/keer_weak.csv'
f3 = 'dimensioonide_grupid/keer/keer_not_present.csv'

# kruskall Wallise sisendid (kolm gruppi)
strong = get_data(f1)
weak = get_data(f2)
notpres = get_data(f3)

print(len(strong))
print(len(weak))
print(len(notpres))

feature_data, df = generate_df(create_array(strong), create_array(weak), create_array(notpres))
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(df)
    
relevant_feats = [feat[0] for feat in feature_data if feat[5] == 'YES']
print(relevant_feats)

strong_pd = pd.read_csv(f1, sep=';')
weak_pd = pd.read_csv(f2, sep=';')
notpres_pd = pd.read_csv(f3, sep=';')

res1, res2, res3 = display_posthoc_results(relevant_feats, strong_pd, weak_pd, notpres_pd)

9
25
53


  adjusted_p = 0.1 / (80 - rank + 1)


Unnamed: 0,feature,rank,H statistic,KW p,adjusted p,state
0,noun,17.0,9.083589,0.01065,0.001563,NO
1,adj,23.0,7.830421,0.01994,0.001724,NO
2,propn,26.0,7.123718,0.02839,0.001818,NO
3,adv,28.0,6.994662,0.03028,0.001887,NO
4,intj,41.5,4.079898,0.13004,0.002532,NO
5,cconj,77.0,0.387453,0.82388,0.025,NO
6,sconj,49.0,3.581159,0.16686,0.003125,NO
7,adp,64.0,1.27501,0.52861,0.005882,NO
8,det,40.0,4.124993,0.12714,0.002439,NO
9,num,50.0,3.533189,0.17091,0.003226,NO


['punct', 'abbr', '1st_pron', '1st_prs_verb', 'finite_verb', 'acl:relc', 'nmod']

PUNCT
          1         2         3
1  1.000000  0.637972  0.016343
2  0.637972  1.000000  0.004476
3  0.016343  0.004476  1.000000
Kui p>0.05, siis need jäetakse välja!

GRUPP 2 (tugev/mõõdukas vs MITTEEKSISTEERIV)
punct, 0.0163434264 

TUGEV/MÕÕDUKAS -> mediaan/keskmine: 0.118; 0.127
NÕRK -> mediaan/keskmine: 0.125; 0.126
MITTEEKSISTEERIV -> mediaan/keskmine: 0.148; 0.148
GRUPP 3 (nõrk vs MITTEEKSISTEERIV)
punct, 0.0044760158 

TUGEV/MÕÕDUKAS -> mediaan/keskmine: 0.118; 0.127
NÕRK -> mediaan/keskmine: 0.125; 0.126
MITTEEKSISTEERIV -> mediaan/keskmine: 0.148; 0.148

ABBR
          1         2         3
1  1.000000  0.054014  0.000676
2  0.054014  1.000000  0.054014
3  0.000676  0.054014  1.000000
Kui p>0.05, siis need jäetakse välja!

GRUPP 2 (tugev/mõõdukas vs MITTEEKSISTEERIV)
abbr, 0.0006755665 

TUGEV/MÕÕDUKAS -> mediaan/keskmine: 0.054; 0.045
NÕRK -> mediaan/keskmine: 0.008; 0.017
MITTEEKSISTEERIV

### SPONTAANSUS

In [22]:
f1 = 'dimensioonide_grupid/spont/spont_strong.csv'
f2 = 'dimensioonide_grupid/spont/spont_weak.csv'
f3 = 'dimensioonide_grupid/spont/spont_not_present.csv'

# kruskall Wallise sisendid (kolm gruppi)
strong = get_data(f1)
weak = get_data(f2)
notpres = get_data(f3)


print(len(strong))
print(len(weak))
print(len(notpres))

feature_data, df = generate_df(create_array(strong), create_array(weak), create_array(notpres))
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(df)
    
relevant_feats = [feat[0] for feat in feature_data if feat[5] == 'YES']
print(relevant_feats)

strong_pd = pd.read_csv(f1, sep=';')
weak_pd = pd.read_csv(f2, sep=';')
notpres_pd = pd.read_csv(f3, sep=';')

res1, res2, res3 = display_posthoc_results(relevant_feats, strong_pd, weak_pd, notpres_pd)

12
6
79


  adjusted_p = 0.1 / (80 - rank + 1)


Unnamed: 0,feature,rank,H statistic,KW p,adjusted p,state
0,noun,4.0,26.278573,0.0,0.001299,YES
1,adj,38.0,6.994455,0.03028,0.002326,NO
2,propn,19.0,15.042598,0.00054,0.001613,YES
3,adv,9.5,22.9276,1e-05,0.001399,YES
4,intj,4.0,38.165935,0.0,0.001299,YES
5,cconj,66.0,1.597142,0.44997,0.006667,NO
6,sconj,47.0,4.272349,0.11811,0.002941,NO
7,adp,59.0,2.331758,0.31165,0.004545,NO
8,det,53.0,3.378742,0.18464,0.003571,NO
9,num,41.0,5.243742,0.07267,0.0025,NO


['noun', 'propn', 'adv', 'intj', 'pron', 'nominals', 'avg_word_len', 'pron/noun_ratio', '1st_pron', '2nd_pron', '1st_prs_verb', '2nd_prs_verb', 'core_verb', 'finite_verb', 'discourse', 'imp_mood', 'neg_polarity', 'gen_case', 'abe_case', 'nsubj_cop', 'modal', 'obl', 'nmod', 'cop']

NOUN
          1         2         3
1  1.000000  0.473671  0.000009
2  0.473671  1.000000  0.020539
3  0.000009  0.020539  1.000000
Kui p>0.05, siis need jäetakse välja!

GRUPP 2 (tugev/mõõdukas vs MITTEEKSISTEERIV)
noun, 9.2864e-06 

TUGEV/MÕÕDUKAS -> mediaan/keskmine: 0.212; 0.198
NÕRK -> mediaan/keskmine: 0.231; 0.237
MITTEEKSISTEERIV -> mediaan/keskmine: 0.296; 0.296
GRUPP 3 (nõrk vs MITTEEKSISTEERIV)
noun, 0.0205386879 

TUGEV/MÕÕDUKAS -> mediaan/keskmine: 0.212; 0.198
NÕRK -> mediaan/keskmine: 0.231; 0.237
MITTEEKSISTEERIV -> mediaan/keskmine: 0.296; 0.296

PROPN
          1         2         3
1  1.000000  0.618473  0.001343
2  0.618473  1.000000  0.095346
3  0.001343  0.095346  1.000000
Kui p>0.05, s


NEG_POLARITY
          1         2         3
1  1.000000  0.388269  0.000446
2  0.388269  1.000000  0.388269
3  0.000446  0.388269  1.000000
Kui p>0.05, siis need jäetakse välja!

GRUPP 2 (tugev/mõõdukas vs MITTEEKSISTEERIV)
neg_polarity, 0.0004462431 

TUGEV/MÕÕDUKAS -> mediaan/keskmine: 0.021; 0.025
NÕRK -> mediaan/keskmine: 0.015; 0.017
MITTEEKSISTEERIV -> mediaan/keskmine: 0.007; 0.01

GEN_CASE
          1         2         3
1  1.000000  0.153599  0.000006
2  0.153599  1.000000  0.153599
3  0.000006  0.153599  1.000000
Kui p>0.05, siis need jäetakse välja!

GRUPP 2 (tugev/mõõdukas vs MITTEEKSISTEERIV)
gen_case, 6.2354e-06 

TUGEV/MÕÕDUKAS -> mediaan/keskmine: 0.063; 0.057
NÕRK -> mediaan/keskmine: 0.086; 0.099
MITTEEKSISTEERIV -> mediaan/keskmine: 0.15; 0.146

ABE_CASE
     1    2    3
1  1.0  1.0  1.0
2  1.0  1.0  1.0
3  1.0  1.0  1.0
Kui p>0.05, siis need jäetakse välja!


NSUBJ_COP
          1         2         3
1  1.000000  0.959818  0.002451
2  0.959818  1.000000  0.033759


### SUBJEKTIIVSUS

In [23]:
f1 = 'dimensioonide_grupid/subj/subj_strong.csv'
f2 = 'dimensioonide_grupid/subj/subj_weak.csv'
f3 = 'dimensioonide_grupid/subj/subj_not_present.csv'

# kruskall Wallise sisendid (kolm gruppi)
strong = get_data(f1)
weak = get_data(f2)
notpres = get_data(f3)

print(len(strong))
print(len(weak))
print(len(notpres))

feature_data, df = generate_df(create_array(strong), create_array(weak), create_array(notpres))
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(df)
    
relevant_feats = [feat[0] for feat in feature_data if feat[5] == 'YES']
print(relevant_feats)

strong_pd = pd.read_csv(f1, sep=';')
weak_pd = pd.read_csv(f2, sep=';')
notpres_pd = pd.read_csv(f3, sep=';')

res1, res2, res3 = display_posthoc_results(relevant_feats, strong_pd, weak_pd, notpres_pd)

30
15
52


  adjusted_p = 0.1 / (80 - rank + 1)


Unnamed: 0,feature,rank,H statistic,KW p,adjusted p,state
0,noun,8.0,38.407955,0.0,0.00137,YES
1,adj,26.0,15.569052,0.00042,0.001818,YES
2,propn,8.0,25.072703,0.0,0.00137,YES
3,adv,8.0,51.636507,0.0,0.00137,YES
4,intj,24.5,15.712987,0.00039,0.00177,YES
5,cconj,78.0,0.462533,0.79353,0.033333,NO
6,sconj,33.0,12.410101,0.00202,0.002083,YES
7,adp,70.0,0.66341,0.7177,0.009091,NO
8,det,37.0,11.807713,0.00273,0.002273,NO
9,num,56.0,2.656925,0.26488,0.004,NO


['noun', 'adj', 'propn', 'adv', 'intj', 'sconj', 'punct', 'pron', 'nominals', 'avg_word_len', 'pron/noun_ratio', 'see_pron', '1st_pron', '2nd_pron', 'active_voice', 'passive_voice', '1st_prs_verb', '2nd_prs_verb', 'core_verb', 'finite_verb', 'supine', 'discourse', 'ind_mood', 'imp_mood', 'neg_polarity', 'nom_case', 'gen_case', 'ade_case', 'nsubj_cop', 'modal', 'csubj_cop', 'obl', 'nmod', 'amod', 'cop']

NOUN
              1         2             3
1  1.000000e+00  0.032324  2.339419e-09
2  3.232403e-02  1.000000  2.478731e-02
3  2.339419e-09  0.024787  1.000000e+00
Kui p>0.05, siis need jäetakse välja!

GRUPP 1 (tugev/mõõdukas vs nõrk)
noun, 0.0323240309 

TUGEV/MÕÕDUKAS -> mediaan/keskmine: 0.224; 0.217
NÕRK -> mediaan/keskmine: 0.263; 0.263
MITTEEKSISTEERIV -> mediaan/keskmine: 0.306; 0.311
GRUPP 2 (tugev/mõõdukas vs MITTEEKSISTEERIV)
noun, 2.3e-09 

TUGEV/MÕÕDUKAS -> mediaan/keskmine: 0.224; 0.217
NÕRK -> mediaan/keskmine: 0.263; 0.263
MITTEEKSISTEERIV -> mediaan/keskmine: 0.306; 0.

MITTEEKSISTEERIV -> mediaan/keskmine: 0.0; 0.0

2ND_PRON
          1         2         3
1  1.000000  0.088997  0.000007
2  0.088997  1.000000  0.127552
3  0.000007  0.127552  1.000000
Kui p>0.05, siis need jäetakse välja!

GRUPP 2 (tugev/mõõdukas vs MITTEEKSISTEERIV)
2nd_pron, 7.0942e-06 

TUGEV/MÕÕDUKAS -> mediaan/keskmine: 0.0; 0.005
NÕRK -> mediaan/keskmine: 0.0; 0.002
MITTEEKSISTEERIV -> mediaan/keskmine: 0.0; 0.0

ACTIVE_VOICE
          1         2         3
1  1.000000  0.195696  0.000429
2  0.195696  1.000000  0.234541
3  0.000429  0.234541  1.000000
Kui p>0.05, siis need jäetakse välja!

GRUPP 2 (tugev/mõõdukas vs MITTEEKSISTEERIV)
active_voice, 0.0004291099 

TUGEV/MÕÕDUKAS -> mediaan/keskmine: 0.148; 0.148
NÕRK -> mediaan/keskmine: 0.132; 0.132
MITTEEKSISTEERIV -> mediaan/keskmine: 0.116; 0.121

PASSIVE_VOICE
          1         2         3
1  1.000000  0.249007  0.000628
2  0.249007  1.000000  0.249007
3  0.000628  0.249007  1.000000
Kui p>0.05, siis need jäetakse välja!

G


MODAL
          1         2         3
1  1.000000  0.278768  0.000119
2  0.278768  1.000000  0.081392
3  0.000119  0.081392  1.000000
Kui p>0.05, siis need jäetakse välja!

GRUPP 2 (tugev/mõõdukas vs MITTEEKSISTEERIV)
modal, 0.0001186161 

TUGEV/MÕÕDUKAS -> mediaan/keskmine: 0.026; 0.025
NÕRK -> mediaan/keskmine: 0.019; 0.021
MITTEEKSISTEERIV -> mediaan/keskmine: 0.008; 0.012

CSUBJ_COP
          1         2         3
1  1.000000  0.797835  0.001518
2  0.797835  1.000000  0.029047
3  0.001518  0.029047  1.000000
Kui p>0.05, siis need jäetakse välja!

GRUPP 2 (tugev/mõõdukas vs MITTEEKSISTEERIV)
csubj_cop, 0.0015176267 

TUGEV/MÕÕDUKAS -> mediaan/keskmine: 0.0; 0.002
NÕRK -> mediaan/keskmine: 0.0; 0.002
MITTEEKSISTEERIV -> mediaan/keskmine: 0.0; 0.0
GRUPP 3 (nõrk vs MITTEEKSISTEERIV)
csubj_cop, 0.0290467739 

TUGEV/MÕÕDUKAS -> mediaan/keskmine: 0.0; 0.002
NÕRK -> mediaan/keskmine: 0.0; 0.002
MITTEEKSISTEERIV -> mediaan/keskmine: 0.0; 0.0

OBL
          1         2         3
1  1.000000

## KORRELATSIOONID DIMENSIOONIDEGA

In [None]:
# dimensioonide vahel spearman!

f = 'limesurvey_loplikud_skoorid_koond.csv'
# f = 'limesurvey_loplikud_skoorid_ordinal.csv'

data = []
with open(f, 'r') as fid:
    csv_reader = csv.DictReader(fid, delimiter=',')
    fieldnames = csv_reader.fieldnames
    for line in csv_reader:
        data.append(line)

col_list1 = ['abs_avg1', 'info_avg1', 'aeg_avg1', 'afek_avg1', 'inter_avg1', 'inst_avg1', 'form_avg1', 'keer_avg1', 'subj_avg1', 'spont_avg1', 'imp_avg1', 'arg_avg1']
col_list2 = ['abs_avg2', 'info_avg2', 'aeg_avg2', 'afek_avg2', 'inter_avg2', 'inst_avg2', 'form_avg2', 'keer_avg2', 'subj_avg2', 'spont_avg2', 'imp_avg2', 'arg_avg2']

# mis on vahet nt abs_avg1 ja abs_avg2 vahel?  et oli kaks gruppi hindajaid. nt ühelt grupilt tuleb keskmiseks 1.5, teiselt grupilt keskmiseks 1.4, siis
# keskmine tuleb 1.45. kui aga juhtub, et ühelt grupilt saab 1 ja teise grupi hinnangut ei tea üldse (st saab ##)
# siis DIM_avg1 korral tehakse (1 + ##)2 = 1
# ja DIM_avg2 korral saab automaatselt -1 (ehk ei tea midagi selle teksti kohta)
# ja kui mõlemad grupid annavad ## ja ##, siis mõlemad saavad keskmiseks ka -1
# valituks osutus: avg1 veerg

dim_df = pd.DataFrame(data).iloc[:, 1:].astype(float)
dim_ver1 = dim_df[col_list1]
dim_ver1




In [None]:
def remove_elements(ls1: list[int], ls2: list[int], equal_to: int = -1) -> tuple[list[int], list[int]]:
    indices = {i for i, el in enumerate(ls1) if el == equal_to} | {i for i, el in enumerate(ls2) if el == equal_to}
    return [el for i, el in enumerate(ls1) if i not in indices], [el for i, el in enumerate(ls2) if i not in indices]

In [None]:
dim_pairs = [pair for pair in itertools.combinations(col_list1,2)]
for p in dim_pairs:
#     print(len(dim1_arr), len(dim2_arr))
    arr1, arr2 = remove_elements(dim_df[p[0]], dim_df[p[1]])
#     print(len(arr1), len(arr2))
    
    sp = stats.spearmanr(arr1, arr2)
    correlation = sp.correlation
    pval = sp.pvalue
#     if pval >= 0.05:
#         print(p)
#         print(round(pval, 5))

## KORRELATSIOONID TUNNUSTEGA

In [None]:
input_f = 'limesurvey_results4.csv'

data = []

with open(input_f, 'r') as fid:
    csv_reader = csv.DictReader(fid, delimiter=',')
    fieldnames = csv_reader.fieldnames
    for line in csv_reader:
        data.append(line)

feature_df = pd.DataFrame(data).iloc[:, 1:].astype(float)
feature_df

In [None]:
feature_names = feature_df.columns
feature_names

In [None]:
def save_csv(prefix, df):
    df.to_csv(f'korrelatsiooni_csvd/{prefix}.csv')

In [None]:
def plot_important_features(model, dimname):
    
    print(model.coef_)

    feature_importance = abs(model.coef_[0])
    
    
    feature_importance = 1.0 * (feature_importance / feature_importance.max())
    sorted_idx = np.argsort(feature_importance)
    pos = np.arange(sorted_idx.shape[0]) + .5

    featfig = plt.figure(figsize=(15,20))
    featax = featfig.add_subplot(1, 1, 1)

    featax.barh(pos, feature_importance[sorted_idx], align='center')
    featax.set_yticks(pos)
    featax.set_yticklabels(np.array(X.columns)[sorted_idx], fontsize=10)
    featax.set_xlabel(f'Relative Feature Importance {str.upper(dimname)}')

    # plt.tight_layout()   
    plt.show()

In [None]:
def get_corr_pairs(corr_df):

    columns = [line for line in corr_df]
    matches = []

    for column, rows in corr_df.items():
        for m, score in rows.items():
            if (score > 0.6 or score < -0.6) and column != m:
                matches.append([(column, m), score])
            continue

    without_duplicates = []
    for ws, i in matches:
        ls = (sorted(ws), i)
        if ls not in without_duplicates:
            without_duplicates.append(ls)       

    return without_duplicates

In [None]:
def generate_heatmap(dim, corr_data):

    my_colors = ['black', 'lightgrey', 'white', 'red']
    my_cmap = ListedColormap(my_colors)
    bounds = [-1.0, -0.6, 0.6, 1.0]
    my_norm = BoundaryNorm(bounds, ncolors=len(my_colors))



    mask = np.triu(np.ones_like(corr_data))

    fig, ax = plt.subplots(1, 1, figsize=(20,20))
    hmap = sns.heatmap(corr_data,
                yticklabels=1, 
                ax=ax,
                linewidths=1.0,
                cmap=my_cmap,
                norm=my_norm,
                mask=mask,
               cbar_kws = dict(use_gridspec=False,location="top")
               )

    colorbar = ax.collections[0].colorbar
    hmap.figure.savefig(f'heatmapid/{dim}_heatmap.png', format='png', dpi=150)
    plt.show()


In [None]:

all_corr = feature_df.corr(method='spearman').round(2)

# save_csv('dimensions', all_corr)

In [None]:
generate_heatmap('all_dims', all_corr)

In [None]:
pairs = get_corr_pairs(all_corr)
for pair in pairs:
    print(f'{pair[0]} = {pair[1]}')

# # Lineaarsuse eeldus

Pearsoni jaoks

In [None]:
feature_pairs = [pair for pair in itertools.combinations(feature_names,2)]

In [None]:
# with PdfPages('foo.pdf') as pdf:
#     for x, y in feature_pairs:

#         figure = plt.figure()
#         ax = plt.gca()
#         ax.scatter(feature_df[x], feature_df[y])
#         ax.set_xlabel(x)
#         ax.set_ylabel(y)
#     #     ax.set_title("{} vs {}".format(x, y))
#         plt.show()
#         pdf.savefig(figure)
    