In [1]:
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats 
import seaborn as sns
import statsmodels.formula.api as smf
import scikit_posthocs as sp
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

In [2]:
plt.rc("font", size=14)
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from matplotlib.colors import BoundaryNorm, ListedColormap

In [3]:
columns = ['noun', 'adj', 'propn', 'adv', 'intj', 'cconj', 'sconj', 'adp', 'det', 'num', 'punct', 'symbol', 'pron', 'abbr', 'TTR', 'avg_word_len', 'avr_sent_len', 'hapax_legomena', 'coref', 'see_pron', 'see_det', '1st_pron', '2nd_pron', '3rd_pron', 'active_voice', 'passive_voice', '1st_prs_verb', '2nd_prs_verb', '3rd_prs_verb', 'core_verb', 'verbtype_ratio', 'da_inf', 'gerund', 'supine', 'verb_particle', 'discourse', 'pres_tense', 'past_tense', 'ind_mood', 'cond_mood', 'imp_mood', 'quot_mood', 'neg_polarity', 'nom_case', 'gen_case', 'part_case', 'ill_case', 'ine_case', 'ela_case', 'alla_case', 'ade_case', 'abl_case', 'tra_case', 'ter_case', 'ess_case', 'abe_case', 'com_case', 'nsubj', 'nsubj_cop', 'modal', 'acl:relc', 'csubj', 'csubj_cop', 'obj', 'ccomp', 'xcomp', 'obl', 'nmod', 'appos', 'nummod', 'amod', 'advcl', 'voc', 'cop', 'conj', 'cc', 'yneemid', 'emoticons']

def get_data(f):
    ls = []
    with open(f, 'r') as fid:
        reader = csv.DictReader(fid, delimiter=';')
        data = [line for line in reader]
    return data


### FUNKTSIOONIDE SEKTSIOON

In [4]:
def create_array(data):
    # KW sisendi saamiseks on vaja iga rea (=faili) tunnuste väärtused ehk
    # [[tunnus11, tunnus12,.., tunnus1N], [tunnus21, tunnus22,.., tunnus2N], ..]
    d = [list(line.items()) for line in data]
    arrays = []
    for f in d:
        arrays.append([element[1] for element in f[1:]])
    return np.array(arrays)

In [5]:
def calculate_hb(rank):
    # holm-bonferroni meetodil p väärtuse korrigeerimine, tavalaks on 0.05, aga siin oleme liberaalsed ja panime 0.1
    adjusted_p = 0.1 / (78 - rank + 1)
    return adjusted_p

In [6]:
def get_relevant_features(f, p):
    # tagastab listide listi, kus igas listis on [tunnus, järjekorra nr, KW p, korrigeeritud p, saatus]
    features = [[feature, round(p, 5)] for feature, f, p in zip(columns, f.tolist(), p.tolist())]          
    p_values = [feature[1] for feature in features]
    ranks = stats.rankdata(p_values)
    features = [feature[0] for feature in features]
    m = [s for s in zip(features, p_values, ranks)]
    positive_features = [] 
    for el in m:
        kW_p = el[1]
        adjusted_p = calculate_hb(el[2])
        if adjusted_p > kW_p:
            state = 'YES'
        else:
            state = 'NO'
        positive_features.append([el[0], el[2], kW_p, adjusted_p, state])

    return positive_features  

In [7]:
def calculate_dunn_test2(feature, strong_pd, weak_pd, not_pres_pd):
    data = [strong_pd[feature], weak_pd[feature], notpres_pd[feature]]
    df = pd.DataFrame(data).transpose()

    # gruppide määramine ehk tugev/mõõdukas; nõrk; not_present
    group1 = df.iloc[:, 0].dropna()
    group2 = df.iloc[:, 1].dropna()
    group3 = df.iloc[:, 2].dropna()
    
    dunn = sp.posthoc_dunn([group1, group2, group3], p_adjust='holm')
#         dunn = sp.posthoc_dunn([group1, group2, group3])

    return feature.upper(), dunn


In [8]:
def generate_df(strong_array, weak_array, notpres_array):
    # Kruskal Wallise väljund
    f_statistic, p_value = stats.kruskal(strong_array, weak_array, notpres_array)

    feature_data = get_relevant_features(f_statistic, p_value)

    return feature_data, pd.DataFrame(feature_data, columns =['feature', 'rank', 'KW p', 'adjusted p', 'state'])

    

In [23]:
def display_posthoc_results(feats, strong, weak, notpres):
    """sisend: tunnused, tugev-mõõdukas, nõrk, puudu dataframe'id.
    siin funktsioonis kasutatakse calculate_dunn_test2 funktsiooni, kus siis tunnus tunnuse haaval vaadatakse konkreetselt ühe tunnuse 
    mõõtmisi kolmes grupis ja arvutatakse, milliste gruppide vahel on erinevusi.
    väljund: list -> tugev-mõõdukas, tugev-puudu, nõrk-puudu grupid per tunnus (kui relevantne tunnuse p on dunni testis <=0.05) 
    """
    
    strong_w = []
    strong_not = []
    weak_not = []
    for feat in feats:
        out = calculate_dunn_test2(feat, strong, weak, notpres)
        print(f'{out[0]}\n{out[1]}')
        print('Kui p>0.05, siis need jäetakse välja!\n')
        strong_weak = [out[1].iloc[1, 0], 'tugev/mõõdukas vs nõrk']
        strong_notpres = [out[1].iloc[2, 0], 'tugev/mõõdukas vs puudu']
        weak_notpres = [out[1].iloc[2, 1], 'nõrk vs puudu']
        for score in [strong_weak, strong_notpres, weak_notpres]:
            if score[0] <= 0.05:
                if score[1] == 'tugev/mõõdukas vs nõrk':
                    print('GRUPP 1 (tugev/mõõdukas vs nõrk)')
                    print(f'{feat}, {round(score[0], 10)} \n')
                    strong_w.append([feat, round(score[0], 10)])
                    
                if score[1] == 'tugev/mõõdukas vs puudu':
                    print('GRUPP 2 (tugev/mõõdukas vs MITTEEKSISTEERIV)')
                    print(f'{feat}, {round(score[0], 10)} \n')
                    strong_not.append([feat, round(score[0], 10)])
                if score[1] == 'nõrk vs puudu':
                    print('GRUPP 3 (nõrk vs MITTEEKSISTEERIV)')
                    print(f'{feat}, {round(score[0], 10)} \n')
                    weak_not.append([feat, round(score[0], 10)])
                print(f'TUGEV/MÕÕDUKAS -> keskmine: {round(strong[feat].mean(), 4)}; mediaan: {round(strong[feat].median(), 4)}')
                print(f'NÕRK -> keskmine: {round(weak[feat].mean(), 4)}; mediaan: {round(weak[feat].median(), 4)}')
                print(f'MITTEEKSISTEERIV -> keskmine: {round(notpres[feat].mean(), 4)}; mediaan: {round(notpres[feat].median(), 4)}')

                # print(f'{feat.upper()} =>\n {score[1]}, {round(score[0], 4)}')
            print('================')
    return strong_w, strong_not, weak_not

### INSTRUEERIVUS

In [26]:
f1 = 'dimensioonide_grupid/inst/inst_strong.csv'
f2 = 'dimensioonide_grupid/inst/inst_weak.csv'
f3 = 'dimensioonide_grupid/inst/inst_not_present.csv'

# kruskall Wallise sisendid (kolm gruppi)
strong = get_data(f1)
weak = get_data(f2)
notpres = get_data(f3)


print(f'"tugev/mõõdukas" kokku -> {len(strong)}')
print(f'"nõrk" kokku -> {len(weak)}')
print(f'"puudu" kokku -> {len(notpres)}')


feature_data, df = generate_df(create_array(strong), create_array(weak), create_array(notpres))
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(df)

"tugev/mõõdukas" kokku -> 15
"nõrk" kokku -> 18
"puudu" kokku -> 58


Unnamed: 0,feature,rank,KW p,adjusted p,state
0,noun,19.0,0.04986,0.001667,NO
1,adj,58.0,0.50356,0.004762,NO
2,propn,9.0,0.00616,0.001429,NO
3,adv,27.0,0.06997,0.001923,NO
4,intj,75.5,0.90499,0.028571,NO
5,cconj,32.0,0.10623,0.002128,NO
6,sconj,53.0,0.45403,0.003846,NO
7,adp,49.0,0.38203,0.003333,NO
8,det,77.0,0.95758,0.05,NO
9,num,40.0,0.25733,0.002564,NO


In [27]:
relevant_feats = [feat[0] for feat in feature_data if feat[4] == 'YES']

strong_pd = pd.read_csv(f1, sep=';')
weak_pd = pd.read_csv(f2, sep=';')
notpres_pd = pd.read_csv(f3, sep=';')

for r in relevant_feats:
    print(r)

res1, res2, res3 = display_posthoc_results(relevant_feats, strong_pd, weak_pd, notpres_pd)

pres_tense
PRES_TENSE
          1         2         3
1  1.000000  0.684228  0.001032
2  0.684228  1.000000  0.001824
3  0.001032  0.001824  1.000000
Kui p>0.05, siis need jäetakse välja!

GRUPP 2 (tugev/mõõdukas vs MITTEEKSISTEERIV)
pres_tense, 0.0010315064 

TUGEV/MÕÕDUKAS -> keskmine: 0.1151; mediaan: 0.1156
NÕRK -> keskmine: 0.1059; mediaan: 0.1084
MITTEEKSISTEERIV -> keskmine: 0.0714; mediaan: 0.0707
GRUPP 3 (nõrk vs MITTEEKSISTEERIV)
pres_tense, 0.0018241219 

TUGEV/MÕÕDUKAS -> keskmine: 0.1151; mediaan: 0.1156
NÕRK -> keskmine: 0.1059; mediaan: 0.1084
MITTEEKSISTEERIV -> keskmine: 0.0714; mediaan: 0.0707


### ABSTRAKTSUS

In [28]:
f1 = 'dimensioonide_grupid/abs/abs_strong.csv'
f2 = 'dimensioonide_grupid/abs/abs_weak.csv'
f3 = 'dimensioonide_grupid/abs/abs_not_present.csv'

# kruskall Wallise sisendid (kolm gruppi)
strong = get_data(f1)
weak = get_data(f2)
notpres = get_data(f3)

print(f'"tugev/mõõdukas" kokku -> {len(strong)}')
print(f'"nõrk" kokku -> {len(weak)}')
print(f'"puudu" kokku -> {len(notpres)}')

feature_data, df = generate_df(create_array(strong), create_array(weak), create_array(notpres))

with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(df)
    
relevant_feats = [feat[0] for feat in feature_data if feat[4] == 'YES']

strong_pd = pd.read_csv(f1, sep=';')
weak_pd = pd.read_csv(f2, sep=';')
notpres_pd = pd.read_csv(f3, sep=';')

print(relevant_feats)

res1, res2, res3 = display_posthoc_results(relevant_feats, strong_pd, weak_pd, notpres_pd)

"tugev/mõõdukas" kokku -> 6
"nõrk" kokku -> 36
"puudu" kokku -> 56


Unnamed: 0,feature,rank,KW p,adjusted p,state
0,noun,31.0,0.04637,0.002083,NO
1,adj,43.0,0.14122,0.002778,NO
2,propn,42.0,0.131,0.002703,NO
3,adv,27.0,0.03815,0.001923,NO
4,intj,5.5,0.0,0.001361,YES
5,cconj,64.0,0.55736,0.006667,NO
6,sconj,51.0,0.30248,0.003571,NO
7,adp,49.0,0.29081,0.003333,NO
8,det,30.0,0.04632,0.002041,NO
9,num,38.0,0.06478,0.002439,NO


['intj', 'symbol', '2nd_pron', 'discourse', 'imp_mood', 'quot_mood', 'abl_case', 'ter_case', 'abe_case', 'csubj', 'csubj_cop', 'obl', 'voc', 'yneemid', 'emoticons']
INTJ
     1    2    3
1  1.0  1.0  1.0
2  1.0  1.0  1.0
3  1.0  1.0  1.0
Kui p>0.05, siis need jäetakse välja!

SYMBOL
          1         2         3
1  1.000000  0.783403  0.783403
2  0.783403  1.000000  0.783403
3  0.783403  0.783403  1.000000
Kui p>0.05, siis need jäetakse välja!

2ND_PRON
          1         2         3
1  1.000000  0.765263  0.286397
2  0.765263  1.000000  0.059813
3  0.286397  0.059813  1.000000
Kui p>0.05, siis need jäetakse välja!

DISCOURSE
     1    2    3
1  1.0  1.0  1.0
2  1.0  1.0  1.0
3  1.0  1.0  1.0
Kui p>0.05, siis need jäetakse välja!

IMP_MOOD
          1         2         3
1  1.000000  0.962507  0.942034
2  0.962507  1.000000  0.528712
3  0.942034  0.528712  1.000000
Kui p>0.05, siis need jäetakse välja!

QUOT_MOOD
     1    2    3
1  1.0  1.0  1.0
2  1.0  1.0  1.0
3  1.0  1.0  1.0
Ku

### AFEKTIIVSUS

In [29]:
f1 = 'dimensioonide_grupid/afek/afek_strong.csv'
f2 = 'dimensioonide_grupid/afek/afek_weak.csv'
f3 = 'dimensioonide_grupid/afek/afek_not_present.csv'

# kruskall Wallise sisendid (kolm gruppi)
strong = get_data(f1)
weak = get_data(f2)
notpres = get_data(f3)

print(f'"tugev/mõõdukas" kokku -> {len(strong)}')
print(f'"nõrk" kokku -> {len(weak)}')
print(f'"puudu" kokku -> {len(notpres)}')

feature_data, df = generate_df(create_array(strong), create_array(weak), create_array(notpres))
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(df)


relevant_feats = [feat[0] for feat in feature_data if feat[4] == 'YES']

strong_pd = pd.read_csv(f1, sep=';')
weak_pd = pd.read_csv(f2, sep=';')
notpres_pd = pd.read_csv(f3, sep=';')

print(relevant_feats)

res1, res2, res3 = display_posthoc_results(relevant_feats, strong_pd, weak_pd, notpres_pd)

"tugev/mõõdukas" kokku -> 28
"nõrk" kokku -> 18
"puudu" kokku -> 45


Unnamed: 0,feature,rank,KW p,adjusted p,state
0,noun,7.5,0.0,0.001399,YES
1,adj,28.0,0.00056,0.001961,YES
2,propn,23.0,0.00016,0.001786,YES
3,adv,7.5,0.0,0.001399,YES
4,intj,20.0,9e-05,0.001695,YES
5,cconj,64.0,0.36595,0.006667,NO
6,sconj,7.5,0.0,0.001399,YES
7,adp,57.0,0.2465,0.004545,NO
8,det,30.0,0.00189,0.002041,YES
9,num,45.0,0.02876,0.002941,NO


['noun', 'adj', 'propn', 'adv', 'intj', 'sconj', 'det', 'pron', 'avg_word_len', 'coref', 'see_pron', '1st_pron', '2nd_pron', '3rd_pron', 'active_voice', 'passive_voice', '1st_prs_verb', '2nd_prs_verb', 'core_verb', 'supine', 'discourse', 'ind_mood', 'imp_mood', 'neg_polarity', 'gen_case', 'ade_case', 'modal', 'xcomp', 'nmod', 'cop']
NOUN
              1         2             3
1  1.000000e+00  0.077622  2.021318e-10
2  7.762157e-02  1.000000  3.966194e-04
3  2.021318e-10  0.000397  1.000000e+00
Kui p>0.05, siis need jäetakse välja!

GRUPP 2 (tugev/mõõdukas vs MITTEEKSISTEERIV)
noun, 2e-10 

TUGEV/MÕÕDUKAS -> keskmine: 0.2091; mediaan: 0.2195
NÕRK -> keskmine: 0.2463; mediaan: 0.2414
MITTEEKSISTEERIV -> keskmine: 0.3163; mediaan: 0.3083
GRUPP 3 (nõrk vs MITTEEKSISTEERIV)
noun, 0.0003966194 

TUGEV/MÕÕDUKAS -> keskmine: 0.2091; mediaan: 0.2195
NÕRK -> keskmine: 0.2463; mediaan: 0.2414
MITTEEKSISTEERIV -> keskmine: 0.3163; mediaan: 0.3083
ADJ
          1         2         3
1  1.000000  0

### AEG

In [30]:
f1 = 'dimensioonide_grupid/aeg/aeg_strong.csv'
f2 = 'dimensioonide_grupid/aeg/aeg_weak.csv'
f3 = 'dimensioonide_grupid/aeg/aeg_not_present.csv'

# kruskall Wallise sisendid (kolm gruppi)
strong = get_data(f1)
weak = get_data(f2)
notpres = get_data(f3)

print(len(strong))
print(len(weak))
print(len(notpres))

feature_data, df = generate_df(create_array(strong), create_array(weak), create_array(notpres))
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(df)
    
relevant_feats = [feat[0] for feat in feature_data if feat[4] == 'YES']

strong_pd = pd.read_csv(f1, sep=';')
weak_pd = pd.read_csv(f2, sep=';')
notpres_pd = pd.read_csv(f3, sep=';')

print(relevant_feats)

res1, res2, res3 = display_posthoc_results(relevant_feats, strong_pd, weak_pd, notpres_pd)

16
39
19


Unnamed: 0,feature,rank,KW p,adjusted p,state
0,noun,73.0,0.83411,0.016667,NO
1,adj,63.0,0.64618,0.00625,NO
2,propn,9.0,0.01225,0.001429,NO
3,adv,45.0,0.29196,0.002941,NO
4,intj,26.5,0.1576,0.001905,NO
5,cconj,37.5,0.2464,0.00241,NO
6,sconj,18.0,0.09044,0.001639,NO
7,adp,69.0,0.66537,0.01,NO
8,det,30.0,0.18689,0.002041,NO
9,num,5.0,0.00341,0.001351,NO


['da_inf', 'pres_tense', 'past_tense', 'nummod']
DA_INF
          1         2         3
1  1.000000  0.011486  0.000284
2  0.011486  1.000000  0.071271
3  0.000284  0.071271  1.000000
Kui p>0.05, siis need jäetakse välja!

GRUPP 1 (tugev/mõõdukas vs nõrk)
da_inf, 0.0114856217 

TUGEV/MÕÕDUKAS -> keskmine: 0.0087; mediaan: 0.0086
NÕRK -> keskmine: 0.0199; mediaan: 0.0172
MITTEEKSISTEERIV -> keskmine: 0.0252; mediaan: 0.0263
GRUPP 2 (tugev/mõõdukas vs MITTEEKSISTEERIV)
da_inf, 0.0002838366 

TUGEV/MÕÕDUKAS -> keskmine: 0.0087; mediaan: 0.0086
NÕRK -> keskmine: 0.0199; mediaan: 0.0172
MITTEEKSISTEERIV -> keskmine: 0.0252; mediaan: 0.0263
PRES_TENSE
          1         2         3
1  1.000000  0.002708  0.000085
2  0.002708  1.000000  0.093619
3  0.000085  0.093619  1.000000
Kui p>0.05, siis need jäetakse välja!

GRUPP 1 (tugev/mõõdukas vs nõrk)
pres_tense, 0.0027083704 

TUGEV/MÕÕDUKAS -> keskmine: 0.0475; mediaan: 0.0313
NÕRK -> keskmine: 0.0861; mediaan: 0.0879
MITTEEKSISTEERIV -> keskm

### ARGUMENTATIIVSUS

In [37]:
f1 = 'dimensioonide_grupid/arg/arg_strong_emoticons.csv'
f2 = 'dimensioonide_grupid/arg/arg_weak_emoticons.csv'
f3 = 'dimensioonide_grupid/arg/arg_not_present_emoticons.csv'

# kruskall Wallise sisendid (kolm gruppi)
strong = get_data(f1)
weak = get_data(f2)
notpres = get_data(f3)


# print(len(strong))
# print(len(weak))
# print(len(notpres))

feature_data, df = generate_df(create_array(strong), create_array(weak), create_array(notpres))
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(df)

    
relevant_feats = [feat[0] for feat in feature_data if feat[4] == 'YES']

strong_pd = pd.read_csv(f1, sep=';')
weak_pd = pd.read_csv(f2, sep=';')
notpres_pd = pd.read_csv(f3, sep=';')

print(relevant_feats)

res1, res2, res3 = display_posthoc_results(relevant_feats, strong_pd, weak_pd, notpres_pd)

21
27
31


Unnamed: 0,feature,rank,KW p,adjusted p,state
0,noun,46.0,0.27359,0.00303,NO
1,adj,40.0,0.23324,0.002564,NO
2,propn,3.0,0.00043,0.001316,YES
3,adv,39.0,0.19915,0.0025,NO
4,intj,73.5,0.81804,0.018182,NO
5,cconj,48.0,0.28862,0.003226,NO
6,sconj,20.0,0.06028,0.001695,NO
7,adp,63.0,0.59528,0.00625,NO
8,det,37.0,0.1903,0.002381,NO
9,num,26.0,0.08826,0.001887,NO


['propn', 'da_inf', 'pres_tense', 'past_tense', 'neg_polarity']
PROPN
          1         2         3
1  1.000000  0.231327  0.000532
2  0.231327  1.000000  0.013760
3  0.000532  0.013760  1.000000
Kui p>0.05, siis need jäetakse välja!

GRUPP 2 (tugev/mõõdukas vs MITTEEKSISTEERIV)
propn, 0.000531673 

TUGEV/MÕÕDUKAS -> keskmine: 0.0315; mediaan: 0.0196
NÕRK -> keskmine: 0.0469; mediaan: 0.0376
MITTEEKSISTEERIV -> keskmine: 0.09; mediaan: 0.0763
GRUPP 3 (nõrk vs MITTEEKSISTEERIV)
propn, 0.0137603004 

TUGEV/MÕÕDUKAS -> keskmine: 0.0315; mediaan: 0.0196
NÕRK -> keskmine: 0.0469; mediaan: 0.0376
MITTEEKSISTEERIV -> keskmine: 0.09; mediaan: 0.0763
DA_INF
          1         2         3
1  1.000000  0.999051  0.003872
2  0.999051  1.000000  0.002610
3  0.003872  0.002610  1.000000
Kui p>0.05, siis need jäetakse välja!

GRUPP 2 (tugev/mõõdukas vs MITTEEKSISTEERIV)
da_inf, 0.0038717156 

TUGEV/MÕÕDUKAS -> keskmine: 0.0264; mediaan: 0.0245
NÕRK -> keskmine: 0.0247; mediaan: 0.0205
MITTEEKSISTE

### FORMAALSUS

In [39]:
f1 = 'dimensioonide_grupid/form/form_strong_emoticons.csv'
f2 = 'dimensioonide_grupid/form/form_weak_emoticons.csv'
f3 = 'dimensioonide_grupid/form/form_not_present_emoticons.csv'

# kruskall Wallise sisendid (kolm gruppi)
strong = get_data(f1)
weak = get_data(f2)
notpres = get_data(f3)

print(len(strong))
print(len(weak))
print(len(notpres))

feature_data, df = generate_df(create_array(strong), create_array(weak), create_array(notpres))
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(df)
    
relevant_feats = [feat[0] for feat in feature_data if feat[4] == 'YES']
print(relevant_feats)


strong_pd = pd.read_csv(f1, sep=';')
weak_pd = pd.read_csv(f2, sep=';')
notpres_pd = pd.read_csv(f3, sep=';')

res1, res2, res3 = display_posthoc_results(relevant_feats, strong_pd, weak_pd, notpres_pd)

14
28
49


Unnamed: 0,feature,rank,KW p,adjusted p,state
0,noun,5.5,0.0,0.001361,YES
1,adj,22.0,0.00087,0.001754,YES
2,propn,14.0,0.00014,0.001538,YES
3,adv,5.5,0.0,0.001361,YES
4,intj,44.5,0.07976,0.002899,NO
5,cconj,52.0,0.12085,0.003704,NO
6,sconj,27.0,0.00238,0.001923,NO
7,adp,73.0,0.6906,0.016667,NO
8,det,12.5,0.00012,0.001504,YES
9,num,39.0,0.03563,0.0025,NO


['noun', 'adj', 'propn', 'adv', 'det', 'punct', 'pron', 'avg_word_len', 'avr_sent_len', 'coref', '1st_pron', 'active_voice', 'passive_voice', '1st_prs_verb', '2nd_prs_verb', 'core_verb', 'verbtype_ratio', 'ind_mood', 'imp_mood', 'gen_case', 'nsubj_cop', 'xcomp', 'nmod', 'cop']
NOUN
          1         2         3
1  1.000000  0.114095  0.000005
2  0.114095  1.000000  0.000173
3  0.000005  0.000173  1.000000
Kui p>0.05, siis need jäetakse välja!

GRUPP 2 (tugev/mõõdukas vs MITTEEKSISTEERIV)
noun, 5.3796e-06 

TUGEV/MÕÕDUKAS -> keskmine: 0.3471; mediaan: 0.3352
NÕRK -> keskmine: 0.3021; mediaan: 0.2989
MITTEEKSISTEERIV -> keskmine: 0.2421; mediaan: 0.2446
GRUPP 3 (nõrk vs MITTEEKSISTEERIV)
noun, 0.0001730261 

TUGEV/MÕÕDUKAS -> keskmine: 0.3471; mediaan: 0.3352
NÕRK -> keskmine: 0.3021; mediaan: 0.2989
MITTEEKSISTEERIV -> keskmine: 0.2421; mediaan: 0.2446
ADJ
          1         2         3
1  1.000000  0.008966  0.000535
2  0.008966  1.000000  0.385642
3  0.000535  0.385642  1.000000
Ku

### IMPERSONAALSUS

In [40]:
f1 = 'dimensioonide_grupid/imp/imp_strong.csv'
f2 = 'dimensioonide_grupid/imp/imp_weak.csv'
f3 = 'dimensioonide_grupid/imp/imp_not_present.csv'

# kruskall Wallise sisendid (kolm gruppi)
strong = get_data(f1)
weak = get_data(f2)
notpres = get_data(f3)


print(len(strong))
print(len(weak))
print(len(notpres))

feature_data, df = generate_df(create_array(strong), create_array(weak), create_array(notpres))
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(df)
    
relevant_feats = [feat[0] for feat in feature_data if feat[4] == 'YES']
print(relevant_feats)

strong_pd = pd.read_csv(f1, sep=';')
weak_pd = pd.read_csv(f2, sep=';')
notpres_pd = pd.read_csv(f3, sep=';')

res1, res2, res3 = display_posthoc_results(relevant_feats, strong_pd, weak_pd, notpres_pd)

37
21
28


Unnamed: 0,feature,rank,KW p,adjusted p,state
0,noun,4.5,0.0,0.001342,YES
1,adj,16.0,0.00051,0.001587,YES
2,propn,37.0,0.07755,0.002381,NO
3,adv,9.5,1e-05,0.001439,YES
4,intj,21.5,0.00137,0.001739,YES
5,cconj,59.0,0.32321,0.005,NO
6,sconj,29.0,0.00906,0.002,NO
7,adp,67.0,0.4757,0.008333,NO
8,det,19.0,0.00077,0.001667,YES
9,num,48.0,0.13464,0.003226,NO


['noun', 'adj', 'adv', 'intj', 'det', 'pron', 'abbr', 'avg_word_len', 'coref', '1st_pron', '2nd_pron', '3rd_pron', 'active_voice', 'passive_voice', '1st_prs_verb', '2nd_prs_verb', 'core_verb', 'verbtype_ratio', 'supine', 'discourse', 'ind_mood', 'imp_mood', 'gen_case', 'xcomp', 'nmod']
NOUN
              1         2             3
1  1.000000e+00  0.000986  7.810205e-10
2  9.864760e-04  1.000000  2.876591e-02
3  7.810205e-10  0.028766  1.000000e+00
Kui p>0.05, siis need jäetakse välja!

GRUPP 1 (tugev/mõõdukas vs nõrk)
noun, 0.000986476 

TUGEV/MÕÕDUKAS -> keskmine: 0.3277; mediaan: 0.3214
NÕRK -> keskmine: 0.2633; mediaan: 0.2625
MITTEEKSISTEERIV -> keskmine: 0.2145; mediaan: 0.2158
GRUPP 2 (tugev/mõõdukas vs MITTEEKSISTEERIV)
noun, 8e-10 

TUGEV/MÕÕDUKAS -> keskmine: 0.3277; mediaan: 0.3214
NÕRK -> keskmine: 0.2633; mediaan: 0.2625
MITTEEKSISTEERIV -> keskmine: 0.2145; mediaan: 0.2158
GRUPP 3 (nõrk vs MITTEEKSISTEERIV)
noun, 0.0287659082 

TUGEV/MÕÕDUKAS -> keskmine: 0.3277; mediaan: 

### INFOTIHEDUS

In [41]:
f1 = 'dimensioonide_grupid/info/info_strong.csv'
f2 = 'dimensioonide_grupid/info/info_weak.csv'
f3 = 'dimensioonide_grupid/info/info_not_present.csv'

# kruskall Wallise sisendid (kolm gruppi)
strong = get_data(f1)
weak = get_data(f2)
notpres = get_data(f3)

print(len(strong))
print(len(weak))
print(len(notpres))

feature_data, df = generate_df(create_array(strong), create_array(weak), create_array(notpres))
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(df)
    
relevant_feats = [feat[0] for feat in feature_data if feat[4] == 'YES']
print(relevant_feats)

strong_pd = pd.read_csv(f1, sep=';')
weak_pd = pd.read_csv(f2, sep=';')
notpres_pd = pd.read_csv(f3, sep=';')

res1, res2, res3 = display_posthoc_results(relevant_feats, strong_pd, weak_pd, notpres_pd)

45
26
5


Unnamed: 0,feature,rank,KW p,adjusted p,state
0,noun,14.0,0.00068,0.001538,YES
1,adj,33.0,0.03403,0.002174,NO
2,propn,24.0,0.01136,0.001818,NO
3,adv,13.0,0.0004,0.001515,YES
4,intj,22.5,0.00949,0.00177,NO
5,cconj,64.0,0.65293,0.006667,NO
6,sconj,15.0,0.0044,0.001563,NO
7,adp,51.0,0.22675,0.003571,NO
8,det,16.0,0.0046,0.001587,NO
9,num,10.5,6e-05,0.00146,YES


['noun', 'adv', 'num', 'pron', 'avg_word_len', 'coref', '1st_pron', '2nd_pron', '1st_prs_verb', '2nd_prs_verb', 'imp_mood', 'neg_polarity', 'xcomp', 'nummod']
NOUN
          1         2         3
1  1.000000  0.002513  0.027851
2  0.002513  1.000000  0.490774
3  0.027851  0.490774  1.000000
Kui p>0.05, siis need jäetakse välja!

GRUPP 1 (tugev/mõõdukas vs nõrk)
noun, 0.0025134747 

TUGEV/MÕÕDUKAS -> keskmine: 0.3005; mediaan: 0.3016
NÕRK -> keskmine: 0.2472; mediaan: 0.2364
MITTEEKSISTEERIV -> keskmine: 0.2298; mediaan: 0.2194
GRUPP 2 (tugev/mõõdukas vs MITTEEKSISTEERIV)
noun, 0.0278508216 

TUGEV/MÕÕDUKAS -> keskmine: 0.3005; mediaan: 0.3016
NÕRK -> keskmine: 0.2472; mediaan: 0.2364
MITTEEKSISTEERIV -> keskmine: 0.2298; mediaan: 0.2194
ADV
          1         2         3
1  1.000000  0.000531  0.099573
2  0.000531  1.000000  0.998293
3  0.099573  0.998293  1.000000
Kui p>0.05, siis need jäetakse välja!

GRUPP 1 (tugev/mõõdukas vs nõrk)
adv, 0.0005307072 

TUGEV/MÕÕDUKAS -> keskmine: 0

### INTERAKTIIVSUS

In [42]:
f1 = 'dimensioonide_grupid/inter/inter_strong.csv'
f2 = 'dimensioonide_grupid/inter/inter_weak.csv'
f3 = 'dimensioonide_grupid/inter/inter_not_present.csv'

# kruskall Wallise sisendid (kolm gruppi)
strong = get_data(f1)
weak = get_data(f2)
notpres = get_data(f3)

print(len(strong))
print(len(weak))
print(len(notpres))

feature_data, df = generate_df(create_array(strong), create_array(weak), create_array(notpres))
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(df)
    
relevant_feats = [feat[0] for feat in feature_data if feat[4] == 'YES']
print(relevant_feats)

strong_pd = pd.read_csv(f1, sep=';')
weak_pd = pd.read_csv(f2, sep=';')
notpres_pd = pd.read_csv(f3, sep=';')

res1, res2, res3 = display_posthoc_results(relevant_feats, strong_pd, weak_pd, notpres_pd)

10
15
75


Unnamed: 0,feature,rank,KW p,adjusted p,state
0,noun,7.0,1e-05,0.001389,YES
1,adj,25.0,0.0042,0.001852,NO
2,propn,26.0,0.0048,0.001887,NO
3,adv,18.0,0.00177,0.001639,NO
4,intj,8.5,4e-05,0.001418,YES
5,cconj,78.0,0.96369,0.1,NO
6,sconj,35.0,0.02819,0.002273,NO
7,adp,49.5,0.16392,0.00339,NO
8,det,48.0,0.15517,0.003226,NO
9,num,34.0,0.02727,0.002222,NO


['noun', 'intj', 'pron', 'TTR', 'avg_word_len', 'hapax_legomena', 'coref', '1st_pron', '2nd_pron', '1st_prs_verb', '2nd_prs_verb', 'discourse', 'imp_mood', 'neg_polarity', 'gen_case', 'nmod', 'voc']
NOUN
          1         2         3
1  1.000000  0.522908  0.000532
2  0.522908  1.000000  0.000798
3  0.000532  0.000798  1.000000
Kui p>0.05, siis need jäetakse välja!

GRUPP 2 (tugev/mõõdukas vs MITTEEKSISTEERIV)
noun, 0.0005316103 

TUGEV/MÕÕDUKAS -> keskmine: 0.2055; mediaan: 0.2123
NÕRK -> keskmine: 0.2219; mediaan: 0.2265
MITTEEKSISTEERIV -> keskmine: 0.2888; mediaan: 0.2857
GRUPP 3 (nõrk vs MITTEEKSISTEERIV)
noun, 0.0007983047 

TUGEV/MÕÕDUKAS -> keskmine: 0.2055; mediaan: 0.2123
NÕRK -> keskmine: 0.2219; mediaan: 0.2265
MITTEEKSISTEERIV -> keskmine: 0.2888; mediaan: 0.2857
INTJ
          1         2         3
1  1.000000  0.026642  0.000033
2  0.026642  1.000000  0.096095
3  0.000033  0.096095  1.000000
Kui p>0.05, siis need jäetakse välja!

GRUPP 1 (tugev/mõõdukas vs nõrk)
intj, 

### KEERULISUS

In [45]:
f1 = 'dimensioonide_grupid/keer/keer_strong_emoticons.csv'
f2 = 'dimensioonide_grupid/keer/keer_weak_emoticons.csv'
f3 = 'dimensioonide_grupid/keer/keer_not_present_emoticons.csv'

# kruskall Wallise sisendid (kolm gruppi)
strong = get_data(f1)
weak = get_data(f2)
notpres = get_data(f3)

print(len(strong))
print(len(weak))
print(len(notpres))

feature_data, df = generate_df(create_array(strong), create_array(weak), create_array(notpres))
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(df)
    
relevant_feats = [feat[0] for feat in feature_data if feat[4] == 'YES']
print(relevant_feats)

strong_pd = pd.read_csv(f1, sep=';')
weak_pd = pd.read_csv(f2, sep=';')
notpres_pd = pd.read_csv(f3, sep=';')

res1, res2, res3 = display_posthoc_results(relevant_feats, strong_pd, weak_pd, notpres_pd)

9
25
53


Unnamed: 0,feature,rank,KW p,adjusted p,state
0,noun,16.0,0.01065,0.001587,NO
1,adj,22.0,0.01994,0.001754,NO
2,propn,25.0,0.02839,0.001852,NO
3,adv,26.0,0.03028,0.001887,NO
4,intj,38.5,0.13004,0.002469,NO
5,cconj,74.0,0.82388,0.02,NO
6,sconj,46.0,0.16686,0.00303,NO
7,adp,61.0,0.52861,0.005556,NO
8,det,37.0,0.12714,0.002381,NO
9,num,47.0,0.17091,0.003125,NO


['punct', 'abbr', '1st_pron', '1st_prs_verb', 'core_verb', 'nmod']
PUNCT
          1         2         3
1  1.000000  0.637972  0.016343
2  0.637972  1.000000  0.004476
3  0.016343  0.004476  1.000000
Kui p>0.05, siis need jäetakse välja!

GRUPP 2 (tugev/mõõdukas vs MITTEEKSISTEERIV)
punct, 0.0163434264 

TUGEV/MÕÕDUKAS -> keskmine: 0.1268; mediaan: 0.1184
NÕRK -> keskmine: 0.1265; mediaan: 0.125
MITTEEKSISTEERIV -> keskmine: 0.1478; mediaan: 0.1477
GRUPP 3 (nõrk vs MITTEEKSISTEERIV)
punct, 0.0044760158 

TUGEV/MÕÕDUKAS -> keskmine: 0.1268; mediaan: 0.1184
NÕRK -> keskmine: 0.1265; mediaan: 0.125
MITTEEKSISTEERIV -> keskmine: 0.1478; mediaan: 0.1477
ABBR
          1         2         3
1  1.000000  0.054014  0.000676
2  0.054014  1.000000  0.054014
3  0.000676  0.054014  1.000000
Kui p>0.05, siis need jäetakse välja!

GRUPP 2 (tugev/mõõdukas vs MITTEEKSISTEERIV)
abbr, 0.0006755665 

TUGEV/MÕÕDUKAS -> keskmine: 0.0451; mediaan: 0.0537
NÕRK -> keskmine: 0.0169; mediaan: 0.0076
MITTEEKSIS

### SPONTAANSUS

In [46]:
f1 = 'dimensioonide_grupid/spont/spont_strong.csv'
f2 = 'dimensioonide_grupid/spont/spont_weak.csv'
f3 = 'dimensioonide_grupid/spont/spont_not_present.csv'

# kruskall Wallise sisendid (kolm gruppi)
strong = get_data(f1)
weak = get_data(f2)
notpres = get_data(f3)


print(len(strong))
print(len(weak))
print(len(notpres))

feature_data, df = generate_df(create_array(strong), create_array(weak), create_array(notpres))
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(df)
    
relevant_feats = [feat[0] for feat in feature_data if feat[4] == 'YES']
print(relevant_feats)

strong_pd = pd.read_csv(f1, sep=';')
weak_pd = pd.read_csv(f2, sep=';')
notpres_pd = pd.read_csv(f3, sep=';')

res1, res2, res3 = display_posthoc_results(relevant_feats, strong_pd, weak_pd, notpres_pd)

12
6
79


Unnamed: 0,feature,rank,KW p,adjusted p,state
0,noun,4.0,0.0,0.001333,YES
1,adj,36.0,0.03028,0.002326,NO
2,propn,19.0,0.00054,0.001667,YES
3,adv,9.5,1e-05,0.001439,YES
4,intj,4.0,0.0,0.001333,YES
5,cconj,65.0,0.44997,0.007143,NO
6,sconj,46.0,0.11811,0.00303,NO
7,adp,58.0,0.31165,0.004762,NO
8,det,52.0,0.18464,0.003704,NO
9,num,40.0,0.07267,0.002564,NO


['noun', 'propn', 'adv', 'intj', 'pron', 'avg_word_len', 'coref', '1st_pron', '2nd_pron', '1st_prs_verb', '2nd_prs_verb', 'core_verb', 'discourse', 'imp_mood', 'neg_polarity', 'gen_case', 'nsubj_cop', 'obl', 'nmod', 'cop']
NOUN
          1         2         3
1  1.000000  0.473671  0.000009
2  0.473671  1.000000  0.020539
3  0.000009  0.020539  1.000000
Kui p>0.05, siis need jäetakse välja!

GRUPP 2 (tugev/mõõdukas vs MITTEEKSISTEERIV)
noun, 9.2864e-06 

TUGEV/MÕÕDUKAS -> keskmine: 0.1976; mediaan: 0.2123
NÕRK -> keskmine: 0.2369; mediaan: 0.2305
MITTEEKSISTEERIV -> keskmine: 0.296; mediaan: 0.2963
GRUPP 3 (nõrk vs MITTEEKSISTEERIV)
noun, 0.0205386879 

TUGEV/MÕÕDUKAS -> keskmine: 0.1976; mediaan: 0.2123
NÕRK -> keskmine: 0.2369; mediaan: 0.2305
MITTEEKSISTEERIV -> keskmine: 0.296; mediaan: 0.2963
PROPN
          1         2         3
1  1.000000  0.618473  0.001343
2  0.618473  1.000000  0.095346
3  0.001343  0.095346  1.000000
Kui p>0.05, siis need jäetakse välja!

GRUPP 2 (tugev/mõõ

### SUBJEKTIIVSUS

In [47]:
f1 = 'dimensioonide_grupid/subj/subj_strong.csv'
f2 = 'dimensioonide_grupid/subj/subj_weak.csv'
f3 = 'dimensioonide_grupid/subj/subj_not_present.csv'

# kruskall Wallise sisendid (kolm gruppi)
strong = get_data(f1)
weak = get_data(f2)
notpres = get_data(f3)

print(len(strong))
print(len(weak))
print(len(notpres))

feature_data, df = generate_df(create_array(strong), create_array(weak), create_array(notpres))
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(df)
    
relevant_feats = [feat[0] for feat in feature_data if feat[4] == 'YES']
print(relevant_feats)

strong_pd = pd.read_csv(f1, sep=';')
weak_pd = pd.read_csv(f2, sep=';')
notpres_pd = pd.read_csv(f3, sep=';')

res1, res2, res3 = display_posthoc_results(relevant_feats, strong_pd, weak_pd, notpres_pd)

30
15
52


Unnamed: 0,feature,rank,KW p,adjusted p,state
0,noun,7.5,0.0,0.001399,YES
1,adj,23.0,0.00042,0.001786,YES
2,propn,7.5,0.0,0.001399,YES
3,adv,7.5,0.0,0.001399,YES
4,intj,21.5,0.00039,0.001739,YES
5,cconj,76.0,0.79353,0.033333,NO
6,sconj,30.0,0.00202,0.002041,YES
7,adp,69.0,0.7177,0.01,NO
8,det,34.0,0.00273,0.002222,NO
9,num,54.0,0.26488,0.004,NO


['noun', 'adj', 'propn', 'adv', 'intj', 'sconj', 'punct', 'pron', 'avg_word_len', 'coref', 'see_pron', '1st_pron', '2nd_pron', 'active_voice', 'passive_voice', '1st_prs_verb', '2nd_prs_verb', 'core_verb', 'supine', 'discourse', 'ind_mood', 'imp_mood', 'neg_polarity', 'gen_case', 'ade_case', 'nsubj_cop', 'csubj_cop', 'obl', 'nmod', 'amod', 'cop']
NOUN
              1         2             3
1  1.000000e+00  0.032324  2.339419e-09
2  3.232403e-02  1.000000  2.478731e-02
3  2.339419e-09  0.024787  1.000000e+00
Kui p>0.05, siis need jäetakse välja!

GRUPP 1 (tugev/mõõdukas vs nõrk)
noun, 0.0323240309 

TUGEV/MÕÕDUKAS -> keskmine: 0.2169; mediaan: 0.2237
NÕRK -> keskmine: 0.2633; mediaan: 0.2629
MITTEEKSISTEERIV -> keskmine: 0.3115; mediaan: 0.3061
GRUPP 2 (tugev/mõõdukas vs MITTEEKSISTEERIV)
noun, 2.3e-09 

TUGEV/MÕÕDUKAS -> keskmine: 0.2169; mediaan: 0.2237
NÕRK -> keskmine: 0.2633; mediaan: 0.2629
MITTEEKSISTEERIV -> keskmine: 0.3115; mediaan: 0.3061
GRUPP 3 (nõrk vs MITTEEKSISTEERIV)
no

## KORRELATSIOONID

In [None]:
input_f = 'limesurvey_feature_results_w_dims_uus.csv'

data = []

with open(input_f, 'r') as fid:
    csv_reader = csv.DictReader(fid, delimiter=',')
    fieldnames = csv_reader.fieldnames
    for line in csv_reader:
        data.append(line)

In [None]:
initial_df = pd.DataFrame(data).iloc[:, 1:].astype(float)

In [None]:
initial_df

In [None]:
feature_names = initial_df.columns[12:]
feature_names

In [None]:
def save_csv(prefix, df):
    df.to_csv(f'korrelatsiooni_csvd/{prefix}.csv')

In [None]:
def plot_important_features(model, dimname):
    
    print(model.coef_)

    feature_importance = abs(model.coef_[0])
    
    
    feature_importance = 1.0 * (feature_importance / feature_importance.max())
    sorted_idx = np.argsort(feature_importance)
    pos = np.arange(sorted_idx.shape[0]) + .5

    featfig = plt.figure(figsize=(15,20))
    featax = featfig.add_subplot(1, 1, 1)

    featax.barh(pos, feature_importance[sorted_idx], align='center')
    featax.set_yticks(pos)
    featax.set_yticklabels(np.array(X.columns)[sorted_idx], fontsize=10)
    featax.set_xlabel(f'Relative Feature Importance {str.upper(dimname)}')

    # plt.tight_layout()   
    plt.show()

In [None]:
def get_corr_pairs(corr_df):

    columns = [line for line in corr_df]
    matches = []

    for column, rows in corr_df.items():
        for m, score in rows.items():
            if (score > 0.6 or score < -0.6) and column != m:
                matches.append([(column, m), score])
            continue

    without_duplicates = []
    for ws, i in matches:
        ls = (sorted(ws), i)
        if ls not in without_duplicates:
            without_duplicates.append(ls)       

    return without_duplicates

In [None]:
def generate_heatmap(dim, corr_data):

    my_colors = ['black', 'lightgrey', 'white', 'red']
    my_cmap = ListedColormap(my_colors)
    bounds = [-1.0, -0.6, 0.6, 1.0]
    my_norm = BoundaryNorm(bounds, ncolors=len(my_colors))



    mask = np.triu(np.ones_like(corr_data))

    fig, ax = plt.subplots(1, 1, figsize=(20,20))
    hmap = sns.heatmap(corr_data,
                yticklabels=1, 
                ax=ax,
                linewidths=1.0,
                cmap=my_cmap,
                norm=my_norm,
                mask=mask,
               cbar_kws = dict(use_gridspec=False,location="top")
               )

    colorbar = ax.collections[0].colorbar
    hmap.figure.savefig(f'heatmapid/{dim}_heatmap.png', format='png', dpi=150)
    plt.show()


In [None]:
all_df = initial_df.iloc[:, 12:]
all_corr = all_df.corr(method='pearson').round(2)

save_csv('dimensions', all_corr)

In [None]:
generate_heatmap('all_dims', all_corr)

In [None]:
pairs = get_corr_pairs(all_corr)
for pair in pairs:
    print(f'{pair[0]} = {pair[1]}')

## ABSTRAKTNE

In [None]:
abs_df = initial_df['abs']
abs_df
# initial_df[initial_df['abs']<=1]

In [None]:
pos, neg = make_new_df('abs')

print(len(pos), len(neg))

In [None]:
pos

In [None]:
abs_corr = pos.corr(method='pearson').round(2)
save_csv('abs', abs_corr)
abs_corr


In [None]:
generate_heatmap('abs', abs_corr)

In [None]:
pairs = get_corr_pairs(abs_corr)
for pair in pairs:
    print(f'{pair[0]} = {pair[1]}')


### NORMAALJAOTUS

In [None]:
def generate_plot(dim, df1, df2, features):
    fig, axs = plt.subplots(ncols=2, figsize=(20, 200))
    plt.subplots_adjust(hspace=0.5)

    for i, feature in enumerate(features):
        df1_mean = np.mean(df1[str(feature)])
        df1_std = np.std(df1[str(feature)])
        pdf1 = stats.norm.pdf(df1[str(feature)].sort_values(), df1_mean, df1_std)
        
        df2_mean = np.mean(df2[str(feature)])
        df2_std = np.std(df2[str(feature)])
        pdf2 = stats.norm.pdf(df2[str(feature)].sort_values(), df2_mean, df2_std)
    
        ax = plt.subplot(40, 2, i+1)
        plt.plot(df1[str(feature)].sort_values(), pdf1, label='>=2')
        plt.plot(df2[str(feature)].sort_values(), pdf2, label='<2')
        plt.legend(loc=1, prop={'size': 20})
        plt.xlabel(str(feature.upper()))

#         plt.savefig(f'normdist_plots/{dim}.pdf')

In [None]:
generate_plot('abs', pos, neg, feature_names)

### LOGISTILINE REGRESSIOON

In [None]:
feats = [*feature_names]

In [None]:
only_abs = initial_df.loc[:, feats + ['abs']].astype('float64')
only_abs['state'] = [1 if score >= 1 else 0 for score in only_abs['abs']]
# print(only_abs)

final_features = only_abs.columns.values.tolist()[:-2]


X = only_abs[final_features]
y = only_abs.state

model = LogisticRegression().fit(X,y)

plot_important_features(model, 'abstraktsus')


In [None]:

log_reg = smf.logit("state ~ coref + hapax_legomena + noun + nom_case + avg_word_len + past_tense + pres_tense + obl + adv + num + ade_case + nummod + part_case", data=only_abs).fit()
log_reg.summary()

# AFEKTIIVNE

In [None]:
dim = 'afek'

In [None]:
pos, neg = make_new_df(dim)
print(len(pos), len(neg))

In [None]:


dim_df_corr = pos.corr(method='pearson').round(2)
save_csv(dim, dim_df_corr)

generate_heatmap(dim, dim_df_corr)


pairs = get_corr_pairs(dim_df_corr)
for pair in pairs:
    print(f'{pair[0]} = {pair[1]}')


generate_plot(dim, pos, neg, feature_names)

In [None]:
df = initial_df.loc[:, feats + ['afek']].astype('float64')
df['state'] = [1 if score >= 1.5 else 0 for score in df['afek']]

final_features = df.columns.values.tolist()[:-2]


X = df[final_features]
y = df.state

model = LogisticRegression().fit(X,y)
plot_important_features(model, 'afektiivsus')

In [None]:
log_reg = smf.logit("state ~ avg_word_len + coref + noun + adv + pron + verbtype_ratio + TTR + punct + active_voice + adj + conj + avr_sent_len", data=df).fit()
log_reg.summary()

## AEG

In [None]:
dim = 'aeg'
pos, neg = make_new_df(dim)
print(len(pos), len(neg))

In [None]:
dim_df_corr = pos.corr(method='pearson').round(2)
save_csv(dim, dim_df_corr)

generate_heatmap(dim, dim_df_corr)

pairs = get_corr_pairs(dim_df_corr)
for pair in pairs:
    print(f'{pair[0]} = {pair[1]}')


generate_plot(dim, pos, neg, feature_names)


In [None]:
df = initial_df.loc[:, feats + ['aeg']].astype('float64')
df['state'] = [1 if score >= 1.5 else 0 for score in df['aeg']]

final_features = df.columns.values.tolist()[:-2]


X = df[final_features]
y = df.state

model = LogisticRegression().fit(X,y)
plot_important_features(model, 'aeg')

In [None]:
log_reg = smf.logit("state ~ coref + past_tense + pres_tense + obl + verbtype_ratio + adv + num + avg_word_len + gen_case + nummod + noun + core_verb + ade_case + propn + da_inf + cop", data=df).fit()
log_reg.summary()

## ARGUMENTATIIVNE

In [None]:
dim = 'arg'

pos, neg = make_new_df(dim)
print(len(pos), len(neg))

In [None]:
dim_df_corr = pos.corr(method='pearson').round(2)
save_csv(dim, dim_df_corr)

generate_heatmap(dim, dim_df_corr)


pairs = get_corr_pairs(dim_df_corr)
for pair in pairs:
    print(f'{pair[0]} = {pair[1]}')

generate_plot(dim, pos, neg, feature_names)

In [None]:
df = initial_df.loc[:, feats + ['arg']].astype('float64')
df['state'] = [1 if score >= 1.5 else 0 for score in df['arg']]

final_features = df.columns.values.tolist()[:-2]


X = df[final_features]
y = df.state

model = LogisticRegression().fit(X,y)
plot_important_features(model, 'arg')



In [None]:
log_reg = smf.logit("state ~ coref + hapax_legomena + propn + verbtype_ratio + past_tense + pres_tense + avg_word_len + gen_case + conj + TTR + adj + neg_polarity + obl + part_case", data=df).fit()
log_reg.summary()

## FORMAALNE

In [None]:
dim = 'form'

pos, neg = make_new_df(dim)
print(len(pos), len(neg))

In [None]:
dim_df_corr = pos.corr(method='pearson').round(2)
save_csv(dim, dim_df_corr)

generate_heatmap(dim, dim_df_corr)

pairs = get_corr_pairs(dim_df_corr)
for pair in pairs:
    print(f'{pair[0]} = {pair[1]}')

generate_plot(dim, pos, neg, feature_names)

In [None]:
df = initial_df.loc[:, feats + ['form']].astype('float64')
df['state'] = [1 if score >= 1.5 else 0 for score in df['form']]

final_features = df.columns.values.tolist()[:-2]


X = df[final_features]
y = df.state

model = LogisticRegression().fit(X,y)
plot_important_features(model, 'formaalsus')




In [None]:

log_reg = smf.logit("state ~ avg_word_len + coref + gen_case + noun + obl + adv + nmod + num", data=df).fit()
log_reg.summary()

## IMPERSONAALNE

In [None]:
dim = 'imp'

pos, neg = make_new_df(dim)
print(len(pos), len(neg))

In [None]:
dim_df_corr = pos.corr(method='pearson').round(2)
save_csv(dim, dim_df_corr)

generate_heatmap(dim, dim_df_corr)


pairs = get_corr_pairs(dim_df_corr)
for pair in pairs:
    print(f'{pair[0]} = {pair[1]}')

generate_plot(dim, pos, neg, feature_names)

In [None]:
df = initial_df.loc[:, feats + ['imp']].astype('float64')
df['state'] = [1 if score >= 1.5 else 0 for score in df['imp']]

final_features = df.columns.values.tolist()[:-2]


X = df[final_features]
y = df.state

model = LogisticRegression().fit(X,y)
plot_important_features(model, 'impersonaalsus')





In [None]:
log_reg = smf.logit("state ~ avg_word_len + coref + noun + past_tense + pres_tense + conj + pron + num", data=df).fit()
log_reg.summary()

## INFO

In [None]:
dim = 'info'

pos, neg = make_new_df(dim)
print(len(pos), len(neg))

In [None]:
dim_df_corr = pos.corr(method='pearson').round(2)
save_csv(dim, dim_df_corr)

generate_heatmap(dim, dim_df_corr)


pairs = get_corr_pairs(dim_df_corr)
for pair in pairs:
    print(f'{pair[0]} = {pair[1]}')


generate_plot(dim, pos, neg, feature_names)

In [None]:
df = initial_df.loc[:, feats + ['info']].astype('float64')
df['state'] = [1 if score >= 1.5 else 0 for score in df['info']]

final_features = df.columns.values.tolist()[:-2]

X = df[final_features]
y = df.state

model = LogisticRegression().fit(X,y)
plot_important_features(model, 'info')


In [None]:

log_reg = smf.logit("state ~ coref + num + avg_word_len + active_voice + propn + nummod + pron + ind_mood + hapax_legomena + nsubj", data=df).fit()
log_reg.summary()

## INSTRUEERIV

In [None]:
dim = 'inst'

pos, neg = make_new_df(dim)
print(len(pos), len(neg))

In [None]:
dim_df_corr = pos.corr(method='pearson').round(2)
save_csv(dim, dim_df_corr)

generate_heatmap(dim, dim_df_corr)


pairs = get_corr_pairs(dim_df_corr)
for pair in pairs:
    print(f'{pair[0]} = {pair[1]}')

generate_plot(dim, pos, neg, feature_names)

In [None]:
df = initial_df.loc[:, feats + ['inst']].astype('float64')
df['state'] = [1 if score >= 1.5 else 0 for score in df['inst']]

final_features = df.columns.values.tolist()[:-2]


X = df[final_features]
y = df.state

model = LogisticRegression().fit(X,y)
plot_important_features(model, 'instrueerivus')


In [None]:
log_reg = smf.logit("state ~ coref + pres_tense + nom_case + pron + hapax_legomena + past_tense + verbtype_ratio + noun + third_prs_verb + second_prs_verb + imp_mood + pron", data=df).fit()
log_reg.summary()

## INTERAKTIIVNE

In [None]:
dim = 'inter'

pos, neg = make_new_df(dim)
print(len(pos), len(neg))

In [None]:
dim_df_corr = pos.corr(method='pearson').round(2)
save_csv(dim, dim_df_corr)

generate_heatmap(dim, dim_df_corr)


pairs = get_corr_pairs(dim_df_corr)
for pair in pairs:
    print(f'{pair[0]} = {pair[1]}')

generate_plot(dim, pos, neg, feature_names)

In [None]:
df = initial_df.loc[:, feats + ['inter']].astype('float64')
df['state'] = [1 if score >= 1.5 else 0 for score in df['inter']]

final_features = df.columns.values.tolist()[:-2]


X = df[final_features]
y = df.state

model = LogisticRegression().fit(X,y)
plot_important_features(model, 'inter')

In [None]:
log_reg = smf.logit("state ~ avg_word_len + hapax_legomena + TTR + gen_case + noun + verbtype_ratio + avr_sent_len", data=df).fit()
log_reg.summary()

## KEER

In [None]:
dim = 'keer'

pos, neg = make_new_df(dim)
print(len(pos), len(neg))

In [None]:
dim_df_corr = pos.corr(method='pearson').round(2)
save_csv(dim, dim_df_corr)

generate_heatmap(dim, dim_df_corr)


pairs = get_corr_pairs(dim_df_corr)
for pair in pairs:
    print(f'{pair[0]} = {pair[1]}')


generate_plot(dim, pos, neg, feature_names)

In [None]:
df = initial_df.loc[:, feats + ['keer']].astype('float64')
df['state'] = [1 if score >= 1.5 else 0 for score in df['keer']]

final_features = df.columns.values.tolist()[:-2]


X = df[final_features]
y = df.state

model = LogisticRegression().fit(X,y)
plot_important_features(model, 'keerulisus')

In [None]:
log_reg = smf.logit("state ~ avg_word_len + coref + hapax_legomena + verbtype_ratio + abbr + nmod + past_tense + obl + active_voice", data=df).fit()
log_reg.summary()

## SPONT

In [None]:
dim = 'spont'

pos, neg = make_new_df(dim)
print(len(pos), len(neg))

In [None]:
dim_df_corr = pos.corr(method='pearson').round(2)
save_csv(dim, dim_df_corr)

generate_heatmap(dim, dim_df_corr)

pairs = get_corr_pairs(dim_df_corr)
for pair in pairs:
    print(f'{pair[0]} = {pair[1]}')


generate_plot(dim, pos, neg, feature_names)

In [None]:
df = initial_df.loc[:, feats + ['spont']].astype('float64')
df['state'] = [1 if score >= 1.5 else 0 for score in df['spont']]

final_features = df.columns.values.tolist()[:-2]


X = df[final_features]
y = df.state

model = LogisticRegression().fit(X,y)
plot_important_features(model, 'spontaansus')





In [None]:
log_reg = smf.logit("state ~ avg_word_len + coref + noun + gen_case + nom_case + avr_sent_len + obl + propn", data=df).fit()
log_reg.summary()

## SUBJ

In [None]:
dim = 'subj'

pos, neg = make_new_df(dim)
print(len(pos), len(neg))

In [None]:
dim_df_corr = pos.corr(method='pearson').round(2)
save_csv(dim, dim_df_corr)

generate_heatmap(dim, dim_df_corr)


pairs = get_corr_pairs(dim_df_corr)
for pair in pairs:
    print(f'{pair[0]} = {pair[1]}')


generate_plot(dim, pos, neg, feature_names)

In [None]:
df = initial_df.loc[:, feats + ['subj']].astype('float64')
df['state'] = [1 if score >= 1.5 else 0 for score in df['subj']]

final_features = df.columns.values.tolist()[:-2]


X = df[final_features]
y = df.state

model = LogisticRegression().fit(X,y)
plot_important_features(model, 'subjektiivsus')





In [None]:
log_reg = smf.logit("state ~ avg_word_len + coref + verbtype_ratio + adv + propn + noun", data=df).fit()
log_reg.summary()