In [None]:
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats 
import seaborn as sns
# import statsmodels.api as sm
import statsmodels.formula.api as smf

In [None]:
plt.rc("font", size=14)
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)


from matplotlib.colors import BoundaryNorm, ListedColormap


In [None]:
input_f = 'limesurvey_feature_results_w_dims.csv'

data = []

with open(input_f, 'r') as fid:
    csv_reader = csv.DictReader(fid, delimiter=',')
    fieldnames = csv_reader.fieldnames
    for line in csv_reader:
        data.append(line)

In [None]:
initial_df = pd.DataFrame(data).iloc[:, 1:].astype(float)

In [None]:
feature_names = initial_df.columns[12:]

In [None]:
def save_csv(prefix, df):
    df.to_csv(f'korrelatsiooni_csvd/{prefix}.csv')

In [None]:
def plot_important_features(model, dimname):
    
    print(model.coef_)

    feature_importance = abs(model.coef_[0])
    
    
    feature_importance = 1.0 * (feature_importance / feature_importance.max())
    sorted_idx = np.argsort(feature_importance)
    pos = np.arange(sorted_idx.shape[0]) + .5

    featfig = plt.figure(figsize=(15,20))
    featax = featfig.add_subplot(1, 1, 1)

    featax.barh(pos, feature_importance[sorted_idx], align='center')
    featax.set_yticks(pos)
    featax.set_yticklabels(np.array(X.columns)[sorted_idx], fontsize=10)
    featax.set_xlabel(f'Relative Feature Importance {str.upper(dimname)}')

    # plt.tight_layout()   
    plt.show()

In [None]:
def get_corr_pairs(corr_df):

    columns = [line for line in corr_df]
    matches = []

    for column, rows in corr_df.items():
        for m, score in rows.items():
            if (score > 0.75 or score < -0.75) and column != m:
                matches.append([(column, m), score])
            continue

    without_duplicates = []
    for ws, i in matches:
        ls = (sorted(ws), i)
        if ls not in without_duplicates:
            without_duplicates.append(ls)       

    return without_duplicates

In [None]:
def generate_heatmap(dim, corr_data):

    my_colors = ['black', 'lightgrey', 'white', 'red']
    my_cmap = ListedColormap(my_colors)
    bounds = [-1.0, -0.75, 0.75, 1.0]
    my_norm = BoundaryNorm(bounds, ncolors=len(my_colors))



    mask = np.triu(np.ones_like(corr_data))

    fig, ax = plt.subplots(1, 1, figsize=(20,20))
    hmap = sns.heatmap(corr_data,
                yticklabels=1, 
                ax=ax,
                linewidths=1.0,
                cmap=my_cmap,
                norm=my_norm,
                mask=mask,
               cbar_kws = dict(use_gridspec=False,location="top")
               )

    colorbar = ax.collections[0].colorbar
    hmap.figure.savefig(f'heatmapid/{dim}_heatmap.png', format='png', dpi=150)
    plt.show()


In [None]:
all_df = initial_df.iloc[:, 12:]
all_corr = all_df.corr(method='pearson').round(2)

save_csv('dimensions', all_corr)

In [None]:
generate_heatmap('all_dims', all_corr)

In [None]:
pairs = get_corr_pairs(all_corr)
for pair in pairs:
    print(f'{pair[0]} = {pair[1]}')

In [None]:
def make_new_df(dim_name):
    positive = initial_df.loc[initial_df['abs']>=2, initial_df.columns[12:]]
    negative = initial_df.loc[initial_df['abs']<2, initial_df.columns[12:]]
    
    return positive, negative



## ABSTRAKTNE

In [None]:
pos, neg = make_new_df('abs')
print(len(pos), len(neg))

In [None]:
abs_corr = pos.corr(method='pearson').round(2)
save_csv('abs', abs_corr)
abs_corr


In [None]:
generate_heatmap('abs', abs_corr)

In [None]:
pairs = get_corr_pairs(abs_corr)
for pair in pairs:
    print(f'{pair[0]} = {pair[1]}')


### NORMAALJAOTUS

In [None]:
def generate_plot(dim, df1, df2, features):
    fig, axs = plt.subplots(ncols=2, figsize=(20, 200))
    plt.subplots_adjust(hspace=0.5)

    for i, feature in enumerate(features):
        df1_mean = np.mean(df1[str(feature)])
        df1_std = np.std(df1[str(feature)])
        pdf1 = stats.norm.pdf(df1[str(feature)].sort_values(), df1_mean, df1_std)
        
        df2_mean = np.mean(df2[str(feature)])
        df2_std = np.std(df2[str(feature)])
        pdf2 = stats.norm.pdf(df2[str(feature)].sort_values(), df2_mean, df2_std)
    
        ax = plt.subplot(40, 2, i+1)
        plt.plot(df1[str(feature)].sort_values(), pdf1, label='>=2')
        plt.plot(df2[str(feature)].sort_values(), pdf2, label='<2')
        plt.legend(loc=1, prop={'size': 20})
        plt.xlabel(str(feature.upper()))

#         plt.savefig(f'normdist_plots/{dim}.pdf')

In [None]:
generate_plot('abs', pos, neg, feature_names)

### LOGISTILINE REGRESSIOON

In [None]:
feats = [*feature_names]

In [None]:
only_abs = initial_df.loc[:, feats + ['abs']].astype('float64')
only_abs['state'] = [1 if score >= 1 else 0 for score in only_abs['abs']]
# print(only_abs)

final_features = only_abs.columns.values.tolist()[:-2]


X = only_abs[final_features]
y = only_abs.state

model = LogisticRegression().fit(X,y)

plot_important_features(model, 'abstraktsus')


In [None]:

log_reg = smf.logit("state ~ coref + hapax_legomena + noun + nom_case + avg_word_len + past_tense + pres_tense + obl + adv + num + ade_case + nummod + part_case", data=only_abs).fit()
log_reg.summary()

## AFEKTIIVNE

In [None]:
dim = 'afek'

In [None]:

afek_pos, afek_neg = make_new_df('afek')


print(len(afek_pos), len(afek_neg))

In [None]:


dim_df_corr = pos.corr(method='pearson').round(2)
save_csv(dim, dim_df_corr)

generate_heatmap(dim, dim_df_corr)


pairs = get_corr_pairs(dim_df_corr)
for pair in pairs:
    print(f'{pair[0]} = {pair[1]}')


generate_plot(dim, pos, neg, feature_names)

In [None]:
df = initial_df.loc[:, feats + ['afek']].astype('float64')
df['state'] = [1 if score >= 1.5 else 0 for score in df['afek']]

final_features = df.columns.values.tolist()[:-2]


X = df[final_features]
y = df.state

model = LogisticRegression().fit(X,y)
plot_important_features(model, 'afektiivsus')

In [None]:
log_reg = smf.logit("state ~ avg_word_len + coref + noun + adv + pron + verbtype_ratio + TTR + punct + active_voice + adj + conj + avr_sent_len", data=df).fit()
log_reg.summary()

## AEG

In [None]:
dim = 'aeg'

tugev = len(initial_df.loc[(initial_df[dim] < 3) & (initial_df[dim] > 2), feature_names])
moodukas = len(initial_df.loc[(initial_df[dim] < 2) & (initial_df[dim] > 1), feature_names])
nork = len(initial_df.loc[initial_df[dim] < 1, feature_names])

print(tugev, moodukas, nork)



In [None]:
aeg_df, aeg_not_df = make_new_df(initial_df, dim, 2)

aeg_corr = aeg_df.corr(method='pearson').round(2)
save_csv(dim, 'present', aeg_corr)
aeg_corr

In [None]:
fig,ax = plt.subplots(1,1,figsize=(20,20))
hmap = sns.heatmap(aeg_corr, linewidths=0.1, ax=ax)
save_heatmap(hmap, dim)

In [None]:
generate_plot('aeg', aeg_df, feature_names)

In [None]:
df = initial_df.loc[:, feats + ['aeg']].astype('float64')
df['state'] = [1 if score >= 1.5 else 0 for score in df['aeg']]

final_features = df.columns.values.tolist()[:-2]


X = df[final_features]
y = df.state

model = LogisticRegression().fit(X,y)
plot_important_features(model, 'aeg')

In [None]:
log_reg = smf.logit("state ~ coref + past_tense + pres_tense + obl + verbtype_ratio + adv + num + avg_word_len + gen_case + nummod + noun + core_verb + ade_case + propn + da_inf + cop", data=df).fit()
log_reg.summary()

## ARGUMENTATIIVNE

In [None]:
dim = 'arg'

tugev = len(initial_df.loc[(initial_df[dim] < 3) & (initial_df[dim] > 2), feature_names])
moodukas = len(initial_df.loc[(initial_df[dim] < 2) & (initial_df[dim] > 1), feature_names])
nork = len(initial_df.loc[initial_df[dim] < 1, feature_names])

print(tugev, moodukas, nork)

In [None]:
arg_df, arg_not_df = make_new_df(initial_df, dim, 2)

arg_corr = arg_df.corr(method='pearson').round(2)
save_csv(dim, 'present', arg_corr)
arg_corr

In [None]:
fig,ax = plt.subplots(1,1,figsize=(20,20))
hmap = sns.heatmap(arg_corr, linewidths=0.1, ax=ax)
save_heatmap(hmap, dim)

In [None]:
generate_plot('arg', arg_df, feature_names)

In [None]:
df = initial_df.loc[:, feats + ['arg']].astype('float64')
df['state'] = [1 if score >= 1.5 else 0 for score in df['arg']]

final_features = df.columns.values.tolist()[:-2]


X = df[final_features]
y = df.state

model = LogisticRegression().fit(X,y)
plot_important_features(model, 'arg')



In [None]:
log_reg = smf.logit("state ~ coref + hapax_legomena + propn + verbtype_ratio + past_tense + pres_tense + avg_word_len + gen_case + conj + TTR + adj + neg_polarity + obl + part_case", data=df).fit()
log_reg.summary()

## FORMAALNE

In [None]:
dim = 'form'

tugev = len(initial_df.loc[(initial_df[dim] < 3) & (initial_df[dim] > 2), feature_names])
moodukas = len(initial_df.loc[(initial_df[dim] < 2) & (initial_df[dim] > 1), feature_names])
nork = len(initial_df.loc[initial_df[dim] < 1, feature_names])

print(dim)
print(tugev, moodukas, nork)

In [None]:
form_df, form_not_df = make_new_df(initial_df, dim, 2)

form_corr = form_df.corr(method='pearson').round(2)
save_csv(dim, 'present', form_corr)
form_corr

In [None]:
fig,ax = plt.subplots(1,1,figsize=(20,20))
hmap = sns.heatmap(form_corr, linewidths=0.1, ax=ax)
save_heatmap(hmap, dim)

In [None]:
generate_plot('form', form_df, feature_names)

In [None]:
df = initial_df.loc[:, feats + ['form']].astype('float64')
df['state'] = [1 if score >= 1.5 else 0 for score in df['form']]

final_features = df.columns.values.tolist()[:-2]


X = df[final_features]
y = df.state

model = LogisticRegression().fit(X,y)
plot_important_features(model, 'formaalsus')




In [None]:

log_reg = smf.logit("state ~ avg_word_len + coref + gen_case + noun + obl + adv + nmod + num", data=df).fit()
log_reg.summary()

## IMPERSONAALNE

In [None]:
dim = 'imp'

tugev = len(initial_df.loc[(initial_df[dim] < 3) & (initial_df[dim] > 2), feature_names])
moodukas = len(initial_df.loc[(initial_df[dim] < 2) & (initial_df[dim] > 1), feature_names])
nork = len(initial_df.loc[initial_df[dim] < 1, feature_names])

print(dim)
print(tugev, moodukas, nork)

In [None]:
imp_df, imp_not_df = make_new_df(initial_df, dim, 2)

imp_corr = imp_df.corr(method='pearson').round(2)
save_csv(dim, 'present', imp_corr)
imp_corr

In [None]:
fig,ax = plt.subplots(1,1,figsize=(20,20))
hmap = sns.heatmap(imp_corr, linewidths=0.1, ax=ax)
save_heatmap(hmap, dim)

In [None]:
generate_plot('imp', imp_df, feature_names)

In [None]:
df = initial_df.loc[:, feats + ['imp']].astype('float64')
df['state'] = [1 if score >= 1.5 else 0 for score in df['imp']]

final_features = df.columns.values.tolist()[:-2]


X = df[final_features]
y = df.state

model = LogisticRegression().fit(X,y)
plot_important_features(model, 'impersonaalsus')





In [None]:
log_reg = smf.logit("state ~ avg_word_len + coref + noun + past_tense + pres_tense + conj + pron + num", data=df).fit()
log_reg.summary()

## INFO

In [None]:
dim = 'info'

tugev = len(initial_df.loc[(initial_df[dim] < 3) & (initial_df[dim] > 2), feature_names])
moodukas = len(initial_df.loc[(initial_df[dim] < 2) & (initial_df[dim] > 1), feature_names])
nork = len(initial_df.loc[initial_df[dim] < 1, feature_names])
print(dim)
print(tugev, moodukas, nork)

In [None]:
info_df, info_not_df = make_new_df(initial_df, dim, 2)

info_corr = info_df.corr(method='pearson').round(2)
save_csv(dim, 'present', info_corr)
info_corr

In [None]:
fig,ax = plt.subplots(1,1,figsize=(20,20))
hmap = sns.heatmap(info_corr, linewidths=0.1, ax=ax)
save_heatmap(hmap, dim)

In [None]:
generate_plot('info', info_df, feature_names)

In [None]:
df = initial_df.loc[:, feats + ['info']].astype('float64')
df['state'] = [1 if score >= 1.5 else 0 for score in df['info']]

final_features = df.columns.values.tolist()[:-2]

X = df[final_features]
y = df.state

model = LogisticRegression().fit(X,y)
plot_important_features(model, 'info')


In [None]:

log_reg = smf.logit("state ~ coref + num + avg_word_len + active_voice + propn + nummod + pron + ind_mood + hapax_legomena + nsubj", data=df).fit()
log_reg.summary()

## INSTRUEERIV

In [None]:
dim = 'inst'

tugev = len(initial_df.loc[(initial_df[dim] < 3) & (initial_df[dim] > 2), feature_names])
moodukas = len(initial_df.loc[(initial_df[dim] < 2) & (initial_df[dim] > 1), feature_names])
nork = len(initial_df.loc[initial_df[dim] < 1, feature_names])

print(dim)
print(tugev, moodukas, nork)

In [None]:
inst_df, inst_not_df = make_new_df(initial_df, dim, 2)

inst_corr = inst_df.corr(method='pearson').round(2)
save_csv(dim, 'present', inst_corr)
inst_corr

In [None]:
fig,ax = plt.subplots(1,1,figsize=(20,20))
hmap = sns.heatmap(inst_corr, linewidths=0.1, ax=ax)
save_heatmap(hmap, dim)

In [None]:
generate_plot('inst', inst_df, feature_names)

In [None]:
df = initial_df.loc[:, feats + ['inst']].astype('float64')
df['state'] = [1 if score >= 1.5 else 0 for score in df['inst']]

final_features = df.columns.values.tolist()[:-2]


X = df[final_features]
y = df.state

model = LogisticRegression().fit(X,y)
plot_important_features(model, 'instrueerivus')


In [None]:
log_reg = smf.logit("state ~ coref + pres_tense + nom_case + pron + hapax_legomena + past_tense + verbtype_ratio + noun + third_prs_verb + second_prs_verb + imp_mood + pron", data=df).fit()
log_reg.summary()

## INTERAKTIIVNE

In [None]:
dim = 'inter'

tugev = len(initial_df.loc[(initial_df[dim] < 3) & (initial_df[dim] > 2), feature_names])
moodukas = len(initial_df.loc[(initial_df[dim] < 2) & (initial_df[dim] > 1), feature_names])
nork = len(initial_df.loc[initial_df[dim] < 1, feature_names])

print(dim)
print(tugev, moodukas, nork)

In [None]:
inter_df, inter_not_df = make_new_df(initial_df, dim, 2)

inter_corr = inter_df.corr(method='pearson').round(2)
save_csv(dim, 'present', inter_corr)
inter_corr

In [None]:
fig,ax = plt.subplots(1,1,figsize=(20,20))
hmap = sns.heatmap(inter_corr, linewidths=0.1, ax=ax)
save_heatmap(hmap, dim)

In [None]:
generate_plot('inter', inter_df, feature_names)

In [None]:
df = initial_df.loc[:, feats + ['inter']].astype('float64')
df['state'] = [1 if score >= 1.5 else 0 for score in df['inter']]

final_features = df.columns.values.tolist()[:-2]


X = df[final_features]
y = df.state

model = LogisticRegression().fit(X,y)
plot_important_features(model, 'inter')





In [None]:
log_reg = smf.logit("state ~ avg_word_len + hapax_legomena + TTR + gen_case + noun + verbtype_ratio + avr_sent_len", data=df).fit()
log_reg.summary()

## KEER

In [None]:
dim = 'keer'

tugev = len(initial_df.loc[(initial_df[dim] < 3) & (initial_df[dim] > 2), feature_names])
moodukas = len(initial_df.loc[(initial_df[dim] < 2) & (initial_df[dim] > 1), feature_names])
nork = len(initial_df.loc[initial_df[dim] < 1, feature_names])

print(dim)
print(tugev, moodukas, nork)

In [None]:
keer_df, keer_not_df = make_new_df(initial_df, dim, 2)

keer_corr = keer_df.corr(method='pearson').round(2)
save_csv(dim, 'present', keer_corr)
keer_corr

In [None]:
fig,ax = plt.subplots(1,1,figsize=(20,20))
hmap = sns.heatmap(keer_corr, linewidths=0.1, ax=ax)
save_heatmap(hmap, dim)

In [None]:
generate_plot('keer', keer_df, feature_names)

In [None]:
df = initial_df.loc[:, feats + ['keer']].astype('float64')
df['state'] = [1 if score >= 1.5 else 0 for score in df['keer']]

final_features = df.columns.values.tolist()[:-2]


X = df[final_features]
y = df.state

model = LogisticRegression().fit(X,y)
plot_important_features(model, 'keerulisus')





In [None]:
log_reg = smf.logit("state ~ avg_word_len + coref + hapax_legomena + verbtype_ratio + abbr + nmod + past_tense + obl + active_voice", data=df).fit()
log_reg.summary()

## SPONT

In [None]:
dim = 'spont'

tugev = len(initial_df.loc[(initial_df[dim] < 3) & (initial_df[dim] > 2), feature_names])
moodukas = len(initial_df.loc[(initial_df[dim] < 2) & (initial_df[dim] > 1), feature_names])
nork = len(initial_df.loc[initial_df[dim] < 1, feature_names])

print(dim)
print(tugev, moodukas, nork)

In [None]:
spont_df, spont_not_df = make_new_df(initial_df, dim, 2)

spont_corr = spont_df.corr(method='pearson').round(2)
save_csv(dim, 'present', spont_corr)
spont_corr

In [None]:
fig,ax = plt.subplots(1,1,figsize=(20,20))
hmap = sns.heatmap(spont_corr, linewidths=0.1, ax=ax)
save_heatmap(hmap, dim)

In [None]:
generate_plot('spont', spont_df, feature_names)

In [None]:
df = initial_df.loc[:, feats + ['spont']].astype('float64')
df['state'] = [1 if score >= 1.5 else 0 for score in df['spont']]

final_features = df.columns.values.tolist()[:-2]


X = df[final_features]
y = df.state

model = LogisticRegression().fit(X,y)
plot_important_features(model, 'spontaansus')





In [None]:
log_reg = smf.logit("state ~ avg_word_len + coref + noun + gen_case + nom_case + avr_sent_len + obl + propn", data=df).fit()
log_reg.summary()

## SUBJ

In [None]:
dim = 'subj'

tugev = len(initial_df.loc[(initial_df[dim] < 3) & (initial_df[dim] > 2), feature_names])
moodukas = len(initial_df.loc[(initial_df[dim] < 2) & (initial_df[dim] > 1), feature_names])
nork = len(initial_df.loc[initial_df[dim] < 1, feature_names])

print(dim)
print(tugev, moodukas, nork)

In [None]:
subj_df, subj_not_df = make_new_df(initial_df, dim, 2)

subj_corr = aeg_df.corr(method='pearson').round(2)
save_csv(dim, 'present', subj_corr)
subj_corr

In [None]:
fig,ax = plt.subplots(1,1,figsize=(20,20))
hmap = sns.heatmap(subj_corr, linewidths=0.1, ax=ax)
save_heatmap(hmap, dim)

In [None]:
generate_plot('subj', subj_df, feature_names)

In [None]:
df = initial_df.loc[:, feats + ['subj']].astype('float64')
df['state'] = [1 if score >= 1.5 else 0 for score in df['subj']]

final_features = df.columns.values.tolist()[:-2]


X = df[final_features]
y = df.state

model = LogisticRegression().fit(X,y)
plot_important_features(model, 'subjektiivsus')





In [None]:
log_reg = smf.logit("state ~ avg_word_len + coref + verbtype_ratio + adv + propn + noun", data=df).fit()
log_reg.summary()