In [3]:
import numpy as np
import scipy as scipy
from scipy.stats import ttest_ind, f_oneway
import pandas as pd

In [2]:
	sims = []
	metric_to_plot = ['unigrams', 'bigrams', 'embeddings'][2]
	with(open('sims.txt')) as f:
		for line in f:
			line = line.strip()
			domain, metric, expertise = line.split()[0].split(",")
			if metric == metric_to_plot:
				for correlation in line.split()[1].split(","):
					sims.append([str(domain),str(metric),str(expertise),float(correlation)])
	
	
	sims = pd.DataFrame(sims, columns=['Domain', "Metric", "Expertise", "Correlation"])
	sims.Domain = sims.Domain.astype(str)
	sims.Metric = sims.Metric.astype(str)
	sims.Expertise = sims.Expertise.astype(str)
	print(sims.dtypes)

	# Make a grouped barplot
	print(sims)

Domain          object
Metric          object
Expertise       object
Correlation    float64
dtype: object
   Domain      Metric Expertise  Correlation
0    Text  embeddings    Novice     0.234264
1    Text  embeddings    Novice     0.295957
2    Text  embeddings    Novice     0.352927
3    Text  embeddings    Novice     0.311403
4    Text  embeddings    Novice     0.383802
5    Text  embeddings    Expert     0.449876
6    Text  embeddings    Expert     0.659829
7    Text  embeddings    Expert     0.420126
8    Text  embeddings    Expert     0.415363
9    Text  embeddings    Expert     0.362667
10   List  embeddings    Novice     0.044027
11   List  embeddings    Novice     0.120915
12   List  embeddings    Novice     0.038308
13   List  embeddings    Novice     0.158708
14   List  embeddings    Novice     0.104545
15   List  embeddings    Expert     0.221276
16   List  embeddings    Expert     0.215433
17   List  embeddings    Expert     0.310650
18   List  embeddings    Expert     0.2

In [7]:
p_vals = []
for d in set(sims['Domain']):
    vals = []
    for e in set(sims['Expertise']):
        print(d,e)
        corrs = sims[(sims['Domain'] == d) & (sims['Expertise'] == e)]['Correlation'].to_numpy()
        print(corrs)
        vals.append(corrs)
    novice, expert = vals
    print('Domain t-test ind %s' % d)
    statistic, p_value = ttest_ind(novice, expert, equal_var=True)
    print(statistic, p_value)
    p_vals.append(p_value)

from scipy.stats import combine_pvalues
print(combine_pvalues(p_vals))

Logo Expert
[0.32249803 0.49014966 0.38370211 0.4204246 ]
Logo Novice
[0.1434666  0.05440446 0.26038937 0.23873982]
Domain t-test ind Logo
3.903733504039721 0.00794964565664826
Tower Expert
[0.39257758 0.35932962 0.35782114 0.40458462 0.3405212 ]
Tower Novice
[0.33942967 0.30609356 0.37664109 0.27721878 0.21711648]
Domain t-test ind Tower
2.2789641431247936 0.052155582647982125
List Expert
[0.22127555 0.2154325  0.31064976 0.22101491 0.30326127]
List Novice
[0.04402692 0.12091454 0.03830804 0.15870838 0.10454505]
Domain t-test ind List
5.104756928028637 0.0009244050261512192
Text Expert
[0.44987594 0.65982916 0.42012576 0.41536291 0.36266738]
Text Novice
[0.23426417 0.29595741 0.35292732 0.31140265 0.38380173]
Domain t-test ind Text
2.537099545086011 0.0348678678023194
(36.26140357167559, 1.5727010804749426e-05)


In [None]:
# Two way ANOVA



In [6]:
all_novice, all_expert = [], []
for d in set(sims['Domain']):
    vals = []
    for e in set(sims['Expertise']):
        print(d,e)
        corrs = sims[(sims['Domain'] == d) & (sims['Expertise'] == e)]['Correlation'].to_numpy()
        print(corrs)
        vals.append(corrs)
    novice, expert = vals
    all_novice += list(novice)
    all_expert += list(expert)
    print('Domain t-test ind %s' % d)
    print(ttest_ind(novice, expert, equal_var=True))
print(all_novice)
print(all_expert)
print(ttest_ind(all_novice, all_expert, equal_var=True))

Tower Expert
[0.39257758 0.35932962 0.35782114 0.40458462 0.3405212 ]
Tower Novice
[0.33942967 0.30609356 0.37664109 0.27721878 0.21711648]
Domain t-test ind Tower
Ttest_indResult(statistic=2.2789641431247936, pvalue=0.052155582647982125)
Logo Expert
[0.32249803 0.49014966 0.38370211 0.4204246 ]
Logo Novice
[0.1434666  0.05440446 0.26038937 0.23873982]
Domain t-test ind Logo
Ttest_indResult(statistic=3.903733504039721, pvalue=0.00794964565664826)
Text Expert
[0.44987594 0.65982916 0.42012576 0.41536291 0.36266738]
Text Novice
[0.23426417 0.29595741 0.35292732 0.31140265 0.38380173]
Domain t-test ind Text
Ttest_indResult(statistic=2.537099545086011, pvalue=0.0348678678023194)
List Expert
[0.22127555 0.2154325  0.31064976 0.22101491 0.30326127]
List Novice
[0.04402692 0.12091454 0.03830804 0.15870838 0.10454505]
Domain t-test ind List
Ttest_indResult(statistic=5.104756928028637, pvalue=0.0009244050261512192)
[0.39257758192608067, 0.35932961728341883, 0.35782114290397443, 0.40458461760947