In [19]:
import pandas as pd
import numpy as np
from scipy.stats import wilcoxon

portuguese_liwc = pd.read_csv('portuguese_stories_liwc.csv.gz', compression='gzip')
english_liwc = pd.read_csv('icwsm09_stories_liwc.csv.gz', compression='gzip')

In [20]:
print("postuguese: " + str(len(portuguese_liwc)))
print("english: " + str(len(english_liwc)))

postuguese: 37746
english: 383361


## Corpus Filter

In [21]:
portuguese_liwc = portuguese_liwc[((portuguese_liwc['wc'] > 10) & (portuguese_liwc['wc'] < 1000))]
portuguese_liwc = portuguese_liwc[((portuguese_liwc['wps'] > 3) & (portuguese_liwc['wps'] < 30))]
portuguese_liwc = portuguese_liwc[portuguese_liwc['i'] > 2]
portuguese_liwc = portuguese_liwc[(portuguese_liwc['negemo'] + portuguese_liwc['posemo']) > 2]
portuguese_liwc = portuguese_liwc[portuguese_liwc['score'] > 0]
len(portuguese_liwc)

28847

In [22]:
english_liwc = english_liwc[((english_liwc['wc'] > 10) & (english_liwc['wc'] < 1000))]
english_liwc = english_liwc[((english_liwc['wps'] > 3) & (english_liwc['wps'] < 30))]
english_liwc = english_liwc[english_liwc['i'] > 2]
english_liwc = english_liwc[(english_liwc['negemo'] + english_liwc['posemo']) > 2]
english_liwc = english_liwc[english_liwc['score'] > 0]
len(english_liwc)

161704

## Wilcoxon Test

In [23]:
english_sample = english_liwc.sample(len(portuguese_liwc))

In [24]:
def diff(x): 
    return abs(x['portuguese_mean'] - x['english_mean'])

stats = pd.DataFrame(data={'portuguese_mean': portuguese_liwc.mean(axis=0)}, index=portuguese_liwc.columns.values)
stats['english_mean'] = english_sample.mean(axis=0)

stats['diff'] = stats.apply(diff,axis=1)

significance = []
for column in list(stats.index.values):
    a = portuguese_liwc[column]
    b = english_sample[column]
    t, p = wilcoxon(a, b)
    significance.append(p)
stats['significance'] = significance

In [25]:
stats[stats.significance > 0.05].sort_values('significance',ascending=False)

Unnamed: 0,portuguese_mean,english_mean,diff,significance


In [26]:
stats[stats.significance < 0.05].sort_values('diff',ascending=True).head(10)

Unnamed: 0,portuguese_mean,english_mean,diff,significance
future,2.229937,2.170971,0.058966,0.01249412
assent,1.062156,1.001803,0.060353,4.960689999999999e-19
wps,15.950989,15.761487,0.189502,0.001163319
family,1.301487,1.53399,0.232503,4.2723689999999996e-63
score,0.479365,0.046321,0.433044,0.0
death,0.933477,0.414116,0.519361,0.0
anx,1.688321,1.158249,0.530072,8.073989000000001e-241
home,2.091483,2.773391,0.681908,1.8195610000000001e-165
filler,0.495407,1.403855,0.908448,0.0
anger,3.07512,2.108122,0.966998,0.0


In [27]:
stats[stats.significance < 0.05].sort_values('diff',ascending=False).head(10)

Unnamed: 0,portuguese_mean,english_mean,diff,significance
Unnamed: 0,18842.674871,198210.061636,179367.386765,0.0
cogmech,141.564634,58.841162,82.723472,0.0
social,69.569765,28.017471,41.552293,0.0
incl,51.971297,21.016224,30.955073,0.0
ipron,44.538878,17.793289,26.745589,0.0
tentat,34.992339,8.863764,26.128575,0.0
relativ,79.698756,56.132041,23.566714,0.0
you,24.81492,1.833466,22.981454,0.0
ingest,24.720699,2.312753,22.407945,0.0
humans,24.318161,2.365723,21.952439,0.0
