In [18]:
import pandas as pd
import numpy as np
from scipy.stats import wilcoxon

portuguese_liwc = pd.read_csv('portuguese_stories_liwc.csv.gz', compression='gzip')
english_liwc = pd.read_csv('icwsm09_stories_liwc.csv.gz', compression='gzip')

In [19]:
print("postuguese: " + str(len(portuguese_liwc)))
print("english: " + str(len(english_liwc)))

postuguese: 37746
english: 383361


## Corpus Filter

In [22]:
portuguese_liwc = portuguese_liwc[((portuguese_liwc['wc'] > 10) & (portuguese_liwc['wc'] < 1000))]
portuguese_liwc = portuguese_liwc[((portuguese_liwc['wps'] > 3) & (portuguese_liwc['wps'] < 30))]
portuguese_liwc = portuguese_liwc[portuguese_liwc['i'] > 2]
portuguese_liwc = portuguese_liwc[(portuguese_liwc['negemo'] + portuguese_liwc['posemo']) > 2]
portuguese_liwc = portuguese_liwc[portuguese_liwc['score'] > 0]
len(portuguese_liwc)

28847

In [24]:
english_liwc = english_liwc[((english_liwc['wc'] > 10) & (english_liwc['wc'] < 1000))]
english_liwc = english_liwc[((english_liwc['wps'] > 3) & (english_liwc['wps'] < 30))]
english_liwc = english_liwc[english_liwc['i'] > 2]
english_liwc = english_liwc[(english_liwc['negemo'] + english_liwc['posemo']) > 2]
english_liwc = english_liwc[english_liwc['score'] > 0]
len(english_liwc)

161704

## Wilcoxon Test

In [25]:
english_sample = english_liwc.sample(len(portuguese_liwc))

In [35]:
def diff(x): 
    return abs(x['portuguese_mean'] - x['english_mean'])

stats = pd.DataFrame(data={'portuguese_mean': portuguese_liwc.mean(axis=0)}, index=portuguese_liwc.columns.values)
stats['english_mean'] = english_sample.mean(axis=0)

stats['diff'] = stats.apply(diff,axis=1)

significance = []
for column in list(stats.index.values):
    a = portuguese_liwc[column]
    b = english_sample[column]
    t, p = wilcoxon(a, b)
    significance.append(p)
stats['significance'] = significance

In [36]:
stats[stats.significance > 0.05].sort_values('significance',ascending=False)

Unnamed: 0,portuguese_mean,english_mean,diff,significance
preps,50.713211,49.482095,1.231116,0.084829
future,2.229937,2.176864,0.053073,0.082062


In [37]:
stats[stats.significance < 0.05].sort_values('diff',ascending=True).head(10)

Unnamed: 0,portuguese_mean,english_mean,diff,significance
assent,1.062156,1.025722,0.036434,5.451402e-14
wps,15.950989,15.766239,0.18475,0.0005994626
family,1.301487,1.570666,0.269179,4.395356e-78
score,0.479365,0.046262,0.433103,0.0
death,0.933477,0.408465,0.525011,0.0
anx,1.688321,1.154713,0.533608,1.071054e-243
home,2.091483,2.844906,0.753423,5.797805e-198
filler,0.495407,1.429993,0.934586,0.0
anger,3.07512,2.121503,0.953617,4.77274e-309
we,1.651506,3.167019,1.515513,0.0


In [38]:
stats[stats.significance < 0.05].sort_values('diff',ascending=False).head(10)

Unnamed: 0,portuguese_mean,english_mean,diff,significance
Unnamed: 0,18842.674871,198994.058758,180151.383887,0.0
cogmech,141.564634,59.300309,82.264326,0.0
social,69.569765,28.306687,41.263078,0.0
incl,51.971297,21.166291,30.805006,0.0
ipron,44.538878,17.92637,26.612507,0.0
tentat,34.992339,8.938954,26.053385,0.0
you,24.81492,1.870766,22.944154,0.0
relativ,79.698756,56.778521,22.920234,0.0
ingest,24.720699,2.327382,22.393316,0.0
humans,24.318161,2.3712,21.946962,0.0


## Regression Evaluation