In [17]:
import pandas as pd
import numpy as np
from scipy.stats import wilcoxon
corpus = pd.read_csv('corpus.csv.gz', compression='gzip')
corpus_liwc = pd.read_csv('corpus_liwc_mtx.csv.gz', compression='gzip')

In [18]:
corpus_liwc['class'] = corpus['qual_a_melhor_classificao_para_esse_texto']
corpus_liwc['confidence'] = corpus['qual_a_melhor_classificao_para_esse_texto:confidence']
corpus_liwc['wc'] = corpus['contentcount']
corpus_liwc['judges'] = corpus['_trusted_judgments']

In [19]:
corpus_liwc = corpus_liwc[corpus_liwc.wc.apply(lambda x: str(x).isnumeric())]
corpus_liwc = corpus_liwc[corpus_liwc['judges'] == 3]

In [20]:
diario = corpus_liwc[corpus_liwc['class'] == 'diario'].sample(330)
outro = corpus_liwc[corpus_liwc['class'] == 'outro'].sample(330)

In [21]:
columns = ['funct','pronoun','ppron','i','we','you','shehe','they','ipron','article','verb','auxverb','past','present','future','adverb','preps','conj','negate','quant','number','swear','social','family','friend','humans','affect','posemo','negemo','anx','anger','sad','cogmech','insight','cause','discrep','tentat','certain','inhib','incl','excl','percept','see','hear','feel','bio','body','health','sexual','ingest','relativ','motion','space','time','work','achieve','leisure','home','money','relig','death','assent','nonfl','filler']

In [22]:
stats = pd.DataFrame(data={'diario_mean': diario.mean(axis=0)}, index=columns)
stats['outro_mean'] = outro.mean(axis=0)
stats['diff'] = stats['diario_mean'] - stats['outro_mean']

In [23]:
significance = []
for column in list(stats.index.values):
    a = diario[column]
    b = outro[column]
    t, p = wilcoxon(a, b)
    significance.append(p)
stats['significance'] = significance

## Wilcoxon Not Rejected Categories
#### two populations are equally interested

In [24]:
liguistic_columns = ['funct','pronoun','ppron','i','we','you','shehe','they','ipron','article','verb','auxverb','past','present','future','adverb','preps','conj','negate','quant','number']
liguistic_stats = stats.ix[liguistic_columns]
liguistic_stats[['significance','diff']][liguistic_stats.significance > 0.05].sort_values('significance',ascending=False)

Unnamed: 0,significance,diff
past,0.803038,0.424242
negate,0.582544,-0.190909
adverb,0.254213,-0.70303
number,0.222036,-0.515152
quant,0.067461,-2.009091


In [25]:
psychological_columns = ['swear','social','family','friend','humans','affect','posemo','negemo','anx','anger','sad','cogmech','insight','cause','discrep','tentat','certain','inhib','incl','excl','percept','see','hear','feel','bio','body','health','sexual','ingest','relativ','motion','space','time','work','achieve','leisure','home','money','relig','death','assent','nonfl','filler']
psychological_stats = stats.ix[psychological_columns]
psychological_stats[['significance','diff']][psychological_stats.significance > 0.05].sort_values('significance',ascending=False)

Unnamed: 0,significance,diff
sexual,0.912331,0.1
anx,0.88816,0.021212
feel,0.779848,-0.036364
friend,0.76828,0.212121
see,0.701963,-0.172727
family,0.60637,-0.245455
health,0.550396,-0.421212
humans,0.364492,-1.509091
assent,0.284289,-0.187879
sad,0.24404,-0.427273


## Wilcoxon Rejected Categories
#### two populations has different interests

In [26]:
liguistic_columns = ['funct','pronoun','ppron','i','we','you','shehe','they','ipron','article','verb','auxverb','past','present','future','adverb','preps','conj','negate','quant','number']
liguistic_stats = stats.ix[liguistic_columns]
liguistic_stats[['significance','diff']][liguistic_stats.significance <= 0.05].sort_values('significance',ascending=False)

Unnamed: 0,significance,diff
verb,0.048505,-5.918182
future,0.020749,-0.509091
we,0.016447,-0.845455
auxverb,0.010987,-3.457576
pronoun,0.004995,-10.193939
funct,0.00408,-27.8
conj,0.004049,-5.893939
ipron,0.001615,-8.590909
present,0.001165,-6.454545
ppron,0.001125,-7.906061


In [27]:
psychological_columns = ['swear','social','family','friend','humans','affect','posemo','negemo','anx','anger','sad','cogmech','insight','cause','discrep','tentat','certain','inhib','incl','excl','percept','see','hear','feel','bio','body','health','sexual','ingest','relativ','motion','space','time','work','achieve','leisure','home','money','relig','death','assent','nonfl','filler']
psychological_stats = stats.ix[psychological_columns]
psychological_stats[['significance','diff']][psychological_stats.significance > 0.05].sort_values('significance',ascending=False)

Unnamed: 0,significance,diff
sexual,0.912331,0.1
anx,0.88816,0.021212
feel,0.779848,-0.036364
friend,0.76828,0.212121
see,0.701963,-0.172727
family,0.60637,-0.245455
health,0.550396,-0.421212
humans,0.364492,-1.509091
assent,0.284289,-0.187879
sad,0.24404,-0.427273


In [28]:
## story categories
psychological_stats[psychological_stats.significance <= 0.05].sort_values('diff',ascending=False).head(10)

Unnamed: 0,diario_mean,outro_mean,diff,significance
filler,0.454545,0.336364,0.118182,0.044004
death,0.912121,1.245455,-0.333333,0.004795
anger,3.20303,4.181818,-0.978788,0.012911
leisure,7.733333,8.872727,-1.139394,0.03398
certain,9.227273,10.521212,-1.293939,0.015936
hear,4.863636,6.4,-1.536364,0.000818
relig,2.236364,3.912121,-1.675758,0.000155
inhib,16.09697,18.112121,-2.015152,0.043226
achieve,11.154545,13.757576,-2.60303,0.002742
nonfl,8.936364,11.709091,-2.772727,6.4e-05


In [29]:
## non-story categories
psychological_stats[psychological_stats.significance <= 0.05].sort_values('diff',ascending=True).head(10)

Unnamed: 0,diario_mean,outro_mean,diff,significance
cogmech,141.830303,165.872727,-24.042424,0.004966
social,69.345455,83.906061,-14.560606,0.000831
relativ,79.293939,89.578788,-10.284848,0.013153
incl,52.445455,62.157576,-9.712121,0.001298
space,36.272727,44.133333,-7.860606,0.000345
tentat,35.178788,40.072727,-4.893939,0.025914
affect,31.651515,36.106061,-4.454545,0.022497
swear,19.306061,23.669697,-4.363636,0.001177
work,8.554545,12.833333,-4.278788,1.1e-05
excl,25.448485,29.454545,-4.006061,0.017434
