In [3]:
import pandas as pd
import numpy as np
from scipy.stats import wilcoxon
corpus = pd.read_csv('corpus.csv.gz', compression='gzip')
corpus_liwc = pd.read_csv('corpus_liwc_mtx.csv.gz', compression='gzip')

In [4]:
corpus_liwc['class'] = corpus['qual_a_melhor_classificao_para_esse_texto']
corpus_liwc['confidence'] = corpus['qual_a_melhor_classificao_para_esse_texto:confidence']
corpus_liwc['wc'] = corpus['contentcount']

In [5]:
diario = corpus_liwc[corpus_liwc['class'] == 'diario'].sample(330)
outro = corpus_liwc[corpus_liwc['class'] == 'outro'].sample(330)

In [6]:
columns = ['funct','pronoun','ppron','i','we','you','shehe','they','ipron','article','verb','auxverb','past','present','future','adverb','preps','conj','negate','quant','number','swear','social','family','friend','humans','affect','posemo','negemo','anx','anger','sad','cogmech','insight','cause','discrep','tentat','certain','inhib','incl','excl','percept','see','hear','feel','bio','body','health','sexual','ingest','relativ','motion','space','time','work','achieve','leisure','home','money','relig','death','assent','nonfl','filler']

In [7]:
stats = pd.DataFrame(data={'diario_mean': diario.mean(axis=0)}, index=columns)
stats['outro_mean'] = outro.mean(axis=0)
stats['diff'] = stats['diario_mean'] - stats['outro_mean']

In [8]:
significance = []
for column in list(stats.index.values):
    a = diario[column]
    b = outro[column]
    t, p = wilcoxon(a, b)
    significance.append(p)
stats['significance'] = significance

## Wilcoxon Not Rejected Categories
#### two populations are equally interested

In [9]:
liguistic_columns = ['funct','pronoun','ppron','i','we','you','shehe','they','ipron','article','verb','auxverb','past','present','future','adverb','preps','conj','negate','quant','number']
liguistic_stats = stats.ix[liguistic_columns]
liguistic_stats[['significance','diff']][liguistic_stats.significance > 0.05].sort_values('significance',ascending=False)

Unnamed: 0,significance,diff
adverb,0.88624,-0.230303
negate,0.757456,-0.112121
past,0.645682,1.333333
number,0.111106,-0.657576
quant,0.070627,-2.109091


In [10]:
psychological_columns = ['swear','social','family','friend','humans','affect','posemo','negemo','anx','anger','sad','cogmech','insight','cause','discrep','tentat','certain','inhib','incl','excl','percept','see','hear','feel','bio','body','health','sexual','ingest','relativ','motion','space','time','work','achieve','leisure','home','money','relig','death','assent','nonfl','filler']
psychological_stats = stats.ix[psychological_columns]
psychological_stats[['significance','diff']][psychological_stats.significance > 0.05].sort_values('significance',ascending=False)

Unnamed: 0,significance,diff
family,0.958735,-0.012121
anx,0.92491,-0.009091
feel,0.842379,-0.127273
assent,0.774638,-0.106061
friend,0.615237,0.2
sad,0.556463,-0.309091
ingest,0.37399,-1.124242
home,0.326099,0.248485
see,0.317357,-0.548485
sexual,0.3155,-0.278788


## Wilcoxon Rejected Categories
#### two populations has different interests

In [11]:
liguistic_columns = ['funct','pronoun','ppron','i','we','you','shehe','they','ipron','article','verb','auxverb','past','present','future','adverb','preps','conj','negate','quant','number']
liguistic_stats = stats.ix[liguistic_columns]
liguistic_stats[['significance','diff']][liguistic_stats.significance <= 0.05].sort_values('significance',ascending=False)

Unnamed: 0,significance,diff
future,0.04456357,-0.527273
verb,0.03820484,-6.130303
we,0.02963067,-0.79697
auxverb,0.01714816,-3.409091
pronoun,0.009719053,-9.272727
conj,0.005828724,-5.727273
funct,0.003043082,-28.00303
ipron,0.00290267,-8.036364
ppron,0.0005909215,-7.875758
present,0.0001311722,-7.284848


In [12]:
psychological_columns = ['swear','social','family','friend','humans','affect','posemo','negemo','anx','anger','sad','cogmech','insight','cause','discrep','tentat','certain','inhib','incl','excl','percept','see','hear','feel','bio','body','health','sexual','ingest','relativ','motion','space','time','work','achieve','leisure','home','money','relig','death','assent','nonfl','filler']
psychological_stats = stats.ix[psychological_columns]
psychological_stats[['significance','diff']][psychological_stats.significance > 0.05].sort_values('significance',ascending=False)

Unnamed: 0,significance,diff
family,0.958735,-0.012121
anx,0.92491,-0.009091
feel,0.842379,-0.127273
assent,0.774638,-0.106061
friend,0.615237,0.2
sad,0.556463,-0.309091
ingest,0.37399,-1.124242
home,0.326099,0.248485
see,0.317357,-0.548485
sexual,0.3155,-0.278788


In [13]:
## story categories
psychological_stats[psychological_stats.significance <= 0.05].sort_values('diff',ascending=False).head(10)

Unnamed: 0,diario_mean,outro_mean,diff,significance
filler,0.527273,0.333333,0.193939,0.01373
death,0.930303,1.278788,-0.348485,0.015792
anger,3.00303,4.166667,-1.163636,0.001779
hear,5.075758,6.290909,-1.215152,0.005713
body,9.918182,11.266667,-1.348485,0.017375
relig,2.30303,3.781818,-1.478788,0.00015
negemo,9.184848,11.130303,-1.945455,0.027243
inhib,16.066667,18.360606,-2.293939,0.030045
achieve,11.248485,13.984848,-2.736364,0.001318
nonfl,8.884848,11.763636,-2.878788,1e-05


In [14]:
## non-story categories
psychological_stats[psychological_stats.significance <= 0.05].sort_values('diff',ascending=True).head(10)

Unnamed: 0,diario_mean,outro_mean,diff,significance
cogmech,142.678788,168.451515,-25.772727,0.001624
social,70.363636,84.409091,-14.045455,0.000619
incl,52.348485,62.790909,-10.442424,0.00013
relativ,80.969697,90.466667,-9.49697,0.025738
space,36.109091,44.633333,-8.524242,3.5e-05
tentat,35.348485,40.824242,-5.475758,0.02098
affect,31.218182,36.50303,-5.284848,0.015811
swear,19.078788,23.678788,-4.6,0.000459
insight,23.006061,27.021212,-4.015152,0.010344
work,9.063636,12.80303,-3.739394,5.4e-05
