In [29]:
import pandas as pd
import numpy as np
from scipy.stats import wilcoxon
corpus = pd.read_csv('corpus.csv.gz', compression='gzip')
corpus_liwc = pd.read_csv('corpus_liwc_mtx.csv.gz', compression='gzip')

In [30]:
corpus_liwc['class'] = corpus['qual_a_melhor_classificao_para_esse_texto']
corpus_liwc['confidence'] = corpus['qual_a_melhor_classificao_para_esse_texto:confidence']
corpus_liwc['wc'] = corpus['contentcount']

In [51]:
diario = corpus_liwc[corpus_liwc['class'] == 'diario'].sample(330)
outro = corpus_liwc[corpus_liwc['class'] == 'outro'].sample(330)

In [38]:
import re

def wc(x): 
    try:
        return len(re.findall(r'\w+', x['content']))
    except:
        return 0
    
corpus['wc'] = corpus.apply(wc,axis=1)
corpus[['qual_a_melhor_classificao_para_esse_texto','wc']].groupby(['qual_a_melhor_classificao_para_esse_texto']).agg(['mean'])

Unnamed: 0_level_0,wc
Unnamed: 0_level_1,mean
qual_a_melhor_classificao_para_esse_texto,Unnamed: 1_level_2
diario,373.531546
outro,448.420765


In [4]:
columns = ['funct','pronoun','ppron','i','we','you','shehe','they','ipron','article','verb','auxverb','past','present','future','adverb','preps','conj','negate','quant','number','swear','social','family','friend','humans','affect','posemo','negemo','anx','anger','sad','cogmech','insight','cause','discrep','tentat','certain','inhib','incl','excl','percept','see','hear','feel','bio','body','health','sexual','ingest','relativ','motion','space','time','work','achieve','leisure','home','money','relig','death','assent','nonfl','filler']

In [63]:
def diff(x): 
    return 100 * ((x['diario_mean']/373) - (x['outro_mean']/448))
    
stats = pd.DataFrame(data={'diario_mean': diario.mean(axis=0)}, index=columns)
stats['outro_mean'] = outro.mean(axis=0)

stats['diff'] = stats.apply(diff,axis=1)

In [64]:
significance = []
for column in list(stats.index.values):
    a = diario[column]
    b = outro[column]
    t, p = wilcoxon(a, b)
    significance.append(p)
stats['significance'] = significance

## Wilcoxon Not Rejected Categories
#### two populations are equally interested

In [65]:
liguistic_columns = ['funct','pronoun','ppron','i','we','you','shehe','they','ipron','article','verb','auxverb','past','present','future','adverb','preps','conj','negate','quant','number']
liguistic_stats = stats.ix[liguistic_columns]
liguistic_stats[['significance','diff']][liguistic_stats.significance > 0.05].sort_values('significance',ascending=False)

Unnamed: 0,significance,diff
past,0.911653,0.651369
negate,0.563017,0.198329
adverb,0.38235,0.430911
we,0.074792,-0.077741
number,0.068814,0.053253


In [66]:
psychological_columns = ['swear','social','family','friend','humans','affect','posemo','negemo','anx','anger','sad','cogmech','insight','cause','discrep','tentat','certain','inhib','incl','excl','percept','see','hear','feel','bio','body','health','sexual','ingest','relativ','motion','space','time','work','achieve','leisure','home','money','relig','death','assent','nonfl','filler']
psychological_stats = stats.ix[psychological_columns]
psychological_stats[['significance','diff']][psychological_stats.significance > 0.05].sort_values('significance',ascending=False)

Unnamed: 0,significance,diff
sad,0.907298,0.144327
feel,0.881418,0.392118
family,0.655018,0.093446
sexual,0.613948,0.219892
anx,0.606215,0.068065
friend,0.480087,0.185595
see,0.415161,0.1484
humans,0.377707,0.708303
health,0.367773,0.015795
assent,0.28907,0.012308


## Wilcoxon Rejected Categories
#### two populations has different interests

In [67]:
liguistic_columns = ['funct','pronoun','ppron','i','we','you','shehe','they','ipron','article','verb','auxverb','past','present','future','adverb','preps','conj','negate','quant','number']
liguistic_stats = stats.ix[liguistic_columns]
liguistic_stats[['significance','diff']][liguistic_stats.significance <= 0.05].sort_values('significance',ascending=False)

Unnamed: 0,significance,diff
quant,0.02700406,0.190497
verb,0.02001081,0.793327
future,0.007640074,-0.049668
pronoun,0.004678376,0.361524
conj,0.004590971,0.100125
auxverb,0.003403468,0.07803
funct,0.002282232,0.819664
ipron,0.0005650039,-0.368602
ppron,0.0003130705,-0.074838
present,0.0002787303,-0.156834


In [73]:
psychological_columns = ['swear','social','family','friend','humans','affect','posemo','negemo','anx','anger','sad','cogmech','insight','cause','discrep','tentat','certain','inhib','incl','excl','percept','see','hear','feel','bio','body','health','sexual','ingest','relativ','motion','space','time','work','achieve','leisure','home','money','relig','death','assent','nonfl','filler']
psychological_stats = stats.ix[psychological_columns]
psychological_stats[['significance','diff']][psychological_stats.significance <= 0.05].sort_values('significance',ascending=True)

Unnamed: 0,significance,diff
nonfl,4.481162e-07,-0.358976
work,1.51106e-06,-0.628144
incl,0.0001212413,-0.26439
space,0.0001227628,-0.295822
swear,0.000308671,-0.190471
social,0.0005599783,-0.335637
relig,0.0006422451,-0.24701
cogmech,0.0009673212,-0.113749
achieve,0.002072578,-0.185265
death,0.006050294,-0.036165


In [69]:
## story categories
psychological_stats[psychological_stats.significance <= 0.05].sort_values('diff',ascending=False).head(10)

Unnamed: 0,diario_mean,outro_mean,diff,significance
relativ,80.163636,91.233333,1.127008,0.016144
bio,23.645455,26.257576,0.478198,0.026998
affect,31.642424,36.657576,0.300729,0.021864
inhib,16.330303,18.321212,0.288541,0.040421
insight,23.218182,26.99697,0.198604,0.01006
discrep,22.527273,26.478788,0.12904,0.014568
tentat,34.763636,41.209091,0.121552,0.01943
body,9.851515,11.345455,0.108689,0.006515
excl,25.593939,30.312121,0.095547,0.014294
filler,0.512121,0.351515,0.058835,0.00768


In [70]:
## non-story categories
psychological_stats[psychological_stats.significance <= 0.05].sort_values('diff',ascending=True).head(10)

Unnamed: 0,diario_mean,outro_mean,diff,significance
work,8.339394,12.830303,-0.628144,1.51106e-06
nonfl,8.730303,12.093939,-0.358976,4.481162e-07
social,70.060606,85.651515,-0.335637,0.0005599783
space,36.330303,44.960606,-0.295822,0.0001227628
incl,51.784848,63.381818,-0.26439,0.0001212413
relig,2.318182,3.890909,-0.24701,0.0006422451
swear,19.236364,23.957576,-0.190471,0.000308671
achieve,11.033333,14.081818,-0.185265,0.002072578
cogmech,140.818182,169.642424,-0.113749,0.0009673212
anger,3.169697,4.236364,-0.095832,0.006205329
