In [10]:
import pandas as pd
import numpy as np
from scipy.stats import wilcoxon
corpus = pd.read_csv('corpus.csv.gz', compression='gzip')
corpus_liwc = pd.read_csv('corpus_liwc_mtx.csv.gz', compression='gzip')

In [11]:
corpus_liwc['class'] = corpus['qual_a_melhor_classificao_para_esse_texto']
corpus_liwc['confidence'] = corpus['qual_a_melhor_classificao_para_esse_texto:confidence']
corpus_liwc['wc'] = corpus['contentcount']
corpus_liwc['judges'] = corpus['_trusted_judgments']

In [12]:
corpus_liwc = corpus_liwc[corpus_liwc.wc.apply(lambda x: str(x).isnumeric())]
corpus_liwc = corpus_liwc[corpus_liwc['judges'] == 3]

In [13]:
diario = corpus_liwc[corpus_liwc['class'] == 'diario'].sample(330)
outro = corpus_liwc[corpus_liwc['class'] == 'outro'].sample(330)

In [14]:
columns = ['funct','pronoun','ppron','i','we','you','shehe','they','ipron','article','verb','auxverb','past','present','future','adverb','preps','conj','negate','quant','number','swear','social','family','friend','humans','affect','posemo','negemo','anx','anger','sad','cogmech','insight','cause','discrep','tentat','certain','inhib','incl','excl','percept','see','hear','feel','bio','body','health','sexual','ingest','relativ','motion','space','time','work','achieve','leisure','home','money','relig','death','assent','nonfl','filler']

In [15]:
stats = pd.DataFrame(data={'diario_mean': diario.mean(axis=0)}, index=columns)
stats['outro_mean'] = outro.mean(axis=0)
stats['diff'] = stats['diario_mean'] - stats['outro_mean']

In [16]:
significance = []
for column in list(stats.index.values):
    a = diario[column]
    b = outro[column]
    t, p = wilcoxon(a, b)
    significance.append(p)
stats['significance'] = significance

In [17]:
liguistic_columns = ['funct','pronoun','ppron','i','we','you','shehe','they','ipron','article','verb','auxverb','past','present','future','adverb','preps','conj','negate','quant','number']
liguistic_stats = stats.ix[liguistic_columns]
liguistic_stats[['significance','diff']][liguistic_stats.significance > 0.05].sort_values('significance',ascending=False)

Unnamed: 0,significance,diff
past,0.971392,0.936364
negate,0.548572,-0.148485
adverb,0.445675,-0.636364
number,0.196496,-0.542424
future,0.104429,-0.418182
quant,0.09399,-1.972727
we,0.065124,-0.775758


In [18]:
psychological_columns = ['swear','social','family','friend','humans','affect','posemo','negemo','anx','anger','sad','cogmech','insight','cause','discrep','tentat','certain','inhib','incl','excl','percept','see','hear','feel','bio','body','health','sexual','ingest','relativ','motion','space','time','work','achieve','leisure','home','money','relig','death','assent','nonfl','filler']
psychological_stats = stats.ix[psychological_columns]
psychological_stats[['significance','diff']][psychological_stats.significance > 0.05].sort_values('significance',ascending=False)

Unnamed: 0,significance,diff
sexual,0.944137,-0.009091
family,0.791442,0.039394
anx,0.714057,-0.009091
friend,0.636196,0.218182
health,0.608327,-0.451515
home,0.54336,0.209091
feel,0.530055,-0.227273
sad,0.52389,-0.215152
see,0.481451,-0.427273
ingest,0.245384,-1.057576


In [20]:
## story categories
psychological_stats[['significance','diff']][psychological_stats.significance > 0.05].sort_values('diff',ascending=False).head(10)

Unnamed: 0,significance,diff
friend,0.636196,0.218182
home,0.54336,0.209091
family,0.791442,0.039394
anx,0.714057,-0.009091
sexual,0.944137,-0.009091
assent,0.218159,-0.172727
sad,0.52389,-0.215152
feel,0.530055,-0.227273
death,0.086738,-0.260606
see,0.481451,-0.427273


In [21]:
## non-story categories
psychological_stats[['significance','diff']][psychological_stats.significance > 0.05].sort_values('diff',ascending=True).head(10)

Unnamed: 0,significance,diff
time,0.175111,-2.315152
bio,0.054083,-2.145455
humans,0.161752,-1.878788
percept,0.096585,-1.748485
inhib,0.129642,-1.675758
negemo,0.122245,-1.442424
certain,0.068497,-1.19697
ingest,0.245384,-1.057576
money,0.148604,-0.869697
health,0.608327,-0.451515
