In [31]:
import pandas as pd
import numpy as np
from scipy.stats import wilcoxon
corpus = pd.read_csv('corpus.csv.gz', compression='gzip')
corpus_liwc = pd.read_csv('corpus_liwc_mtx.csv.gz', compression='gzip')

In [32]:
corpus_liwc['class'] = corpus['qual_a_melhor_classificao_para_esse_texto']
corpus_liwc['confidence'] = corpus['qual_a_melhor_classificao_para_esse_texto:confidence']

In [44]:
import re

def wc(x): 
    try:
        return len(re.findall(r'\w+', x['content']))
    except:
        return 0
    
corpus['wc'] = corpus.apply(wc,axis=1)
corpus_liwc['wc'] = corpus['wc']
corpus_liwc = corpus_liwc[corpus_liwc['confidence'] == 1]
corpus_liwc[['class','wc']].groupby(['class']).agg(['mean','count'])

Unnamed: 0_level_0,wc,wc
Unnamed: 0_level_1,mean,count
class,Unnamed: 1_level_2,Unnamed: 2_level_2
diario,371.153179,346
outro,477.047872,188


In [34]:
outro = corpus_liwc[corpus_liwc['class'] == 'outro']
diario = corpus_liwc[corpus_liwc['class'] == 'diario'].sample(len(outro))

In [35]:
columns = ['funct','pronoun','ppron','i','we','you','shehe','they','ipron','article','verb','auxverb','past','present','future','adverb','preps','conj','negate','quant','number','swear','social','family','friend','humans','affect','posemo','negemo','anx','anger','sad','cogmech','insight','cause','discrep','tentat','certain','inhib','incl','excl','percept','see','hear','feel','bio','body','health','sexual','ingest','relativ','motion','space','time','work','achieve','leisure','home','money','relig','death','assent','nonfl','filler']

In [46]:
def diff(x): 
    return (x['diario_mean']) - (x['outro_mean'])
    #return 100 * ((x['diario_mean']/diario.wc.mean()) - (x['outro_mean']/outro.wc.mean()))
    
stats = pd.DataFrame(data={'diario_mean': diario.mean(axis=0)}, index=columns)
stats['outro_mean'] = outro.mean(axis=0)

stats['diff'] = stats.apply(diff,axis=1)

In [47]:
significance = []
for column in list(stats.index.values):
    a = diario[column]
    b = outro[column]
    t, p = wilcoxon(a, b)
    significance.append(p)
stats['significance'] = significance

## Wilcoxon Not Rejected Categories
#### two populations are equally interested

In [48]:
liguistic_columns = ['funct','pronoun','ppron','i','we','you','shehe','they','ipron','article','verb','auxverb','past','present','future','adverb','preps','conj','negate','quant','number']
liguistic_stats = stats.ix[liguistic_columns]
liguistic_stats[['significance','diff']][liguistic_stats.significance > 0.05].sort_values('significance',ascending=False)

Unnamed: 0,significance,diff
negate,0.918024,0.037234
adverb,0.843622,-0.175532
past,0.414383,1.361702
future,0.128246,-0.5
quant,0.104887,-2.569149


In [49]:
psychological_columns = ['swear','social','family','friend','humans','affect','posemo','negemo','anx','anger','sad','cogmech','insight','cause','discrep','tentat','certain','inhib','incl','excl','percept','see','hear','feel','bio','body','health','sexual','ingest','relativ','motion','space','time','work','achieve','leisure','home','money','relig','death','assent','nonfl','filler']
psychological_stats = stats.ix[psychological_columns]
psychological_stats[['significance','diff']][psychological_stats.significance > 0.05].sort_values('significance',ascending=False)

Unnamed: 0,significance,diff
anx,0.920032,0.239362
sexual,0.853533,-0.255319
friend,0.806944,0.06383
home,0.790528,-0.085106
feel,0.660014,-0.239362
see,0.294309,-0.569149
assent,0.276611,-0.207447
certain,0.271694,-1.473404
sad,0.23855,-0.446809
family,0.223343,-0.590426


## Wilcoxon Rejected Categories
#### two populations has different interests

In [50]:
liguistic_columns = ['funct','pronoun','ppron','i','we','you','shehe','they','ipron','article','verb','auxverb','past','present','future','adverb','preps','conj','negate','quant','number']
liguistic_stats = stats.ix[liguistic_columns]
liguistic_stats[['significance','diff']][liguistic_stats.significance <= 0.05].sort_values('significance',ascending=False)

Unnamed: 0,significance,diff
auxverb,0.03437545,-4.0
verb,0.03075465,-7.601064
number,0.01374029,-1.047872
pronoun,0.002403478,-13.590426
conj,0.001211852,-7.925532
funct,0.0007842559,-38.867021
ipron,0.0002667456,-11.643617
we,0.0001235138,-1.579787
ppron,3.375651e-05,-12.111702
present,2.071924e-05,-9.994681


In [51]:
psychological_columns = ['swear','social','family','friend','humans','affect','posemo','negemo','anx','anger','sad','cogmech','insight','cause','discrep','tentat','certain','inhib','incl','excl','percept','see','hear','feel','bio','body','health','sexual','ingest','relativ','motion','space','time','work','achieve','leisure','home','money','relig','death','assent','nonfl','filler']
psychological_stats = stats.ix[psychological_columns]
print(len(psychological_stats[psychological_stats.significance <= 0.05]))
psychological_stats[['significance','diff']][psychological_stats.significance <= 0.05].sort_values('significance',ascending=True)

28


Unnamed: 0,significance,diff
nonfl,2.648187e-08,-4.489362
space,4.476228e-06,-12.361702
relig,7.700756e-06,-2.409574
incl,1.430851e-05,-15.505319
social,5.259275e-05,-20.803191
work,0.0001695272,-4.510638
swear,0.000312036,-6.281915
achieve,0.0004190609,-3.87766
cogmech,0.0006824609,-33.340426
anger,0.001280878,-1.643617


In [52]:
## story categories
psychological_stats[psychological_stats.significance <= 0.05].sort_values('diff',ascending=False).head(10)

Unnamed: 0,diario_mean,outro_mean,diff,significance
filler,0.5,0.292553,0.207447,0.005353
death,0.941489,1.473404,-0.531915,0.016553
health,3.930851,5.047872,-1.117021,0.027567
hear,4.845745,6.37766,-1.531915,0.009077
anger,3.0,4.643617,-1.643617,0.001281
body,9.255319,11.324468,-2.069149,0.003561
money,7.276596,9.393617,-2.117021,0.010215
negemo,9.601064,11.861702,-2.260638,0.01637
relig,2.239362,4.648936,-2.409574,8e-06
motion,23.851064,26.388298,-2.537234,0.036953


In [53]:
## non-story categories
psychological_stats[psychological_stats.significance <= 0.05].sort_values('diff',ascending=True).head(10)

Unnamed: 0,diario_mean,outro_mean,diff,significance
cogmech,141.526596,174.867021,-33.340426,0.000682
social,68.010638,88.81383,-20.803191,5.3e-05
incl,52.010638,67.515957,-15.505319,1.4e-05
relativ,79.946809,94.329787,-14.382979,0.005722
space,35.723404,48.085106,-12.361702,4e-06
affect,30.707447,37.952128,-7.244681,0.002873
tentat,34.930851,41.324468,-6.393617,0.027612
swear,19.196809,25.478723,-6.281915,0.000312
time,29.579787,34.898936,-5.319149,0.017193
work,9.053191,13.56383,-4.510638,0.00017
