In [1]:
import pandas as pd
import numpy as np
from scipy.stats import wilcoxon

# load corpus
corpus = pd.read_csv('corpus.csv.gz', compression='gzip')
corpus = corpus[corpus['qual_a_melhor_classificao_para_esse_texto:confidence'] == 1]
corpus = corpus.reset_index()

corpus_feat = pd.read_csv('corpus_liwc_mtx.csv.gz', compression='gzip')
corpus_feat.shape

(534, 68)

In [2]:
import re

def wc(x): 
    try:
        return len(re.findall(r'\w+', x['content']))
    except:
        return 0
    
corpus['wc'] = corpus.apply(wc,axis=1)
corpus_feat['wc'] = corpus['wc']

In [3]:
corpus_feat.drop('Unnamed: 0', axis=1,inplace=True)
corpus_feat.drop('confidence', axis=1,inplace=True)

In [4]:
wc_vector = corpus_feat['wc']
class_vector = corpus_feat['class']

corpus_feat.drop('class',axis=1,inplace=True)
corpus_feat.drop('wc',axis=1,inplace=True)

In [5]:
data = corpus_feat.as_matrix().astype(float) / wc_vector.as_matrix().astype(float)[:, np.newaxis]
data[np.isnan(data)] = 0
data[data >= 1E308] = 0
data.shape

  if __name__ == '__main__':


(534, 64)

In [6]:
columns = ['funct','pronoun','ppron','i','we','you','shehe','they','ipron','article','verb','auxverb','past','present','future','adverb','preps','conj','negate','quant','number','swear','social','family','friend','humans','affect','posemo','negemo','anx','anger','sad','cogmech','insight','cause','discrep','tentat','certain','inhib','incl','excl','percept','see','hear','feel','bio','body','health','sexual','ingest','relativ','motion','space','time','work','achieve','leisure','home','money','relig','death','assent','nonfl','filler']
prop_liwc = pd.DataFrame(data, columns=columns)
prop_liwc['class'] = class_vector

In [7]:
outro = prop_liwc[prop_liwc['class'] == 'outro']
diario = prop_liwc[prop_liwc['class'] == 'diario']

In [8]:
def diff(x): 
    return (x['diario_mean']) - (x['outro_mean'])
    #return 100 * ((x['diario_mean']/diario.wc.mean()) - (x['outro_mean']/outro.wc.mean()))
    
stats = pd.DataFrame(data={'diario_mean': diario.mean(axis=0)}, index=columns)
stats['diario_std'] = diario.std(axis=0)
stats['outro_mean'] = outro.mean(axis=0)
stats['outro_std'] = outro.std(axis=0)
stats['diff'] = stats.apply(diff,axis=1)
stats = stats * 100

In [9]:
outro = prop_liwc[prop_liwc['class'] == 'outro']
diario = prop_liwc[prop_liwc['class'] == 'diario'].sample(len(outro))

significance = []
for column in list(stats.index.values):
    a = diario[column]
    b = outro[column]
    t, p = wilcoxon(a, b)
    significance.append(p)
stats['significance'] = significance

### Wilcoxon Rejected Linguistic Categories

In [14]:
linguistic_columns = ['funct','pronoun','ppron','i','we','you','shehe','they','ipron','article','verb','auxverb','past','present','future','adverb','preps','conj','negate','quant','number']
linguistic_stats = stats.ix[linguistic_columns]
linguistic_stats.sort_values('diff',ascending=False)
linguistic_stats[linguistic_stats.significance <= 0.05].sort_values('significance',ascending=True)

Unnamed: 0,diario_mean,diario_std,outro_mean,outro_std,diff,significance
i,3.443419,2.105991,1.939779,1.614582,1.50364,3.526405e-14
you,6.368339,2.258465,7.655838,2.29598,-1.287499,9.123177e-08
shehe,5.72946,2.00856,6.969947,2.039647,-1.240487,2.155176e-07
adverb,3.80135,1.57137,2.972749,1.349097,0.828601,2.016648e-06
funct,48.074812,6.240477,44.914592,6.389209,3.160221,3.090001e-06
verb,14.640012,3.383896,13.058287,3.319171,1.581725,3.090001e-06
past,4.339875,2.268508,3.251727,1.721237,1.088148,8.725385e-06
preps,13.385029,3.045543,14.666125,3.216046,-1.281096,2.731157e-05
they,1.613463,1.174379,2.020093,1.16732,-0.40663,0.0001434357
article,7.246217,2.025399,7.858596,2.06649,-0.612378,0.0005163599


### Wilcoxon Not Rejected Linguistic Categories

In [11]:
linguistic_stats[linguistic_stats.significance > 0.05].sort_values('significance',ascending=False)

Unnamed: 0,diario_mean,diario_std,outro_mean,outro_std,diff,significance
ppron,11.286512,3.014084,11.265673,3.006212,0.020839,0.616648
present,7.815691,2.640534,7.912399,2.73801,-0.096709,0.481394
future,0.594461,0.529025,0.549943,0.557398,0.044517,0.418611
we,0.426263,0.555506,0.553866,0.73992,-0.127603,0.41779
ipron,11.785252,3.022925,11.517185,2.908371,0.268067,0.404338
number,1.495991,0.971771,1.311089,0.785835,0.184902,0.052191


### Wilcoxon Rejected Psychological Categories

In [12]:
psychological_columns = ['swear','social','family','friend','humans','affect','posemo','negemo','anx','anger','sad','cogmech','insight','cause','discrep','tentat','certain','inhib','incl','excl','percept','see','hear','feel','bio','body','health','sexual','ingest','relativ','motion','space','time','work','achieve','leisure','home','money','relig','death','assent','nonfl','filler']
psychoProc_stats = stats.ix[psychological_columns]
psychoProc_stats[psychoProc_stats.significance <= 0.05].sort_values('significance',ascending=True).head(10)

Unnamed: 0,diario_mean,diario_std,outro_mean,outro_std,diff,significance
relativ,21.787661,4.22296,19.606526,3.610765,2.181135,5e-06
feel,2.224723,1.191583,1.829541,1.156732,0.395182,3.3e-05
humans,6.877624,2.503676,5.898336,2.514503,0.979288,4.4e-05
percept,5.688852,2.211484,4.992647,2.107234,0.696204,4.8e-05
sexual,1.398003,1.05858,1.066595,0.823168,0.331408,5.6e-05
filler,0.155947,0.344473,0.083384,0.270106,0.072563,0.000116
ingest,6.800587,2.385995,5.814962,1.987943,0.985625,0.000283
certain,2.486801,1.156612,2.145316,1.098878,0.341485,0.000431
discrep,6.122851,2.199861,5.3713,1.797836,0.751551,0.000639
tentat,9.36638,2.865459,8.321216,2.464374,1.045164,0.001075


### Wilcoxon Not Rejected Psychological Categories

In [13]:
psychoProc_stats[psychoProc_stats.significance > 0.05].sort_values('significance',ascending=False).head(10)

Unnamed: 0,diario_mean,diario_std,outro_mean,outro_std,diff,significance
money,1.95392,1.125584,1.963778,1.227436,-0.009858,0.755287
health,1.047675,0.991947,0.9683,0.923305,0.079375,0.575502
swear,5.179338,1.927663,5.353678,2.051963,-0.17434,0.49314
sad,1.034851,1.010062,0.919221,0.898489,0.11563,0.460145
social,18.581261,4.463031,18.555436,4.58337,0.025824,0.446288
incl,13.843956,2.971938,13.900658,2.923731,-0.056702,0.414216
hear,1.342451,1.059902,1.368052,1.056469,-0.025601,0.385011
negemo,2.493855,1.57273,2.424255,1.580562,0.0696,0.371968
family,0.42612,0.766946,0.401683,0.805169,0.024437,0.31246
death,0.229088,0.382567,0.30489,0.491463,-0.075802,0.271313
