In [1]:
import pandas as pd
import numpy as np
from scipy.stats import wilcoxon

# load corpus
corpus = pd.read_csv('corpus.csv.gz', compression='gzip')
corpus = corpus[corpus['qual_a_melhor_classificao_para_esse_texto:confidence'] == 1]
corpus = corpus.reset_index()

corpus_feat = pd.read_csv('corpus_liwc_mtx.csv.gz', compression='gzip')
corpus_feat.shape

(534, 68)

In [2]:
import re

def wc(x): 
    try:
        return len(re.findall(r'\w+', x['content']))
    except:
        return 0
    
corpus['wc'] = corpus.apply(wc,axis=1)
corpus_feat['wc'] = corpus['wc']

In [3]:
corpus_feat.drop('Unnamed: 0', axis=1,inplace=True)
corpus_feat.drop('confidence', axis=1,inplace=True)

In [4]:
wc_vector = corpus_feat['wc']
class_vector = corpus_feat['class']

corpus_feat.drop('class',axis=1,inplace=True)
corpus_feat.drop('wc',axis=1,inplace=True)

In [5]:
data = corpus_feat.as_matrix().astype(float) / wc_vector.as_matrix().astype(float)[:, np.newaxis]
data[np.isnan(data)] = 0
data[data >= 1E308] = 0
data.shape

  """Entry point for launching an IPython kernel.


(534, 64)

In [6]:
columns = ['funct','pronoun','ppron','i','we','you','shehe','they','ipron','article','verb','auxverb','past','present','future','adverb','preps','conj','negate','quant','number','swear','social','family','friend','humans','affect','posemo','negemo','anx','anger','sad','cogmech','insight','cause','discrep','tentat','certain','inhib','incl','excl','percept','see','hear','feel','bio','body','health','sexual','ingest','relativ','motion','space','time','work','achieve','leisure','home','money','relig','death','assent','nonfl','filler']
prop_liwc = pd.DataFrame(data, columns=columns)
prop_liwc['class'] = class_vector

In [7]:
outro = prop_liwc[prop_liwc['class'] == 'outro']
diario = prop_liwc[prop_liwc['class'] == 'diario']

In [8]:
def diff(x): 
    return (x['diario_mean']) - (x['outro_mean'])
    #return 100 * ((x['diario_mean']/diario.wc.mean()) - (x['outro_mean']/outro.wc.mean()))
    
stats = pd.DataFrame(data={'diario_mean': diario.mean(axis=0)}, index=columns)
stats['diario_std'] = diario.std(axis=0)
stats['outro_mean'] = outro.mean(axis=0)
stats['outro_std'] = outro.std(axis=0)
stats['diff'] = stats.apply(diff,axis=1)
stats = stats * 100

In [9]:
outro = prop_liwc[prop_liwc['class'] == 'outro']
diario = prop_liwc[prop_liwc['class'] == 'diario'].sample(len(outro))

significance = []
for column in list(stats.index.values):
    a = diario[column]
    b = outro[column]
    t, p = wilcoxon(a, b)
    significance.append(p)
stats['significance'] = significance

### Wilcoxon Rejected Linguistic Categories

In [10]:
linguistic_columns = ['funct','pronoun','ppron','i','we','you','shehe','they','ipron','article','verb','auxverb','past','present','future','adverb','preps','conj','negate','quant','number']
linguistic_stats = stats.ix[linguistic_columns]
linguistic_stats.sort_values('diff',ascending=False)
linguistic_stats[linguistic_stats.significance <= 0.05].sort_values('diff',ascending=False)

Unnamed: 0,diario_mean,diario_std,outro_mean,outro_std,diff,significance
funct,48.074812,6.240477,44.914592,6.389209,3.160221,1.280426e-06
verb,14.640012,3.383896,13.058287,3.319171,1.581725,1.797862e-05
i,3.443419,2.105991,1.939779,1.614582,1.50364,1.288248e-13
pronoun,18.011798,4.372227,16.738534,4.048724,1.273263,0.0009235415
past,4.339875,2.268508,3.251727,1.721237,1.088148,2.202102e-07
quant,5.374879,1.811825,4.52171,1.636805,0.853169,8.83472e-06
adverb,3.80135,1.57137,2.972749,1.349097,0.828601,1.574009e-05
conj,9.588038,2.314788,8.958689,2.294111,0.62935,0.006504289
negate,1.697326,1.182546,1.352419,0.931901,0.344907,0.00119052
present,7.815691,2.640534,7.912399,2.73801,-0.096709,0.04654905


### Wilcoxon Rejected Linguistic Categories

In [11]:
linguistic_stats[linguistic_stats.significance > 0.05].sort_values('significance',ascending=False)

Unnamed: 0,diario_mean,diario_std,outro_mean,outro_std,diff,significance
ppron,11.286512,3.014084,11.265673,3.006212,0.020839,0.584986
future,0.594461,0.529025,0.549943,0.557398,0.044517,0.176078
ipron,11.785252,3.022925,11.517185,2.908371,0.268067,0.166759
number,1.495991,0.971771,1.311089,0.785835,0.184902,0.101707
auxverb,5.849782,2.112848,5.32122,2.042196,0.528562,0.090922
we,0.426263,0.555506,0.553866,0.73992,-0.127603,0.05189


### Wilcoxon Rejected Psychological Categories

In [16]:
psychological_columns = ['swear','social','family','friend','humans','affect','posemo','negemo','anx','anger','sad','cogmech','insight','cause','discrep','tentat','certain','inhib','incl','excl','percept','see','hear','feel','bio','body','health','sexual','ingest','relativ','motion','space','time','work','achieve','leisure','home','money','relig','death','assent','nonfl','filler']
psychoProc_stats = stats.ix[psychological_columns]
psychoProc_stats[psychoProc_stats.significance <= 0.05].sort_values('diff',ascending=False).tail(6)

Unnamed: 0,diario_mean,diario_std,outro_mean,outro_std,diff,significance
home,0.590785,0.740793,0.431651,0.592966,0.159134,0.000689
anx,0.478541,0.568647,0.375735,0.418989,0.102806,0.034675
filler,0.155947,0.344473,0.083384,0.270106,0.072563,0.001704
anger,0.727392,0.687519,0.926319,0.91358,-0.198927,0.033156
nonfl,2.267855,1.115228,2.670826,1.146495,-0.40297,0.007251
work,2.302282,1.718505,2.747805,1.991083,-0.445523,0.006322


### Wilcoxon Not Rejected Psychological Categories

In [13]:
psychoProc_stats[psychoProc_stats.significance > 0.05].sort_values('significance',ascending=False).head(6)

Unnamed: 0,diario_mean,diario_std,outro_mean,outro_std,diff,significance
negemo,2.493855,1.57273,2.424255,1.580562,0.0696,0.994617
incl,13.843956,2.971938,13.900658,2.923731,-0.056702,0.818957
family,0.42612,0.766946,0.401683,0.805169,0.024437,0.751502
health,1.047675,0.991947,0.9683,0.923305,0.079375,0.713529
hear,1.342451,1.059902,1.368052,1.056469,-0.025601,0.709812
social,18.581261,4.463031,18.555436,4.58337,0.025824,0.623259
