In [98]:
import pandas as pd
import numpy as np
from scipy.stats import wilcoxon

portuguese_liwc = pd.read_csv('portuguese_stories_liwc.csv.gz', compression='gzip')
english_liwc = pd.read_csv('icwsm09_stories_liwc.csv.gz', compression='gzip')

In [99]:
print("postuguese: " + str(len(portuguese_liwc)))
print("english: " + str(len(english_liwc)))

postuguese: 37746
english: 383361


## Corpus Filter

In [100]:
portuguese_liwc = portuguese_liwc[((portuguese_liwc['wc'] > 10) & (portuguese_liwc['wc'] < 1000))]
portuguese_liwc = portuguese_liwc[((portuguese_liwc['wps'] > 3) & (portuguese_liwc['wps'] < 30))]
portuguese_liwc = portuguese_liwc[portuguese_liwc['i'] > 2]
portuguese_liwc = portuguese_liwc[(portuguese_liwc['negemo'] + portuguese_liwc['posemo']) > 2]
portuguese_liwc = portuguese_liwc[portuguese_liwc['score'] > 0]
len(portuguese_liwc)

28847

In [101]:
english_liwc = english_liwc[((english_liwc['wc'] > 10) & (english_liwc['wc'] < 1000))]
english_liwc = english_liwc[((english_liwc['wps'] > 3) & (english_liwc['wps'] < 30))]
english_liwc = english_liwc[english_liwc['i'] > 2]
english_liwc = english_liwc[(english_liwc['negemo'] + english_liwc['posemo']) > 2]
english_liwc = english_liwc[english_liwc['score'] > 0]
len(english_liwc)

161704

## Proportional LIWC

In [102]:
portuguese_feat = portuguese_liwc.drop(['Unnamed: 0','score','wc','wps'],axis=1)
portuguese_wc = portuguese_liwc['wc']

portuguese_data = portuguese_feat.as_matrix().astype(float) / portuguese_wc.as_matrix().astype(float)[:, np.newaxis]
portuguese_data[np.isnan(portuguese_data)] = 0
portuguese_data[portuguese_data >= 1E308] = 0
portuguese_data.shape

(28847, 64)

In [103]:
english_feat = english_liwc.drop(['Unnamed: 0','score','wc','wps'],axis=1)
english_wc = english_liwc['wc']

english_data = english_feat.as_matrix().astype(float) / english_wc.as_matrix().astype(float)[:, np.newaxis]
english_data[np.isnan(english_data)] = 0
english_data[english_data >= 1E308] = 0
english_data.shape

(161704, 64)

In [104]:
english_sample = english_data[np.random.choice(english_data.shape[0], len(portuguese_data), replace=False)]
english_sample.shape

(28847, 64)

In [105]:
columns = ['funct','pronoun','ppron','i','we','you','shehe','they','ipron','article','verb','auxverb','past','present','future','adverb','preps','conj','negate','quant','number','swear','social','family','friend','humans','affect','posemo','negemo','anx','anger','sad','cogmech','insight','cause','discrep','tentat','certain','inhib','incl','excl','percept','see','hear','feel','bio','body','health','sexual','ingest','relativ','motion','space','time','work','achieve','leisure','home','money','relig','death','assent','nonfl','filler']

## Wilcoxon Test

In [106]:
def diff(x): 
    return abs(x['portuguese_mean'] - x['english_mean'])

stats = pd.DataFrame(data={'portuguese_mean': np.mean(portuguese_data, axis=0)}, index=columns)
stats['english_mean'] = np.mean(english_sample, axis=0)

stats['diff'] = stats.apply(diff,axis=1)

In [107]:
significance = []
for column in range(0,len(columns)):
    a = portuguese_data[:,column]
    b = english_sample[:,column]
    t, p = wilcoxon(a, b)
    significance.append(p)
stats['significance'] = significance

In [108]:
stats[stats.significance > 0.05].sort_values('significance',ascending=False)

Unnamed: 0,portuguese_mean,english_mean,diff,significance


In [109]:
stats[stats.significance < 0.05].sort_values('diff',ascending=True).head(10)

Unnamed: 0,portuguese_mean,english_mean,diff,significance
assent,0.002881,0.002779,0.000102,8.559474e-18
future,0.00589,0.005425,0.000465,8.880326000000001e-17
family,0.003254,0.004052,0.000798,1.10793e-76
anx,0.00441,0.003039,0.001371,1.639316e-279
death,0.002406,0.001021,0.001385,0.0
home,0.005356,0.007147,0.001791,1.028044e-190
filler,0.001459,0.003776,0.002318,0.0
anger,0.00791,0.005589,0.002321,0.0
we,0.00418,0.007415,0.003235,0.0
i,0.031258,0.03496,0.003703,2.03021e-25


In [110]:
stats[stats.significance < 0.05].sort_values('diff',ascending=False).head(10)

Unnamed: 0,portuguese_mean,english_mean,diff,significance
cogmech,0.37156,0.150263,0.221297,0.0
social,0.183999,0.070568,0.113431,0.0
incl,0.135478,0.053676,0.081802,0.0
ipron,0.116244,0.045323,0.070921,0.0
tentat,0.091577,0.0226,0.068977,0.0
relativ,0.210647,0.144416,0.066231,0.0
pronoun,0.17519,0.114345,0.060846,0.0
humans,0.066145,0.005807,0.060338,0.0
you,0.064564,0.004524,0.06004,0.0
ingest,0.064926,0.005943,0.058983,0.0
