# Descriptive Statistics
### base on Chua, Alton Y.K. and Banerjee, Snehasish 2015
#### http://onlinelibrary.wiley.com/doi/10.1002/asi.23180/abstract

In [1]:
import sys
sys.path.append("../pylinguistics/pylinguistics/")

import Pylinguistics as pl
import pandas as pd
import plotsfunc as pf
import numpy as np
from scipy.stats import spearmanr
%matplotlib inline

reviews_features = pd.read_csv('reviews.csv.gz', compression='gzip')
#reviews_features = pd.read_csv('amazon-help.csv.gz')

  interactivity=interactivity, compiler=compiler, result=result)


In [64]:
## adiciona coluna de helpfulness no corpus
def helpf(x): 
    try:
        return float ( float(x['thumbsup']) / (float(x['thumbsup']) + float(x['thumbsdown'])) )
    except:
        return 0
    
def thumbssum(x): 
    try:
        return int(x['thumbsup']) + int(x['thumbsdown']) 
    except:
        return 0
    
def fillnanup(x): 
    try:
        return float(x['thumbsup'])
    except:
        return 0
    
def fillnandown(x): 
    try:
        return float(x['thumbsdown'])
    except:
        return 0

def length(x): 
    try:
        return int(x['word_count']) * int(x['mean_word_length'])
    except:
        return 0
    
def sentiment(x): 
    if x['stars'] == 5:
        return 'favorable'
    if x['stars'] == 1:
        return 'unfavorable'
    else:
        return 'mixed'
    
reviews_features['thumbsup'] = reviews_features.apply(fillnanup,axis=1)
reviews_features['thumbsdown'] = reviews_features.apply(fillnandown,axis=1)
reviews_features['helpfulness'] = reviews_features.apply(helpf,axis=1)
reviews_features['thumbstotal'] = reviews_features.apply(thumbssum,axis=1)
reviews_features['length'] = reviews_features.apply(length,axis=1)
reviews_features = reviews_features[reviews_features.thumbstotal>4]
reviews_features = reviews_features[reviews_features.stars>0]
reviews_features = reviews_features[reviews_features.word_count>9]

## para usar no weka
#reviews_features.to_csv('amazon-help.csv',encoding='utf-8')

In [54]:
def mean_data(df):
    mean_stats = []
    mean_stats.append(df.thumbstotal.mean())
    mean_stats.append(df.thumbsup.mean())
    mean_stats.append(df.thumbsdown.mean())
    mean_stats.append(df.stars.mean())
    mean_stats.append(df.word_count.mean())
    mean_stats.append(df.helpfulness.mean())
    mean_stats.append(len(df))
    return mean_stats

def std_data(df):
    std_stats = []
    std_stats.append(df.thumbstotal.std())
    std_stats.append(df.thumbsup.std())
    std_stats.append(df.thumbsdown.std())
    std_stats.append(df.stars.std())
    std_stats.append(df.word_count.std())
    std_stats.append(df.helpfulness.std())
    std_stats.append(len(df))
    return std_stats

columns = ['TOT','UP','DOWN','RAT','DEP','HEL','LEN']

# full dataset
stats = pd.DataFrame(data={'mean_full': np.array(mean_data(reviews_features))}, index=columns)
stats['std_full'] = std_data(reviews_features)

In [55]:
# split low DEP dataset
low_reviews = reviews_features[reviews_features.word_count <= reviews_features.word_count.median()]

In [56]:
# split high DEP dataset
high_reviews = reviews_features[reviews_features.word_count > reviews_features.word_count.median()]

In [57]:
stats['mean_low'] = mean_data(low_reviews)
stats['std_low'] = std_data(low_reviews)
stats['mean_high'] = mean_data(high_reviews)
stats['std_high'] = std_data(high_reviews)

## Descriptive Table Buscape Reviews

In [51]:
stats

Unnamed: 0,mean_full,std_full,mean_low,std_low,mean_high,std_high
TOT,16.565078,21.320393,13.656141,15.369274,19.509027,25.659157
UP,12.040635,17.203314,8.772401,11.310367,15.348205,21.078742
DOWN,4.524443,7.183114,4.88374,7.029224,4.160822,7.31836
RAT,3.619018,1.263387,3.866926,1.198905,3.368126,1.277495
DEP,77.9737,69.09881,33.167111,13.909446,123.319568,72.959362
HEL,0.719318,0.261293,0.653463,0.28621,0.785965,0.213656
LEN,9696.0,9696.0,4877.0,4877.0,4819.0,4819.0


## Descriptive Table Amazon Reviews

In [58]:
stats

Unnamed: 0,mean_full,std_full,mean_low,std_low,mean_high,std_high
TOT,19.375397,47.207289,12.897547,14.991762,25.894506,64.502898
UP,8.642791,23.120956,5.31039,7.019381,11.996417,31.632618
DOWN,10.732606,24.311323,7.587157,8.693276,13.898089,33.013483
RAT,3.281415,1.59733,3.169831,1.679961,3.39371,1.501376
DEP,159.292262,186.368676,55.663107,25.191845,263.581476,217.640171
HEL,0.42908,0.107987,0.40952,0.124308,0.448766,0.084112
LEN,15120.0,15120.0,7584.0,7584.0,7536.0,7536.0


## Correlations

In [65]:
def corr_table(df):
    # create new dict for correlation
    pearson_helpfull_list = []
    spearmanr_helpfull_list = []
    pearson_up_list = []
    spearmanr_up_list = []
    pearson_down_list = []
    spearmanr_down_list = []
    
    # build columns list
    df_columns = df.select_dtypes(include=['float64','int']).columns.drop('helpfulness').drop('thumbsup').drop('thumbsdown')
    
    #compute correlation
    for i in df_columns:
        pearson_helpfull_list.append(df.helpfulness.corr(df[i]))
        spearmanr_helpfull_list.append(spearmanr(df.helpfulness, df[i])[0])
        pearson_up_list.append(df.thumbsup.corr(df[i]))
        spearmanr_up_list.append(spearmanr(df.thumbsup, df[i])[0])
        pearson_down_list.append(df.thumbsdown.corr(df[i]))
        spearmanr_down_list.append(spearmanr(df.thumbsdown, df[i])[0])

    corr_df = pd.DataFrame(data={'pearson_helpfull': np.array(pearson_helpfull_list)}, index=df_columns)
    corr_df['spearmanr_helpfull'] = spearmanr_helpfull_list
    corr_df['pearson_up'] = pearson_up_list
    corr_df['spearmanr_up'] = spearmanr_up_list
    corr_df['pearson_down'] = pearson_down_list
    corr_df['spearmanr_down'] = spearmanr_down_list
    return corr_df

### Top 10 full dataset correlation

In [66]:
corr_table(reviews_features).sort_values('pearson_helpfull',ascending=False).head(10)

Unnamed: 0,pearson_helpfull,spearmanr_helpfull,pearson_up,spearmanr_up,pearson_down,spearmanr_down
sentence_count,0.233848,0.272718,0.190857,0.287981,-0.035178,-0.134337
syllable_count,0.232391,0.296291,0.194409,0.323703,-0.039854,-0.141024
functionalIncidence,0.110437,0.06883,0.084086,0.084159,-0.018186,-0.029111
percentile_90_sentence_length,0.098786,0.13766,0.077552,0.158846,-0.021252,-0.062589
percentile_90_word_length,0.075885,0.088339,0.028634,0.074606,-0.026484,-0.055376
percentile_75_sentence_length,0.060591,0.090048,0.046828,0.109611,-0.013834,-0.040199
ConnectiveTemporalIncidence,0.051017,0.098355,0.01362,0.086583,-0.017233,-0.057675
percentile_75_word_length,0.045398,0.058189,0.026592,0.054365,-0.009669,-0.036048
ConnectiveCasualIncidence,0.042612,0.09068,0.010783,0.07989,-0.014929,-0.053222
LogicOperatorsIncidence,0.030953,0.116469,0.006677,0.103053,-0.009675,-0.068792


### Top 10 low DEP dataset correlation

In [67]:
corr_table(low_reviews).sort_values('pearson_helpfull',ascending=False).head(10)

Unnamed: 0,pearson_helpfull,spearmanr_helpfull,pearson_up,spearmanr_up,pearson_down,spearmanr_down
syllable_count,0.185388,0.149697,0.154238,0.223266,0.067121,0.091218
stars,0.18008,0.196163,-0.000423,0.041512,-0.081649,-0.120368
functionalIncidence,0.115534,0.077617,0.021609,0.030997,-0.028293,-0.03979
percentile_25_word_length,0.083768,0.071708,0.044956,0.078699,0.009883,0.013252
ConnectiveAdditiveIncidence,0.077189,0.08921,0.01119,0.046811,-0.03172,-0.037591
adjectiveIncidence,0.071355,0.077307,-0.023111,0.014845,-0.058455,-0.070442
LogicAndIncidence,0.067295,0.080492,0.004441,0.042763,-0.033654,-0.033196
ConnectiveLogicIncidence,0.049975,0.049391,0.014903,0.044792,-0.014295,-0.003261
ConnectiveIncidence,0.047431,0.045475,-0.014266,0.011321,-0.047313,-0.042335
ConnectiveTemporalIncidence,0.034641,0.050578,-0.000866,0.044414,-0.020823,-0.010164


### Top 10 high DEP dataset correlation

In [68]:
corr_table(high_reviews).sort_values('pearson_helpfull',ascending=False).head(10)

Unnamed: 0,pearson_helpfull,spearmanr_helpfull,pearson_up,spearmanr_up,pearson_down,spearmanr_down
stars,0.350135,0.346877,0.065115,0.147193,0.030787,0.002196
syllable_count,0.111279,0.083781,0.256442,0.288677,0.251299,0.241433
adjectiveIncidence,0.083698,0.078417,0.008842,0.046462,0.001826,0.011216
percentile_25_word_length,0.072519,0.045733,0.007891,0.042583,-0.001622,0.009744
functionalIncidence,0.065833,0.047813,-0.032081,-0.098412,-0.041994,-0.122943
percentile_75_word_length,0.047864,0.037707,0.045441,0.068751,0.042438,0.049195
mean_word_length,0.045986,0.032428,0.068299,0.082658,0.066559,0.065293
ConnectiveAdditiveIncidence,0.04396,0.047231,-0.017295,-0.002807,-0.023872,-0.024964
LogicAndIncidence,0.031469,0.036221,-0.020414,-0.009616,-0.025273,-0.026873
avg_syllables_per_word,0.027475,0.019646,0.062945,0.071578,0.062698,0.06092
