# Descriptive Statistics
### base on Chua, Alton Y.K. and Banerjee, Snehasish 2015
#### http://onlinelibrary.wiley.com/doi/10.1002/asi.23180/abstract

In [26]:
import sys
sys.path.append("../pylinguistics/pylinguistics/")

import Pylinguistics as pl
import pandas as pd
import plotsfunc as pf
import numpy as np
from scipy.stats import spearmanr
%matplotlib inline

reviews_features = pd.read_csv('reviews.csv.gz', compression='gzip')

In [45]:
## adiciona coluna de helpfulness no corpus
def helpf(x): 
    try:
        return float ( float(x['thumbsup']) / (float(x['thumbsup']) + float(x['thumbsdown'])) )
    except:
        return 0
    
def thumbssum(x): 
    try:
        return int(x['thumbsup']) + int(x['thumbsdown']) 
    except:
        return 0
    
def fillnanup(x): 
    try:
        return float(x['thumbsup'])
    except:
        return 0
    
def fillnandown(x): 
    try:
        return float(x['thumbsdown'])
    except:
        return 0
    
reviews_features['thumbsup'] = reviews_features.apply(fillnanup,axis=1)
reviews_features['thumbsdown'] = reviews_features.apply(fillnandown,axis=1)
reviews_features['helpfulness'] = reviews_features.apply(helpf,axis=1)
reviews_features['thumbstotal'] = reviews_features.apply(thumbssum,axis=1)

In [36]:
def mean_data(df):
    mean_stats = []
    mean_stats.append(df.thumbstotal.mean())
    mean_stats.append(df.stars.mean())
    mean_stats.append(df.word_count.mean())
    mean_stats.append(df.helpfulness.mean())
    return mean_stats

def std_data(df):
    std_stats = []
    std_stats.append(df.thumbstotal.std())
    std_stats.append(df.stars.std())
    std_stats.append(df.word_count.std())
    std_stats.append(df.helpfulness.std())
    return std_stats

columns = ['TOT','RAT','DEP','HEL']

stats = pd.DataFrame(data={'mean_full': np.array(mean_data(reviews_features))}, index=columns)

stats['std_full'] = std_data(reviews_features)

In [46]:
# full dataset
reviews_features.shape

(32226, 49)

In [38]:
# split low DEP dataset
low_reviews = reviews_features[reviews_features.word_count <= reviews_features.word_count.median()]
low_reviews.shape

(16271, 49)

In [39]:
# split high DEP dataset
high_reviews = reviews_features[reviews_features.word_count > reviews_features.word_count.median()]
high_reviews.shape

(15955, 49)

In [44]:
stats['mean_low'] = mean_data(low_reviews)
stats['std_low'] = std_data(low_reviews)
stats['mean_high'] = mean_data(high_reviews)
stats['std_high'] = std_data(high_reviews)

## Descriptive Table

In [43]:
stats

Unnamed: 0,mean_full,std_full,mean_low,std_low,mean_high,std_high
TOT,6.187302,13.557119,4.010325,7.726384,8.407396,17.337848
RAT,3.823951,1.188895,4.09638,1.055879,3.546161,1.251068
DEP,59.670049,57.907309,23.922746,10.535257,96.125353,63.463516
HEL,0.729849,0.377087,0.67641,0.414662,0.78434,0.325555


## Correlations

In [62]:
def corr_table(df):
    # create new dict for correlation
    pearson_helpfull_list = []
    spearmanr_helpfull_list = []
    pearson_up_list = []
    spearmanr_up_list = []
    pearson_down_list = []
    spearmanr_down_list = []
    
    # build columns list
    df_columns = df.select_dtypes(include=['float64','int']).columns.drop('helpfulness').drop('thumbsup').drop('thumbsdown')
    
    #compute correlation
    for i in df_columns:
        pearson_helpfull_list.append(df.helpfulness.corr(df[i]))
        spearmanr_helpfull_list.append(spearmanr(df.helpfulness, df[i])[0])
        pearson_up_list.append(df.thumbsup.corr(df[i]))
        spearmanr_up_list.append(spearmanr(df.thumbsup, df[i])[0])
        pearson_down_list.append(df.thumbsdown.corr(df[i]))
        spearmanr_down_list.append(spearmanr(df.thumbsdown, df[i])[0])

    corr_df = pd.DataFrame(data={'pearson_helpfull': np.array(pearson_helpfull_list)}, index=df_columns)
    corr_df['spearmanr_helpfull'] = spearmanr_helpfull_list
    corr_df['pearson_up'] = pearson_up_list
    corr_df['spearmanr_up'] = spearmanr_up_list
    corr_df['pearson_down'] = pearson_down_list
    corr_df['spearmanr_down'] = spearmanr_down_list
    return corr_df

### Top 10 full dataset correlation

In [63]:
corr_table(reviews_features).sort_values('pearson_helpfull',ascending=False).head(10)

Unnamed: 0,pearson_helpfull,spearmanr_helpfull,pearson_up,spearmanr_up,pearson_down,spearmanr_down
sentence_count,0.141733,0.10546,0.206188,0.285767,0.04684,-0.002055
syllable_count,0.134348,0.098542,0.218057,0.33094,0.053729,0.020001
functionalIncidence,0.067938,0.027364,0.072279,0.091068,0.016081,0.006552
percentile_90_sentence_length,0.050461,0.042376,0.097291,0.195694,0.03193,0.029446
ConnectiveTemporalIncidence,0.030754,0.039276,0.013497,0.111303,-0.004567,0.001193
percentile_90_word_length,0.0292,0.062258,0.008188,0.074663,-0.006464,-0.03743
percentile_75_word_length,0.027247,0.037398,0.010063,0.028726,-0.010924,-0.02969
percentile_75_sentence_length,0.026271,0.024356,0.06527,0.147194,0.025491,0.029419
ConnectiveCasualIncidence,0.023214,0.034393,0.011033,0.10716,-0.003066,0.004731
adjectiveIncidence,0.022691,0.064751,-0.053766,-0.024516,-0.043407,-0.073442


### Top 10 low DEP dataset correlation

In [64]:
corr_table(low_reviews).sort_values('pearson_helpfull',ascending=False).head(10)

Unnamed: 0,pearson_helpfull,spearmanr_helpfull,pearson_up,spearmanr_up,pearson_down,spearmanr_down
syllable_count,0.141626,0.112717,0.147265,0.210492,-0.002371,-0.044831
sentence_count,0.11857,0.105242,0.104107,0.146778,-0.022372,-0.06181
adjectiveIncidence,0.070917,0.097127,-0.002002,0.052284,-0.043061,-0.085521
stars,0.053487,0.033619,-0.051734,-0.035768,-0.055837,-0.046388
percentile_90_sentence_length,0.047815,0.040658,0.07319,0.108029,0.018485,-0.000371
ConnectiveTemporalIncidence,0.033895,0.045813,0.004598,0.039315,-0.017077,-0.03081
percentile_75_sentence_length,0.031798,0.026749,0.059959,0.088105,0.021609,0.006944
ConnectiveCasualIncidence,0.027588,0.039794,0.003578,0.037134,-0.015618,-0.025013
functionalIncidence,0.024906,0.012504,0.008115,0.011908,-0.004315,-0.008104
percentile_90_word_length,0.023993,0.06908,0.004198,0.062537,-0.004716,-0.055347


### Top 10 high DEP dataset correlation

In [65]:
corr_table(high_reviews).sort_values('pearson_helpfull',ascending=False).head(10)

Unnamed: 0,pearson_helpfull,spearmanr_helpfull,pearson_up,spearmanr_up,pearson_down,spearmanr_down
sentence_count,0.08983,0.050144,0.133964,0.174297,0.046498,0.015717
ContentDiversty,0.084987,0.055985,-0.017535,-0.047876,-0.044519,-0.068071
avg_syllables_per_word,0.078178,0.059042,0.001261,0.029503,-0.03135,-0.046681
syllable_count,0.063939,0.022452,0.132193,0.183437,0.047304,0.040973
LexicalDiversty,0.062494,0.047847,-0.049236,-0.080315,-0.053996,-0.071041
percentile_90_word_length,0.058238,0.042701,0.013736,0.043441,-0.025603,-0.025877
percentile_75_word_length,0.049325,0.04194,0.014359,0.025177,-0.015301,-0.031352
adjectiveIncidence,0.048812,0.057356,-0.009556,0.025523,-0.028984,-0.046346
stars,0.04563,0.06298,-0.05195,-0.073512,-0.056922,-0.081744
mean_word_length,0.039573,0.035735,-0.022738,-0.013176,-0.033937,-0.041731
