# Descriptive Statistics
### base on Chua, Alton Y.K. and Banerjee, Snehasish 2015
#### http://onlinelibrary.wiley.com/doi/10.1002/asi.23180/abstract

In [2]:
import sys
sys.path.append("../pylinguistics/pylinguistics/")

import Pylinguistics as pl
import pandas as pd
import plotsfunc as pf
import numpy as np
from scipy.stats import spearmanr
%matplotlib inline

#reviews_features = pd.read_csv('reviews.csv.gz', compression='gzip')
reviews_features = pd.read_csv('reviews-amazon.csv')

In [3]:
## adiciona coluna de helpfulness no corpus
def helpf(x): 
    try:
        return float ( float(x['thumbsup']) / (float(x['thumbsup']) + float(x['thumbsdown'])) )
    except:
        return 0
    
def thumbssum(x): 
    try:
        return int(x['thumbsup']) + int(x['thumbsdown']) 
    except:
        return 0
    
def fillnanup(x): 
    try:
        return float(x['thumbsup'])
    except:
        return 0
    
def fillnandown(x): 
    try:
        return float(x['thumbsdown'])
    except:
        return 0
    
reviews_features['thumbsup'] = reviews_features.apply(fillnanup,axis=1)
reviews_features['thumbsdown'] = reviews_features.apply(fillnandown,axis=1)
reviews_features['helpfulness'] = reviews_features.apply(helpf,axis=1)
reviews_features['thumbstotal'] = reviews_features.apply(thumbssum,axis=1)

In [4]:
def mean_data(df):
    mean_stats = []
    mean_stats.append(df.thumbstotal.mean())
    mean_stats.append(df.stars.mean())
    mean_stats.append(df.word_count.mean())
    mean_stats.append(df.helpfulness.mean())
    mean_stats.append(len(df))
    return mean_stats

def std_data(df):
    std_stats = []
    std_stats.append(df.thumbstotal.std())
    std_stats.append(df.stars.std())
    std_stats.append(df.word_count.std())
    std_stats.append(df.helpfulness.std())
    std_stats.append(len(df))
    return std_stats

columns = ['TOT','RAT','DEP','HEL','LEN']

# full dataset
stats = pd.DataFrame(data={'mean_full': np.array(mean_data(reviews_features))}, index=columns)
stats['std_full'] = std_data(reviews_features)

In [5]:
# split low DEP dataset
low_reviews = reviews_features[reviews_features.word_count <= reviews_features.word_count.median()]

In [6]:
# split high DEP dataset
high_reviews = reviews_features[reviews_features.word_count > reviews_features.word_count.median()]

In [7]:
stats['mean_low'] = mean_data(low_reviews)
stats['std_low'] = std_data(low_reviews)
stats['mean_high'] = mean_data(high_reviews)
stats['std_high'] = std_data(high_reviews)

## Descriptive Table Buscape Reviews

In [20]:
stats

Unnamed: 0,mean_full,std_full,mean_low,std_low,mean_high,std_high
TOT,6.187302,13.557119,4.010325,7.726384,8.407396,17.337848
RAT,3.823951,1.188895,4.09638,1.055879,3.546161,1.251068
DEP,59.670049,57.907309,23.922746,10.535257,96.125353,63.463516
HEL,0.729849,0.377087,0.67641,0.414662,0.78434,0.325555
LEN,32226.0,32226.0,16271.0,16271.0,15955.0,15955.0


## Descriptive Table Amazon Reviews

In [8]:
stats

Unnamed: 0,mean_full,std_full,mean_low,std_low,mean_high,std_high
TOT,9.741407,32.15269,5.410121,8.017842,14.125733,44.46373
RAT,3.393389,1.578104,3.412961,1.641476,3.373577,1.511041
DEP,117.650124,144.24746,40.880616,16.96697,195.359722,171.971119
HEL,0.403156,0.164908,0.37507,0.187611,0.431586,0.132242
LEN,35001.0,35001.0,17607.0,17607.0,17394.0,17394.0


## Correlations

In [21]:
def corr_table(df):
    # create new dict for correlation
    pearson_helpfull_list = []
    spearmanr_helpfull_list = []
    pearson_up_list = []
    spearmanr_up_list = []
    pearson_down_list = []
    spearmanr_down_list = []
    
    # build columns list
    df_columns = df.select_dtypes(include=['float64','int']).columns.drop('helpfulness').drop('thumbsup').drop('thumbsdown')
    
    #compute correlation
    for i in df_columns:
        pearson_helpfull_list.append(df.helpfulness.corr(df[i]))
        spearmanr_helpfull_list.append(spearmanr(df.helpfulness, df[i])[0])
        pearson_up_list.append(df.thumbsup.corr(df[i]))
        spearmanr_up_list.append(spearmanr(df.thumbsup, df[i])[0])
        pearson_down_list.append(df.thumbsdown.corr(df[i]))
        spearmanr_down_list.append(spearmanr(df.thumbsdown, df[i])[0])

    corr_df = pd.DataFrame(data={'pearson_helpfull': np.array(pearson_helpfull_list)}, index=df_columns)
    corr_df['spearmanr_helpfull'] = spearmanr_helpfull_list
    corr_df['pearson_up'] = pearson_up_list
    corr_df['spearmanr_up'] = spearmanr_up_list
    corr_df['pearson_down'] = pearson_down_list
    corr_df['spearmanr_down'] = spearmanr_down_list
    return corr_df

### Top 10 full dataset correlation

In [22]:
corr_table(reviews_features).sort_values('pearson_helpfull',ascending=False).head(10)

Unnamed: 0,pearson_helpfull,spearmanr_helpfull,pearson_up,spearmanr_up,pearson_down,spearmanr_down
sentence_count,0.141733,,0.206188,,0.04684,
syllable_count,0.134348,,0.218057,,0.053729,
word_count,0.126666,,0.218488,,0.05779,
functionalIncidence,0.067938,,0.072279,,0.016081,
percentile_90_sentence_length,0.050461,,0.097291,,0.03193,
ConnectiveTemporalIncidence,0.030754,,0.013497,,-0.004567,
Unnamed: 0,0.030495,,-0.02312,,-0.04814,
percentile_90_word_length,0.0292,,0.008188,,-0.006464,
percentile_75_word_length,0.027247,,0.010063,,-0.010924,
percentile_75_sentence_length,0.026271,,0.06527,,0.025491,


### Top 10 low DEP dataset correlation

In [23]:
corr_table(low_reviews).sort_values('pearson_helpfull',ascending=False).head(10)

Unnamed: 0,pearson_helpfull,spearmanr_helpfull,pearson_up,spearmanr_up,pearson_down,spearmanr_down
syllable_count,0.141626,,0.147265,,-0.002371,
word_count,0.135052,,0.155602,,0.006513,
sentence_count,0.11857,,0.104107,,-0.022372,
adjectiveIncidence,0.070917,,-0.002002,,-0.043061,
stars,0.053487,,-0.051734,,-0.055837,
percentile_90_sentence_length,0.047815,,0.07319,,0.018485,
Unnamed: 0,0.044402,,-0.011098,,-0.042966,
ConnectiveTemporalIncidence,0.033895,,0.004598,,-0.017077,
percentile_75_sentence_length,0.031798,,0.059959,,0.021609,
ConnectiveCasualIncidence,0.027588,,0.003578,,-0.015618,


### Top 10 high DEP dataset correlation

In [24]:
corr_table(high_reviews).sort_values('pearson_helpfull',ascending=False).head(10)

Unnamed: 0,pearson_helpfull,spearmanr_helpfull,pearson_up,spearmanr_up,pearson_down,spearmanr_down
sentence_count,0.08983,0.050144,0.133964,0.174297,0.046498,0.015717
ContentDiversty,0.084987,0.055985,-0.017535,-0.047876,-0.044519,-0.068071
avg_syllables_per_word,0.078178,0.059042,0.001261,0.029503,-0.03135,-0.046681
syllable_count,0.063939,0.022452,0.132193,0.183437,0.047304,0.040973
LexicalDiversty,0.062494,0.047847,-0.049236,-0.080315,-0.053996,-0.071041
percentile_90_word_length,0.058238,0.042701,0.013736,0.043441,-0.025603,-0.025877
word_count,0.050844,0.009013,0.132179,0.179819,0.052581,0.052764
percentile_75_word_length,0.049325,0.04194,0.014359,0.025177,-0.015301,-0.031352
adjectiveIncidence,0.048812,0.057356,-0.009556,0.025523,-0.028984,-0.046346
stars,0.04563,0.06298,-0.05195,-0.073512,-0.056922,-0.081744
