# Descriptive Statistics
### base on Chua, Alton Y.K. and Banerjee, Snehasish 2016
#### http://www.sciencedirect.com/science/article/pii/S074756321530131X

In [19]:
import sys
sys.path.append("../pylinguistics/pylinguistics/")

import Pylinguistics as pl
import pandas as pd
import plotsfunc as pf
import numpy as np
%matplotlib inline

In [20]:
## adiciona coluna de helpfulness no corpus
def helpf(x): 
    try:
        return float ( float(x['thumbsup']) / (float(x['thumbsup']) + float(x['thumbsdown'])) )
    except:
        return 0
    
def thumbssum(x): 
    try:
        return int(x['thumbsup']) + int(x['thumbsdown']) 
    except:
        return 0
    
def fillnanup(x): 
    try:
        return float(x['thumbsup'])
    except:
        return 0
    
def fillnandown(x): 
    try:
        return float(x['thumbsdown'])
    except:
        return 0
    
def fillnanstarts(x): 
    try:
        return float(x['stars'])
    except:
        return 0
    
def length(x): 
    try:
        return int(x['word_count']) * int(x['mean_word_length'])
    except:
        return 0
    
def sentiment(x): 
    if x['stars'] == 5:
        return 'favorable'
    if x['stars'] == 1:
        return 'unfavorable'
    else:
        return 'mixed'

In [21]:
reviews_features = pd.read_csv('reviews.csv.gz', compression='gzip')
#reviews_features = pd.read_csv('amazon-help.csv.gz')

reviews_features['thumbsup'] = reviews_features.apply(fillnanup,axis=1)
reviews_features['thumbsdown'] = reviews_features.apply(fillnandown,axis=1)
reviews_features['helpfulness'] = reviews_features.apply(helpf,axis=1)
reviews_features['thumbstotal'] = reviews_features.apply(thumbssum,axis=1)
reviews_features['length'] = reviews_features.apply(length,axis=1)
reviews_features['stars'] = reviews_features.apply(fillnanstarts,axis=1)
reviews_features = reviews_features[reviews_features.thumbstotal>4]
reviews_features = reviews_features[reviews_features.word_count>4]
#reviews_features = reviews_features[~reviews_features.applymap(np.isnan).all(1)]
## para usar no weka
#reviews_features['sentiment'] = reviews_features.apply(sentiment,axis=1)
#reviews_features.to_csv('buscape-help.csv',encoding='utf-8')
reviews_features.shape

(9633, 50)

In [22]:
list_experience_categories = [ 'Livros' , 'Perfume', 'Roupas', 'Jogos', 'Bonecas', 'Chapinha / Prancha', 'Esteira'
                              , 'Cadeira para Auto']
experience_products = reviews_features[reviews_features.category.isin(list_experience_categories)]
experience_products.shape

(294, 50)

In [23]:
list_search_categories = [ 'Celular e Smartphone' , 'TV', 'Camera Digital', 'Maquina de Lavar Roupas'
                          , 'Geladeira / Refrigerador', 'Ar Condicionado', 'Tablet', 'Notebook'
                          ,'Console de Videogame', 'Impressora', 'Fogao', 'Microondas'
                          , 'Aparelho de Telefone', 'MP3 Player / MP4 Player', 'Aquecedor de Ambiente'
                          , 'Forno Eletrico']
search_products = reviews_features[reviews_features.category.isin(list_search_categories)]
search_products.shape

(7574, 50)

### 1.1 Categorias mais comentadas

In [24]:
df_agg = reviews_features[['word_count','category']].groupby(['category']).agg(['count','mean'])
df_agg['word_count'].sort_values('count',ascending=False).head(50)

Unnamed: 0_level_0,count,mean
category,Unnamed: 1_level_1,Unnamed: 2_level_1
Celular e Smartphone,2051,85.900049
TV,1712,81.258762
Camera Digital,716,76.604749
Maquina de Lavar Roupas,488,94.383197
Geladeira / Refrigerador,413,75.307506
Ar Condicionado,370,77.081081
Tablet,363,82.663912
Notebook,278,84.913669
Console de Videogame,231,83.017316
Impressora,209,75.215311


In [25]:
def mean_data(df):
    mean_stats = []
    mean_stats.append(df.thumbstotal.mean())
    mean_stats.append(df.thumbsup.mean())
    mean_stats.append(df.thumbsdown.mean())
    mean_stats.append(df.stars.mean())
    mean_stats.append(df.word_count.mean())
    mean_stats.append(df.helpfulness.mean())
    mean_stats.append(len(df))
    return mean_stats

def std_data(df):
    std_stats = []
    std_stats.append(df.thumbstotal.std())
    std_stats.append(df.thumbsup.std())
    std_stats.append(df.thumbsdown.std())
    std_stats.append(df.stars.std())
    std_stats.append(df.word_count.std())
    std_stats.append(df.helpfulness.std())
    std_stats.append(len(df))
    return std_stats

columns = ['TOT','UP','DOWN','RAT','DEP','HEL','LEN']

In [26]:
reviews_features = reviews_features[reviews_features.category.isin(list_search_categories)]
# split mixed subset
mixed_reviews = reviews_features[reviews_features.stars > 1]
mixed_reviews = mixed_reviews[reviews_features.stars < 5]



In [27]:
# split favorable subset
favorable_reviews = reviews_features[reviews_features.stars == 5]

In [28]:
# split unfavorable subset
unfavorable_reviews = reviews_features[reviews_features.stars == 1]

In [29]:
stats = pd.DataFrame(data={'mean_favorable': np.array(mean_data(favorable_reviews))}, index=columns)
stats['std_favorable'] = std_data(favorable_reviews)
stats['mean_unfavorable'] = mean_data(unfavorable_reviews)
stats['std_unfavorable'] = std_data(unfavorable_reviews)
stats['mean_mixed'] = mean_data(mixed_reviews)
stats['std_mixed'] = std_data(mixed_reviews)

## Descriptive Table Buscape Reviews
### statistics (mean ± sd) of reviews as a function of review sentiment

In [30]:
stats

Unnamed: 0,mean_favorable,std_favorable,mean_unfavorable,std_unfavorable,mean_mixed,std_mixed
TOT,14.976558,16.723113,21.551935,35.134535,17.81347,22.916652
UP,9.721555,12.413762,14.767821,26.453561,13.538043,19.058364
DOWN,5.255003,7.486841,6.784114,12.277665,4.275427,6.720567
RAT,5.0,0.0,1.0,0.0,3.510093,0.699096
DEP,59.916524,59.744181,95.026477,71.232283,87.344332,73.50385
HEL,0.642369,0.285214,0.669592,0.291098,0.748274,0.241408
LEN,1749.0,1749.0,491.0,491.0,5152.0,5152.0


## Results of the multiple regression analyses

In [31]:
from scipy.stats import spearmanr, f_oneway
from sklearn.svm import SVR, LinearSVR
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import make_scorer

def simple_spearman(x,y): return np.abs(spearmanr(x,y)[0])
spearmanr_scorer = make_scorer(simple_spearman)

In [32]:
# create new dict for correlation
mixed_list = []
mixed_list_p = []
favorable_list = []
favorable_list_p = []
unfavorable_list = []
unfavorable_list_p = []
columns_pd = []

#svm model
model = LinearSVR(epsilon=0.01)

# build columns list
df_columns = mixed_reviews.select_dtypes(include=['float64','int']).columns.drop('helpfulness')
df_columns

Index([u'Unnamed: 0', u'ConnectiveAdditiveIncidence', u'sentence_sized_30',
       u'redability', u'mean_sentence_length',
       u'percentile_75_sentence_length', u'percentile_75_word_length',
       u'word_count', u'ConnectiveLogicIncidence',
       u'percentile_90_word_length', u'mean_word_length', u'syllable_count',
       u'avg_word_per_sentence', u'LogicIfIncidence',
       u'percentile_50_sentence_length', u'median_sentence_length',
       u'LogicAndIncidence', u'percentile_90_sentence_length',
       u'median_word_length', u'pronIncidence', u'LogicOperatorsIncidence',
       u'percentile_25_word_length', u'ContentDiversty', u'verbIncidence',
       u'functionalIncidence', u'adjectiveIncidence', u'ConnectiveIncidence',
       u'contentIncidence', u'LogicNegationIncidence',
       u'avg_syllables_per_word', u'ConnectiveTemporalIncidence',
       u'sentence_count', u'ConnectiveCasualIncidence', u'advIncidence',
       u'LexicalDiversty', u'adpPronRatio', u'thumbsup', u'thumbsdown'

In [33]:
#compute correlation
for i in df_columns:
    #compute mixed score
    labels = mixed_reviews["helpfulness"].values
    features = mixed_reviews[list([i])].values
    features_flat = mixed_reviews[i].values
    scores = cross_val_score(model, features, labels, cv=5, scoring=spearmanr_scorer)
    f_val,p_val = f_oneway(features_flat, labels)
    mixed_list_p.append(p_val)
    mixed_list.append(scores.mean())
    #compute favorable score
    labels = favorable_reviews["helpfulness"].values
    features = favorable_reviews[list([i])].values
    features_flat = favorable_reviews[i].values
    scores = cross_val_score(model, features, labels, cv=5, scoring=spearmanr_scorer)
    f_val,p_val = f_oneway(features_flat, labels)
    favorable_list_p.append(p_val)
    favorable_list.append(scores.mean())
    #compute unfavorable score
    labels = unfavorable_reviews["helpfulness"].values
    features = unfavorable_reviews[list([i])].values
    features_flat = unfavorable_reviews[i].values
    scores = cross_val_score(model, features, labels, cv=5, scoring=spearmanr_scorer)
    f_val,p_val = f_oneway(features_flat, labels)
    unfavorable_list_p.append(p_val)
    unfavorable_list.append(scores.mean())
    #print status
    columns_pd.append(i);
    print i
    #if len(columns_pd) > 3:
    #    break

Unnamed: 0
ConnectiveAdditiveIncidence
sentence_sized_30
redability
mean_sentence_length
percentile_75_sentence_length
percentile_75_word_length
word_count
ConnectiveLogicIncidence
percentile_90_word_length
mean_word_length
syllable_count
avg_word_per_sentence
LogicIfIncidence
percentile_50_sentence_length
median_sentence_length
LogicAndIncidence
percentile_90_sentence_length
median_word_length
pronIncidence
LogicOperatorsIncidence
percentile_25_word_length
ContentDiversty
verbIncidence
functionalIncidence
adjectiveIncidence
ConnectiveIncidence
contentIncidence
LogicNegationIncidence
avg_syllables_per_word
ConnectiveTemporalIncidence
sentence_count
ConnectiveCasualIncidence
advIncidence
LexicalDiversty
adpPronRatio
thumbsup
thumbsdown
stars
thumbstotal
length


In [34]:
corr_df = pd.DataFrame(data={'favorable': np.array(favorable_list)}, index=columns_pd)
corr_df['fav_p'] = favorable_list_p
corr_df['unfavorable'] = unfavorable_list
corr_df['unfav_p'] = unfavorable_list_p
corr_df['mixed'] = mixed_list
corr_df['mix_p'] = mixed_list_p

## Buscape Correlation Features

In [35]:
corr_df.sort_values('favorable',ascending=False)

Unnamed: 0,favorable,fav_p,unfavorable,unfav_p,mixed,mix_p
thumbsdown,0.7858,5.166167999999999e-134,0.759262,9.303524e-27,0.804168,1.013775e-290
thumbsup,0.555349,3.404134e-182,0.449057,3.5131649999999996e-30,0.361896,0.0
syllable_count,0.306545,2.564e-321,0.298297,3.7378139999999997e-134,0.269739,0.0
sentence_count,0.297849,1.278574e-282,0.213132,4.229878e-112,0.247399,0.0
word_count,0.293997,2.523305e-306,0.304526,2.160006e-136,0.261807,0.0
length,0.278223,4.109549e-311,0.289343,9.921931999999999e-134,0.253684,0.0
percentile_25_word_length,0.217549,6.199593e-92,0.035202,1.1440649999999999e-38,0.065217,0.0
LexicalDiversty,0.192725,3.4736679999999996e-63,0.113933,0.1682841,0.130376,2.510263e-13
ContentDiversty,0.150393,1.551026e-143,0.060443,1.289432e-13,0.078587,1.155313e-64
percentile_90_sentence_length,0.147443,0.0,0.181487,3.40313e-115,0.111425,0.0
