# Descriptive Statistics
### base on Chua, Alton Y.K. and Banerjee, Snehasish 2016
#### http://www.sciencedirect.com/science/article/pii/S074756321530131X

In [86]:
import sys
sys.path.append("../pylinguistics/pylinguistics/")

import Pylinguistics as pl
import pandas as pd
import plotsfunc as pf
import numpy as np
%matplotlib inline

reviews_features = pd.read_csv('reviews.csv.gz', compression='gzip')
#reviews_features = pd.read_csv('amazon-help.csv.gz')

In [87]:
## adiciona coluna de helpfulness no corpus
def helpf(x): 
    try:
        return float ( float(x['thumbsup']) / (float(x['thumbsup']) + float(x['thumbsdown'])) )
    except:
        return 0
    
def thumbssum(x): 
    try:
        return int(x['thumbsup']) + int(x['thumbsdown']) 
    except:
        return 0
    
def fillnanup(x): 
    try:
        return float(x['thumbsup'])
    except:
        return 0
    
def fillnandown(x): 
    try:
        return float(x['thumbsdown'])
    except:
        return 0
    
def length(x): 
    try:
        return int(x['word_count']) * int(x['mean_word_length'])
    except:
        return 0
    
def sentiment(x): 
    if x['stars'] == 5:
        return 'favorable'
    if x['stars'] == 1:
        return 'unfavorable'
    else:
        return 'mixed'

In [88]:
reviews_features['thumbsup'] = reviews_features.apply(fillnanup,axis=1)
reviews_features['thumbsdown'] = reviews_features.apply(fillnandown,axis=1)
reviews_features['helpfulness'] = reviews_features.apply(helpf,axis=1)
reviews_features['thumbstotal'] = reviews_features.apply(thumbssum,axis=1)
reviews_features['length'] = reviews_features.apply(length,axis=1)
reviews_features = reviews_features[reviews_features.thumbstotal>4]
reviews_features = reviews_features[reviews_features.word_count>9]
#reviews_features = reviews_features[~reviews_features.applymap(np.isnan).all(1)]
## para usar no weka
#reviews_features['sentiment'] = reviews_features.apply(sentiment,axis=1)
#reviews_features.to_csv('buscape-help.csv',encoding='utf-8')
reviews_features.shape

(9451, 50)

In [89]:
list_experience_categories = [ 'Livros' , 'Perfume', 'Roupas', 'Jogos', 'Bonecas', 'Chapinha / Prancha', 'Esteira'
                              , 'Cadeira para Auto']
experience_products = reviews_features[reviews_features.category.isin(list_experience_categories)]
experience_products.shape

(289, 50)

In [90]:
list_search_categories = [ 'Celular e Smartphone' , 'TV', 'Camera Digital', 'Maquina de Lavar Roupas'
                          , 'Geladeira / Refrigerador', 'Ar Condicionado', 'Tablet', 'Notebook'
                          ,'Console de Videogame', 'Impressora', 'Fogao', 'Microondas'
                          , 'Aparelho de Telefone', 'MP3 Player / MP4 Player', 'Aquecedor de Ambiente'
                          , 'Forno Eletrico']
search_products = reviews_features[reviews_features.category.isin(list_search_categories)]
search_products.shape

(7444, 50)

### 1.1 Categorias mais comentadas

In [91]:
df_agg = reviews_features[['word_count','category']].groupby(['category']).agg(['count','mean'])
df_agg['word_count'].sort_values('count',ascending=False).head(50)

Unnamed: 0_level_0,count,mean
category,Unnamed: 1_level_1,Unnamed: 2_level_1
Celular e Smartphone,2017,87.234507
TV,1682,82.585018
Camera Digital,710,77.184507
Maquina de Lavar Roupas,483,95.296066
Geladeira / Refrigerador,407,76.321867
Ar Condicionado,369,77.273713
Tablet,341,87.554252
Notebook,270,87.214815
Console de Videogame,228,84.008772
Impressora,204,76.852941


In [92]:
def mean_data(df):
    mean_stats = []
    mean_stats.append(df.thumbstotal.mean())
    mean_stats.append(df.thumbsup.mean())
    mean_stats.append(df.thumbsdown.mean())
    mean_stats.append(df.stars.mean())
    mean_stats.append(df.word_count.mean())
    mean_stats.append(df.helpfulness.mean())
    mean_stats.append(len(df))
    return mean_stats

def std_data(df):
    std_stats = []
    std_stats.append(df.thumbstotal.std())
    std_stats.append(df.thumbsup.std())
    std_stats.append(df.thumbsdown.std())
    std_stats.append(df.stars.std())
    std_stats.append(df.word_count.std())
    std_stats.append(df.helpfulness.std())
    std_stats.append(len(df))
    return std_stats

columns = ['TOT','UP','DOWN','RAT','DEP','HEL','LEN']

In [93]:
reviews_features = reviews_features[reviews_features.category.isin(list_search_categories)]
# split mixed subset
mixed_reviews = reviews_features[reviews_features.stars > 1]
mixed_reviews = mixed_reviews[reviews_features.stars < 5]

In [94]:
# split favorable subset
favorable_reviews = reviews_features[reviews_features.stars == 5]

In [95]:
# split unfavorable subset
unfavorable_reviews = reviews_features[reviews_features.stars == 1]

In [96]:
stats = pd.DataFrame(data={'mean_favorable': np.array(mean_data(favorable_reviews))}, index=columns)
stats['std_favorable'] = std_data(favorable_reviews)
stats['mean_unfavorable'] = mean_data(unfavorable_reviews)
stats['std_unfavorable'] = std_data(unfavorable_reviews)
stats['mean_mixed'] = mean_data(mixed_reviews)
stats['std_mixed'] = std_data(mixed_reviews)

## Descriptive Table Buscape Reviews
### statistics (mean ± sd) of reviews as a function of review sentiment

In [97]:
stats

Unnamed: 0,mean_favorable,std_favorable,mean_unfavorable,std_unfavorable,mean_mixed,std_mixed
TOT,15.168667,17.015725,21.577551,35.165852,17.881316,22.995865
UP,9.980192,12.629684,14.781633,26.478823,13.616921,19.120519
DOWN,5.188475,7.510266,6.795918,12.287423,4.264395,6.724305
RAT,5.0,0.0,1.0,0.0,3.506855,0.700673
DEP,62.564826,59.994312,95.204082,71.196172,88.065609,73.438448
HEL,0.65194,0.280843,0.669145,0.291226,0.750974,0.238948
LEN,1666.0,1666.0,490.0,490.0,5106.0,5106.0


## Results of the multiple regression analyses

In [98]:
from scipy.stats import spearmanr, f_oneway
from sklearn.svm import SVR, LinearSVR
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import make_scorer

def simple_spearman(x,y): return np.abs(spearmanr(x,y)[0])
spearmanr_scorer = make_scorer(simple_spearman)

In [103]:
# create new dict for correlation
mixed_list = []
mixed_list_p = []
favorable_list = []
favorable_list_p = []
unfavorable_list = []
unfavorable_list_p = []
columns_pd = []

#svm model
model = LinearSVR(epsilon=0.01)

# build columns list
df_columns = mixed_reviews.select_dtypes(include=['float64','int']).columns.drop('helpfulness')
df_columns

Index([u'ConnectiveAdditiveIncidence', u'redability', u'mean_sentence_length',
       u'percentile_75_sentence_length', u'percentile_75_word_length',
       u'ConnectiveLogicIncidence', u'percentile_90_word_length',
       u'mean_word_length', u'syllable_count', u'avg_word_per_sentence',
       u'LogicIfIncidence', u'percentile_50_sentence_length',
       u'median_sentence_length', u'LogicAndIncidence',
       u'percentile_90_sentence_length', u'median_word_length',
       u'pronIncidence', u'LogicOperatorsIncidence',
       u'percentile_25_word_length', u'ContentDiversty', u'verbIncidence',
       u'functionalIncidence', u'adjectiveIncidence', u'ConnectiveIncidence',
       u'contentIncidence', u'LogicNegationIncidence',
       u'avg_syllables_per_word', u'ConnectiveTemporalIncidence',
       u'sentence_count', u'ConnectiveCasualIncidence', u'advIncidence',
       u'LexicalDiversty', u'adpPronRatio', u'thumbsup', u'thumbsdown',
       u'stars'],
      dtype='object')

In [104]:
#compute correlation
for i in df_columns:
    #compute mixed score
    labels = mixed_reviews["helpfulness"].values
    features = mixed_reviews[list([i])].values
    features_flat = mixed_reviews[i].values
    scores = cross_val_score(model, features, labels, cv=5, scoring=spearmanr_scorer)
    f_val,p_val = f_oneway(features_flat, labels)
    mixed_list_p.append(p_val)
    mixed_list.append(scores.mean())
    #compute favorable score
    labels = favorable_reviews["helpfulness"].values
    features = favorable_reviews[list([i])].values
    features_flat = favorable_reviews[i].values
    scores = cross_val_score(model, features, labels, cv=5, scoring=spearmanr_scorer)
    f_val,p_val = f_oneway(features_flat, labels)
    favorable_list_p.append(p_val)
    favorable_list.append(scores.mean())
    #compute unfavorable score
    labels = unfavorable_reviews["helpfulness"].values
    features = unfavorable_reviews[list([i])].values
    features_flat = unfavorable_reviews[i].values
    scores = cross_val_score(model, features, labels, cv=5, scoring=spearmanr_scorer)
    f_val,p_val = f_oneway(features_flat, labels)
    unfavorable_list_p.append(p_val)
    unfavorable_list.append(scores.mean())
    #print status
    columns_pd.append(i);
    print i
    #if len(columns_pd) > 3:
    #    break

ConnectiveAdditiveIncidence
redability
mean_sentence_length
percentile_75_sentence_length
percentile_75_word_length
ConnectiveLogicIncidence
percentile_90_word_length
mean_word_length
syllable_count
avg_word_per_sentence
LogicIfIncidence
percentile_50_sentence_length
median_sentence_length
LogicAndIncidence
percentile_90_sentence_length
median_word_length
pronIncidence
LogicOperatorsIncidence
percentile_25_word_length
ContentDiversty
verbIncidence
functionalIncidence
adjectiveIncidence
ConnectiveIncidence
contentIncidence
LogicNegationIncidence
avg_syllables_per_word
ConnectiveTemporalIncidence
sentence_count
ConnectiveCasualIncidence
advIncidence
LexicalDiversty
adpPronRatio
thumbsup
thumbsdown
stars


In [105]:
corr_df = pd.DataFrame(data={'favorable': np.array(favorable_list)}, index=columns_pd)
corr_df['fav_p'] = favorable_list_p
corr_df['unfavorable'] = unfavorable_list
corr_df['unfav_p'] = unfavorable_list_p
corr_df['mixed'] = mixed_list
corr_df['mix_p'] = mixed_list_p

## Buscape Correlation Features

In [106]:
corr_df.sort_values('favorable',ascending=False)

Unnamed: 0,favorable,fav_p,unfavorable,unfav_p,mixed,mix_p
thumbsdown,0.784824,2.888953e-123,0.759056,9.175931e-27,0.804514,9.066557e-286
thumbsup,0.537347,1.131216e-176,0.446382,3.988332e-30,0.35396,0.0
syllable_count,0.281946,0.0,0.302278,2.29579e-134,0.26067,0.0
sentence_count,0.280478,7.389362999999999e-283,0.216593,2.990303e-112,0.239225,0.0
percentile_25_word_length,0.204342,1.445181e-108,0.045125,1.176684e-38,0.057922,0.0
LexicalDiversty,0.159239,1.73847e-45,0.116898,0.1725741,0.119071,1.890132e-18
ContentDiversty,0.114548,9.350744000000001e-120,0.054594,1.372241e-13,0.067344,5.721431e-57
percentile_90_sentence_length,0.11061,0.0,0.185972,3.196791e-115,0.099479,0.0
ConnectiveTemporalIncidence,0.101573,8.809595e-37,0.131341,9.368337e-12,0.062736,1.625713e-158
ConnectiveCasualIncidence,0.098769,2.468862e-41,0.123085,6.377181e-14,0.052902,6.901421000000001e-175
