# Descriptive Statistics
### base on Chua, Alton Y.K. and Banerjee, Snehasish 2016
#### http://www.sciencedirect.com/science/article/pii/S074756321530131X

In [2]:
import sys
sys.path.append("../pylinguistics/pylinguistics/")

import Pylinguistics as pl
import pandas as pd
import plotsfunc as pf
import numpy as np
%matplotlib inline

reviews_features = pd.read_csv('reviews.csv.gz', compression='gzip')
#reviews_features = pd.read_csv('amazon-help.csv.gz')

In [3]:
## adiciona coluna de helpfulness no corpus
def helpf(x): 
    try:
        return float ( float(x['thumbsup']) / (float(x['thumbsup']) + float(x['thumbsdown'])) )
    except:
        return 0
    
def thumbssum(x): 
    try:
        return int(x['thumbsup']) + int(x['thumbsdown']) 
    except:
        return 0
    
def fillnanup(x): 
    try:
        return float(x['thumbsup'])
    except:
        return 0
    
def fillnandown(x): 
    try:
        return float(x['thumbsdown'])
    except:
        return 0
    
def length(x): 
    try:
        return int(x['word_count']) * int(x['mean_word_length'])
    except:
        return 0
    
def sentiment(x): 
    if x['stars'] == 5:
        return 'favorable'
    if x['stars'] == 1:
        return 'unfavorable'
    else:
        return 'mixed'

In [12]:
reviews_features['thumbsup'] = reviews_features.apply(fillnanup,axis=1)
reviews_features['thumbsdown'] = reviews_features.apply(fillnandown,axis=1)
reviews_features['helpfulness'] = reviews_features.apply(helpf,axis=1)
reviews_features['thumbstotal'] = reviews_features.apply(thumbssum,axis=1)
reviews_features['length'] = reviews_features.apply(length,axis=1)
reviews_features = reviews_features[reviews_features.thumbstotal>4]
reviews_features = reviews_features[reviews_features.stars>0]
reviews_features = reviews_features[reviews_features.word_count>9]

## para usar no weka
#reviews_features['sentiment'] = reviews_features.apply(sentiment,axis=1)
#reviews_features.to_csv('buscape-help.csv',encoding='utf-8')
reviews_features.shape

(9215, 50)

In [5]:
def mean_data(df):
    mean_stats = []
    mean_stats.append(df.thumbstotal.mean())
    mean_stats.append(df.thumbsup.mean())
    mean_stats.append(df.thumbsdown.mean())
    mean_stats.append(df.stars.mean())
    mean_stats.append(df.word_count.mean())
    mean_stats.append(df.helpfulness.mean())
    mean_stats.append(len(df))
    return mean_stats

def std_data(df):
    std_stats = []
    std_stats.append(df.thumbstotal.std())
    std_stats.append(df.thumbsup.std())
    std_stats.append(df.thumbsdown.std())
    std_stats.append(df.stars.std())
    std_stats.append(df.word_count.std())
    std_stats.append(df.helpfulness.std())
    std_stats.append(len(df))
    return std_stats

columns = ['TOT','UP','DOWN','RAT','DEP','HEL','LEN']

In [7]:
# split mixed subset
mixed_reviews = reviews_features[reviews_features.stars > 1]
mixed_reviews = mixed_reviews[reviews_features.stars < 5]

In [8]:
# split favorable subset
favorable_reviews = reviews_features[reviews_features.stars == 5]

In [9]:
# split unfavorable subset
unfavorable_reviews = reviews_features[reviews_features.stars == 1]

In [15]:
stats = pd.DataFrame(data={'mean_favorable': np.array(mean_data(favorable_reviews))}, index=columns)
stats['std_favorable'] = std_data(favorable_reviews)
stats['mean_unfavorable'] = mean_data(unfavorable_reviews)
stats['std_unfavorable'] = std_data(unfavorable_reviews)
stats['mean_mixed'] = mean_data(mixed_reviews)
stats['std_mixed'] = std_data(mixed_reviews)

## Descriptive Table Buscape Reviews
### statistics (mean ± sd) of reviews as a function of review sentiment

In [17]:
stats

Unnamed: 0,mean_favorable,std_favorable,mean_unfavorable,std_unfavorable,mean_mixed,std_mixed
TOT,14.546702,16.143837,19.922018,32.043209,17.094891,21.803487
UP,9.580345,11.877403,14.04893,24.318661,12.971438,18.051412
DOWN,4.966357,7.378825,5.873089,11.031381,4.123453,6.535705
RAT,5.0,0.0,1.0,0.0,3.497302,0.707887
DEP,60.096503,57.558823,96.701835,69.359825,84.895113,70.780472
HEL,0.664571,0.281268,0.699282,0.280326,0.7524,0.238471
LEN,2259.0,2259.0,654.0,654.0,6302.0,6302.0


## Descriptive Table Amazon Reviews

In [10]:
stats

Unnamed: 0,mean_mixed,std_mixed,mean_favorable,std_favorable,mean_unfavorable,std_unfavorable
Length,86.346112,70.885989,59.604149,57.262348,97.819423,69.541724
Redability,15.725786,6.439299,15.328631,7.127779,15.756449,9.20711
Verbs incidence,177.65298,54.690528,176.992329,78.050514,198.50238,60.183015
Adv Pron Ratio,1.304249,1.213923,1.406624,1.5309,1.198007,1.07133
Lexical Diversty,0.723967,0.106067,0.774523,0.139167,0.688236,0.127435
Reviews,6391.0,6391.0,2410.0,2410.0,659.0,659.0


## Results of the multiple regression analyses

In [9]:
from scipy.stats import spearmanr
from sklearn.svm import SVR, LinearSVR
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import make_scorer

def simple_spearman(x,y): return np.abs(spearmanr(x,y)[0])
spearmanr_scorer = make_scorer(simple_spearman)

In [10]:
# create new dict for correlation
mixed_list = []
favorable_list = []
unfavorable_list = []

#svm model
model = LinearSVR(epsilon=0.01)

# build columns list
df_columns = mixed_reviews.select_dtypes(include=['float64','int']).columns.drop('helpfulness')
df_columns

Index([u'ConnectiveAdditiveIncidence', u'redability', u'mean_sentence_length',
       u'percentile_75_sentence_length', u'percentile_75_word_length',
       u'ConnectiveLogicIncidence', u'percentile_90_word_length',
       u'mean_word_length', u'syllable_count', u'avg_word_per_sentence',
       u'LogicIfIncidence', u'percentile_50_sentence_length',
       u'median_sentence_length', u'LogicAndIncidence',
       u'percentile_90_sentence_length', u'median_word_length',
       u'pronIncidence', u'LogicOperatorsIncidence',
       u'percentile_25_word_length', u'ContentDiversty', u'verbIncidence',
       u'functionalIncidence', u'nounIncidence', u'percentile_50_word_length',
       u'percentile_25_sentence_length', u'LogicOrIncidence',
       u'adjectiveIncidence', u'ConnectiveIncidence', u'contentIncidence',
       u'LogicNegationIncidence', u'avg_syllables_per_word',
       u'ConnectiveTemporalIncidence', u'ConnectiveCasualIncidence',
       u'advIncidence', u'LexicalDiversty', u'adpPronRa

In [None]:
#compute correlation
for i in df_columns:
    #compute mixed score
    labels = mixed_reviews["helpfulness"].values
    features = mixed_reviews[list([i])].values
    scores = cross_val_score(model, features, labels, cv=5, scoring=spearmanr_scorer)
    mixed_list.append(scores.mean())
    #compute favorable score
    labels = favorable_reviews["helpfulness"].values
    features = favorable_reviews[list([i])].values
    scores = cross_val_score(model, features, labels, cv=5, scoring=spearmanr_scorer)
    favorable_list.append(scores.mean())
    #compute unfavorable score
    labels = unfavorable_reviews["helpfulness"].values
    features = unfavorable_reviews[list([i])].values
    scores = cross_val_score(model, features, labels, cv=5, scoring=spearmanr_scorer)
    unfavorable_list.append(scores.mean())
    #print status
    print i
    #break

In [16]:
corr_df = pd.DataFrame(data={'favorable': np.array(favorable_list)}, index=df_columns)
corr_df['unfavorable'] = unfavorable_list
corr_df['mixed'] = mixed_list

## Buscape Correlation Features

In [14]:
corr_df.sort_values('favorable',ascending=False)

Unnamed: 0,favorable,unfavorable,mixed
thumbsdown,0.810246,0.792424,0.814404
thumbsup,0.534107,0.408524,0.363443
syllable_count,0.307581,0.309379,0.264287
sentence_count,0.298929,0.228727,0.239436
LexicalDiversty,0.190878,0.131078,0.128995
percentile_25_word_length,0.189424,0.084586,0.066233
percentile_90_sentence_length,0.155722,0.182862,0.116944
ContentDiversty,0.155609,0.089447,0.079891
ConnectiveTemporalIncidence,0.123886,0.157397,0.075437
ConnectiveCasualIncidence,0.123343,0.144324,0.065941


## Amazon Correlation Features

In [17]:
corr_df.sort_values('favorable',ascending=False)

Unnamed: 0,favorable,unfavorable,mixed
thumbsdown,0.25819,0.47746,0.350453
avg_word_per_sentence,0.217883,0.048024,0.200442
adpPronRatio,0.179355,0.03785,0.170806
stars,0.165443,0.247281,0.158017
median_word_length,0.129115,0.021671,0.107442
verbIncidence,0.117268,0.033231,0.111078
LogicNegationIncidence,0.114759,0.045579,0.090884
adjectiveIncidence,0.109548,0.029496,0.0723
percentile_75_word_length,0.105693,0.023853,0.081942
ContentDiversty,0.105627,0.05945,0.064507
