# Descriptive Statistics
### base on Chua, Alton Y.K. and Banerjee, Snehasish 2016
#### http://www.sciencedirect.com/science/article/pii/S074756321530131X

In [1]:
import sys
sys.path.append("../pylinguistics/pylinguistics/")

#import Pylinguistics as pl
import pandas as pd
import plotsfunc as pf
import numpy as np
%matplotlib inline

In [2]:
## adiciona coluna de helpfulness no corpus
def helpf(x): 
    try:
        return float ( float(x['thumbsup']) / float(x['thumbsdown']) )
    except:
        return 0
    
def thumbssum(x): 
    try:
        return int(x['thumbsdown']) - int(x['thumbsup']) 
    except:
        return 0
    
def fillnanup(x): 
    try:
        return float(x['thumbsup'])
    except:
        return 0
    
def fillnandown(x): 
    try:
        return float(x['thumbsdown'])
    except:
        return 0
    
def length(x): 
    try:
        return int(x['word_count']) * int(x['mean_word_length'])
    except:
        return 0
    
def sentiment(x): 
    if x['stars'] == 5:
        return 'favorable'
    if x['stars'] == 1:
        return 'unfavorable'
    else:
        return 'mixed'

In [4]:
#reviews_features = pd.read_csv('reviews.csv.gz', compression='gzip')
reviews_features = pd.read_csv('amazon-help.csv.gz')
reviews_features['thumbsup'] = reviews_features.apply(fillnanup,axis=1)
reviews_features['thumbsdown'] = reviews_features.apply(fillnandown,axis=1)
reviews_features['helpfulness'] = reviews_features.apply(helpf,axis=1)
reviews_features['thumbstotal'] = reviews_features.apply(thumbssum,axis=1)
reviews_features['length'] = reviews_features.apply(length,axis=1)
reviews_features = reviews_features[reviews_features.thumbsdown>4]
reviews_features = reviews_features[reviews_features.word_count>9]
#reviews_features = reviews_features[~reviews_features.applymap(np.isnan).all(1)]
## para usar no weka
#reviews_features['sentiment'] = reviews_features.apply(sentiment,axis=1)
#reviews_features.to_csv('buscape-help.csv',encoding='utf-8')
reviews_features.shape

(9801, 47)

In [5]:
def mean_data(df):
    mean_stats = []
    mean_stats.append(df.thumbstotal.mean())
    mean_stats.append(df.thumbsup.mean())
    mean_stats.append(df.thumbsdown.mean())
    mean_stats.append(df.stars.mean())
    mean_stats.append(df.word_count.mean())
    mean_stats.append(df.helpfulness.mean())
    mean_stats.append(len(df))
    return mean_stats

def std_data(df):
    std_stats = []
    std_stats.append(df.thumbstotal.std())
    std_stats.append(df.thumbsup.std())
    std_stats.append(df.thumbsdown.std())
    std_stats.append(df.stars.std())
    std_stats.append(df.word_count.std())
    std_stats.append(df.helpfulness.std())
    std_stats.append(len(df))
    return std_stats

columns = ['TOT','UP','DOWN','RAT','DEP','HEL','LEN']

In [6]:
# split mixed subset
mixed_reviews = reviews_features[reviews_features.stars > 1]
mixed_reviews = mixed_reviews[reviews_features.stars < 5]

  app.launch_new_instance()


In [7]:
# split favorable subset
favorable_reviews = reviews_features[reviews_features.stars == 5]

In [8]:
# split unfavorable subset
unfavorable_reviews = reviews_features[reviews_features.stars == 1]

In [9]:
stats = pd.DataFrame(data={'mean_favorable': np.array(mean_data(favorable_reviews))}, index=columns)
stats['std_favorable'] = std_data(favorable_reviews)
stats['mean_unfavorable'] = mean_data(unfavorable_reviews)
stats['std_unfavorable'] = std_data(unfavorable_reviews)
stats['mean_mixed'] = mean_data(mixed_reviews)
stats['std_mixed'] = std_data(mixed_reviews)

## Descriptive Table Amazon Reviews
### statistics (mean ± sd) of reviews as a function of review sentiment

In [10]:
stats

Unnamed: 0,mean_favorable,std_favorable,mean_unfavorable,std_unfavorable,mean_mixed,std_mixed
TOT,1.916587,3.97765,4.660156,7.289426,2.62122,5.416328
UP,13.513849,29.328084,8.699609,20.518542,12.291951,31.230406
DOWN,15.430436,30.162036,13.359766,22.593297,14.913171,32.497419
RAT,5.0,0.0,1.0,0.0,3.220976,0.828234
DEP,191.90226,237.816366,126.012109,143.007801,208.493415,217.372258
HEL,0.84318,0.236037,0.642799,0.306009,0.790375,0.262502
LEN,3141.0,3141.0,2560.0,2560.0,4100.0,4100.0


## Results of the multiple regression analyses

In [11]:
from scipy.stats import spearmanr, f_oneway
from sklearn.svm import SVR, LinearSVR
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import make_scorer

def simple_spearman(x,y): return np.abs(spearmanr(x,y)[0])
spearmanr_scorer = make_scorer(simple_spearman)

In [12]:
# create new dict for correlation
mixed_list = []
mixed_list_p = []
favorable_list = []
favorable_list_p = []
unfavorable_list = []
unfavorable_list_p = []
columns_pd = []

#svm model
model = LinearSVR(epsilon=0.01)

# build columns list
df_columns = mixed_reviews.select_dtypes(include=['float64','int']).columns.drop('helpfulness')
df_columns

Index([u'Unnamed: 0', u'Unnamed: 0.1', u'ConnectiveAdditiveIncidence',
       u'sentence_sized_30', u'redability', u'mean_sentence_length',
       u'percentile_75_sentence_length', u'percentile_75_word_length',
       u'word_count', u'ConnectiveLogicIncidence',
       u'percentile_90_word_length', u'mean_word_length', u'syllable_count',
       u'avg_word_per_sentence', u'LogicIfIncidence',
       u'percentile_50_sentence_length', u'median_sentence_length',
       u'LogicAndIncidence', u'percentile_90_sentence_length',
       u'median_word_length', u'pronIncidence', u'LogicOperatorsIncidence',
       u'percentile_25_word_length', u'ContentDiversty', u'verbIncidence',
       u'functionalIncidence', u'nounIncidence', u'percentile_50_word_length',
       u'percentile_25_sentence_length', u'LogicOrIncidence',
       u'adjectiveIncidence', u'ConnectiveIncidence', u'contentIncidence',
       u'LogicNegationIncidence', u'avg_syllables_per_word',
       u'ConnectiveTemporalIncidence', u'sentenc

In [13]:
#compute correlation
for i in df_columns:
    #compute mixed score
    labels = mixed_reviews["helpfulness"].values
    features = mixed_reviews[list([i])].values
    features_flat = mixed_reviews[i].values
    scores = cross_val_score(model, features, labels, cv=5, scoring=spearmanr_scorer)
    f_val,p_val = f_oneway(features_flat, labels)
    mixed_list_p.append(p_val)
    mixed_list.append(scores.mean())
    #compute favorable score
    labels = favorable_reviews["helpfulness"].values
    features = favorable_reviews[list([i])].values
    features_flat = favorable_reviews[i].values
    scores = cross_val_score(model, features, labels, cv=5, scoring=spearmanr_scorer)
    f_val,p_val = f_oneway(features_flat, labels)
    favorable_list_p.append(p_val)
    favorable_list.append(scores.mean())
    #compute unfavorable score
    labels = unfavorable_reviews["helpfulness"].values
    features = unfavorable_reviews[list([i])].values
    features_flat = unfavorable_reviews[i].values
    scores = cross_val_score(model, features, labels, cv=5, scoring=spearmanr_scorer)
    f_val,p_val = f_oneway(features_flat, labels)
    unfavorable_list_p.append(p_val)
    unfavorable_list.append(scores.mean())
    #print status
    columns_pd.append(i);
    print i
    #if len(columns_pd) > 3:
    #    break

Unnamed: 0
Unnamed: 0.1
ConnectiveAdditiveIncidence
sentence_sized_30
redability
mean_sentence_length
percentile_75_sentence_length
percentile_75_word_length
word_count
ConnectiveLogicIncidence
percentile_90_word_length
mean_word_length
syllable_count
avg_word_per_sentence
LogicIfIncidence
percentile_50_sentence_length
median_sentence_length
LogicAndIncidence
percentile_90_sentence_length
median_word_length
pronIncidence
LogicOperatorsIncidence
percentile_25_word_length
ContentDiversty
verbIncidence
functionalIncidence
nounIncidence
percentile_50_word_length
percentile_25_sentence_length
LogicOrIncidence
adjectiveIncidence
ConnectiveIncidence
contentIncidence
LogicNegationIncidence
avg_syllables_per_word
ConnectiveTemporalIncidence
sentence_count
ConnectiveCasualIncidence
advIncidence
LexicalDiversty
adpPronRatio
thumbsup
thumbsdown
stars
thumbstotal
length


In [14]:
corr_df = pd.DataFrame(data={'favorable': np.array(favorable_list)}, index=columns_pd)
corr_df['fav_p'] = favorable_list_p
corr_df['unfavorable'] = unfavorable_list
corr_df['unfav_p'] = unfavorable_list_p
corr_df['mixed'] = mixed_list
corr_df['mix_p'] = mixed_list_p

## Amazon Correlation Features

In [15]:
corr_df.sort_values('favorable',ascending=False)

Unnamed: 0,favorable,fav_p,unfavorable,unfav_p,mixed,mix_p
thumbstotal,0.927629,1.287741e-50,0.875659,3.621105e-159,0.918483,7.555207e-101
thumbsup,0.389397,6.811899e-124,0.66467,1.179425e-84,0.506149,5.213385e-119
syllable_count,0.309641,0.0,0.10066,0.0,0.282291,0.0
length,0.307966,0.0,0.099342,0.0,0.281343,0.0
word_count,0.306472,0.0,0.095297,0.0,0.281338,0.0
sentence_count,0.271598,0.0,0.108828,0.0,0.251331,0.0
LexicalDiversty,0.265379,2.528765e-206,0.077094,1.692447e-42,0.247636,6.55258e-153
ContentDiversty,0.192358,0.0003391654,0.051189,9.917106000000001e-211,0.181877,1.150466e-06
percentile_90_sentence_length,0.192186,0.0,0.02721,0.0,0.152354,0.0
LogicOrIncidence,0.157647,1.359356e-99,0.048775,8.232291e-72,0.110797,2.46443e-154
