# Descriptive Statistics
### base on Chua, Alton Y.K. and Banerjee, Snehasish 2016
#### http://www.sciencedirect.com/science/article/pii/S074756321530131X

In [1]:
import sys
sys.path.append("../pylinguistics/pylinguistics/")

import Pylinguistics as pl
import pandas as pd
import plotsfunc as pf
import numpy as np
%matplotlib inline

reviews_features = pd.read_csv('reviews.csv.gz', compression='gzip')
#reviews_features = pd.read_csv('amazon-help.csv.gz')

  interactivity=interactivity, compiler=compiler, result=result)


In [2]:
## adiciona coluna de helpfulness no corpus
def helpf(x): 
    try:
        return float ( float(x['thumbsup']) / (float(x['thumbsup']) + float(x['thumbsdown'])) )
    except:
        return 0
    
def thumbssum(x): 
    try:
        return int(x['thumbsup']) + int(x['thumbsdown']) 
    except:
        return 0
    
def fillnanup(x): 
    try:
        return float(x['thumbsup'])
    except:
        return 0
    
def fillnandown(x): 
    try:
        return float(x['thumbsdown'])
    except:
        return 0
    
def length(x): 
    try:
        return int(x['word_count']) * int(x['mean_word_length'])
    except:
        return 0
    
def sentiment(x): 
    if x['stars'] == 5:
        return 'favorable'
    if x['stars'] == 1:
        return 'unfavorable'
    else:
        return 'mixed'

In [3]:
reviews_features['thumbsup'] = reviews_features.apply(fillnanup,axis=1)
reviews_features['thumbsdown'] = reviews_features.apply(fillnandown,axis=1)
reviews_features['helpfulness'] = reviews_features.apply(helpf,axis=1)
reviews_features['thumbstotal'] = reviews_features.apply(thumbssum,axis=1)
reviews_features['length'] = reviews_features.apply(length,axis=1)
reviews_features = reviews_features[reviews_features.thumbstotal>4]
reviews_features = reviews_features[reviews_features.stars>0]

## para usar no weka
reviews_features['sentiment'] = reviews_features.apply(sentiment,axis=1)
reviews_features.to_csv('buscape-help.csv',encoding='utf-8')

In [4]:
def mean_data(df):
    mean_stats = []
    mean_stats.append(df.length.mean())
    mean_stats.append(df.redability.mean())
    mean_stats.append(df.verbIncidence.mean())
    mean_stats.append(df.adpPronRatio.mean())
    mean_stats.append(df.LexicalDiversty.mean())
    mean_stats.append(len(df))
    return mean_stats

def std_data(df):
    std_stats = []
    std_stats.append(df.word_count.std())
    std_stats.append(df.redability.std())
    std_stats.append(df.verbIncidence.std())
    std_stats.append(df.adpPronRatio.std())
    std_stats.append(df.LexicalDiversty.std())
    std_stats.append(len(df))
    return std_stats

columns = ['Length','Redability','Verbs incidence','Adv Pron Ratio','Lexical Diversty','Reviews']

In [5]:
# split mixed subset
mixed_reviews = reviews_features[reviews_features.stars > 1]
mixed_reviews = mixed_reviews[reviews_features.stars < 5]



In [6]:
# split favorable subset
favorable_reviews = reviews_features[reviews_features.stars == 5]

In [7]:
# split unfavorable subset
unfavorable_reviews = reviews_features[reviews_features.stars == 1]

In [8]:
stats = pd.DataFrame(data={'mean_mixed': np.array(mean_data(mixed_reviews))}, index=columns)
stats['std_mixed'] = std_data(mixed_reviews)
stats['mean_favorable'] = mean_data(favorable_reviews)
stats['std_favorable'] = std_data(favorable_reviews)
stats['mean_unfavorable'] = mean_data(unfavorable_reviews)
stats['std_unfavorable'] = std_data(unfavorable_reviews)

## Descriptive Table Buscape Reviews
### statistics (mean ± sd) of reviews as a function of review sentiment

In [9]:
stats

Unnamed: 0,mean_mixed,std_mixed,mean_favorable,std_favorable,mean_unfavorable,std_unfavorable
Length,86.346112,70.885989,59.604149,57.262348,97.819423,69.541724
Redability,15.725786,6.439299,15.328631,7.127779,15.756449,9.20711
Verbs incidence,177.65298,54.690528,176.992329,78.050514,198.50238,60.183015
Adv Pron Ratio,1.304249,1.213923,1.406624,1.5309,1.198007,1.07133
Lexical Diversty,0.723967,0.106067,0.774523,0.139167,0.688236,0.127435
Reviews,6391.0,6391.0,2410.0,2410.0,659.0,659.0


## Descriptive Table Amazon Reviews

In [10]:
stats

Unnamed: 0,mean_mixed,std_mixed,mean_favorable,std_favorable,mean_unfavorable,std_unfavorable
Length,86.346112,70.885989,59.604149,57.262348,97.819423,69.541724
Redability,15.725786,6.439299,15.328631,7.127779,15.756449,9.20711
Verbs incidence,177.65298,54.690528,176.992329,78.050514,198.50238,60.183015
Adv Pron Ratio,1.304249,1.213923,1.406624,1.5309,1.198007,1.07133
Lexical Diversty,0.723967,0.106067,0.774523,0.139167,0.688236,0.127435
Reviews,6391.0,6391.0,2410.0,2410.0,659.0,659.0


## Results of the multiple regression analyses

In [11]:
from scipy.stats import spearmanr
from sklearn.svm import SVR, LinearSVR
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import make_scorer

def simple_spearman(x,y): return np.abs(spearmanr(x,y)[0])
spearmanr_scorer = make_scorer(simple_spearman)

In [12]:
# create new dict for correlation
mixed_list = []
favorable_list = []
unfavorable_list = []

#svm model
model = LinearSVR(epsilon=0.01)

# build columns list
df_columns = mixed_reviews.select_dtypes(include=['float64','int']).columns.drop('helpfulness')
df_columns

Index([u'ConnectiveAdditiveIncidence', u'redability', u'mean_sentence_length',
       u'percentile_75_sentence_length', u'percentile_75_word_length',
       u'ConnectiveLogicIncidence', u'percentile_90_word_length',
       u'mean_word_length', u'syllable_count', u'avg_word_per_sentence',
       u'LogicIfIncidence', u'percentile_50_sentence_length',
       u'median_sentence_length', u'LogicAndIncidence',
       u'percentile_90_sentence_length', u'median_word_length',
       u'pronIncidence', u'LogicOperatorsIncidence',
       u'percentile_25_word_length', u'ContentDiversty', u'verbIncidence',
       u'functionalIncidence', u'adjectiveIncidence', u'ConnectiveIncidence',
       u'contentIncidence', u'LogicNegationIncidence',
       u'avg_syllables_per_word', u'ConnectiveTemporalIncidence',
       u'sentence_count', u'ConnectiveCasualIncidence', u'advIncidence',
       u'LexicalDiversty', u'adpPronRatio', u'thumbsup', u'thumbsdown',
       u'stars'],
      dtype='object')

In [13]:
#compute correlation
for i in df_columns:
    #compute mixed score
    labels = mixed_reviews["helpfulness"].values
    features = mixed_reviews[list([i])].values
    scores = cross_val_score(model, features, labels, cv=5, scoring=spearmanr_scorer)
    mixed_list.append(scores.mean())
    #compute favorable score
    labels = favorable_reviews["helpfulness"].values
    features = favorable_reviews[list([i])].values
    scores = cross_val_score(model, features, labels, cv=5, scoring=spearmanr_scorer)
    favorable_list.append(scores.mean())
    #compute unfavorable score
    labels = unfavorable_reviews["helpfulness"].values
    features = unfavorable_reviews[list([i])].values
    scores = cross_val_score(model, features, labels, cv=5, scoring=spearmanr_scorer)
    unfavorable_list.append(scores.mean())
    #print status
    print i
    break

corr_df = pd.DataFrame(data={'favorable': np.array(favorable_list)}, index=df_columns)
corr_df['unfavorable'] = unfavorable_list
corr_df['mixed'] = mixed_list

ConnectiveAdditiveIncidence
redability
mean_sentence_length
percentile_75_sentence_length
percentile_75_word_length
ConnectiveLogicIncidence
percentile_90_word_length
mean_word_length
syllable_count
avg_word_per_sentence
LogicIfIncidence
percentile_50_sentence_length
median_sentence_length
LogicAndIncidence
percentile_90_sentence_length
median_word_length
pronIncidence
LogicOperatorsIncidence
percentile_25_word_length
ContentDiversty
verbIncidence
functionalIncidence
adjectiveIncidence
ConnectiveIncidence
contentIncidence
LogicNegationIncidence
avg_syllables_per_word
ConnectiveTemporalIncidence
sentence_count
ConnectiveCasualIncidence
advIncidence
LexicalDiversty
adpPronRatio
thumbsup
thumbsdown
stars


In [14]:
corr_df.sort_values('favorable',ascending=False)

Unnamed: 0,favorable,unfavorable,mixed
thumbsdown,0.810246,0.792424,0.814404
thumbsup,0.534107,0.408524,0.363443
syllable_count,0.307581,0.309379,0.264287
sentence_count,0.298929,0.228727,0.239436
LexicalDiversty,0.190878,0.131078,0.128995
percentile_25_word_length,0.189424,0.084586,0.066233
percentile_90_sentence_length,0.155722,0.182862,0.116944
ContentDiversty,0.155609,0.089447,0.079891
ConnectiveTemporalIncidence,0.123886,0.157397,0.075437
ConnectiveCasualIncidence,0.123343,0.144324,0.065941
