In [8]:
import pandas as pd
import numpy as np

def helpf(x): 
    try:
        return float ( float(x['thumbsup']) / (float(x['thumbsup']) + float(x['thumbsdown'])) )
    except:
        return 0
    
def thumbssum(x): 
    try:
        return int(x['thumbsup']) + int(x['thumbsdown']) 
    except:
        return 0
    
def fillnanup(x): 
    try:
        return float(x['thumbsup'])
    except:
        return 0
    
def fillnandown(x): 
    try:
        return float(x['thumbsdown'])
    except:
        return 0
    
def fillnanstarts(x): 
    try:
        return float(x['stars'])
    except:
        return 0

In [9]:
reviews_features = pd.read_csv('reviews.csv.gz', compression='gzip')
reviews_features['thumbsup'] = reviews_features.apply(fillnanup,axis=1)
reviews_features['thumbsdown'] = reviews_features.apply(fillnandown,axis=1)
reviews_features['helpfulness'] = reviews_features.apply(helpf,axis=1)
reviews_features['thumbstotal'] = reviews_features.apply(thumbssum,axis=1)
reviews_features['stars'] = reviews_features.apply(fillnanstarts,axis=1)
reviews_features = reviews_features[reviews_features.thumbstotal>4]
reviews_features = reviews_features[reviews_features.word_count>4]
reviews_features.shape

(9633, 49)

In [5]:
def nounIncidence(x): 
    try:
        return float(x['contentIncidence']) - float(x['adjectiveIncidence']) - float(x['verbIncidence']) - float(x['pronIncidence']) - float(x['advIncidence'])
    except:
        return 0
    
reviews_features['nounIncidence'] = reviews_features.apply(nounIncidence,axis=1)
reviews_features.nounIncidence.mean()

123.223848733726

In [10]:
def verbNounRatio(x): 
    try:
        return float(x['verbIncidence']) / float(x['nounIncidence'])
    except:
        return 0
    
reviews_features['verbNounRatio'] = reviews_features.apply(verbNounRatio,axis=1)
reviews_features.verbNounRatio.mean()

0.88747482603364503

In [12]:
list_search_categories = [ 'Celular e Smartphone' , 'TV', 'Camera Digital', 'Maquina de Lavar Roupas'
                          , 'Geladeira / Refrigerador', 'Ar Condicionado', 'Tablet', 'Notebook'
                          ,'Console de Videogame', 'Impressora', 'Fogao', 'Microondas'
                          , 'Aparelho de Telefone', 'MP3 Player / MP4 Player', 'Aquecedor de Ambiente'
                          , 'Forno Eletrico']
reviews_features = reviews_features[reviews_features.category.isin(list_search_categories)]
mixed_reviews = reviews_features[reviews_features.stars > 1]
mixed_reviews = mixed_reviews[reviews_features.stars < 5]
favorable_reviews = reviews_features[reviews_features.stars == 5]
unfavorable_reviews = reviews_features[reviews_features.stars == 1]
reviews_features.shape



(7574, 50)

In [15]:
from scipy.stats import spearmanr, f_oneway
from sklearn.svm import SVR, LinearSVR
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import make_scorer

def simple_spearman(x,y): return np.abs(spearmanr(x,y)[0])
spearmanr_scorer = make_scorer(simple_spearman)
model = LinearSVR(epsilon=0.01)

In [16]:
#compute mixed score
labels = mixed_reviews["helpfulness"].values
features = mixed_reviews[list(['verbNounRatio'])].values
features_flat = mixed_reviews['verbNounRatio'].values
scores = cross_val_score(model, features, labels, cv=5, scoring=spearmanr_scorer)
f_val,p_val = f_oneway(features_flat, labels)
print 'mixed_reviews'
print p_val
print scores.mean()
#compute favorable score
labels = favorable_reviews["helpfulness"].values
features = favorable_reviews[list(['verbNounRatio'])].values
features_flat = favorable_reviews['verbNounRatio'].values
scores = cross_val_score(model, features, labels, cv=5, scoring=spearmanr_scorer)
f_val,p_val = f_oneway(features_flat, labels)
print 'favorable_reviews'
print p_val
print scores.mean()
#compute unfavorable score
labels = unfavorable_reviews["helpfulness"].values
features = unfavorable_reviews[list(['verbNounRatio'])].values
features_flat = unfavorable_reviews['verbNounRatio'].values
scores = cross_val_score(model, features, labels, cv=5, scoring=spearmanr_scorer)
f_val,p_val = f_oneway(features_flat, labels)
print 'unfavorable_reviews'
print p_val
print scores.mean()

mixed_reviews
5.38248776523e-17
0.0966073789312
favorable_reviews
6.98592281355e-34
0.068129599659
unfavorable_reviews
5.28441349151e-15
0.134565961077
