In [18]:
import pandas as pd
import plotsfunc as pf
import numpy as np
%matplotlib inline

In [19]:
## adiciona coluna de helpfulness no corpus
def helpf(x): 
    try:
        return float ( float(x['thumbsup']) / (float(x['thumbsup']) + float(x['thumbsdown'])) )
    except:
        return 0
    
def thumbssum(x): 
    try:
        return int(x['thumbsup']) + int(x['thumbsdown']) 
    except:
        return 0
    
def fillnanup(x): 
    try:
        return float(x['thumbsup'])
    except:
        return 0
    
def fillnandown(x): 
    try:
        return float(x['thumbsdown'])
    except:
        return 0
    
def length(x): 
    try:
        return int(x['word_count']) * int(x['mean_word_length'])
    except:
        return 0
    
def sentiment(x): 
    if x['stars'] == 5:
        return 'favorable'
    if x['stars'] == 1:
        return 'unfavorable'
    else:
        return 'mixed'

In [24]:
reviews_features = pd.read_csv('amazon-help.csv.gz')
reviews_features['thumbsup'] = reviews_features.apply(fillnanup,axis=1)
reviews_features['thumbsdown'] = reviews_features.apply(fillnandown,axis=1)
reviews_features['helpfulness'] = reviews_features.apply(helpf,axis=1)
reviews_features['thumbstotal'] = reviews_features.apply(thumbssum,axis=1)
reviews_features['length'] = reviews_features.apply(length,axis=1)
reviews_features = reviews_features[reviews_features.thumbstotal>4]
reviews_features = reviews_features[reviews_features.word_count>10]
reviews_features.shape

(15096, 47)

In [25]:
from sklearn.svm import SVR, LinearSVR
from sklearn.grid_search import GridSearchCV
from scipy.stats import spearmanr
from sklearn.metrics import make_scorer
from time import time

def simple_spearman(x,y): return np.abs(spearmanr(x,y)[0])
spearmanr_scorer = make_scorer(simple_spearman)

## grid search LinearSVM

In [28]:
features = reviews_features[list(['word_count'])].values
labels = reviews_features["helpfulness"].values

param_grid = [
    {'epsilon':[10**i for i in range(-4,-1)]},
]
grid_search = GridSearchCV(LinearSVR(), param_grid=param_grid, scoring=spearmanr_scorer, cv=10, n_jobs=-1)
start = time()
grid_search.fit(features, labels)
print("GridSearchCV took %.2f seconds for %d candidates" % (time() - start, len(grid_search.grid_scores_)))
grid_search.grid_scores_

GridSearchCV took 33.79 seconds for 3 candidates


[mean: 0.18736, std: 0.04971, params: {'epsilon': 0.0001},
 mean: 0.18736, std: 0.04971, params: {'epsilon': 0.001},
 mean: 0.18736, std: 0.04971, params: {'epsilon': 0.01}]

## grid search SVM RBF

In [29]:
features = reviews_features[list(['word_count'])].values
labels = reviews_features["helpfulness"].values

param_grid = [
    #{'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
    {'C': [1], 'gamma': [0.001], 'kernel': ['rbf']},
]
grid_search = GridSearchCV(SVR(), param_grid=param_grid, scoring=spearmanr_scorer, cv=10, n_jobs=-1)
start = time()
grid_search.fit(features, labels)
print("GridSearchCV took %.2f seconds for %d candidates" % (time() - start, len(grid_search.grid_scores_)))
grid_search.grid_scores_

GridSearchCV took 126.44 seconds for 1 candidates


[mean: 0.06266, std: 0.02579, params: {'kernel': 'rbf', 'C': 1, 'gamma': 0.001}]