In [1]:
import pandas as pd
import plotsfunc as pf
import numpy as np
%matplotlib inline

In [2]:
## adiciona coluna de helpfulness no corpus
def helpf(x): 
    try:
        return float ( float(x['thumbsup']) / (float(x['thumbsup']) + float(x['thumbsdown'])) )
    except:
        return 0
    
def thumbssum(x): 
    try:
        return int(x['thumbsup']) + int(x['thumbsdown']) 
    except:
        return 0
    
def fillnanup(x): 
    try:
        return float(x['thumbsup'])
    except:
        return 0
    
def fillnandown(x): 
    try:
        return float(x['thumbsdown'])
    except:
        return 0
    
def length(x): 
    try:
        return int(x['word_count']) * int(x['mean_word_length'])
    except:
        return 0
    
def sentiment(x): 
    if x['stars'] == 5:
        return 'favorable'
    if x['stars'] == 1:
        return 'unfavorable'
    else:
        return 'mixed'

In [3]:
reviews_features = pd.read_csv('amazon-help.csv.gz')
reviews_features['thumbsup'] = reviews_features.apply(fillnanup,axis=1)
reviews_features['thumbsdown'] = reviews_features.apply(fillnandown,axis=1)
reviews_features['helpfulness'] = reviews_features.apply(helpf,axis=1)
reviews_features['thumbstotal'] = reviews_features.apply(thumbssum,axis=1)
reviews_features['length'] = reviews_features.apply(length,axis=1)
reviews_features = reviews_features[reviews_features.thumbstotal>4]
reviews_features = reviews_features[reviews_features.word_count>10]
reviews_features.shape

(15096, 47)

In [4]:
from sklearn.svm import SVR, LinearSVR
from sklearn.grid_search import GridSearchCV
from scipy.stats import spearmanr
from sklearn.metrics import make_scorer
from time import time

def simple_spearman(x,y): return np.abs(spearmanr(x,y)[0])
spearmanr_scorer = make_scorer(simple_spearman)

## grid search LinearSVM

In [5]:
features = reviews_features[list(['word_count'])].values
labels = reviews_features["helpfulness"].values

param_grid = [
    {'epsilon':[10**i for i in range(-4,-1)]},
]
grid_search = GridSearchCV(LinearSVR(), param_grid=param_grid, scoring=spearmanr_scorer, cv=5, n_jobs=5, verbose=3)
start = time()
grid_search.fit(features, labels)
print("GridSearchCV took %.2f seconds for %d candidates" % (time() - start, len(grid_search.grid_scores_)))
grid_search.grid_scores_

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] epsilon=0.0001 ..................................................
[CV] epsilon=0.0001 ..................................................
[CV] epsilon=0.0001 ..................................................
[CV] epsilon=0.0001 ..................................................
[CV] epsilon=0.0001 ..................................................
[CV] ......................... epsilon=0.0001, score=0.164195 -   3.4s
[CV] epsilon=0.001 ...................................................
[CV] ......................... epsilon=0.0001, score=0.174803 -   3.5s
[CV] ......................... epsilon=0.0001, score=0.213562 -   3.4s
[CV] ......................... epsilon=0.0001, score=0.223151 -   3.5s
[CV] epsilon=0.001 ...................................................
[CV] epsilon=0.001 ...................................................
[CV] epsilon=0.001 ...................................................
[CV] ............

[Parallel(n_jobs=5)]: Done  15 out of  15 | elapsed:   11.3s finished


GridSearchCV took 13.29 seconds for 3 candidates


[mean: 0.18315, std: 0.03102, params: {'epsilon': 0.0001},
 mean: 0.18315, std: 0.03102, params: {'epsilon': 0.001},
 mean: 0.18315, std: 0.03102, params: {'epsilon': 0.01}]

## grid search SVM RBF

In [7]:
features = reviews_features[list(['word_count'])].values
labels = reviews_features["helpfulness"].values

gamma_range = 10 ** np.arange(-4,4,1)
C_range = 10.0 ** np.arange(-3,4,1)
param_grid = [{'kernel': ['rbf'], 'gamma': gamma_range, 'C': C_range}]
grid_search = GridSearchCV(SVR(), param_grid=param_grid, scoring=spearmanr_scorer, cv=5, n_jobs=5, verbose=3)

start = time()
grid_search.fit(features, labels)
print("GridSearchCV took %.2f seconds for %d candidates" % (time() - start, len(grid_search.grid_scores_)))
grid_search.grid_scores_

Fitting 5 folds for each of 56 candidates, totalling 280 fits
[CV] kernel=rbf, C=0.001, gamma=0 ....................................
[CV] kernel=rbf, C=0.001, gamma=0 ....................................
[CV] kernel=rbf, C=0.001, gamma=0 ....................................
[CV] kernel=rbf, C=0.001, gamma=0 ....................................
[CV] kernel=rbf, C=0.001, gamma=0 ....................................
[CV] ........... kernel=rbf, C=0.001, gamma=0, score=0.038810 -   6.3s
[CV] kernel=rbf, C=0.001, gamma=0 ....................................
[CV] ........... kernel=rbf, C=0.001, gamma=0, score=0.022643 -   6.4s
[CV] kernel=rbf, C=0.001, gamma=0 ....................................
[CV] ........... kernel=rbf, C=0.001, gamma=0, score=0.047001 -   6.6s
[CV] kernel=rbf, C=0.001, gamma=0 ....................................
[CV] ........... kernel=rbf, C=0.001, gamma=0, score=0.033575 -   6.6s
[CV] kernel=rbf, C=0.001, gamma=0 ....................................
[CV] ..........

[Parallel(n_jobs=5)]: Done  22 tasks      | elapsed:   31.9s


[CV] ........... kernel=rbf, C=0.001, gamma=1, score=0.011501 -   6.6s
[CV] kernel=rbf, C=0.001, gamma=10 ...................................
[CV] ........... kernel=rbf, C=0.001, gamma=1, score=0.033575 -   6.4s
[CV] ........... kernel=rbf, C=0.001, gamma=1, score=0.047001 -   6.4s
[CV] kernel=rbf, C=0.001, gamma=10 ...................................
[CV] kernel=rbf, C=0.001, gamma=10 ...................................
[CV] .......... kernel=rbf, C=0.001, gamma=10, score=0.050203 -   5.9s
[CV] kernel=rbf, C=0.001, gamma=100 ..................................
[CV] .......... kernel=rbf, C=0.001, gamma=10, score=0.039324 -   5.8s
[CV] kernel=rbf, C=0.001, gamma=100 ..................................
[CV] .......... kernel=rbf, C=0.001, gamma=10, score=0.075430 -   6.3s
[CV] kernel=rbf, C=0.001, gamma=100 ..................................
[CV] .......... kernel=rbf, C=0.001, gamma=10, score=0.063527 -   6.1s
[CV] .......... kernel=rbf, C=0.001, gamma=10, score=0.029840 -   6.1s
[CV] k

[Parallel(n_jobs=5)]: Done 118 tasks      | elapsed:  2.8min


[CV] .......... kernel=rbf, C=0.1, gamma=1000, score=0.016809 -   7.3s
[CV] kernel=rbf, C=1.0, gamma=0 ......................................
[CV] .......... kernel=rbf, C=0.1, gamma=1000, score=0.001543 -   7.3s
[CV] kernel=rbf, C=1.0, gamma=0 ......................................
[CV] ............. kernel=rbf, C=1.0, gamma=0, score=0.002282 -   9.1s
[CV] kernel=rbf, C=1.0, gamma=0 ......................................
[CV] ............. kernel=rbf, C=1.0, gamma=0, score=0.045393 -   9.4s
[CV] kernel=rbf, C=1.0, gamma=0 ......................................
[CV] ............. kernel=rbf, C=1.0, gamma=0, score=0.017916 -  10.5s
[CV] kernel=rbf, C=1.0, gamma=0 ......................................
[CV] ............. kernel=rbf, C=1.0, gamma=0, score=0.003767 -   9.6s
[CV] kernel=rbf, C=1.0, gamma=0 ......................................
[CV] ............. kernel=rbf, C=1.0, gamma=0, score=0.006073 -   9.6s
[CV] kernel=rbf, C=1.0, gamma=0 ......................................
[CV] .

[Parallel(n_jobs=5)]: Done 280 out of 280 | elapsed:  7.9min finished


GridSearchCV took 478.63 seconds for 56 candidates


[mean: 0.03070, std: 0.01244, params: {'kernel': 'rbf', 'C': 0.001, 'gamma': 0},
 mean: 0.03070, std: 0.01244, params: {'kernel': 'rbf', 'C': 0.001, 'gamma': 0},
 mean: 0.03070, std: 0.01244, params: {'kernel': 'rbf', 'C': 0.001, 'gamma': 0},
 mean: 0.03070, std: 0.01244, params: {'kernel': 'rbf', 'C': 0.001, 'gamma': 0},
 mean: 0.03070, std: 0.01244, params: {'kernel': 'rbf', 'C': 0.001, 'gamma': 1},
 mean: 0.05167, std: 0.01635, params: {'kernel': 'rbf', 'C': 0.001, 'gamma': 10},
 mean: 0.06356, std: 0.01323, params: {'kernel': 'rbf', 'C': 0.001, 'gamma': 100},
 mean: 0.06356, std: 0.01323, params: {'kernel': 'rbf', 'C': 0.001, 'gamma': 1000},
 mean: 0.01064, std: 0.00619, params: {'kernel': 'rbf', 'C': 0.01, 'gamma': 0},
 mean: 0.01064, std: 0.00619, params: {'kernel': 'rbf', 'C': 0.01, 'gamma': 0},
 mean: 0.01064, std: 0.00619, params: {'kernel': 'rbf', 'C': 0.01, 'gamma': 0},
 mean: 0.01064, std: 0.00619, params: {'kernel': 'rbf', 'C': 0.01, 'gamma': 0},
 mean: 0.01064, std: 0.006