In [1]:
import pandas as pd
import plotsfunc as pf
import numpy as np
%matplotlib inline

In [2]:
## adiciona coluna de helpfulness no corpus
def helpf(x): 
    try:
        return float ( float(x['thumbsup']) / float(x['thumbsdown']) )
    except:
        return 0
    
def thumbssum(x): 
    try:
        return int(x['thumbsdown']) - int(x['thumbsup']) 
    except:
        return 0
    
def fillnanup(x): 
    try:
        return float(x['thumbsup'])
    except:
        return 0
    
def fillnandown(x): 
    try:
        return float(x['thumbsdown'])
    except:
        return 0
    
def length(x): 
    try:
        return int(x['word_count']) * int(x['mean_word_length'])
    except:
        return 0
    
def sentiment(x): 
    if x['stars'] == 5:
        return 'favorable'
    if x['stars'] == 1:
        return 'unfavorable'
    else:
        return 'mixed'

In [5]:
reviews_features = pd.read_csv('amazon-help.csv.gz')
reviews_features['thumbsup'] = reviews_features.apply(fillnanup,axis=1)
reviews_features['thumbsdown'] = reviews_features.apply(fillnandown,axis=1)
reviews_features['helpfulness'] = reviews_features.apply(helpf,axis=1)
reviews_features['thumbstotal'] = reviews_features.apply(thumbssum,axis=1)
reviews_features['length'] = reviews_features.apply(length,axis=1)
reviews_features = reviews_features[reviews_features.thumbsdown>4]
reviews_features = reviews_features[reviews_features.word_count>4]
reviews_features.shape

(9814, 47)

In [6]:
from sklearn.svm import SVR, LinearSVR
from sklearn.grid_search import GridSearchCV
from scipy.stats import spearmanr
from sklearn.metrics import make_scorer
from time import time

def simple_spearman(x,y): return np.abs(spearmanr(x,y)[0])
spearmanr_scorer = make_scorer(simple_spearman)

## grid search LinearSVM

In [7]:
features = reviews_features[list(['word_count'])].values
labels = reviews_features["helpfulness"].values

param_grid = [
    {'epsilon':[10**i for i in range(-4,-1)]},
]
grid_search = GridSearchCV(LinearSVR(), param_grid=param_grid, scoring=spearmanr_scorer, cv=5, n_jobs=5, verbose=3)
start = time()
grid_search.fit(features, labels)
print("GridSearchCV took %.2f seconds for %d candidates" % (time() - start, len(grid_search.grid_scores_)))
grid_search.grid_scores_

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] epsilon=0.0001 ..................................................
[CV] epsilon=0.0001 ..................................................
[CV] epsilon=0.0001 ..................................................
[CV] epsilon=0.0001 ..................................................
[CV] epsilon=0.0001 ..................................................
[CV] ......................... epsilon=0.0001, score=0.216205 -   0.9s
[CV] epsilon=0.001 ...................................................
[CV] ......................... epsilon=0.0001, score=0.302265 -   0.9s
[CV] ......................... epsilon=0.0001, score=0.264332 -   0.9s
[CV] epsilon=0.001 ...................................................
[CV] epsilon=0.001 ...................................................
[CV] ......................... epsilon=0.0001, score=0.282996 -   1.0s
[CV] epsilon=0.001 ...................................................
[CV] ............

[Parallel(n_jobs=5)]: Done  15 out of  15 | elapsed:    3.0s finished


GridSearchCV took 3.87 seconds for 3 candidates


[mean: 0.27811, std: 0.03690, params: {'epsilon': 0.0001},
 mean: 0.27811, std: 0.03690, params: {'epsilon': 0.001},
 mean: 0.27811, std: 0.03690, params: {'epsilon': 0.01}]

## grid search SVM RBF

In [8]:
features = reviews_features[list(['word_count'])].values
labels = reviews_features["helpfulness"].values

gamma_range = 10 ** np.arange(-4,4,1)
C_range = 10.0 ** np.arange(-3,4,1)
param_grid = [{'kernel': ['rbf'], 'gamma': gamma_range, 'C': C_range}]
grid_search = GridSearchCV(SVR(), param_grid=param_grid, scoring=spearmanr_scorer, cv=5, n_jobs=5, verbose=3)

start = time()
grid_search.fit(features, labels)
print("GridSearchCV took %.2f seconds for %d candidates" % (time() - start, len(grid_search.grid_scores_)))
grid_search.grid_scores_

Fitting 5 folds for each of 56 candidates, totalling 280 fits
[CV] kernel=rbf, C=0.001, gamma=0 ....................................
[CV] kernel=rbf, C=0.001, gamma=0 ....................................
[CV] kernel=rbf, C=0.001, gamma=0 ....................................
[CV] kernel=rbf, C=0.001, gamma=0 ....................................
[CV] kernel=rbf, C=0.001, gamma=0 ....................................
[CV] ........... kernel=rbf, C=0.001, gamma=0, score=0.181850 -   4.2s
[CV] ........... kernel=rbf, C=0.001, gamma=0, score=0.244048 -   4.2s
[CV] kernel=rbf, C=0.001, gamma=0 ....................................
[CV] kernel=rbf, C=0.001, gamma=0 ....................................
[CV] ........... kernel=rbf, C=0.001, gamma=0, score=0.199405 -   4.3s
[CV] ........... kernel=rbf, C=0.001, gamma=0, score=0.168926 -   4.3s
[CV] kernel=rbf, C=0.001, gamma=0 ....................................
[CV] ........... kernel=rbf, C=0.001, gamma=0, score=0.179391 -   4.4s
[CV] kernel=rbf

[Parallel(n_jobs=5)]: Done  22 tasks      | elapsed:   22.4s


[CV] ........... kernel=rbf, C=0.001, gamma=1, score=0.179391 -   4.7s
[CV] kernel=rbf, C=0.001, gamma=10 ...................................
[CV] ........... kernel=rbf, C=0.001, gamma=1, score=0.199405 -   4.7s
[CV] kernel=rbf, C=0.001, gamma=10 ...................................
[CV] .......... kernel=rbf, C=0.001, gamma=10, score=0.156072 -   4.1s
[CV] kernel=rbf, C=0.001, gamma=100 ..................................
[CV] .......... kernel=rbf, C=0.001, gamma=10, score=0.218260 -   4.1s
[CV] kernel=rbf, C=0.001, gamma=100 ..................................
[CV] .......... kernel=rbf, C=0.001, gamma=10, score=0.155753 -   4.1s
[CV] kernel=rbf, C=0.001, gamma=100 ..................................
[CV] .......... kernel=rbf, C=0.001, gamma=10, score=0.162151 -   4.1s
[CV] kernel=rbf, C=0.001, gamma=100 ..................................
[CV] .......... kernel=rbf, C=0.001, gamma=10, score=0.173375 -   4.2s
[CV] kernel=rbf, C=0.001, gamma=100 ..................................
[CV] .

[Parallel(n_jobs=5)]: Done 118 tasks      | elapsed:  1.7min


[CV] .......... kernel=rbf, C=0.1, gamma=1000, score=0.180997 -   3.8s
[CV] kernel=rbf, C=1.0, gamma=0 ......................................
[CV] .......... kernel=rbf, C=0.1, gamma=1000, score=0.170835 -   3.8s
[CV] kernel=rbf, C=1.0, gamma=0 ......................................
[CV] ............. kernel=rbf, C=1.0, gamma=0, score=0.163766 -   5.8s
[CV] kernel=rbf, C=1.0, gamma=0 ......................................
[CV] ............. kernel=rbf, C=1.0, gamma=0, score=0.209288 -   5.7s
[CV] kernel=rbf, C=1.0, gamma=0 ......................................
[CV] ............. kernel=rbf, C=1.0, gamma=0, score=0.169838 -   5.8s
[CV] kernel=rbf, C=1.0, gamma=0 ......................................
[CV] ............. kernel=rbf, C=1.0, gamma=0, score=0.162669 -   5.2s
[CV] kernel=rbf, C=1.0, gamma=0 ......................................
[CV] ............. kernel=rbf, C=1.0, gamma=0, score=0.176114 -   5.7s
[CV] kernel=rbf, C=1.0, gamma=0 ......................................
[CV] .

[Parallel(n_jobs=5)]: Done 280 out of 280 | elapsed:  4.9min finished


GridSearchCV took 297.78 seconds for 56 candidates


[mean: 0.19472, std: 0.02654, params: {'kernel': 'rbf', 'C': 0.001, 'gamma': 0},
 mean: 0.19472, std: 0.02654, params: {'kernel': 'rbf', 'C': 0.001, 'gamma': 0},
 mean: 0.19472, std: 0.02654, params: {'kernel': 'rbf', 'C': 0.001, 'gamma': 0},
 mean: 0.19472, std: 0.02654, params: {'kernel': 'rbf', 'C': 0.001, 'gamma': 0},
 mean: 0.19472, std: 0.02654, params: {'kernel': 'rbf', 'C': 0.001, 'gamma': 1},
 mean: 0.17312, std: 0.02345, params: {'kernel': 'rbf', 'C': 0.001, 'gamma': 10},
 mean: 0.17071, std: 0.02223, params: {'kernel': 'rbf', 'C': 0.001, 'gamma': 100},
 mean: 0.17071, std: 0.02223, params: {'kernel': 'rbf', 'C': 0.001, 'gamma': 1000},
 mean: 0.19591, std: 0.02252, params: {'kernel': 'rbf', 'C': 0.01, 'gamma': 0},
 mean: 0.19591, std: 0.02252, params: {'kernel': 'rbf', 'C': 0.01, 'gamma': 0},
 mean: 0.19591, std: 0.02252, params: {'kernel': 'rbf', 'C': 0.01, 'gamma': 0},
 mean: 0.19591, std: 0.02252, params: {'kernel': 'rbf', 'C': 0.01, 'gamma': 0},
 mean: 0.19591, std: 0.022