In [1]:
import pandas as pd
import numpy as np
from scipy.stats import spearmanr

In [2]:
def helpInt(x): 
    try:
        return float ( float(x['thumbsup']) / (float(x['thumbsup']) + float(x['thumbsdown'])) )
    except:
        return 0
def thumbsSum(x): 
    try:
        return int(x['thumbsup']) + int(x['thumbsdown']) 
    except:
        return 0

### load data

In [4]:
# reset variables
reviews_features = pd.read_csv('reviews.csv.gz', compression='gzip')
#reviews_features = pd.read_csv('amazon-help.csv.gz')
reviews_columns = reviews_features.select_dtypes(include=['float64','int']).columns
reviews_features['helpfulness'] = reviews_features.apply(helpInt,axis=1)
reviews_features['thumbstotal'] = reviews_features.apply(thumbsSum,axis=1)
reviews_features.shape

(32226, 49)

### filter dataset

In [5]:
#reviews_features = reviews_features.sample(frac=0.5)
reviews_features = reviews_features[np.isfinite(reviews_features.helpfulness)]
reviews_features = reviews_features[reviews_features.thumbstotal>4]
reviews_features.shape

(9696, 49)

### split data and class

In [6]:
# split class and features
labels = reviews_features["helpfulness"].values
del reviews_features['helpfulness']
features = reviews_features[list(reviews_columns)].values

# clean features
features = np.abs(features)
features[np.isnan(features) == True] = 0

features.shape

(9696, 37)

### SVR Training Script
#### based on https://github.com/ajschumacher/ajschumacher.github.io/blob/master/20150417-negative_r_squared/index.md

In [32]:
from sklearn.svm import SVR
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error

### define functions

In [36]:
from sklearn.metrics import make_scorer
def simple_spearman(x,y): return spearmanr(x,y)[0]
spearmanr_scorer = make_scorer(simple_spearman)

def val_table(model, features, target, cvOpt=0):
    feature_train, feature_test, label_train, label_test = train_test_split(features, target)

    model.fit(feature_train, label_train)

    columns = ['self validation', '1-fold validation']
    if cvOpt > 0: columns.append(str(cvOpt) + '-fold cross validation ')
    
    score_list = []
    score_list.append(model.score(feature_train, label_train))
    score_list.append(model.score(feature_test,  label_test))
    if cvOpt > 0: score_list.append(0)
    
    corr_df = pd.DataFrame(data={'score': np.array(score_list)}, index=columns)

    pearson_list = []
    pearson_list.append(np.corrcoef(model.predict(feature_train), label_train)[0, 1]**2)
    pearson_list.append(np.corrcoef(model.predict(feature_test), label_test)[0, 1]**2)
    if cvOpt > 0: pearson_list.append(0)
    corr_df['pearson^2'] = pearson_list
    
    spearman_list = []
    spearman_list.append(simple_spearman(model.predict(feature_train), label_train))
    spearman_list.append(simple_spearman(model.predict(feature_test), label_test))
    if cvOpt > 0: 
        scores = cross_val_score(model, features, target, cv=cvOpt, scoring=spearmanr_scorer)
        spearman_list.append(scores.mean())
    corr_df['spearman'] = spearman_list   
    
    squared_list = []
    squared_list.append(np.abs(mean_absolute_error(label_train, model.predict(feature_train))))
    squared_list.append(np.abs(mean_absolute_error(label_test, model.predict(feature_test))))
    if cvOpt > 0: 
        scores = cross_val_score(model, features, target, cv=cvOpt, scoring='mean_absolute_error')
        squared_list.append(np.abs(scores.mean()))
    corr_df['mean_absolute_error'] = squared_list   
    
    return corr_df

def name_columns(features):
    kbest_columns = []

    for value in features[0]:
        for column in reviews_columns:
            if reviews_features[column].iloc[0] == value:
                kbest_columns.append(column)
            
    return kbest_columns

### SVR Linear

In [37]:
from sklearn.svm import LinearSVR
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectKBest, f_regression

kbest_features = SelectKBest(f_regression, k=15).fit_transform(features, labels)

model = LinearSVR()
val_table(model, kbest_features, labels, 10)

Unnamed: 0,score,pearson^2,spearman,mean_absolute_error
self validation,-1.890929,0.070805,0.267863,0.396733
1-fold validation,-1.860022,0.097121,0.321466,0.401548
10-fold cross validation,0.0,0.0,0.163377,0.27808


### SVR RBF with Obviuos

In [50]:
obvious_features = reviews_features[list(['thumbsup','thumbsdown'])].values

model = SVR(C=1.0, epsilon=0.01, kernel='rbf')

val_table(model, obvious_features, labels, 10)

Unnamed: 0,score,pearson^2,spearman,mean_absolute_error
self validation,0.998693,0.999125,0.995545,0.009134
1-fold validation,0.980383,0.981158,0.983794,0.015504
10-fold cross validation,0.0,0.0,0.98375,0.015364


### SVR RBF with Manual Selection

In [39]:
manual_features = reviews_features[list(['word_count','stars'])].values

model = SVR(C=1.0, epsilon=0.01, kernel='rbf')

val_table(model, manual_features, labels, 10)

Unnamed: 0,score,pearson^2,spearman,mean_absolute_error
self validation,0.193038,0.217277,0.436651,0.164923
1-fold validation,-0.052608,0.054271,0.21391,0.20498
10-fold cross validation,0.0,0.0,0.204739,0.20181


### SVR RBF with 5 KBest

In [40]:
from sklearn.feature_selection import SelectKBest, f_regression

kbest_features = SelectKBest(f_regression, k=5).fit_transform(features, labels)

model = SVR(C=1.0, epsilon=0.001, kernel='rbf')

name_columns(kbest_features)

['word_count',
 'syllable_count',
 'pronIncidence',
 'sentence_count',
 'LexicalDiversty']

In [41]:
val_table(model, kbest_features, labels, 10)

Unnamed: 0,score,pearson^2,spearman,mean_absolute_error
self validation,0.935558,0.935763,0.974875,0.013917
1-fold validation,-0.05082,0.030829,0.143275,0.207383
10-fold cross validation,0.0,0.0,0.126684,0.209835


### SVR RBF with 10 KBest

In [42]:
from sklearn.feature_selection import SelectKBest, f_regression

kbest_features = SelectKBest(f_regression, k=10).fit_transform(features, labels)

model = SVR(C=1.0, epsilon=0.001, kernel='rbf')

val_table(model, kbest_features, labels, 10)

Unnamed: 0,score,pearson^2,spearman,mean_absolute_error
self validation,0.979402,0.979413,0.988792,0.00483
1-fold validation,0.03196,0.036176,0.064758,0.207136
10-fold cross validation,0.0,0.0,0.063474,0.206886


### SVR RBF with Percentile

In [43]:
from sklearn.feature_selection import SelectPercentile

percentile_features = SelectPercentile(f_regression).fit_transform(features, labels)

percentile_features.shape

(9696, 4)

In [44]:
model = SVR(C=1.0, epsilon=0.01, kernel='rbf')

val_table(model, percentile_features, labels, 10)

Unnamed: 0,score,pearson^2,spearman,mean_absolute_error
self validation,0.59897,0.602408,0.805051,0.085593
1-fold validation,-0.140728,0.048337,0.159689,0.213468
10-fold cross validation,0.0,0.0,0.170376,0.216116


### recursive feature elimination and cross-validated

In [52]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFECV

model = LinearSVR()
selector = RFECV(model, step=5, cv=10)
rfecv_features = selector.fit_transform(features, labels)

name_columns(rfecv_features)

['mean_word_length', 'avg_syllables_per_word']

In [53]:
val_table(model, rfecv_features, labels, 10)

Unnamed: 0,score,pearson^2,spearman,mean_absolute_error
self validation,-0.068495,0.007487,0.090083,0.199436
1-fold validation,-0.06835,0.01505,0.136093,0.202495
10-fold cross validation,0.0,0.0,0.091704,0.200973


### Linear Regression

In [47]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
kbest_features = SelectKBest(f_regression, k=10).fit_transform(features, labels)

val_table(model, kbest_features, labels, 10)

Unnamed: 0,score,pearson^2,spearman,mean_absolute_error
self validation,0.100305,0.100305,0.282318,0.194735
1-fold validation,0.087557,0.087698,0.261688,0.203676
10-fold cross validation,0.0,0.0,0.27326,0.198058
