In [1]:
import pandas as pd
import numpy as np
from scipy.stats import spearmanr

In [2]:
def helpInt(x): 
    try:
        return float ( float(x['thumbsup']) / (float(x['thumbsup']) + float(x['thumbsdown'])) )
    except:
        return 0
def thumbsSum(x): 
    try:
        return int(x['thumbsup']) + int(x['thumbsdown']) 
    except:
        return 0

### load data

In [4]:
# reset variables
reviews_features = pd.read_csv('reviews.csv.gz', compression='gzip')
#reviews_features = pd.read_csv('amazon-help.csv.gz')
reviews_columns = reviews_features.select_dtypes(include=['float64','int']).columns
reviews_features['helpfulness'] = reviews_features.apply(helpInt,axis=1)
reviews_features['thumbstotal'] = reviews_features.apply(thumbsSum,axis=1)
reviews_features.shape

(32226, 49)

### filter dataset

In [5]:
#reviews_features = reviews_features.sample(frac=0.5)
reviews_features = reviews_features[np.isfinite(reviews_features.helpfulness)]
reviews_features = reviews_features[reviews_features.thumbstotal>4]
reviews_features.shape

(9696, 49)

### split data and class

In [6]:
# split class and features
labels = reviews_features["helpfulness"].values
del reviews_features['helpfulness']
features = reviews_features[list(reviews_columns)].values

# clean features
features = np.abs(features)
features[np.isnan(features) == True] = 0

features.shape

(9696, 34)

### SVR Training Script
#### based on https://github.com/ajschumacher/ajschumacher.github.io/blob/master/20150417-negative_r_squared/index.md

In [7]:
from sklearn.svm import SVR
from sklearn.cross_validation import train_test_split, cross_val_score

### define functions

In [8]:
from sklearn.metrics import make_scorer
def simple_spearman(x,y): return spearmanr(x,y)[0]
spearmanr_scorer = make_scorer(simple_spearman)

def val_table(model, features, target, cvOpt=0):
    feature_train, feature_test, label_train, label_test = train_test_split(features, target)

    model.fit(feature_train, label_train)

    columns = ['self validation', '1-fold validation']
    if cvOpt > 0: columns.append(str(cvOpt) + '-fold cross validation ')
    
    score_list = []
    score_list.append(model.score(feature_train, label_train))
    score_list.append(model.score(feature_test,  label_test))
    if cvOpt > 0: score_list.append(0)
    
    corr_df = pd.DataFrame(data={'score': np.array(score_list)}, index=columns)

    pearson_list = []
    pearson_list.append(np.corrcoef(model.predict(feature_train), label_train)[0, 1]**2)
    pearson_list.append(np.corrcoef(model.predict(feature_test), label_test)[0, 1]**2)
    if cvOpt > 0: pearson_list.append(0)
    corr_df['pearson^2'] = pearson_list

    spearman_list = []
    spearman_list.append(simple_spearman(model.predict(feature_train), label_train))
    spearman_list.append(simple_spearman(model.predict(feature_test), label_test))
    if cvOpt > 0: 
        scores = cross_val_score(model, features, target, cv=cvOpt, scoring=spearmanr_scorer)
        spearman_list.append(scores.mean())
    corr_df['spearman'] = spearman_list   
    
    return corr_df

def name_columns(features):
    kbest_columns = []

    for value in features[0]:
        for column in reviews_columns:
            if reviews_features[column].iloc[0] == value:
                kbest_columns.append(column)
            
    return kbest_columns

### SVR Linear

In [9]:
from sklearn.svm import LinearSVR
from sklearn.feature_selection import SelectKBest, f_regression

kbest_features = SelectKBest(f_regression, k=15).fit_transform(features, labels)

model = LinearSVR(epsilon=0.01)
val_table(model, kbest_features, labels, 10)

Unnamed: 0,score,pearson^2,spearman
self validation,-0.06387,0.002647,0.052807
1-fold validation,-0.081492,0.002546,0.036138
10-fold cross validation,0.0,0.0,0.164418


### SVR RBF with Obviuos

In [10]:
obvious_features = reviews_features[list(['thumbsup','thumbsdown'])].values

model = SVR(C=1.0, epsilon=0.01, kernel='rbf')

val_table(model, obvious_features, labels, 10)

Unnamed: 0,score,pearson^2,spearman
self validation,0.998705,0.999141,0.995415
1-fold validation,0.976047,0.976917,0.982348
10-fold cross validation,0.0,0.0,0.98375


### SVR RBF with Manual Selection

In [11]:
manual_features = reviews_features[list(['word_count','stars'])].values

model = SVR(C=1.0, epsilon=0.01, kernel='rbf')

val_table(model, manual_features, labels, 10)

Unnamed: 0,score,pearson^2,spearman
self validation,0.200808,0.225494,0.449868
1-fold validation,-0.060397,0.054587,0.17975
10-fold cross validation,0.0,0.0,0.204524


### SVR RBF with 5 KBest

In [12]:
from sklearn.feature_selection import SelectKBest, f_regression

kbest_features = SelectKBest(f_regression, k=5).fit_transform(features, labels)

model = SVR(C=1.0, epsilon=0.001, kernel='rbf')

name_columns(kbest_features)

['syllable_count',
 'pronIncidence',
 'ConnectiveAdditiveIncidence',
 'ConnectiveLogicIncidence',
 'LogicIfIncidence',
 'LogicAndIncidence',
 'LogicOperatorsIncidence',
 'percentile_25_word_length',
 'LogicNegationIncidence',
 'ConnectiveTemporalIncidence',
 'ConnectiveCasualIncidence',
 'sentence_count',
 'LexicalDiversty']

In [13]:
val_table(model, kbest_features, labels, 10)

Unnamed: 0,score,pearson^2,spearman
self validation,0.86683,0.868405,0.948715
1-fold validation,-0.161551,0.031984,0.149779
10-fold cross validation,0.0,0.0,0.133382


### SVR RBF with 10 KBest

In [14]:
from sklearn.feature_selection import SelectKBest, f_regression

kbest_features = SelectKBest(f_regression, k=10).fit_transform(features, labels)

model = SVR(C=1.0, epsilon=0.001, kernel='rbf')

val_table(model, kbest_features, labels, 10)

Unnamed: 0,score,pearson^2,spearman
self validation,0.98089,0.980892,0.988075
1-fold validation,0.035452,0.038194,0.111033
10-fold cross validation,0.0,0.0,0.08067


### SVR RBF with Percentile

In [15]:
from sklearn.feature_selection import SelectPercentile

percentile_features = SelectPercentile(f_regression).fit_transform(features, labels)

percentile_features.shape

(9696, 4)

In [16]:
model = SVR(C=1.0, epsilon=0.01, kernel='rbf')

val_table(model, percentile_features, labels, 10)

Unnamed: 0,score,pearson^2,spearman
self validation,0.86087,0.86238,0.942507
1-fold validation,-0.103548,0.041579,0.15924
10-fold cross validation,0.0,0.0,0.126978


### recursive feature elimination and cross-validated

In [17]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFECV

model = LinearRegression()
selector = RFECV(model, step=1, cv=10)
rfecv_features = selector.fit_transform(features, labels)

name_columns(rfecv_features)

['ConnectiveAdditiveIncidence',
 'ConnectiveLogicIncidence',
 'LogicIfIncidence',
 'LogicAndIncidence',
 'LogicOperatorsIncidence',
 'percentile_25_word_length',
 'LogicNegationIncidence',
 'ConnectiveTemporalIncidence',
 'ConnectiveCasualIncidence']

In [18]:
val_table(model, rfecv_features, labels, 10)

Unnamed: 0,score,pearson^2,spearman
self validation,0.000718,0.000718,0.113057
1-fold validation,0.001368,0.001978,0.126914
10-fold cross validation,0.0,0.0,0.114982


### Linear Regression

In [19]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
kbest_features = SelectKBest(f_regression, k=10).fit_transform(features, labels)

val_table(model, kbest_features, labels, 10)

Unnamed: 0,score,pearson^2,spearman
self validation,0.085369,0.085369,0.266209
1-fold validation,0.09602,0.096496,0.291289
10-fold cross validation,0.0,0.0,0.263472
