In [1]:
import pandas as pd
import numpy as np
from scipy.stats import spearmanr

In [2]:
def helpInt(x): 
    try:
        return float ( float(x['thumbsup']) / (float(x['thumbsup']) + float(x['thumbsdown'])) )
    except:
        return 0

### load data

In [4]:
# reset variables
reviews_features = pd.read_csv('reviews.csv.gz', compression='gzip')
reviews_columns = reviews_features.select_dtypes(include=['float64','int']).columns
reviews_features['helpfulness'] = reviews_features.apply(helpInt,axis=1)
reviews_features.shape

(32226, 48)

### category TV

In [5]:
reviews_features = reviews_features[reviews_features.category == 'TV']
reviews_features = reviews_features[reviews_features.helpfulness != 0]
#reviews_features.to_csv('reviews-help.csv')
reviews_features.shape

(4567, 48)

### split data and class

In [6]:
# split class and features
labels = reviews_features["helpfulness"].values
del reviews_features['helpfulness']
features = reviews_features[list(reviews_columns)].values

# clean features
features = np.abs(features)
features[np.isnan(features) == True] = 0

features.shape

(4567, 34)

### SVR Training Script
#### based on https://github.com/ajschumacher/ajschumacher.github.io/blob/master/20150417-negative_r_squared/index.md

In [7]:
from sklearn.svm import SVR
from sklearn.cross_validation import train_test_split, cross_val_score

### define functions

In [46]:
from sklearn.metrics import make_scorer
def simple_spearman(x,y): return spearmanr(x,y)[0]
spearmanr_scorer = make_scorer(simple_spearman)

def val_table(model, features, target, cvOpt=0):
    feature_train, feature_test, label_train, label_test = train_test_split(features, target)

    model.fit(feature_train, label_train)

    columns = ['self validation', '1-fold validation']
    if cvOpt > 0: columns.append(str(cvOpt) + '-fold cross validation ')
    
    score_list = []
    score_list.append(model.score(feature_train, label_train))
    score_list.append(model.score(feature_test,  label_test))
    if cvOpt > 0: score_list.append(0)
    
    corr_df = pd.DataFrame(data={'score': np.array(score_list)}, index=columns)

    pearson_list = []
    pearson_list.append(np.corrcoef(model.predict(feature_train), label_train)[0, 1]**2)
    pearson_list.append(simple_spearman(model.predict(feature_train), label_train))
    if cvOpt > 0: pearson_list.append(0)
    corr_df['pearson^2'] = pearson_list

    spearman_list = []
    spearman_list.append(np.corrcoef(model.predict(feature_test), label_test)[0, 1]**2)
    spearman_list.append(simple_spearman(model.predict(feature_test), label_test))
    if cvOpt > 0: 
        scores = cross_val_score(model, features, target, cv=cvOpt, scoring=spearmanr_scorer)
        spearman_list.append(scores.mean())
    corr_df['spearman'] = spearman_list        
    
    return corr_df

def name_columns(features):
    kbest_columns = []

    for value in features[0]:
        for column in reviews_columns:
            if reviews_features[column].iloc[0] == value:
                kbest_columns.append(column)
            
    return kbest_columns

### SVR Linear

In [12]:
from sklearn.svm import LinearSVR
from sklearn.feature_selection import SelectKBest, f_regression

kbest_features = SelectKBest(f_regression, k=5).fit_transform(features, labels)

model = LinearSVR(epsilon=0.2)
val_table(model, kbest_features, labels)

Unnamed: 0,score,pearson^2,spearman
self validation,-0.089976,0.011332,0.001527
1-fold validation,-0.13183,0.059185,0.02699


### SVR RBF with Obviuos

In [48]:
obvious_features = reviews_features[list(['thumbsup','thumbsdown'])].values

model = SVR(C=1.0, epsilon=0.01, kernel='rbf')

val_table(model, obvious_features, labels, 10)

Unnamed: 0,score,pearson^2,spearman
self validation,0.998234,0.99935,0.968181
1-fold validation,0.966645,0.890253,0.881917
10-fold cross validation,0.0,0.0,0.883147


### SVR RBF with Manual Selection

In [23]:
manual_features = reviews_features[list(['sentence_count','stars','word_count'])].values

model = SVR(C=1.0, epsilon=0.01, kernel='rbf')

val_table(model, manual_features, labels)

Unnamed: 0,score,pearson^2,spearman
self validation,0.216845,0.291771,0.004792
1-fold validation,-0.262054,0.531171,0.061776


### SVR RBF with 5 KBest

In [24]:
from sklearn.feature_selection import SelectKBest, f_regression

kbest_features = SelectKBest(f_regression, k=5).fit_transform(features, labels)

model = SVR(C=1.0, epsilon=0.001, kernel='rbf')

name_columns(kbest_features)

['syllable_count',
 'pronIncidence',
 'ContentDiversty',
 'sentence_count',
 'stars']

In [25]:
val_table(model, kbest_features, labels)

Unnamed: 0,score,pearson^2,spearman
self validation,0.900543,0.901976,0.008118
1-fold validation,-0.126945,0.84935,0.050313


### SVR RBF with 10 KBest

In [26]:
from sklearn.feature_selection import SelectKBest, f_regression

kbest_features = SelectKBest(f_regression, k=10).fit_transform(features, labels)

model = SVR(C=1.0, epsilon=0.001, kernel='rbf')

val_table(model, kbest_features, labels)

Unnamed: 0,score,pearson^2,spearman
self validation,0.958651,0.95875,0.017187
1-fold validation,-0.00499,0.861189,0.041759


### SVR RBF with Percentile

In [50]:
from sklearn.feature_selection import SelectPercentile

percentile_features = SelectPercentile(f_regression).fit_transform(features, labels)

percentile_features.shape

(4567, 4)

In [51]:
model = SVR(C=1.0, epsilon=0.01, kernel='rbf')

val_table(model, percentile_features, labels, 10)

Unnamed: 0,score,pearson^2,spearman
self validation,0.328663,0.387776,0.002922
1-fold validation,-0.31411,0.62251,0.038774
10-fold cross validation,0.0,0.0,0.042755


### recursive feature elimination and cross-validated

In [29]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFECV

model = LinearRegression()
selector = RFECV(model, step=1, cv=5)
rfecv_features = selector.fit_transform(features, labels)

name_columns(rfecv_features)

['ConnectiveAdditiveIncidence',
 'ConnectiveLogicIncidence',
 'LogicIfIncidence',
 'LogicAndIncidence',
 'LogicOperatorsIncidence',
 'percentile_25_word_length',
 'LogicNegationIncidence',
 'ConnectiveAdditiveIncidence',
 'ConnectiveLogicIncidence',
 'LogicIfIncidence',
 'LogicAndIncidence',
 'LogicOperatorsIncidence',
 'percentile_25_word_length',
 'LogicNegationIncidence',
 'ConnectiveAdditiveIncidence',
 'ConnectiveLogicIncidence',
 'LogicIfIncidence',
 'LogicAndIncidence',
 'LogicOperatorsIncidence',
 'percentile_25_word_length',
 'LogicNegationIncidence']

In [30]:
val_table(model, rfecv_features, labels)

Unnamed: 0,score,pearson^2,spearman
self validation,0.002329,0.002329,0.000251
1-fold validation,-0.000225,0.041032,0.008756


### Linear Regression

In [49]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
kbest_features = SelectKBest(f_regression, k=10).fit_transform(features, labels)

val_table(model, kbest_features, labels, 10)

Unnamed: 0,score,pearson^2,spearman
self validation,0.034345,0.034345,0.023134
1-fold validation,0.022664,0.126914,0.1017
10-fold cross validation,0.0,0.0,0.093417


### SVR Cross Validation

In [20]:
kbest_features = SelectKBest(f_regression, k=2).fit_transform(features, labels)
model = SVR(C=1.0, epsilon=0.2, kernel='rbf')
scores = cross_val_score(model, kbest_features, labels, cv=5, scoring=spearmanr_scorer)
scores.mean()

0.030293908643749895