In [1]:
import pandas as pd
import numpy as np
from scipy.stats import spearmanr

In [2]:
def helpInt(x): 
    try:
        return int ( float(x['thumbsup']) * 1000 / (float(x['thumbsup']) + float(x['thumbsdown'])) )
    except:
        return 0

### load data

In [4]:
# reset variables
reviews_features = pd.read_csv('reviews.csv.gz', compression='gzip')
reviews_columns = reviews_features.select_dtypes(include=['float64','int']).columns
reviews_features['helpfulness'] = reviews_features.apply(helpInt,axis=1)
reviews_features.shape

(32226, 48)

### category TV

In [5]:
reviews_features = reviews_features[reviews_features.category == 'TV']
#reviews_features = reviews_features[np.abs(reviews_features.helpfulness-reviews_features.helpfulness.mean())<=(reviews_features.helpfulness.std())]
#reviews_features.to_csv('reviews-help.csv')
reviews_features.shape

(5748, 48)

### split data and class

In [6]:
# split class and features
labels = reviews_features["helpfulness"].values
del reviews_features['helpfulness']
features = reviews_features[list(reviews_columns)].values

# clean features
features = np.abs(features)
features[np.isnan(features) == True] = 0

features.shape

(5748, 34)

### feature selection

In [7]:
from sklearn.feature_selection import SelectKBest, f_regression

new_features = SelectKBest(f_regression, k=10).fit_transform(features, labels)
# manually feature selection
# new_features = reviews_features[list(['syllable_count','sentence_count','word_count'])].values
new_features.shape

(5748, 10)

### SVR Training Script
#### based on https://github.com/ajschumacher/ajschumacher.github.io/blob/master/20150417-negative_r_squared/index.md

In [13]:
from sklearn.svm import SVR
from sklearn.cross_validation import train_test_split, cross_val_score

### compute validation table

In [8]:
def val_table(model, features, target):
    feature_train, feature_test, label_train, label_test = train_test_split(features, target)

    model.fit(feature_train, label_train)

    score_list = []
    score_list.append(model.score(feature_train, label_train))
    score_list.append(model.score(feature_test,  label_test))

    corr_df = pd.DataFrame(data={'score': np.array(score_list)}, index=['self validation', '1-fold validation'])

    pearson_list = []
    pearson_list.append(np.corrcoef(model.predict(feature_train), label_train)[0, 1]**2)
    pearson_list.append(spearmanr(model.predict(feature_train), label_train)[0])
    corr_df['pearson^2'] = pearson_list

    spearman_list = []
    spearman_list.append(np.corrcoef(model.predict(feature_test), label_test)[0, 1]**2)
    spearman_list.append(spearmanr(model.predict(feature_test), label_test)[0])
    corr_df['spearman'] = spearman_list

    return corr_df

### SVR Linear

In [9]:
model = SVR(C=1.0, epsilon=0.01, kernel='linear')

val_table(model, new_features, labels)

Unnamed: 0,score,pearson^2,spearman
self validation,-0.160828,0.047525,0.026263
1-fold validation,-0.159271,0.157828,0.134584


### SVR RBF

In [10]:
model = SVR(C=1.0, epsilon=0.01, kernel='linear')

val_table(model, new_features, labels)

Unnamed: 0,score,pearson^2,spearman
self validation,-0.190992,0.038536,0.05083
1-fold validation,-0.179308,0.141167,0.180757


### Linear Regression

In [11]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

val_table(model, new_features, labels)

Unnamed: 0,score,pearson^2,spearman
self validation,0.058816,0.058816,0.045049
1-fold validation,0.042546,0.193275,0.157358


### SVR Cross Validation

In [15]:
model = SVR(C=1.0, epsilon=0.01, kernel='rbf')
scores = cross_val_score(model, new_features, labels, cv=10)
scores.mean()

-0.48343292149876038