In [2]:
import os

import numpy as np
import pandas as pd
import snap
import pickle

from sklearn.ensemble import RandomForestClassifier

In [3]:
# Read datasets.
DATA_DIR = "../yelp_data/dataset"
OUTPUT_DIR = "../shared/figures"

In [10]:
with open(os.path.join(DATA_DIR, "val_features.pkl")) as fval,\
      open(os.path.join(DATA_DIR, "train_features.pkl")) as ftrain,\
      open(os.path.join(DATA_DIR, "train_rating.pkl")) as rtrain,\
      open(os.path.join(DATA_DIR, "val_rating.pkl")) as rval:
    (_, valFeats) = pickle.load(fval)
    valY = pickle.load(rval)
    (_, trainFeats) = pickle.load(ftrain)
    trainY = pickle.load(rtrain)

In [143]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import BayesianRidge
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor

In [79]:
class BaselineModel(object):
    def fit(self, X, y):
        self._mean = np.mean(y)
        return self
    
    def predict(self, X):
        return np.array([self._mean for _ in xrange(len(X))])
    
    def score(self, X, true):
        predicted = self.predict(X)
        u = np.sum((true - predicted)**2)
        v = np.sum((true - np.mean(true))**2)
        return 1 - u/v
    def get_params(self):
        return ""

In [80]:
linearRegressor = LinearRegression()

In [90]:
rideRegressor = Ridge(alpha=1)

In [159]:
bayesianRegressor = BayesianRidge(n_iter=100, compute_score=True)

In [147]:
baselinePredictor = BaselineModel()

In [148]:
neuralNetworkPredictor = MLPRegressor(
    hidden_layer_sizes=(200,40,8,2), max_iter=1000, early_stopping=True)

In [149]:
randomForestPredictor = RandomForestRegressor(n_estimators=1000)

In [150]:
def rmse(predicted, true):
    return np.sqrt(np.sum((predicted - true)**2) / len(true))

In [151]:
def relative_error(predicted, true):
    m = max(np.max(predicted), np.max(true))
    return np.mean(np.abs(predicted -true)) / m

In [152]:
def fitModel(name, model):
    model = model.fit(trainFeats, trainY)
    predicted = model.predict(valFeats)
    print "Score for %s is %s" % (
        name, model.score(valFeats, valY))
    print "RMSE for %s is %s" % (
        name, rmse(predicted, valY))
    print "Average relative error for %s is %s percent." % (
        name, 100*relative_error(predicted, valY))

In [153]:
fitModel("baseline", baselinePredictor)

Score for baseline is -4.63415358343e-05
RMSE for baseline is 1.42776997765
Average relative error for baseline is 24.1191750901 percent.


In [154]:
fitModel("linear regression", linearRegressor)

Score for linear regression is 0.312514493616
RMSE for linear regression is 1.18380441761
Average relative error for linear regression is 18.2853412809 percent.


In [155]:
fitModel("ridge regression", rideRegressor)

Score for ridge regression is 0.312515072251
RMSE for ridge regression is 1.18380391943
Average relative error for ridge regression is 18.2853539128 percent.


In [166]:
fitModel("bayesian regression", bayesianRegressor)

Score for bayesian regression is 0.312534028173
RMSE for bayesian regression is 1.18378759889
Average relative error for bayesian regression is 18.2857790551 percent.


In [157]:
fitModel("neural network", neuralNetworkPredictor)

Score for neural network is 0.334842664919
RMSE for neural network is 1.16442192777
Average relative error for neural network is 16.0869919882 percent.


In [169]:
fitModel("random forest", randomForestPredictor)

Score for random forest is 0.307618649842
RMSE for random forest is 1.18801209881
Average relative error for random forest is 18.6598898159 percent.


In [160]:
with open(os.path.join(DATA_DIR, "test_features.pkl")) as ftest,\
      open(os.path.join(DATA_DIR, "test_rating.pkl")) as rtest:
    (_, testFeats) = pickle.load(ftest)
    testY = pickle.load(rtest)

In [161]:
# Now that we have the trained models, test them with the test data.
def finalTestModel(name, model):
    predicted = model.predict(testFeats)
    print "Score for %s is %s" % (
        name, model.score(testFeats, testY))
    print "RMSE for %s is %s" % (
        name, rmse(predicted, testY))
    print "Average relative error for %s is %s percent." % (
        name, 100*relative_error(predicted, testY))

In [162]:
finalTestModel("baseline", baselinePredictor)

Score for baseline is -0.000855120695457
RMSE for baseline is 1.4634860104
Average relative error for baseline is 24.7465614817 percent.


In [163]:
finalTestModel("linear regression", linearRegressor)

Score for linear regression is 0.327892972837
RMSE for linear regression is 1.19928440313
Average relative error for linear regression is 18.5669587686 percent.


In [164]:
finalTestModel("ridge regression", rideRegressor)

Score for ridge regression is 0.327893367682
RMSE for ridge regression is 1.19928405085
Average relative error for ridge regression is 18.5669755633 percent.


In [167]:
finalTestModel("bayesian regression", bayesianRegressor)

Score for bayesian regression is 0.327906243749
RMSE for bayesian regression is 1.19927256299
Average relative error for bayesian regression is 18.567542783 percent.


In [168]:
finalTestModel("neural network", neuralNetworkPredictor)

Score for neural network is 0.34511029369
RMSE for neural network is 1.1838237529
Average relative error for neural network is 16.3219078547 percent.


In [170]:
finalTestModel("random forest", randomForestPredictor)

Score for random forest is 0.335125985417
RMSE for random forest is 1.19281377927
Average relative error for random forest is 18.7137709175 percent.


In [171]:
def testTrainModel(name, model):
    predicted = model.predict(trainFeats)
    print "Score for %s is %s" % (
        name, model.score(trainFeats, trainY))
    print "RMSE for %s is %s" % (
        name, rmse(predicted, trainY))
    print "Average relative error for %s is %s percent." % (
        name, 100*relative_error(predicted, trainY))

In [172]:
testTrainModel("baseline", baselinePredictor)

Score for baseline is 0.0
RMSE for baseline is 1.50142049076
Average relative error for baseline is 25.7312431633 percent.


In [174]:
testTrainModel("linear regression", linearRegressor)

Score for linear regression is 0.257107970487
RMSE for linear regression is 1.29409210615
Average relative error for linear regression is 20.397681968 percent.


In [175]:
testTrainModel("ridge regression", rideRegressor)

Score for ridge regression is 0.257107970462
RMSE for ridge regression is 1.29409210617
Average relative error for ridge regression is 20.3976788744 percent.


In [177]:
testTrainModel("bayesian regression", bayesianRegressor)

Score for bayesian regression is 0.257107941987
RMSE for bayesian regression is 1.29409213097
Average relative error for bayesian regression is 20.3975770983 percent.


In [178]:
testTrainModel("neural network", neuralNetworkPredictor)

Score for neural network is 0.290030838364
RMSE for neural network is 1.26509191767
Average relative error for neural network is 18.7831282852 percent.


In [180]:
testTrainModel("random forest", randomForestPredictor)

Score for random forest is 0.750702893173
RMSE for random forest is 0.749654164334
Average relative error for random forest is 10.0184313324 percent.
