# 3. Data Modeling

## Implement Baselines

To fit our baseline model we will use OLS (Ordinary Least Squares Regression). We will split our dataset into a train and test (65 / 35) and run 10 Linear Regression simulations to calculate the Train and Test Score. 

In [17]:
import pandas as pd
import numpy as np
import matplotlib
import seaborn as sb
import matplotlib.pyplot as plt
import matplotlib.cm as cmx
import matplotlib.colors as colors
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression as Lin_Reg
from sklearn.linear_model import Ridge as Ridge_Reg
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import Lasso as Lasso_Reg
from sklearn.ensemble import RandomForestRegressor
%matplotlib inline

In [18]:
data = pd.read_csv('../datasets/listings_clean.csv')
data.head()

Unnamed: 0,id,host_id,accommodates,bedrooms,beds,guests_included,number_of_reviews,host_listing_count,10001,10002,...,40-49,50-59,60-69,70-79,80-84,85-89,90-95,95-100,No Reviews,price
0,1069266,5867023,-0.517323,-0.40596,-0.490869,0.500815,2.67242,-0.359693,0,0,...,0,0,0,0,0,1,0,0,0,160.0
1,2061725,4601412,-0.517323,-0.40596,0.387294,-0.459368,1.269925,0.927756,0,0,...,0,0,0,0,0,0,0,1,0,58.0
2,44974,198425,-0.517323,-0.40596,-0.490869,-0.459368,0.802427,-0.359693,0,0,...,0,0,0,0,0,0,0,1,0,185.0
3,4701675,22590025,-0.517323,-0.40596,0.387294,-0.459368,-0.496178,-0.359693,0,0,...,0,0,0,0,0,0,0,1,0,195.0
4,68914,343302,1.693096,1.271321,1.265456,0.500815,0.282985,0.069457,0,0,...,0,0,0,0,0,0,0,1,0,165.0


In [19]:
# split into x and y (note that we do not include id and host_id as predictors)
x = data.iloc[:, 2:-1]
y = data.iloc[:, -1]

In [7]:
def linear_regression(x, y):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.35)
    regression = Lin_Reg(fit_intercept=True)
    regression.fit(x_train, y_train)
    train_score = regression.score(x_train, y_train)
    test_score = regression.score(x_test, y_test)
    return train_score, test_score

In [8]:
# 10 iterations
training_scores = [None]*10
testing_scores = [None]*10

for i in range(10):
    training_scores[i], testing_scores[i] = linear_regression(x, y)
#     print 'Train Score {}:'.format(i+1), training_scores[i]
#     print 'Test Score {}:'.format(i+1), testing_scores[i], '\n'

print 'Mean Train Score:', np.mean(training_scores)
print 'Mean Test Score:', np.mean(testing_scores)

Mean Train Score: 0.315415815287
Mean Test Score: -3.41570000691e+18


We see that while we are achieving a low $R^2$ score on the train set, we are achieving an extremely negative $R^2$ score on the test set. Let's try a RidgeCV regression instead.

In [12]:
# ridge regression: compute train and test score
def ridge_regression(x, y):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.35)
    reg_params = 10.**np.linspace(-10, 5, 10)
    ridge = RidgeCV(alphas=reg_params, fit_intercept=True, cv=5)
    ridge.fit(x_train, y_train)
    train_score = ridge.score(x_train, y_train)
    test_score = ridge.score(x_test, y_test)
    return train_score, test_score

In [11]:
# perform 10 iterations
training_scores = [None]*10
testing_scores = [None]*10

# compute ridge regression train and test score
for i in range(10):
    training_scores[i], testing_scores[i] = ridge_regression(x, y)
#     print 'Train Score {}:'.format(i+1), training_scores[i]
#     print 'Test Score {}:'.format(i+1), testing_scores[i], '\n'

print 'Mean Train Score:', np.mean(training_scores)
print 'Mean Test Score:', np.mean(testing_scores)

Mean Train Score: 0.298820752829
Mean Test Score: 0.292137226259


In [10]:
# lasso regression: compute train and test score
def lasso_regression(x, y):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.35)
    reg_params = 10.**np.linspace(-10, 5, 10)
    lasso = LassoCV(alphas=reg_params, fit_intercept=True, cv=5)
    lasso.fit(x_train, y_train)
    train_score = lasso.score(x_train, y_train)
    test_score = lasso.score(x_test, y_test)
    return train_score, test_score

In [11]:
# perform 10 iterations
training_scores = [None]*10
testing_scores = [None]*10

# compute ridge regression train and test score
for i in range(10):
    training_scores[i], testing_scores[i] = lasso_regression(x, y)
#     print 'Train Score {}:'.format(i+1), training_scores[i]
#     print 'Test Score {}:'.format(i+1), testing_scores[i], '\n'

print 'Mean Train Score:', np.mean(training_scores)
print 'Mean Test Score:', np.mean(testing_scores)



Mean Train Score: 0.298946032747
Mean Test Score: 0.291913140191


We see that using Ridge Regression as well as Lasso Regression has dramatically increased our $R^2$ for the test score. Both Regressions perform incredibly similar. We see that for both the $R^2$ for the train score in general remains the same. Let's try a RidgeCV polynomial next.

In [8]:
#Function for calculating BIC
BIC = lambda n, d, RSS: n * np.log(RSS * 1. / n) + d * np.log(n)
#Function for calculating Residual Sum of Squares
RSS = lambda predict, actual: np.sum((predict - actual)**2)

#Function for finding the best polynomial model using Ridge polynomial regression
def best_poly_model(pred, response, max_deg, reg_params):
    #Best regularization parameter for each degree
    lambdas = []
    #Best BIC for each degree
    bics = []
    #Best model parameters for each degree
    params = []
    
    #Iterate through degrees 1 to max_degree
    for degree in range(2, max_deg):
        #Turn one predictor into 1, t, t^2, t^3, ...
        poly_t = PolynomialFeatures(degree=degree)
        pred_expanded = poly_t.fit_transform(pred)
        
        #Perform Ridge regression using expanded set of predictors, 
        #choose best regularization parameter lambda using 5-fold x-validation
        ridge = RidgeCV(alphas=reg_params, fit_intercept=True, cv=5)
        ridge.fit(pred_expanded, response)
        
        #Record the parameters of the model chosen by 5-fold x-validation
        params.append(ridge.coef_)
        #Record the lambda chosen by 5-fold x-validation
        lambdas.append(ridge.alpha_)
        
        #Record the BIC score of the model chosen by 5-fold x-validation
        response_hat = ridge.predict(pred_expanded)        
        error = RSS(response_hat, response)
        bics.append(BIC(pred.shape[0], degree, error))
    
    #Find the degree with the min BIC score
    best_degree = np.argmin(bics) + 1
    #Find the best lambda for the degree with the min BIC score
    best_lambda = lambdas[best_degree - 1]
    #Find the best model parameters for the degree with the min BIC score
    best_params = params[best_degree - 1]
    return best_degree, best_lambda, best_params       

## --------Warning---------

This code takes forever to run. Would suspect the amount of observations is throwing off the polynomial prediction.

In [None]:
reg_params = 10.**np.linspace(-10, 5, 10)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.35)
best_degree, best_lambda = best_poly_model(x_train, y_train, 3, reg_params)    



In [None]:
print best_degree, best_lambda

In [None]:
poly_t = PolynomialFeatures(degree=best_degree)
pred_expanded = poly_t.fit_transform(x_train)
pred_test_expanded = poly_t.fit_transform(x_test)
ridge = RidgeCV(alphas=[best_lambda], fit_intercept=True, cv=5)
ridge.fit(pred_expanded, y_train)
train_score = ridge.score(pred_expanded, y_train)
test_score = ridge.score(pred_test_expanded, y_test)

In [None]:
print train_score
print test_score

In [29]:
# random forest regressor
def random_forest_model(x_train, y_train, x_test, y_test):
#     best_num_p = 0
#     best_num_tree = 0
#     best_score = 0
#     best_train_score = 0
    # tune for parameter
#     for i in range(1, len(x_train.columns)):
#         # tune for tree depth from 1 to 20
#         for j in range(1, 21):
#             for k in range(1, 101, 20):
    rf = RandomForestRegressor()
    rf.fit(x_train, y_train)
    score_train = rf.score(x_train, y_train)
    score = rf.score(x_test, y_test)
#     if score > best_score:
#         best_score = score
#         best_train_score = score_train
#         best_num_p = i
#         best_num_tree = j
    return score_train, score

In [30]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.35)
rf_values = random_forest_model(x_train, y_train, x_test, y_test)

In [34]:
print 'Train Score:', rf_values[0]
print 'Test Score:', rf_values[1]

Train Score: 0.770322616846
Test Score: 0.213976690139


We see a significantly higher train score for Random Forest Regressor but a lower Test Score. This is certainly promising. With further tuning, we can expect the test score to increase.