### Import necesssary libraries and CSVs

In [1]:
import tools.helpers as th
import tools.data_preparation as dp

import numpy as np
import pandas as pd
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder, PolynomialFeatures

X_train = pd.read_csv('data/bakeoff/Xtrain.csv')
y_train = pd.read_csv('data/bakeoff/ytrain.csv')
X_test = pd.read_csv('data/bakeoff/Xtest.csv')

X_train = dp.initial_clean(X_train, bakeoff=True)
X_test = dp.initial_clean(X_test, bakeoff=True)

In [2]:
# dp.add_distance(X_train)
X_train = X_train.drop(columns=[
    'date',
    'sqft_above',
    'grade',
    'sqft_living15', 
    'sqft_lot15', 
    'sqft_lot', 
    'bathrooms',
    'sqft_basement',
    'zipcode'
])
X_train.head()

Unnamed: 0,bedrooms,sqft_living,floors,waterfront,view,condition,yr_built,yr_renovated,lat,long,dist_from_center
0,3,1880,2.0,0.0,0.0,3,1993,0.0,47.5664,-121.999,0.242195
1,3,2020,1.0,0.0,0.0,3,1994,0.0,47.3545,-122.158,0.283037
2,5,4720,2.0,0.0,0.0,5,1975,0.0,47.4536,-122.009,0.283786
3,2,1430,1.0,0.0,0.0,4,1949,0.0,47.6844,-122.392,0.168553
4,3,2270,1.0,0.0,0.0,4,1980,0.0,47.3451,-122.094,0.314763


In [3]:
X_test = X_test.drop(columns=[
    'date',
    'sqft_above',
    'grade',
    'sqft_living15', 
    'sqft_lot15', 
    'sqft_lot', 
    'bathrooms',
    'sqft_basement',
    'zipcode'
])
X_test.head()

Unnamed: 0,bedrooms,sqft_living,floors,waterfront,view,condition,yr_built,yr_renovated,lat,long,dist_from_center
0,3,850,1.0,0.0,0.0,3,1945,0.0,47.503,-122.356,0.174585
1,3,1510,1.0,0.0,0.0,4,1940,0.0,47.6966,-122.324,0.114052
2,4,1790,1.0,0.0,0.0,3,1983,0.0,47.4819,-121.744,0.510555
3,2,1140,1.0,0.0,1.0,3,1988,0.0,47.5707,-122.359,0.137756
4,3,1500,1.0,0.0,0.0,3,1947,0.0,47.6718,-122.359,0.133237


In [4]:
# log transform continuous variables
to_log = ['sqft_living', 'lat', 'dist_from_center'] # 'dist_from_center'
th.log_transform(X_train, to_log)
th.log_transform(X_test, to_log)

In [5]:
X_train.head()

Unnamed: 0,bedrooms,sqft_living,floors,waterfront,view,condition,yr_built,yr_renovated,lat,long,dist_from_center
0,3,7.539027,2.0,0.0,0.0,3,1993,0.0,3.862127,-121.999,-1.418012
1,3,7.610853,1.0,0.0,0.0,3,1994,0.0,3.857662,-122.158,-1.262177
2,5,8.459564,2.0,0.0,0.0,5,1975,0.0,3.859752,-122.009,-1.259536
3,2,7.26543,1.0,0.0,0.0,4,1949,0.0,3.864604,-122.392,-1.780508
4,3,7.727535,1.0,0.0,0.0,4,1980,0.0,3.857463,-122.094,-1.155935


In [6]:
X_test.isna().sum()

bedrooms            0
sqft_living         0
floors              0
waterfront          0
view                0
condition           0
yr_built            0
yr_renovated        0
lat                 0
long                0
dist_from_center    0
dtype: int64

In [7]:
X_train.isna().sum()

bedrooms            0
sqft_living         0
floors              0
waterfront          0
view                0
condition           0
yr_built            0
yr_renovated        0
lat                 0
long                0
dist_from_center    0
dtype: int64

In [8]:
poly_2 = PolynomialFeatures(2) # 2 is the degree of the polynomial features

poly_2.fit_transform(X_train)

array([[1.00000000e+00, 3.00000000e+00, 7.53902706e+00, ...,
        1.48837560e+04, 1.72996074e+02, 2.01075868e+00],
       [1.00000000e+00, 3.00000000e+00, 7.61085279e+00, ...,
        1.49225770e+04, 1.54185066e+02, 1.59309178e+00],
       [1.00000000e+00, 5.00000000e+00, 8.45956408e+00, ...,
        1.48861961e+04, 1.53674678e+02, 1.58642991e+00],
       ...,
       [1.00000000e+00, 4.00000000e+00, 7.33302301e+00, ...,
        1.48615605e+04, 1.36586412e+02, 1.25530883e+00],
       [1.00000000e+00, 1.00000000e+00, 5.94017125e+00, ...,
        1.49629163e+04, 2.15636550e+02, 3.10762426e+00],
       [1.00000000e+00, 4.00000000e+00, 7.92117272e+00, ...,
        1.48754612e+04, 1.50608676e+02, 1.52485849e+00]])

In [9]:
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [10]:
predictions = lr.predict(X_test)

In [11]:
print('Train Score:', lr.score(X_train, y_train))

Train Score: 0.6468375588831625


In [12]:
y_test_fake = np.full((5400,1), 0)


# fake predictions using the mean of y_train.
your_y_hat_predictions = np.full((5400,1), np.mean(y_train))

r2_score(your_y_hat_predictions, y_test_fake)


0.0

In [13]:
np.savetxt('samantha_ian_sanjit.csv', predictions, delimiter=',')

In [14]:
# gradient boost regressor using the df_test
# small learning rate of 5% to minimize overfitting
# 400 boosting stages
# limit the number of nodes in the tree to 5
from sklearn import ensemble
clf = ensemble.GradientBoostingRegressor(loss = 'ls', learning_rate = 0.05, 
                                         n_estimators = 400, max_depth = 5, min_samples_split = 2,)

In [16]:
# fit a gradient boost model for 'X_train' and 'y_train'
# this may take awhile to load
clf.fit(X_train, y_train)

  return f(**kwargs)


GradientBoostingRegressor(learning_rate=0.05, max_depth=5, n_estimators=400)

In [17]:
# calculate r squared for train and test data, train r squared will likely be higher due to overfitting
print('Train Score:', clf.score(X_train, y_train))
print('Validation Score', clf.score(X_test, y_test))

Train Score: 0.9438008938436105


NameError: name 'y_test' is not defined

In [40]:
# predict the y_test values with the fitted gradient boost regression given the X_test values
predictions = clf.predict(X_test)

In [41]:
# we need to undue the log transform we performed on price to better interpret RMSE
y_test2 = np.expm1(y_test)

# remove log transform off predictions to better interpret RMSE
predictions2 = np.expm1(predictions)