### Import necesssary libraries and CSVs

In [54]:
import tools.helpers as th
import tools.data_preparation as dp
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder, PolynomialFeatures

X_train = pd.read_csv('data/bakeoff/Xtrain.csv')
y_train = pd.read_csv('data/bakeoff/ytrain.csv')
X_test = pd.read_csv('data/bakeoff/Xtest.csv')

In [55]:
print(X_train.shape)

(16197, 19)


In [56]:
print(y_train.shape)

(16197, 1)


In [57]:
print(X_test.shape)

(5400, 19)


In [58]:
X_test.head()

Unnamed: 0,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,2/20/2015,3,0.75,850,8573,1.0,0.0,0.0,3,6,600,250.0,1945,0.0,98146,47.503,-122.356,850,8382
1,10/8/2014,3,1.0,1510,6083,1.0,0.0,0.0,4,6,860,650.0,1940,0.0,98115,47.6966,-122.324,1510,5712
2,3/25/2015,4,2.25,1790,42000,1.0,0.0,0.0,3,7,1170,620.0,1983,0.0,98045,47.4819,-121.744,2060,50094
3,2/17/2015,2,1.5,1140,2500,1.0,0.0,1.0,3,7,630,510.0,1988,,98106,47.5707,-122.359,1500,5000
4,5/23/2014,3,1.0,1500,3920,1.0,0.0,0.0,3,7,1000,500.0,1947,0.0,98107,47.6718,-122.359,1640,4017


In [59]:
y_train.head()

Unnamed: 0,price
0,529000.0
1,253000.0
2,745000.0
3,545000.0
4,390000.0


In [60]:
# dp.add_distance(X_train)
X_train = X_train.drop(columns=[
    'date',
    'sqft_above',
    'grade',
    'sqft_living15', 
    'sqft_lot15', 
    'sqft_lot', 
    'bathrooms',
    'sqft_basement',
    'zipcode'
])
X_train.head()

Unnamed: 0,bedrooms,sqft_living,floors,waterfront,view,condition,yr_built,yr_renovated,lat,long
0,3,1880,2.0,0.0,0.0,3,1993,0.0,47.5664,-121.999
1,3,2020,1.0,0.0,0.0,3,1994,0.0,47.3545,-122.158
2,5,4720,2.0,0.0,0.0,5,1975,0.0,47.4536,-122.009
3,2,1430,1.0,0.0,0.0,4,1949,0.0,47.6844,-122.392
4,3,2270,1.0,0.0,0.0,4,1980,0.0,47.3451,-122.094


In [61]:
X_test = X_test.drop(columns=[
    'date',
    'sqft_above',
    'grade',
    'sqft_living15', 
    'sqft_lot15', 
    'sqft_lot', 
    'bathrooms',
    'sqft_basement',
    'zipcode'
])
X_test.head()

Unnamed: 0,bedrooms,sqft_living,floors,waterfront,view,condition,yr_built,yr_renovated,lat,long
0,3,850,1.0,0.0,0.0,3,1945,0.0,47.503,-122.356
1,3,1510,1.0,0.0,0.0,4,1940,0.0,47.6966,-122.324
2,4,1790,1.0,0.0,0.0,3,1983,0.0,47.4819,-121.744
3,2,1140,1.0,0.0,1.0,3,1988,,47.5707,-122.359
4,3,1500,1.0,0.0,0.0,3,1947,0.0,47.6718,-122.359


In [62]:
# log transform continuous variables
to_log = ['sqft_living', 'lat'] # 'dist_from_center'
th.log_transform(X_train, to_log)
th.log_transform(X_test, to_log)

In [63]:
X_train.head()

Unnamed: 0,bedrooms,sqft_living,floors,waterfront,view,condition,yr_built,yr_renovated,lat,long
0,3,7.539027,2.0,0.0,0.0,3,1993,0.0,3.862127,-121.999
1,3,7.610853,1.0,0.0,0.0,3,1994,0.0,3.857662,-122.158
2,5,8.459564,2.0,0.0,0.0,5,1975,0.0,3.859752,-122.009
3,2,7.26543,1.0,0.0,0.0,4,1949,0.0,3.864604,-122.392
4,3,7.727535,1.0,0.0,0.0,4,1980,0.0,3.857463,-122.094


In [64]:
X_test.isna().sum()

bedrooms          0
sqft_living       0
floors            0
waterfront      620
view             14
condition         0
yr_built          0
yr_renovated    963
lat               0
long              0
dtype: int64

In [65]:
X_train.isna().sum()

bedrooms           0
sqft_living        0
floors             0
waterfront      1756
view              49
condition          0
yr_built           0
yr_renovated    2879
lat                0
long               0
dtype: int64

In [66]:
# fill na's in dataframe with the mode for each column
th.mode_fill(X_test, column='waterfront')
th.mode_fill(X_test, column='view')
th.mode_fill(X_test, column='yr_renovated')
th.mode_fill(X_train, column='waterfront')
th.mode_fill(X_train, column='view')
th.mode_fill(X_train, column='yr_renovated')

In [67]:
X_test.isna().sum()

bedrooms        0
sqft_living     0
floors          0
waterfront      0
view            0
condition       0
yr_built        0
yr_renovated    0
lat             0
long            0
dtype: int64

In [68]:
poly_2 = PolynomialFeatures(2) # 2 is the degree of the polynomial features

poly_2.fit_transform(X_train)

array([[ 1.00000000e+00,  3.00000000e+00,  7.53902706e+00, ...,
         1.49160221e+01, -4.71175587e+02,  1.48837560e+04],
       [ 1.00000000e+00,  3.00000000e+00,  7.61085279e+00, ...,
         1.48815550e+01, -4.71244257e+02,  1.49225770e+04],
       [ 1.00000000e+00,  5.00000000e+00,  8.45956408e+00, ...,
         1.48976885e+01, -4.70924530e+02,  1.48861961e+04],
       ...,
       [ 1.00000000e+00,  4.00000000e+00,  7.33302301e+00, ...,
         1.49294592e+01, -4.71036157e+02,  1.48615605e+04],
       [ 1.00000000e+00,  1.00000000e+00,  5.94017125e+00, ...,
         1.49021449e+01, -4.72207101e+02,  1.49629163e+04],
       [ 1.00000000e+00,  4.00000000e+00,  7.92117272e+00, ...,
         1.49440776e+01, -4.71487059e+02,  1.48754612e+04]])

In [69]:
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [70]:
print('Train Score:', lr.score(X_train, y_train))
#print('Validation Score:', lr.score(X_test, y_test))

Train Score: 0.5609666494764214


In [None]:
y_test_fake = np.full((5400,1), 0)


# fake predictions using the mean of y_train.
your_y_hat_predictions = np.full((5400,1), np.mean(y_train))

r2_score(your_y_hat_predictions, y_test_fake)


In [None]:
np.savetxt('your_team_member_names.csv', your_y_hat_predictions, delimiter=',')