## Modeling

In [2]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures

In [42]:
df = pd.read_csv('../datasets/df.csv')
clean_test_df = pd.read_csv('../datasets/clean_test_df.csv')

In [43]:
clean_test_df.head()

Unnamed: 0,Id,Overall Qual,Gr Liv Area,Garage Area,Garage Cars,Total Bsmt SF,1st Flr SF,Year Built,Year Remod/Add,Full Bath,...,Misc Feature_Shed,Misc Feature_none,Roof Style_Gable,Roof Style_Gambrel,Roof Style_Hip,Roof Style_Mansard,Roof Style_Shed,Neighborhood_Landmrk,Neighborhood_GrnHill,Misc Feature_TenC
0,2658,6,1928,440,1,1020,908,1910,1950,2,...,0,1,1,0,0,0,0,0,0,0
1,2718,5,1967,580,2,1967,1967,1977,1977,2,...,0,1,1,0,0,0,0,0,0,0
2,2414,7,1496,426,2,654,664,2006,2006,2,...,0,1,1,0,0,0,0,0,0,0
3,1989,5,968,480,2,968,968,1923,2006,1,...,0,1,1,0,0,0,0,0,0,0
4,625,6,1394,514,2,1394,1394,1963,1963,1,...,0,1,1,0,0,0,0,0,0,0


In [44]:
df.set_index('Id')
clean_test_df.set_index('Id')

Unnamed: 0_level_0,Overall Qual,Gr Liv Area,Garage Area,Garage Cars,Total Bsmt SF,1st Flr SF,Year Built,Year Remod/Add,Full Bath,Mas Vnr Area,...,Misc Feature_Shed,Misc Feature_none,Roof Style_Gable,Roof Style_Gambrel,Roof Style_Hip,Roof Style_Mansard,Roof Style_Shed,Neighborhood_Landmrk,Neighborhood_GrnHill,Misc Feature_TenC
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2658,6,1928,440,1,1020,908,1910,1950,2,0.0,...,0,1,1,0,0,0,0,0,0,0
2718,5,1967,580,2,1967,1967,1977,1977,2,0.0,...,0,1,1,0,0,0,0,0,0,0
2414,7,1496,426,2,654,664,2006,2006,2,0.0,...,0,1,1,0,0,0,0,0,0,0
1989,5,968,480,2,968,968,1923,2006,1,0.0,...,0,1,1,0,0,0,0,0,0,0
625,6,1394,514,2,1394,1394,1963,1963,1,247.0,...,0,1,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1662,6,1877,488,2,1084,1084,1974,1974,2,0.0,...,0,1,1,0,0,0,0,0,0,0
1234,6,1988,480,2,1104,1104,1966,1999,2,410.0,...,0,1,1,0,0,0,0,0,0,0
1373,5,1211,322,1,952,1211,1968,1968,1,0.0,...,0,1,0,0,1,0,0,0,0,0
1672,4,864,528,2,864,864,1971,1971,1,0.0,...,0,1,1,0,0,0,0,0,0,0


In [45]:
X = df.drop(axis = 1, columns = ['SalePrice'])
y = df['SalePrice']

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = .75)

In [69]:
cross_val_score(lr, X, y, cv = 5)

array([0.85929971, 0.89256375, 0.87311969, 0.87895563, 0.85972494])

Splitting the data shows the output is close enough.

In [47]:
#From Noah's Workflow lesson

#establish baseline model for comparison, this is a naive model
y_baseline = [y.mean()]*len(y)
baseline_rmse = mean_squared_error(y, y_baseline, squared = False)
baseline_rmse

79079.73066263588

In [48]:
lr= LinearRegression()

lr.fit(X_train, y_train)

LinearRegression()

In [49]:
pd.DataFrame({
    'column' : X.columns,
    'coef' : lr.coef_
})

Unnamed: 0,column,coef
0,Id,8.622880
1,Overall Qual,13651.091589
2,Gr Liv Area,56.847170
3,Garage Area,38.276294
4,Garage Cars,-96.973706
...,...,...
68,Roof Style_Gable,10114.202107
69,Roof Style_Gambrel,1096.315756
70,Roof Style_Hip,19900.258088
71,Roof Style_Mansard,-18990.500810


In [50]:
X_kaggle = clean_test_df.copy()
X_kaggle.shape

(878, 73)

In [51]:
X_kaggle['SalePrice'] = lr.predict(X_kaggle)

In [52]:
output = X_kaggle[["SalePrice"]]
output.head()

Unnamed: 0,SalePrice
0,104635.00095
1,160777.220627
2,168235.04502
3,100235.968011
4,175875.283389


In [54]:
output.to_csv('../datasets/JS_first_submission.csv')

Linear Regression wasnt very good because a lot of variables are not independent, so I'll try ridge regression

In [55]:
sc = StandardScaler()
Z_train = sc.fit_transform(X_train)
Z_test = sc.transform(X_test)

In [56]:
## using lesson 4.02 for this
ridge = Ridge(alpha = 9.7)
ridge.fit(Z_train, y_train)
# Evaluate model using R2.
print(ridge.score(Z_train, y_train))
print(ridge.score(Z_test, y_test))

pred_ridge = ridge.predict(Z_test)
(mean_squared_error(y_test, pred_ridge))**.5

0.882540004081606
0.8764807691074691


27484.153644358863

In [57]:
r_alphas =  np.logspace(0,3, 150)
ridge_cv = RidgeCV(alphas = r_alphas, scoring = 'r2', cv = 5)
ridge_cv.fit(Z_train, y_train)
ridge_cv.alpha_

103.13897683787211

In [58]:
r_kaggle = clean_test_df.copy()
r_kaggle.head()

Unnamed: 0,Id,Overall Qual,Gr Liv Area,Garage Area,Garage Cars,Total Bsmt SF,1st Flr SF,Year Built,Year Remod/Add,Full Bath,...,Misc Feature_Shed,Misc Feature_none,Roof Style_Gable,Roof Style_Gambrel,Roof Style_Hip,Roof Style_Mansard,Roof Style_Shed,Neighborhood_Landmrk,Neighborhood_GrnHill,Misc Feature_TenC
0,2658,6,1928,440,1,1020,908,1910,1950,2,...,0,1,1,0,0,0,0,0,0,0
1,2718,5,1967,580,2,1967,1967,1977,1977,2,...,0,1,1,0,0,0,0,0,0,0
2,2414,7,1496,426,2,654,664,2006,2006,2,...,0,1,1,0,0,0,0,0,0,0
3,1989,5,968,480,2,968,968,1923,2006,1,...,0,1,1,0,0,0,0,0,0,0
4,625,6,1394,514,2,1394,1394,1963,1963,1,...,0,1,1,0,0,0,0,0,0,0


In [59]:
ksc = StandardScaler()
kaggle_sc = ksc.fit_transform(r_kaggle)

In [60]:
r_kaggle['SalePrice'] = ridge.predict(kaggle_sc)

In [62]:
pd.DataFrame({
    'column' : X.columns,
    'coef' : ridge.coef_
})

Unnamed: 0,column,coef
0,Id,2276.003257
1,Overall Qual,19028.890122
2,Gr Liv Area,24822.070592
3,Garage Area,8019.795596
4,Garage Cars,75.899537
...,...,...
68,Roof Style_Gable,2249.874351
69,Roof Style_Gambrel,-298.954715
70,Roof Style_Hip,6115.758031
71,Roof Style_Mansard,-1468.203847


In [61]:
output = r_kaggle[["SalePrice"]]
output.head()

Unnamed: 0,SalePrice
0,140445.949671
1,190843.827063
2,192942.902348
3,129968.883817
4,193376.066513


In [64]:
output.to_csv('../datasets/JS_ridge5_submission.csv')