In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import copy

sns.set_style('darkgrid')
%config InlineBackend.figure_format = 'retina'
%matplotlib inline

from sklearn.preprocessing import PolynomialFeatures, StandardScaler

In [2]:
train = pd.read_csv('./train (1).csv')

In [3]:
test = pd.read_csv('./test.csv')

In [4]:
train_obj = train.select_dtypes(exclude=[np.number])
train_num = train.select_dtypes(include=[np.number])
test_obj = test.select_dtypes(exclude=[np.number])
test_num = test.select_dtypes(include=[np.number])

In [5]:
# fill NaNs in numeric columns with mean of column (Train)
train.fillna(train_num.mean(), inplace=True)

In [6]:
# fill NaNs in numeric columns with mean of column (Test)
test.fillna(test_num.mean(), inplace=True)

In [7]:
# Get dummies on categorical columns (Train)
train = pd.concat([train, pd.get_dummies(train_obj)], axis=1)

In [8]:
# Get dummies on categorical columns (Test)
test = pd.concat([test, pd.get_dummies(test_obj)], axis=1)

In [9]:
#Confirm dummies were added
print(train.shape)
print(test.shape)

(2051, 334)
(879, 315)


In [10]:
# Identify Object Columns in Train

train_obj.columns

Index(['MS Zoning', 'Street', 'Alley', 'Lot Shape', 'Land Contour',
       'Utilities', 'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1',
       'Condition 2', 'Bldg Type', 'House Style', 'Roof Style', 'Roof Matl',
       'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type', 'Exter Qual',
       'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure',
       'BsmtFin Type 1', 'BsmtFin Type 2', 'Heating', 'Heating QC',
       'Central Air', 'Electrical', 'Kitchen Qual', 'Functional',
       'Fireplace Qu', 'Garage Type', 'Garage Finish', 'Garage Qual',
       'Garage Cond', 'Paved Drive', 'Pool QC', 'Fence', 'Misc Feature',
       'Sale Type'],
      dtype='object')

In [11]:
# Drop those "object" columns in Train
train.drop(['MS Zoning', 'Street', 'Alley', 'Lot Shape', 'Land Contour',
       'Utilities', 'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1',
       'Condition 2', 'Bldg Type', 'House Style', 'Roof Style', 'Roof Matl',
       'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type', 'Exter Qual',
       'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure',
       'BsmtFin Type 1', 'BsmtFin Type 2', 'Heating', 'Heating QC',
       'Central Air', 'Electrical', 'Kitchen Qual', 'Functional',
       'Fireplace Qu', 'Garage Type', 'Garage Finish', 'Garage Qual',
       'Garage Cond', 'Paved Drive', 'Pool QC', 'Fence', 'Misc Feature',
       'Sale Type'], axis=1, inplace=True)

In [12]:
# Identify Object Columns in Test

test_obj.columns

Index(['MS Zoning', 'Street', 'Alley', 'Lot Shape', 'Land Contour',
       'Utilities', 'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1',
       'Condition 2', 'Bldg Type', 'House Style', 'Roof Style', 'Roof Matl',
       'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type', 'Exter Qual',
       'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure',
       'BsmtFin Type 1', 'BsmtFin Type 2', 'Heating', 'Heating QC',
       'Central Air', 'Electrical', 'Kitchen Qual', 'Functional',
       'Fireplace Qu', 'Garage Type', 'Garage Finish', 'Garage Qual',
       'Garage Cond', 'Paved Drive', 'Pool QC', 'Fence', 'Misc Feature',
       'Sale Type'],
      dtype='object')

In [13]:
# Drop those "object" columns in Test
test.drop(['MS Zoning', 'Street', 'Alley', 'Lot Shape', 'Land Contour',
       'Utilities', 'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1',
       'Condition 2', 'Bldg Type', 'House Style', 'Roof Style', 'Roof Matl',
       'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type', 'Exter Qual',
       'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure',
       'BsmtFin Type 1', 'BsmtFin Type 2', 'Heating', 'Heating QC',
       'Central Air', 'Electrical', 'Kitchen Qual', 'Functional',
       'Fireplace Qu', 'Garage Type', 'Garage Finish', 'Garage Qual',
       'Garage Cond', 'Paved Drive', 'Pool QC', 'Fence', 'Misc Feature',
       'Sale Type'], axis=1, inplace=True)

In [14]:
# Recheck shape of dataset to confirm object columns are dropped
print(train.shape)
print(test.shape)

(2051, 292)
(879, 273)


In [15]:
# Recheck that there are no more null values
train.isnull().sum().sum()

0

In [16]:
extra_train_cols = set(train.columns) - set(test.columns)
extra_train_cols = list(extra_train_cols)
extra_train_cols

['Heating_Wall',
 'Roof Matl_ClyTile',
 'Neighborhood_GrnHill',
 'Condition 2_Artery',
 'MS Zoning_A (agr)',
 'Condition 2_RRAn',
 'Neighborhood_Landmrk',
 'Bsmt Cond_Po',
 'Garage Qual_Ex',
 'Electrical_Mix',
 'SalePrice',
 'Roof Matl_Membran',
 'Misc Feature_Elev',
 'Condition 2_RRNn',
 'Misc Feature_TenC',
 'Bsmt Cond_Ex',
 'Functional_Sev',
 'Heating QC_Po',
 'Heating_OthW',
 'Pool QC_Gd',
 'Utilities_NoSeWa',
 'Exterior 1st_CBlock',
 'Pool QC_Fa',
 'Functional_Sal',
 'Condition 2_RRAe',
 'Exterior 2nd_Stone',
 'Exterior 1st_ImStucc',
 'Exterior 1st_Stone']

In [17]:
# Drop those Extra Columns in Train

train.drop(['Neighborhood_GrnHill',
 'Roof Matl_Membran',
 'Neighborhood_Landmrk',
 'Misc Feature_TenC',
 'Exterior 1st_Stone',
 'Roof Matl_ClyTile',
 'Bsmt Cond_Ex',
 'MS Zoning_A (agr)',
 'Pool QC_Gd',
 'Bsmt Cond_Po',
 'Condition 2_Artery',
 'Condition 2_RRNn',
 'Exterior 2nd_Stone',
 'Utilities_NoSeWa',
 'Exterior 1st_CBlock',
 'Exterior 1st_ImStucc',
 'Functional_Sal',
 'Heating_Wall',
 'Misc Feature_Elev',
 'Condition 2_RRAn',
 'Functional_Sev',
 'Heating_OthW',
 'Heating QC_Po',
 'Electrical_Mix',
 'Garage Qual_Ex',
 'Pool QC_Fa',
 'Condition 2_RRAe'], axis=1, inplace=True)

In [18]:
extra_test_cols = set(test.columns) - set(train.columns)
extra_test_cols = list(extra_test_cols)
extra_test_cols

['Heating_Floor',
 'Exterior 2nd_PreCast',
 'Roof Matl_Metal',
 'Mas Vnr Type_CBlock',
 'Sale Type_VWD',
 'Kitchen Qual_Po',
 'Exterior 1st_PreCast',
 'Exterior 2nd_Other',
 'Roof Matl_Roll']

In [19]:
# Drop those Extra columns in Test

test.drop(['Sale Type_VWD',
 'Roof Matl_Metal',
 'Exterior 2nd_PreCast',
 'Mas Vnr Type_CBlock',
 'Exterior 1st_PreCast',
 'Exterior 2nd_Other',
 'Kitchen Qual_Po',
 'Heating_Floor',
 'Roof Matl_Roll'], axis=1, inplace=True)

In [20]:
# Check Train Shape

train.shape

(2051, 265)

In [21]:
# Check Test Shape

test.shape

(879, 264)

In [22]:
# Fill New column SalePrice in Test with 0s

test['SalePrice'] = 0

In [23]:
# Recheck Test Shape

test.shape

(879, 265)

In [24]:
# Feature Selection with 10 Features

from sklearn.feature_selection import  SelectKBest, f_regression, f_classif
selector = SelectKBest(score_func=f_regression, k=10)

target = 'SalePrice'
not_target = [x for x in train.columns if x != target]

predictors = not_target

selector.fit(train[predictors], train[target])

best_features = selector.get_support(indices=True)

features = list(train[predictors].columns[selector.get_support(indices = True)])
features

['Overall Qual',
 'Year Built',
 'Total Bsmt SF',
 '1st Flr SF',
 'Gr Liv Area',
 'Garage Cars',
 'Garage Area',
 'Exter Qual_TA',
 'Bsmt Qual_Ex',
 'Kitchen Qual_Ex']

In [25]:
# Train-Train-Split on Train Data Set

from sklearn.model_selection import train_test_split, KFold, cross_val_score

target = ['SalePrice']
not_target = ['Overall Qual',
 'Year Built',
 'Total Bsmt SF',
 '1st Flr SF',
 'Gr Liv Area',
 'Garage Cars',
 'Garage Area',
 'Exter Qual_TA',
 'Bsmt Qual_Ex',
 'Kitchen Qual_Ex']

X = train[not_target].values
Y = train[target].values

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42)

In [26]:
# Standardize Data Set

ss = StandardScaler()
X_train_scaled = ss.fit_transform(X_train)
X_test_scaled = ss.transform(X_test)

In [27]:
# Fit Linear Regression Model

from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train_scaled, Y_train)
print(f"LinReg Training Score: {lr.score(X_test_scaled, Y_test)}")

LinReg Training Score: 0.8685090741271412


In [28]:
# Now Use the Model on the Test Data Set

target = ['SalePrice']
not_target = ['Overall Qual',
 'Year Built',
 'Total Bsmt SF',
 '1st Flr SF',
 'Gr Liv Area',
 'Garage Cars',
 'Garage Area',
 'Exter Qual_TA',
 'Bsmt Qual_Ex',
 'Kitchen Qual_Ex']

X_testvalues = test[not_target].values
Y_testvalues = test[target].values


ss = StandardScaler()
X_testvalues_scaled = ss.fit_transform(X_testvalues)
X_testvalues_scaled = ss.transform(X_testvalues)

yhat_withKbest = lr.predict(X_testvalues_scaled)

In [29]:
# # Feature Selection with 1 Features

# from sklearn.feature_selection import  SelectKBest, f_regression, f_classif
# selector = SelectKBest(score_func=f_regression, k=1)

# target = 'SalePrice'
# not_target = [x for x in train.columns if x != target]

# predictors = not_target

# selector.fit(train[predictors], train[target])

# best_features = selector.get_support(indices=True)

# features = list(train[predictors].columns[selector.get_support(indices = True)])
# features

In [30]:
# # Fit Linear Model on 1 Feature

# target = ['SalePrice']
# not_target = ['Overall Qual']

# X_testvalues = test[not_target].values
# Y_testvalues = test[target].values


# X = train[not_target].values
# Y = train[target].values

# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42)

# ss = StandardScaler()
# X_train_scaled = ss.fit_transform(X_train)
# X_test_scaled = ss.transform(X_test)

# from sklearn.linear_model import LinearRegression
# lr = LinearRegression()
# lr.fit(X_train_scaled, Y_train)
# print(f"LinReg Training Score: {lr.score(X_test_scaled, Y_test)}")

In [31]:
# # Feature Selection with 25 Features

# from sklearn.feature_selection import  SelectKBest, f_regression, f_classif
# selector = SelectKBest(score_func=f_regression, k=25)

# target = 'SalePrice'
# not_target = [x for x in train.columns if x != target]

# predictors = not_target

# selector.fit(train[predictors], train[target])

# best_features = selector.get_support(indices=True)

# features = list(train[predictors].columns[selector.get_support(indices = True)])
# features

In [32]:
# # Fit Linear Model on 25 Features

# target = ['SalePrice']
# not_target = ['Overall Qual',
#  'Year Built',
#  'Year Remod/Add',
#  'Mas Vnr Area',
#  'Total Bsmt SF',
#  '1st Flr SF',
#  'Gr Liv Area',
#  'Full Bath',
#  'TotRms AbvGrd',
#  'Fireplaces',
#  'Garage Yr Blt',
#  'Garage Cars',
#  'Garage Area',
#  'Neighborhood_NridgHt',
#  'Exter Qual_Ex',
#  'Exter Qual_Gd',
#  'Exter Qual_TA',
#  'Foundation_PConc',
#  'Bsmt Qual_Ex',
#  'Bsmt Qual_TA',
#  'BsmtFin Type 1_GLQ',
#  'Heating QC_Ex',
#  'Kitchen Qual_Ex',
#  'Kitchen Qual_TA',
#  'Garage Finish_Unf']

# X_testvalues = test[not_target].values
# Y_testvalues = test[target].values


# X = train[not_target].values
# Y = train[target].values

# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42)

# ss = StandardScaler()
# X_train_scaled = ss.fit_transform(X_train)
# X_test_scaled = ss.transform(X_test)

# from sklearn.linear_model import LinearRegression
# lr = LinearRegression()
# lr.fit(X_train_scaled, Y_train)
# print(f"LinReg Training Score: {lr.score(X_test_scaled, Y_test)}")

In [33]:
# Submission for LinReg Model

# submission_yhat_withKbest = pd.DataFrame(data = yhat_withKbest, columns = ['SalePrice'], index=test['Id'])
# submission_yhat_withKbest.to_csv('./submission_yhat_withKbest.csv')

In [34]:
# Ridge Model

from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV

ridge_alphas = np.logspace(0, 5, 200)

optimal_ridge = RidgeCV(alphas=ridge_alphas, cv=10)
optimal_ridge.fit(X_train_scaled, Y_train)

print(optimal_ridge.alpha_)

ridge = Ridge(alpha=optimal_ridge.alpha_)

ridge_scores = cross_val_score(ridge, X_train_scaled, Y_train, cv=10)

print(ridge_scores)
print(np.mean(ridge_scores))
yhat_optimal_ridge = optimal_ridge.predict(X_testvalues_scaled)

162.5755666443795
[0.86618331 0.82366984 0.81436082 0.81814988 0.83569357 0.83224428
 0.8726353  0.73754781 0.54307092 0.84258911]
0.7986144838368155


In [35]:
# def plot_cv(alphas, cv_means, optimal_alpha, lr_mse, log=False):
#     # alphas = list of alphas
#     # cv_means = list of CV mean MSE
#     # optimal_alpha
#     # lr_mse
#     fig = plt.figure(figsize=(12,8))
#     ax = plt.gca()

#     if log:
#         ax.semilogx(alphas, cv_means, lw=2)
#     else:
#         ax.plot(alphas, cv_means, lw=2)
#     ax.axvline(optimal_alpha)
#     ax.axhline(lr_mse)
#     ax.set_xlabel('alpha')
#     ax.set_ylabel('Mean Squared Error')

# lr_model = LinearRegression()
# lr_cv_mean_mse = -cross_val_score(lr_model, X_train_scaled, Y_train, cv=5, scoring='neg_mean_squared_error').mean()
    
# plot_cv(optimal_ridge.alphas, ridge_scores, optimal_ridge.alpha_, lr_cv_mean_mse, log=True)




In [36]:
# Submission for Ridge Model

# submission_optimal_ridge = pd.DataFrame(data = yhat_optimal_ridge, columns = ['SalePrice'], index=test['Id'])
# submission_optimal_ridge.to_csv('./submission_optimal_ridge.csv')

In [37]:
# Lasso Model

from sklearn.linear_model import Lasso, LassoCV

optimal_lasso = LassoCV(n_alphas=500, cv=10, verbose=0)
optimal_lasso.fit(X_train_scaled, Y_train)

print(optimal_lasso.alpha_)

lasso = Lasso(alpha=optimal_lasso.alpha_)

lasso_scores = cross_val_score(lasso, X_train_scaled, Y_train, cv=10)

print(lasso_scores)
print(np.mean(lasso_scores))

lasso.fit(X_train_scaled, Y_train)

yhat_optimal_lasso = optimal_lasso.predict(X_testvalues_scaled)

  y = column_or_1d(y, warn=True)


548.531797513093
[0.86780293 0.82694233 0.81276649 0.81846715 0.83758561 0.83224212
 0.87655342 0.73782674 0.51226477 0.83968991]
0.7962141473185773


In [38]:
# Submission for Lasso Model

# submission_optimal_lasso = pd.DataFrame(data = yhat_optimal_lasso, columns = ['SalePrice'], index=test['Id'])
# submission_optimal_lasso.to_csv('./submission_optimal_lasso.csv')

In [39]:
# Elastic Net

from sklearn.linear_model import ElasticNet, ElasticNetCV

l1_ratios = np.linspace(0.01, 1.0, 25)

optimal_enet = ElasticNetCV(l1_ratio=l1_ratios, n_alphas=100, cv=10,
                            verbose=0)
optimal_enet.fit(X_train_scaled, Y_train)

print(optimal_enet.alpha_)
print(optimal_enet.l1_ratio_)

enet = ElasticNet(alpha=optimal_enet.alpha_, l1_ratio=optimal_enet.l1_ratio_)

enet_scores = cross_val_score(enet, X_train_scaled, Y_train, cv=10)

print(enet_scores)
print(np.mean(enet_scores))

yhat_optimal_enet = optimal_enet.predict(X_testvalues_scaled)

  y = column_or_1d(y, warn=True)


590.2323733256419
1.0
[0.86780017 0.82678054 0.81270019 0.81846995 0.83756355 0.83217443
 0.87641525 0.7378993  0.51282098 0.83960057]
0.7962224934334381


In [40]:
# Submission for Elastic Net

# submission_optimal_enet = pd.DataFrame(data = yhat_optimal_enet, columns = ['SalePrice'], index=test['Id'])
# submission_optimal_enet.to_csv('./submission_optimal_enet.csv')