In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/home-data-for-ml-course/sample_submission.csv
/kaggle/input/home-data-for-ml-course/data_description.txt
/kaggle/input/home-data-for-ml-course/train.csv
/kaggle/input/home-data-for-ml-course/test.csv


In [2]:
#sklearn imports
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.linear_model import SGDRegressor
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor, StackingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

In [3]:
#Function definitions
def map_feature(X, to_map, mapping):
    X[to_map] = X[to_map].stack().apply(lambda x: mapping[x]).unstack()

def plot_features(X, features):
    fig, axs = plt.subplots(nrows=len(features), figsize=(20, 100))
    for i in range(len(features)):
        axs[i].scatter(X[features[i]], y)
        axs[i].set_xlabel(features[i], fontsize=15)
    plt.show()

def analyze(data):
    print(data.info(), data.describe(), data.value_counts())
    
def plot_data(data):
    sns.scatterplot(x=data,  y=y)
    plt.show()
    print(data.corr(y), "\n")
    
def poly_transform(X, features, degree=2):
    for col in features:
        X[col] = X[col] ** degree
    
def score_model(model, X_train, y_train, X_valid, y_valid):
    model.fit(X_train, y_train)
    print(f'{model} train accuracy: ', model.score(X_train, y_train).round(3))
    print(f'{model} valid accuracy: ', model.score(X_valid, y_valid).round(3))

In [4]:
#Data preparation
X = pd.read_csv("/kaggle/input/home-data-for-ml-course/train.csv")
y = X.pop('SalePrice')
X_test = pd.read_csv("/kaggle/input/home-data-for-ml-course/test.csv")

drop_columns = [col for col in X.columns if X[col].isna().sum() > X.shape[0] // 2]
X.drop(drop_columns, axis=1, inplace=True)
X_test.drop(drop_columns, axis=1, inplace=True)

drop_rows = [523, 1298, 1190, 738, 921, 1350, 1061, 496]
X.drop(drop_rows, inplace=True)
y.drop(drop_rows, inplace=True)

numerical_data = X.select_dtypes(['number']).columns
categorical_data = X.select_dtypes(['object']).columns

print(drop_columns, len(drop_columns))
print(numerical_data, len(numerical_data))
print(categorical_data, len(categorical_data))
print(X.shape, X_test.shape, y.shape)
# X.info(), X.describe()

['Alley', 'MasVnrType', 'PoolQC', 'Fence', 'MiscFeature'] 5
Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
       'MiscVal', 'MoSold', 'YrSold'],
      dtype='object') 37
Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFin

In [5]:
#Ordinal mappings
ordinal_mapping0 = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1}
ordinal_mapping1 = {'Gd': 4, 'Av': 3, 'Mn': 2, 'No': 1}
ordinal_mapping2 = {'GLQ': 6, 'ALQ': 5, 'BLQ': 4, 'Rec': 3, 'LwQ': 2, 'Unf': 1}
ordinal_mapping3 = {'Y': 2, 'P': 1, 'N': 0}
ordinal_mapping4 = {'SBrkr': 5, 'FuseA': 4, 'FuseF': 3, 'FuseP': 2, 'Mix': 1}
ordinal_mapping5 = {'Typ': 6, 'Min1': 5, 'Min2': 5, 'Mod': 4, 'Maj1': 3, 'Maj2': 3, 'Sev': 2, 'Sal': 1}
ordinal_mapping6 = {'Fin': 3, 'RFn': 2, 'Unf': 1}

to_map0 = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond']
to_map1 = ['BsmtExposure']
to_map2 = ['BsmtFinType1', 'BsmtFinType2']
to_map3 = ['CentralAir', 'PavedDrive']
to_map4 = ['Electrical']
to_map5 = ['Functional']
to_map6 = ['GarageFinish']

map_feature(X, to_map0, ordinal_mapping0)
map_feature(X, to_map1, ordinal_mapping1)
map_feature(X, to_map2, ordinal_mapping2)
map_feature(X, to_map3, ordinal_mapping3)
map_feature(X, to_map4, ordinal_mapping4)
map_feature(X, to_map5, ordinal_mapping5)
map_feature(X, to_map6, ordinal_mapping6)

In [6]:
#Feature selection & engineering
data_corr0 = X[numerical_data].join(y).corr()
data_corr1 = X[to_map0 + to_map1 + to_map2 + to_map3 + to_map4 + to_map5 + to_map6].join(y).corr()
data_corr_to_y0 = data_corr0.SalePrice.sort_values(ascending=False)[1:]
data_corr_to_y1 = data_corr1.SalePrice.sort_values(ascending=False)[1:]
#print(data_corr_to_y1)

good_features0 = data_corr_to_y0[abs(data_corr_to_y0) >= 0.4].index
good_features1 = data_corr_to_y1[abs(data_corr_to_y1) >= 0.4].index
bad_features0 = data_corr_to_y0[abs(data_corr_to_y0) <= 0.20].index
bad_features1 = data_corr_to_y1[abs(data_corr_to_y1) <= 0.20].index
#plot_features(X, bad_features1)

pca = PCA(random_state=42)
X_pca0 = pca.fit_transform(X[['GrLivArea', 'GarageArea', 'LotArea']])
X_pca1 = pca.fit_transform(X[['TotRmsAbvGrd', 'BedroomAbvGr', 'KitchenAbvGr']])
# print("\n", pca.explained_variance_ratio_, "\n")
# print(pca.components_.T)

X_final = X[good_features0].join(X[good_features1])
X_final['HouseQualCond'] = (X[['OverallCond', 'OverallQual', 'HeatingQC', 'FireplaceQu', 'CentralAir', 'Electrical', 'Functional']].sum(axis=1)) * (X.YearBuilt + X.YearRemodAdd + X.GarageYrBlt.fillna(1400) - X.YrSold)
X_final['TotalArea'] = X.GrLivArea + ((X.GarageCars + X.GarageFinish) * X.GarageArea)
X_final['TotalSF'] = X[['TotalBsmtSF', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', '1stFlrSF', '2ndFlrSF', 'WoodDeckSF', 'OpenPorchSF']].sum(axis=1) - X.LowQualFinSF
X_final['TotalBath'] = X[['FullBath', 'HalfBath', 'BsmtFullBath', 'BsmtHalfBath']].sum(axis=1)
X_final['TotalRms'] = X.TotRmsAbvGrd - X.BedroomAbvGr
X_final['OverallQualCond'] = X[['ExterQual', 'ExterCond', 'KitchenQual', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 'HeatingQC', 'FireplaceQu']].sum(axis=1)

to_poly_2 = ['OverallQual', 'YearBuilt', 'TotalSF']
to_poly_3 = ['HouseQualCond', 'OverallQualCond']
poly_transform(X_final, to_poly_2)
poly_transform(X_final, to_poly_3, 3)
# plot_features(X_final, X_final.columns)

# to_check1 = 'GarageArea'
# to_check2 = 'TotalSF'
# analyze(X[to_check1])
# analyze(X_final[to_check2])
# print("\n", X_final[to_check2][X_final[to_check2] > 0.95e8].index, "\n")
# plot_data(X_final.TotalSF)

print(X_final.shape)
X_final.info(), X_final.describe()

X_final = X_final.join(X.select_dtypes([object]))
print(X_final.shape)
X_final.info()

(1452, 25)
<class 'pandas.core.frame.DataFrame'>
Index: 1452 entries, 0 to 1459
Data columns (total 25 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   OverallQual      1452 non-null   int64  
 1   GrLivArea        1452 non-null   int64  
 2   GarageCars       1452 non-null   int64  
 3   TotalBsmtSF      1452 non-null   int64  
 4   GarageArea       1452 non-null   int64  
 5   1stFlrSF         1452 non-null   int64  
 6   FullBath         1452 non-null   int64  
 7   TotRmsAbvGrd     1452 non-null   int64  
 8   YearBuilt        1452 non-null   int64  
 9   YearRemodAdd     1452 non-null   int64  
 10  GarageYrBlt      1373 non-null   float64
 11  MasVnrArea       1444 non-null   float64
 12  Fireplaces       1452 non-null   int64  
 13  BsmtFinSF1       1452 non-null   int64  
 14  ExterQual        1452 non-null   float64
 15  KitchenQual      1452 non-null   float64
 16  BsmtQual         1415 non-null   float64
 17  GarageFi

In [7]:
#Data cleaning
num_pipeline = make_pipeline(
    SimpleImputer(strategy='constant', fill_value=0), 
    #SimpleImputer(strategy='median'), 
    StandardScaler()
)
cat_pipeline = make_pipeline(
    #SimpleImputer(strategy='constant', fill_value='None'), 
    SimpleImputer(strategy='most_frequent'), 
    #OneHotEncoder(handle_unknown='ignore')
)
column_transformer = ColumnTransformer([
    ('num', num_pipeline, make_column_selector(dtype_include=np.number)), 
    ('cat', cat_pipeline, make_column_selector(dtype_include=object))
])
X_trans = pd.DataFrame(column_transformer.fit_transform(X_final))
#X_trans.columns = column_transformer.get_feature_names_out()
X_trans = pd.get_dummies(X_trans, columns=range(25, 46))
X_trans.columns = X_trans.columns.astype(str)

print(X_trans.shape, y.shape)
# X_trans.info(), X_trans.describe()

(1452, 182) (1452,)


In [8]:
#Feature selection 2
rfr = RandomForestRegressor(random_state=42)
rfr.fit(X_trans, y)

feature_importance_dict = {}
for feature, importance in zip(X_trans.columns, rfr.feature_importances_):
    feature_importance_dict[feature] = importance.round(4)
print(feature_importance_dict)

important_features = [col for col in feature_importance_dict if feature_importance_dict[col] > 0]
X_trans_final = X_trans[important_features]
print(X_trans_final.shape, y.shape)

{'0': 0.2163, '1': 0.0311, '2': 0.0003, '3': 0.0059, '4': 0.0046, '5': 0.0072, '6': 0.0023, '7': 0.0049, '8': 0.0045, '9': 0.0044, '10': 0.0031, '11': 0.006, '12': 0.0011, '13': 0.0114, '14': 0.0013, '15': 0.0027, '16': 0.0018, '17': 0.0009, '18': 0.0006, '19': 0.0831, '20': 0.3552, '21': 0.1772, '22': 0.0117, '23': 0.0053, '24': 0.0303, '25_C (all)': 0.0003, '25_FV': 0.0001, '25_RH': 0.0, '25_RL': 0.0004, '25_RM': 0.0005, '26_Grvl': 0.0, '26_Pave': 0.0, '27_IR1': 0.0006, '27_IR2': 0.0003, '27_IR3': 0.0002, '27_Reg': 0.0005, '28_Bnk': 0.0001, '28_HLS': 0.0005, '28_Low': 0.0002, '28_Lvl': 0.0005, '29_AllPub': 0.0, '29_NoSeWa': 0.0, '30_Corner': 0.0003, '30_CulDSac': 0.0005, '30_FR2': 0.0001, '30_FR3': 0.0, '30_Inside': 0.0004, '31_Gtl': 0.0009, '31_Mod': 0.0001, '31_Sev': 0.0002, '32_Blmngtn': 0.0002, '32_Blueste': 0.0, '32_BrDale': 0.0, '32_BrkSide': 0.0001, '32_ClearCr': 0.0004, '32_CollgCr': 0.0003, '32_Crawfor': 0.0016, '32_Edwards': 0.0002, '32_Gilbert': 0.0001, '32_IDOTRR': 0.0001

In [9]:
#Train & Test sets preparation
X_train, X_valid, y_train, y_valid = train_test_split(X_trans_final, y, test_size=0.3, random_state=42,)
print(X_train.shape, y_train.shape, X_valid.shape, y_valid.shape)

(1016, 120) (1016,) (436, 120) (436,)


In [10]:
#Hyperparameter tuning
sgdr = SGDRegressor(random_state=42, max_iter=10000, tol=1e-3)
lsvr = LinearSVR(random_state=42, C=1000, epsilon=10000, max_iter=10000, tol=1e-3)
rfr = RandomForestRegressor(random_state=42, n_jobs=-1)
gbr = GradientBoostingRegressor(random_state=42, n_estimators=1000)

# score_model(sgdr, X_train, y_train, X_valid, y_valid)
# score_model(lsvr, X_train, y_train, X_valid, y_valid)
# score_model(rfr, X_train, y_train, X_valid, y_valid)
# score_model(gbr, X_train, y_train, X_valid, y_valid)

param_grid0 = {'alpha': [10, 0.1, 0.001, 0.00001], 'eta0': [0.1, 0.01, 0.001]}
param_grid1 = {'C': [100, 1000, 10000, 100000], 'epsilon': [100, 1000, 10000, 100000]}
param_grid2 = {'max_depth': [1, 10, 100, 1000], 'max_features': [0.125, 0.25, 0.5, 0.75], 
               'max_samples': [0.25, 0.5, 0.75, 1], 'n_estimators': [10, 50, 100, 500]}
param_grid3 = {'learning_rate': [1, 0.1, 0.01, 0.001], 'max_features': [0.05, 0.125, 0.25, 0.5], 
               'subsample': [0.125, 0.25, 0.5, 0.75]}

gscv_sgdr = GridSearchCV(sgdr, param_grid0, cv=3, scoring='neg_root_mean_squared_error')
gscv_lsvr = GridSearchCV(lsvr, param_grid1, cv=3, scoring='neg_root_mean_squared_error')
gscv_rfr = GridSearchCV(rfr, param_grid2, cv=3, scoring='neg_root_mean_squared_error')
gscv_gbr = GridSearchCV(gbr, param_grid3, cv=3, scoring='neg_root_mean_squared_error')

score_model(gscv_sgdr, X_train, y_train, X_valid, y_valid)
print(gscv_sgdr.best_params_)
score_model(gscv_lsvr, X_train, y_train, X_valid, y_valid)
print(gscv_lsvr.best_params_)
score_model(gscv_rfr, X_train, y_train, X_valid, y_valid)
print(gscv_rfr.best_params_)
score_model(gscv_gbr, X_train, y_train, X_valid, y_valid)
print(gscv_gbr.best_params_)

sgdr.set_params(**gscv_sgdr.best_params_)
lsvr.set_params(**gscv_lsvr.best_params_)
rfr.set_params(**gscv_rfr.best_params_)
gbr.set_params(**gscv_gbr.best_params_)

GridSearchCV(cv=3, estimator=SGDRegressor(max_iter=10000, random_state=42),
             param_grid={'alpha': [10, 0.1, 0.001, 1e-05],
                         'eta0': [0.1, 0.01, 0.001]},
             scoring='neg_root_mean_squared_error') train accuracy:  -22669.342
GridSearchCV(cv=3, estimator=SGDRegressor(max_iter=10000, random_state=42),
             param_grid={'alpha': [10, 0.1, 0.001, 1e-05],
                         'eta0': [0.1, 0.01, 0.001]},
             scoring='neg_root_mean_squared_error') valid accuracy:  -24122.598
{'alpha': 0.001, 'eta0': 0.001}
GridSearchCV(cv=3,
             estimator=LinearSVR(C=1000, epsilon=10000, max_iter=10000,
                                 random_state=42, tol=0.001),
             param_grid={'C': [100, 1000, 10000, 100000],
                         'epsilon': [100, 1000, 10000, 100000]},
             scoring='neg_root_mean_squared_error') train accuracy:  -22905.885
GridSearchCV(cv=3,
             estimator=LinearSVR(C=1000, epsilon=10000,

In [11]:
#Model training
sr = StackingRegressor(estimators=[
    #('sgdr', sgdr), 
    ('lsvr', lsvr), 
    ('rfr', rfr), 
    ('gbr', gbr)
], cv=3)
vr = VotingRegressor(estimators=[
    #('sgdr', sgdr), 
    ('lsvr', lsvr), 
    ('rfr', rfr), 
    ('gbr', gbr), 
    ('sr', sr)
])
sr.fit(X_train, y_train)
print(mean_squared_error(y_valid, sr.predict(X_valid), squared=False))
vr.fit(X_train, y_train)
print(mean_squared_error(y_valid, vr.predict(X_valid), squared=False))

22021.922170483922
21613.579973229167


In [12]:
#Final preparation
vr.fit(X_trans_final, y)
print(mean_squared_error(y, vr.predict(X_trans_final), squared=False))

map_feature(X_test, to_map0, ordinal_mapping0)
map_feature(X_test, to_map1, ordinal_mapping1)
map_feature(X_test, to_map2, ordinal_mapping2)
map_feature(X_test, to_map3, ordinal_mapping3)
map_feature(X_test, to_map4, ordinal_mapping4)
map_feature(X_test, to_map5, ordinal_mapping5)
map_feature(X_test, to_map6, ordinal_mapping6)

X_final = X_test[good_features0].join(X_test[good_features1])
X_final['HouseQualCond'] = (X_test[['OverallCond', 'OverallQual', 'HeatingQC', 'FireplaceQu', 'CentralAir', 'Electrical', 'Functional']].sum(axis=1)) * (X_test.YearBuilt + X_test.YearRemodAdd + X_test.GarageYrBlt.fillna(1400) - X_test.YrSold)
X_final['TotalArea'] = X_test.GrLivArea + ((X_test.GarageCars + X_test.GarageFinish) * X_test.GarageArea)
X_final['TotalSF'] = X_test[['TotalBsmtSF', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', '1stFlrSF', '2ndFlrSF', 'WoodDeckSF', 'OpenPorchSF']].sum(axis=1) - X_test.LowQualFinSF
X_final['TotalBath'] = X_test[['FullBath', 'HalfBath', 'BsmtFullBath', 'BsmtHalfBath']].sum(axis=1)
X_final['TotalRms'] = X_test.TotRmsAbvGrd - X_test.BedroomAbvGr
X_final['OverallQualCond'] = X_test[['ExterQual', 'ExterCond', 'KitchenQual', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 'HeatingQC', 'FireplaceQu']].sum(axis=1)

poly_transform(X_final, to_poly_2)
poly_transform(X_final, to_poly_3, 3)

X_final = X_final.join(X_test.select_dtypes([object]))

X_trans = pd.DataFrame(column_transformer.transform(X_final))
X_trans = pd.get_dummies(X_trans, columns=range(25, 46))
X_trans.columns = X_trans.columns.astype(str)

X_trans_final = pd.DataFrame(index=X_test.index)
for col in important_features:
    if col in X_trans.columns:
        X_trans_final[col] = X_trans[col]
    else:
        X_trans_final[col] = 0
print(X_trans_final.shape)

16559.706046201747
(1459, 120)


  X_trans_final[col] = X_trans[col]
  X_trans_final[col] = X_trans[col]
  X_trans_final[col] = X_trans[col]
  X_trans_final[col] = X_trans[col]
  X_trans_final[col] = X_trans[col]
  X_trans_final[col] = X_trans[col]
  X_trans_final[col] = X_trans[col]
  X_trans_final[col] = X_trans[col]
  X_trans_final[col] = X_trans[col]
  X_trans_final[col] = X_trans[col]
  X_trans_final[col] = X_trans[col]
  X_trans_final[col] = X_trans[col]
  X_trans_final[col] = X_trans[col]
  X_trans_final[col] = X_trans[col]
  X_trans_final[col] = X_trans[col]
  X_trans_final[col] = X_trans[col]
  X_trans_final[col] = X_trans[col]
  X_trans_final[col] = X_trans[col]
  X_trans_final[col] = X_trans[col]
  X_trans_final[col] = X_trans[col]


In [13]:
# Save test predictions to file
output = pd.DataFrame({'Id': X_test.index+1461,
                       'SalePrice': vr.predict(X_trans_final)})
output.to_csv('submission.csv', index=False)