In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [2]:
dataset = pd.read_csv('train.csv')
df = dataset.iloc[:, 1:-1]
dataset_test = pd.read_csv('test.csv')
df_test = dataset_test.iloc[:, 1:]

In [3]:
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 79 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   Alley          91 non-null     object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

In [5]:
categorical_columns = []
numerical_columns = []

for i in df.columns:
    if(df[i].dtype == 'object'):
        categorical_columns.append(i)
    else:
        numerical_columns.append(i)

In [6]:
na_columns_numerical = df[numerical_columns].isna().sum()
na_columns_numerical = na_columns_numerical[na_columns_numerical > 0]
na_columns_numerical_test = df_test[numerical_columns].isna().sum()
na_columns_numerical_test = na_columns_numerical_test[na_columns_numerical_test > 0]

In [7]:
df['LotFrontage'] = df['LotFrontage'].fillna(df['LotFrontage'].mean())
for col in na_columns_numerical.index:
    if col != 'LotFrontage':
        df[col] = df[col].fillna(0)
df_test['LotFrontage'] = df_test['LotFrontage'].fillna(df_test['LotFrontage'].mean())
for col in na_columns_numerical_test.index:
    if col != 'LotFrontage':
        df_test[col] = df_test[col].fillna(0)

In [8]:
na_columns_categorical = df[categorical_columns].isna().sum()
na_columns_categorical = na_columns_categorical[na_columns_categorical > 0]
na_columns_categorical_test = df_test[categorical_columns].isna().sum()
na_columns_categorical_test = na_columns_categorical_test[na_columns_categorical_test > 0]

In [9]:
na_columns_categorical

Alley           1369
MasVnrType         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [10]:
for i in na_columns_categorical.index:
    print(i, df[i].unique())

Alley [nan 'Grvl' 'Pave']
MasVnrType ['BrkFace' 'None' 'Stone' 'BrkCmn' nan]
BsmtQual ['Gd' 'TA' 'Ex' nan 'Fa']
BsmtCond ['TA' 'Gd' nan 'Fa' 'Po']
BsmtExposure ['No' 'Gd' 'Mn' 'Av' nan]
BsmtFinType1 ['GLQ' 'ALQ' 'Unf' 'Rec' 'BLQ' nan 'LwQ']
BsmtFinType2 ['Unf' 'BLQ' nan 'ALQ' 'Rec' 'LwQ' 'GLQ']
Electrical ['SBrkr' 'FuseF' 'FuseA' 'FuseP' 'Mix' nan]
FireplaceQu [nan 'TA' 'Gd' 'Fa' 'Ex' 'Po']
GarageType ['Attchd' 'Detchd' 'BuiltIn' 'CarPort' nan 'Basment' '2Types']
GarageFinish ['RFn' 'Unf' 'Fin' nan]
GarageQual ['TA' 'Fa' 'Gd' nan 'Ex' 'Po']
GarageCond ['TA' 'Fa' nan 'Gd' 'Po' 'Ex']
PoolQC [nan 'Ex' 'Fa' 'Gd']
Fence [nan 'MnPrv' 'GdWo' 'GdPrv' 'MnWw']
MiscFeature [nan 'Shed' 'Gar2' 'Othr' 'TenC']


In [11]:
df[na_columns_categorical.index] = df[na_columns_categorical.index].fillna('None')
df_test[na_columns_categorical_test.index] = df_test[na_columns_categorical_test.index].fillna('None')

In [12]:
categorical_columns_ordinal = [
'LandSlope',
'ExterQual',
 'ExterCond',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'HeatingQC',
 'CentralAir',
 'KitchenQual',
 'FireplaceQu',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence']

In [13]:
categorical_columns_nominal = [col for col in categorical_columns if col not in categorical_columns_ordinal]

In [14]:
# categorical_columns_ordinal_map = [
#     {
#         'LandSlope'  : {
#             'Gtl' : 2,
#             'Mod' : 1,
#             'Sev' : 0
#         },
#         'ExterQual' : {
#             'Ex' : 4,
#             'Gd' : 3,	
#             'TA' : 2,	
#             'Fa' : 1,	
#             'Po' : 0	
#         },
#         'ExterCond' : {
#             'Ex' : 4,
#             'Gd' : 3,	
#             'TA' : 2,	
#             'Fa' : 1,	
#             'Po' : 0	
#         },
#         'BsmtQual' : {
#             'Ex' : 5,
#             'Gd' : 4,	
#             'TA' : 3,	
#             'Fa' : 2,	
#             'Po' : 1,
#             'None' : 0	
#         },
#         'BsmtCond' : {
#             'Ex' : 5,
#             'Gd' : 4,	
#             'TA' : 3,	
#             'Fa' : 2,	
#             'Po' : 1,
#             'None' : 0	
#         },
#         'BsmtExposure' : {
#             'Gd' : 4,	
#             'Av' : 3,	
#             'Mn' : 2,
#             'No' : 1,	
#             'None' : 0	
#         },
#         'BsmtFinType1' : {
#             'GLQ' : 6,
#             'ALQ' : 5,	
#             'BLQ' : 4,	
#             'Rec' : 3,	
#             'LwQ' : 2,
#             'Unf' : 1,	
#             'None' : 0	
#         },
#         'BsmtFinType2' : {
#             'GLQ' : 6,
#             'ALQ' : 5,	
#             'BLQ' : 4,	
#             'Rec' : 3,	
#             'LwQ' : 2,
#             'Unf' : 1,	
#             'None' : 0	
#         },
#         'HeatingQC' : {
#             'Ex' : 5,
#             'Gd' : 4,	
#             'TA' : 3,	
#             'Fa' : 2,	
#             'Po' : 1,
#             'None' : 0	
#         },
#         'CentralAir' : {
#             'N' : 0,
#             'Y' : 1
#         },
#         'KitchenQual' : {
#             'Ex' : 5,
#             'Gd' : 4,	
#             'TA' : 3,	
#             'Fa' : 2,	
#             'Po' : 1,
#             'None' : 0	
#         },
#         'FireplaceQu' : {
#             'Ex' : 5,
#             'Gd' : 4,	
#             'TA' : 3,	
#             'Fa' : 2,	
#             'Po' : 1,
#             'None' : 0
#         },
#         'GarageFinish' : {	
#             'Fin' : 3,	
#             'RFn' : 2,	
#             'Unf' : 1,
#             'None' : 0
#         },
#         'GarageQual' : {
#             'Ex' : 5,
#             'Gd' : 4,	
#             'TA' : 3,	
#             'Fa' : 2,	
#             'Po' : 1,
#             'None' : 0
#         },
#         'GarageCond' : {
#             'Ex' : 5,
#             'Gd' : 4,	
#             'TA' : 3,	
#             'Fa' : 2,	
#             'Po' : 1,
#             'None' : 0
#         },
#         'PavedDrive' : {	
#             'Y' : 2,	
#             'P' : 1,
#             'N' : 0
#         },
#         'PoolQC' : {
#             'Ex' : 4,
#             'Gd' : 3,	
#             'TA' : 2,	
#             'Fa' : 1,
#             'None' : 0
#         },
#         'Fence' : {
#             'GdPrv' : 4,
#             'MnPrv' : 3,	
#             'GdWo' : 2,	
#             'MnWw' : 1,
#             'None' : 0
#         },
#     }
# ]

In [15]:
# for i in categorical_columns_ordinal:
#     df[i] = df[i].map(categorical_columns_ordinal_map[0][i])
#     df_test[i] = df_test[i].map(categorical_columns_ordinal_map[0][i])

In [16]:
categorical_columns_nominal

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'Foundation',
 'Heating',
 'Electrical',
 'Functional',
 'GarageType',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

In [17]:
from sklearn.preprocessing import LabelEncoder
for col in categorical_columns_ordinal:
    le = LabelEncoder()
    print(col)
    le.fit(pd.concat([df[col], df_test[col]]))
    df[col] = le.transform(df[col])
    df_test[col] = le.transform(df_test[col])

LandSlope
ExterQual
ExterCond
BsmtQual
BsmtCond
BsmtExposure
BsmtFinType1
BsmtFinType2
HeatingQC
CentralAir
KitchenQual
FireplaceQu
GarageFinish
GarageQual
GarageCond
PavedDrive
PoolQC
Fence


In [18]:
ct = ColumnTransformer(transformers=[('encoded', OneHotEncoder(), categorical_columns_nominal)], remainder='passthrough')
ct.fit(pd.concat([df, df_test]))

In [19]:
df = ct.transform(df)
df_test = ct.transform(df_test)

In [20]:
y = dataset.iloc[:, -1].values

In [21]:
X = df

In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [23]:
# from sklearn.ensemble import RandomForestRegressor
# model = RandomForestRegressor()

In [24]:
from xgboost import XGBRegressor
model = XGBRegressor()

In [25]:
model.fit(X_train, y_train)

In [26]:
y_pred = model.predict(X_test)

In [27]:
from sklearn.metrics import r2_score, mean_squared_error
print(r2_score(y_pred=y_pred, y_true=y_test))
print(mean_squared_error(y_pred=y_pred, y_true=y_test))

0.9110438732333533
496182946.18847007


In [28]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = model, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 84.58 %
Standard Deviation: 9.14 %


In [29]:
from sklearn.model_selection import GridSearchCV
param_grid_random_forest = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt'],
    'bootstrap': [True, False],
    'random_state': [42]
}
# param_grid_xgb = {
#     'learning_rate': [0.001, 0.005, 0.01, 0.05],
#     'n_estimators': [50, 100, 500, 1000],
#     'max_depth': [3, 5, 7, 9],
#     'min_child_weight': [1, 3, 5, 7],
#     'gamma': [0, 0.1, 0.2, 0.3],
#     'subsample': [0.6, 0.8, 1.0],
#     'colsample_bytree': [0.6, 0.8, 1.0],
#     'reg_alpha': [0, 0.1, 0.5],
#     'reg_lambda': [1, 1.5, 2],
#     'random_state': [42]
# }
param_grid_xgb = {
    'max_depth': range (2, 10, 1),
    'n_estimators': range(50, 1000, 100),
    'learning_rate': [0.1, 0.05, 0.01, 0.005, 0.001],
    'gamma': [0, 0.1, 0.2, 0.3],
}


grid_search = GridSearchCV(estimator = model,
                           param_grid = param_grid_xgb,
                           scoring = 'r2',
                           cv = 10,
                           n_jobs = -1)
grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

Best Accuracy: 88.30 %
Best Parameters: {'gamma': 0, 'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 950}


In [30]:
# output = pd.DataFrame({'Id': dataset_test['Id'], 'SalePrice': model.predict(df_test)})
# output.to_csv('submission.csv', index=False)
# print("Your submission was successfully saved!")