In [301]:
import gc
import math
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV
import matplotlib.pyplot as plt
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter("ignore", category=ConvergenceWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter("ignore", category=ConvergenceWarning)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [334]:
df = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")

In [335]:
test = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")
df = df.append(test).reset_index()
df.drop("index",axis=1,inplace=True)

In [200]:
df.shape

(2919, 81)

In [250]:
def grab_col_names(dataframe, cat_th=100, car_th=20, num_th = 118):
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and
                   dataframe[col].dtypes != "O"]
    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and
                   dataframe[col].dtypes == "O"]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]
    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O" and col not in "SalePrice" and col not in "Id"]
    num_cols = [col for col in num_cols if col not in num_but_cat]
    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')
    return cat_cols, cat_but_car, num_cols, num_but_cat


cat_cols, cat_but_car, num_cols, num_but_cat = grab_col_names(df)

Observations: 2919
Variables: 81
cat_cols: 61
num_cols: 17
cat_but_car: 1
num_but_cat: 19


In [336]:
int_cols  = ["MSSubClass","MSZoning", "Street", "Alley", "LotShape","LandContour" ,"Utilities","LotConfig","LandSlope", "Neighborhood","Condition1","Condition2", "BldgType", "HouseStyle","OverallQual","OverallCond","RoofStyle","RoofMatl","Exterior1st","Exterior2nd","MasVnrType","ExterQual","ExterCond","Foundation","BsmtQual","BsmtCond","BsmtExposure","BsmtFinType1", "BsmtFinType2","Heating","HeatingQC","CentralAir","Electrical","KitchenQual","Functional","FireplaceQu","GarageType","GarageFinish","GarageQual","GarageCond","PavedDrive","PoolQC","Fence","MiscFeature","SaleType","SaleCondition"]
for col in int_cols:
    df[col] = df[col].astype("object")

In [337]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
label_list = ["LotShape","LandContour","Utilities","Condition1","Condition2","OveralQual","OveralCond","ExterQual","ExterCond","BsmtQual","BsmtCond","BsmtFinType1","BsmtFinType2","BsmtExposure","HeatingQC","GarageQual","PoolQC","KitchenQual"]
for col in cat_cols:
    df[col] = le.fit_transform(df[col])

In [310]:
df.isnull().sum()

Id                  0
MSSubClass          0
MSZoning            0
LotFrontage       486
LotArea             0
Street              0
Alley               0
LotShape            0
LandContour         0
Utilities           0
LotConfig           0
LandSlope           0
Neighborhood        0
Condition1          0
Condition2          0
BldgType            0
HouseStyle          0
OverallQual         0
OverallCond         0
YearBuilt           0
YearRemodAdd        0
RoofStyle           0
RoofMatl            0
Exterior1st         0
Exterior2nd         0
MasVnrType          0
MasVnrArea         23
ExterQual           0
ExterCond           0
Foundation          0
BsmtQual            0
BsmtCond            0
BsmtExposure        0
BsmtFinType1        0
BsmtFinSF1          1
BsmtFinType2        0
BsmtFinSF2          1
BsmtUnfSF           1
TotalBsmtSF         1
Heating             0
HeatingQC           0
CentralAir          0
Electrical          0
1stFlrSF            0
2ndFlrSF            0
LowQualFin

In [322]:
df["GarageYrBlt"].fillna(df["GarageYrBlt"].max(),inplace=True)


In [328]:
col_year = ["YearRemodAdd", "YearBuilt", "YrSold", "GarageYrBlt"]

for col in col_year:
    df[col] = df[col].astype(int)



In [313]:
### NEW FEATURES
df["Floorcross_tersr"]=(df["2ndFlrSF"] + df["1stFlrSF"]) / df["1stFlrSF"]
df["Total_Floor"]=df["1stFlrSF"] + df["2ndFlrSF"]
df["_Floor_cross"] =df["1stFlrSF"] * (df["1stFlrSF"] + df["2ndFlrSF"])
df["Zemin_Alan_Kullanımı"]=(df["GrLivArea"] + df["1stFlrSF"]*df["1stFlrSF"])/df["1stFlrSF"]
df["Alana_düşen_Mutfak"]=df["KitchenAbvGr"] / df["1stFlrSF"]
df["bodrum_banyo_per"]=(df["BsmtFullBath"] + (df["TotalBsmtSF"])/df["TotalBsmtSF"])
df["GrLivArea"] = np.log10(df["GrLivArea"])
df["mean_1stFlrSF_GrLivArea"] = df[['1stFlrSF', 'GrLivArea']].mean(axis=1)
df['std_1stFlrSF_GrLivArea'] = df[['1stFlrSF', 'GrLivArea']].std(axis=1)
df['BsmtQual'].fillna('No_Basement', inplace=True)
df['BsmtCond'].fillna('No_Basement', inplace=True)
df['BsmtExposure'].fillna('No_Basement', inplace=True)
df['BsmtFinType1'].fillna('No_Basement', inplace=True)
df['BsmtFinType2'].fillna('No_Basement', inplace=True)
df['FireplaceQu'].fillna('No_Fireplace', inplace=True)
df['GarageType'].fillna('No_Garage', inplace=True)
df['GarageFinish'].fillna('No_Garage', inplace=True)
df['GarageQual'].fillna('No_Garage', inplace=True)
df['GarageCond'].fillna('No_Garage', inplace=True)
df['PoolQC'].fillna('No_Garage', inplace=True)
df["Lot_divide"] = (df["LotArea"] / df["LotFrontage"]) 
df["MSSubClass"] = df["MSSubClass"].astype("O") 
df["MSZoning"] = df["MSZoning"].astype("O")
df["Home_Age"] = df["YrSold"]- df["YearBuilt"] 
df["Restore_age"] = df["YrSold"] - df["YearRemodAdd"]
drop_year=["YearRemodAdd", "YearBuilt", "YrSold", "GarageYrBlt"]
df.drop(drop_year,axis=1,inplace=True)
### NEW VARIABLES
df["Functional"] = np.where(df.Functional.isin(["Sev", "Maj2", "Maj1","Mod"]), "smmm", df["Functional"])
df["BldgType"] = np.where(df.BldgType.isin(["2fmCon", "Twnhs"]), "2T", df["BldgType"])
df["HouseStyle"] = np.where(df.HouseStyle.isin(["2.5Fin", "1.5Unf" ,"2.5Unf","SFoyer"]), "121", df["HouseStyle"])
df["ExterQual"] = np.where(df.ExterQual.isin(["Fa", "Ex"]), "FaEx", df["ExterQual"])
df["Foundation"] = np.where(df.Foundation.isin(["Wood", "Stone","Slab","BrkTil"]), "2T", df["Foundation"])
df["MSZoning"] = np.where(df.MSZoning.isin(["C (all)", "RH"]), "CRH", df["MSZoning"])
df["Fireplaces"] = np.where(df.Fireplaces.isin(["4", "3"]), "43", df["Fireplaces"])
df["GarageCars"] = np.where(df.GarageCars.isin(["4.000", "5.000"]), "45", df["GarageCars"])
df["Condition1"] = np.where(df.Condition1.isin(["RRNn", "RRNe","RRAn"]), "RRR", df["Condition1"])
df["Exterior2nd"] = np.where(df.Exterior2nd.isin(["AsbShng", "AsphShn"]), "AA", df["Exterior2nd"])
df["Exterior2nd"] = np.where(df.Exterior2nd.isin(["Brk Cmn", "BrkFace"]), "BB", df["Exterior2nd"])
df["Exterior2nd"] = np.where(df.Exterior2nd.isin(["Other", "Stone","CBlock"]), "OSC", df["Exterior2nd"])
del test
gc.collect()


7581

In [None]:
["YearRemodAdd", "YearBuilt", "YrSold", "GarageYrBlt"]

In [345]:
df["YearBuilt"] = pd.to_datetime({'year': df['year'],}, format="%y%M%d%h%m")

ValueError: to assemble mappings requires at least that [year, month, day] be specified: [day,month] is missing

In [338]:
df["YearBuilt"].describe().T

count   2919.000
mean    1971.313
std       30.291
min     1872.000
25%     1953.500
50%     1973.000
75%     2001.000
max     2010.000
Name: YearBuilt, dtype: float64

In [288]:
aggregations_MS = {"LotFrontage":["sum","mean","max","min"],
                    "LotArea": ["min", "mean","sum","max"],
                     "GrLivArea":["mean", "max","min","std","var"]}
MSZoning = df.groupby('MSZoning').agg(aggregations_MS)

MSZoning.columns = pd.Index(['MSZoning' + e[0] + "_" + e[1].upper() for e in MSZoning.columns.tolist()])



    
MSZoning.reset_index()

df = df.join(MSZoning, how='left', on='MSZoning') 




aggregations_BC = {"GarageArea":["sum","mean"],
                   "OpenPorchSF":["min", "mean","sum","max"],
                     "TotRmsAbvGrd": ["mean", "size","min"],
                    "TotalBsmtSF": ["mean","sum","max"],
                     "GrLivArea": ["min", "max", "size"],
                       "1stFlrSF": ["var","mean","max", "std"],
                       "2ndFlrSF" : ["var", "std", "mean","min"],
                       "EnclosedPorch": ["min", "max"],
                        "Home_Age" :["mean","max","sum","min"],
                      "Restore_age":["mean","var","min","max","sum"],
                         "Total_Floor":["mean","sum","max","min"],
                         "Alana_düşen_Mutfak" : ["mean", "sum"]
                                        }

MSSubClass = df.groupby('MSSubClass').agg(aggregations_BC)

MSSubClass.columns = pd.Index(['MSSubClass' + e[0] + "_" + e[1].upper() for e in MSSubClass.columns.tolist()])


MSSubClass.reset_index()

df = df.join(MSSubClass, how='left', on='MSSubClass')

In [273]:
[df["Home_Age"] >= 0

KeyError: "None of [Index([()], dtype='object')] are in the [columns]"

In [145]:
####  DROPPING COLUMNS
df.drop("PoolArea", axis=1, inplace=True)
drop_list = ["BsmtFinSF2","WoodDeckSF","ScreenPorch", "BsmtExposure", "CentralAir","Electrical", "PavedDrive","LandContour", "BldgType", "Heating","Street","Utilities","Condition2", "RoofMatl", "MiscFeature","Fence","PoolQC","GarageCond","Alley", "MiscVal", "3SsnPorch","LowQualFinSF"]
df.drop(drop_list, axis=1, inplace=True)

In [219]:
df.drop(["LotFrontage","LotArea", "LandSlope" ],axis=1, inplace=True)
df.drop(["EnclosedPorch","2ndFlrSF","1stFlrSF", "GarageArea", "OpenPorchSF", "GrLivArea", "TotRmsAbvGrd", "TotalBsmtSF" ],axis=1, inplace=True)

In [289]:
## AFTER THE FEATURE ENGINEERING ALL OVER

df["LotFrontage"] = np.log10(df['LotFrontage'] + 1)
df["LotArea"] = np.log10(df['LotArea'] + 1)
df["MasVnrArea"] = np.log10(df['MasVnrArea'] + 1)
df["BsmtFinSF1"] = np.log10(df['BsmtFinSF1'] + 1)
df["BsmtUnfSF"] = np.log10(df['BsmtUnfSF'] + 1)
df["TotalBsmtSF"] = np.log10(df['TotalBsmtSF'] + 1)
df["1stFlrSF"] = np.log10(df['1stFlrSF'] + 1)
df["2ndFlrSF"] = np.log10(df['2ndFlrSF'] + 1)
df["GarageArea"] = np.log10(df['GarageArea'] + 1)
df["_Floor_cross"] = np.log10(df['_Floor_cross'] + 1)
df["Zemin_Alan_Kullanımı"] = np.log10(df['Zemin_Alan_Kullanımı'] + 1)
df["MSSubClassTotal_Floor_MEAN"] = np.log10(df['MSSubClassTotal_Floor_MEAN'] + 1)
df["mean_1stFlrSF_GrLivArea"] = np.log10(df['mean_1stFlrSF_GrLivArea'] + 1)
df["std_1stFlrSF_GrLivArea"] = np.log10(df['std_1stFlrSF_GrLivArea'] + 1)
df["Lot_divide"] = np.log10(df['Lot_divide'] + 1)
df["MSZoningLotFrontage_SUM"] = np.log10(df['MSZoningLotFrontage_SUM'] + 1)
df["MSZoningLotArea_SUM"] = np.log10(df['MSZoningLotArea_SUM'] + 1)
df["MSSubClassOpenPorchSF_SUM"] = np.log10(df['MSSubClassOpenPorchSF_SUM'] + 1)
df["MSSubClassTotalBsmtSF_SUM"] = np.log10(df['MSSubClassTotalBsmtSF_SUM'] + 1)

In [146]:
def outlier_thresholds(dataframe, variable, low_quantile=0.25, up_quantile=0.75):
    quantile_one = dataframe[variable].quantile(low_quantile)
    quantile_three = dataframe[variable].quantile(up_quantile)
    interquantile_range = quantile_three - quantile_one
    up_limit = quantile_three + 1.5 * interquantile_range
    low_limit = quantile_one - 1.5 * interquantile_range
    return low_limit, up_limit

# Aykırı değer kontrolü
def check_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    if dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None):
        return True
    else:
        return False


for col in num_cols:
    if col != "SalePrice":
        print(col, check_outlier(df, col))




LotFrontage True
LotArea True
MasVnrArea True
BsmtFinSF1 True


KeyError: 'BsmtFinSF2'

In [147]:
# Aykırı değerlerin baskılanması
def replace_with_thresholds(dataframe, variable):
    low_limit, up_limit = outlier_thresholds(dataframe, variable)
    dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit


for col in num_cols:
    if col != "SalePrice":
        replace_with_thresholds(df,col)


KeyError: 'BsmtFinSF2'

count    2919.000
mean    -1969.520
std        30.336
min     -2007.000
25%     -1999.000
50%     -1971.000
75%     -1951.500
max     -1870.000
Name: Home_Age, dtype: float64

In [None]:
########## Fuctional

#aggregations_FC = { "GarageCars" : ["min", "max"],
   #                     "GarageArea" : ["mean"],
    #                 "EnclosedPorch": ["mean"],
     #                "1stFlrSF" : ["mean", "min", "std", "max"],
      #               "KitchenAbvGr": ["min", "max"],
       #             "BedroomAbvGr": ["mean", "std", "var"],
        #             "TotRmsAbvGrd": ["mean"]
         #              }
#Functional = df.groupby('Functional').agg(aggregations_FC)
 
#Functional.columns = pd.Index(['Functional' + e[0] + "_" + e[1].upper() for e in Functional.columns.tolist()])

#df.drop(["GarageCars","GarageArea","EnclosedPorch","1stFlrSF","KitchenAbvGr", "BedroomAbvGr","TotRmsAbvGrd"],axis=1, inplace=True)


#df = df.join(Functional, how='left', on="Functional")   



######## MSZoning

aggregations_MS = {"LotFrontage":["sum","mean"],
                    "LotArea": ["min", "mean","sum","max"],
                    "Street":["nunique", "size"],
                     "LandContour": ["nunique"],
                    "LandSlope": ["nunique"],
                      }
MSZoning = df.groupby('MSZoning').agg(aggregations_MS)

MSZoning.columns = pd.Index(['MSZoning' + e[0] + "_" + e[1].upper() for e in MSZoning.columns.tolist()])
 
df.drop(["LotFrontage","MiscVal", "LotArea", "Street", "LandContour", "LandSlope" ],axis=1, inplace=True)
    
df = df.join(MSZoning, how='left', on='MSZoning')   




####### MSSubClass



aggregations_BC = { "WoodDeckSF": ["mean","sum","max"],
                    "OpenPorchSF":["min", "mean","sum","max"],
                    "TotalBsmtSF": ["mean","sum","max"],
                     "GrLivArea": ["min", "max", "size"],
                      "Build" : ["min","max"],
                       "2ndFlrSF" : ["var", "std", "mean","min"]
                       }
MSSubClass = df.groupby('MSSubClass').agg(aggregations_BC)

MSSubClass.columns = pd.Index(['MSSubClass' + e[0] + "_" + e[1].upper() for e in MSSubClass.columns.tolist()])
    
df.drop(["2ndFlrSF","Build", "WoodDeckSF", "OpenPorchSF", "GrLivArea",  "TotalBsmtSF" ],axis=1, inplace=True)

df = df.join(MSSubClass, how='left', on='MSSubClass')   




df.drop("Std_Build", axis=1, inplace=True)

In [None]:
for col in num_cols:
    df[col] = np.log10(df[col] + 1)

In [290]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,SaleType,SaleCondition,SalePrice,Floorcross_tersr,Total_Floor,_Floor_cross,Zemin_Alan_Kullanımı,Alana_düşen_Mutfak,bodrum_banyo_per,mean_1stFlrSF_GrLivArea,std_1stFlrSF_GrLivArea,Lot_divide,Home_Age,Restore_age,MSZoningLotFrontage_SUM,MSZoningLotFrontage_MEAN,MSZoningLotFrontage_MAX,MSZoningLotFrontage_MIN,MSZoningLotArea_MIN,MSZoningLotArea_MEAN,MSZoningLotArea_SUM,MSZoningLotArea_MAX,MSZoningGrLivArea_MEAN,MSZoningGrLivArea_MAX,MSZoningGrLivArea_MIN,MSZoningGrLivArea_STD,MSZoningGrLivArea_VAR,MSSubClassGarageArea_SUM,MSSubClassGarageArea_MEAN,MSSubClassOpenPorchSF_MIN,MSSubClassOpenPorchSF_MEAN,MSSubClassOpenPorchSF_SUM,MSSubClassOpenPorchSF_MAX,MSSubClassTotRmsAbvGrd_MEAN,MSSubClassTotRmsAbvGrd_SIZE,MSSubClassTotRmsAbvGrd_MIN,MSSubClassTotalBsmtSF_MEAN,MSSubClassTotalBsmtSF_SUM,MSSubClassTotalBsmtSF_MAX,MSSubClassGrLivArea_MIN,MSSubClassGrLivArea_MAX,MSSubClassGrLivArea_SIZE,MSSubClass1stFlrSF_VAR,MSSubClass1stFlrSF_MEAN,MSSubClass1stFlrSF_MAX,MSSubClass1stFlrSF_STD,MSSubClass2ndFlrSF_VAR,MSSubClass2ndFlrSF_STD,MSSubClass2ndFlrSF_MEAN,MSSubClass2ndFlrSF_MIN,MSSubClassEnclosedPorch_MIN,MSSubClassEnclosedPorch_MAX,MSSubClassHome_Age_MEAN,MSSubClassHome_Age_MAX,MSSubClassHome_Age_SUM,MSSubClassHome_Age_MIN,MSSubClassRestore_age_MEAN,MSSubClassRestore_age_VAR,MSSubClassRestore_age_MIN,MSSubClassRestore_age_MAX,MSSubClassRestore_age_SUM,MSSubClassTotal_Floor_MEAN,MSSubClassTotal_Floor_SUM,MSSubClassTotal_Floor_MAX,MSSubClassTotal_Floor_MIN,MSSubClassAlana_düşen_Mutfak_MEAN,MSSubClassAlana_düşen_Mutfak_SUM
0,1,5,3,1.82,3.927,1,2,3,3,0,4,0,CollgCr,2,2,0,5,6,4,1,1,12,13,1,2.294,2,4,2,2,3,3,2,2.849,5,0.0,2.179,2.933,1,0,1,4,2.933,2.932,0,3.233,1,0,2,1,3,1,2,6,6,0,5,1,1,2,2.74,4,4,2,0,61,0,0,0,0,3,4,4,0,1,8,4,208500.0,1.998,1710,6.165,2.934,0.001,2.0,2.634,2.781,2.117,-2001,-51,5.134,74.052,313.0,22.0,1700,11169.472,7.403,215245,3.164,3.751,2.524,0.14,0.02,335996.0,584.341,0,79.897,4.662,570,5.809,575,3,1021.537,5.769,6110.0,3.103,3.751,575,119995.678,1066.33,4692,346.404,52652.697,229.462,929.445,438,0,368,-1993.167,-1908,-1146071,-2006,-45.508,123.344,-57,3,-26167,3.3,1147571,5642,1269,0.001,0.582
1,2,0,3,1.908,3.982,1,2,3,3,0,2,0,Veenker,1,2,0,2,5,7,1,1,8,8,2,0.0,3,4,1,2,3,1,0,2.991,5,0.0,2.455,3.101,1,0,1,4,3.101,0.0,0,3.101,0,1,2,0,3,1,3,4,6,1,4,1,1,2,2.664,4,4,2,298,0,0,0,0,0,3,4,4,0,4,8,4,181500.0,1.0,1262,6.202,3.102,0.001,1.0,2.802,2.95,2.083,-1975,-25,5.134,74.052,313.0,22.0,1700,11169.472,7.403,215245,3.164,3.751,2.524,0.14,0.02,546168.0,506.18,0,46.75,4.703,484,4.027,1079,0,1252.864,6.131,5095.0,2.524,3.707,1079,164898.78,1352.872,5095,406.077,744.037,27.277,0.83,0,0,584,-1976.217,-1935,-2132338,-2007,-32.964,424.866,-58,4,-35568,3.132,1460645,5095,334,0.001,0.865
2,3,5,3,1.839,4.051,1,2,0,3,0,4,0,CollgCr,2,2,0,5,6,4,1,1,12,13,1,2.212,2,4,2,2,3,2,2,2.688,5,0.0,2.638,2.964,1,0,1,4,2.964,2.938,0,3.252,1,0,2,1,3,1,2,4,6,1,4,1,1,2,2.785,4,4,2,0,42,0,0,0,0,3,4,4,0,8,8,4,223500.0,1.941,1786,6.216,2.965,0.001,2.0,2.665,2.812,2.221,-1999,-50,5.134,74.052,313.0,22.0,1700,11169.472,7.403,215245,3.164,3.751,2.524,0.14,0.02,335996.0,584.341,0,79.897,4.662,570,5.809,575,3,1021.537,5.769,6110.0,3.103,3.751,575,119995.678,1066.33,4692,346.404,52652.697,229.462,929.445,438,0,368,-1993.167,-1908,-1146071,-2006,-45.508,123.344,-57,3,-26167,3.3,1147571,5642,1269,0.001,0.582
3,4,6,3,1.785,3.98,1,2,0,3,0,0,0,Crawfor,2,2,0,5,6,4,1,1,13,15,2,0.0,3,4,0,3,1,3,0,2.336,5,0.0,2.733,2.879,1,2,1,4,2.983,2.879,0,3.235,1,0,1,0,3,1,2,5,6,1,2,5,2,3,2.808,4,4,2,0,35,272,0,0,0,3,4,4,0,1,8,0,140000.0,1.787,1717,6.217,2.984,0.001,2.0,2.684,2.831,2.205,-1915,-20,5.134,74.052,313.0,22.0,1700,11169.472,7.403,215245,3.164,3.751,2.524,0.14,0.02,47870.0,376.929,0,42.922,3.74,312,5.227,128,3,734.141,4.973,1370.0,2.937,3.504,128,62435.029,917.203,1940,249.87,48232.464,219.619,773.234,322,0,386,-1916.008,-1870,-245249,-1950,-25.875,565.102,-56,4,-3312,3.228,216376,3194,864,0.001,0.151
4,5,5,3,1.929,4.154,1,2,0,3,0,2,0,NoRidge,2,2,0,5,7,4,1,1,12,13,1,2.545,2,4,2,2,3,0,2,2.817,5,0.0,2.691,3.059,1,0,1,4,3.059,3.023,0,3.342,1,0,2,1,4,1,2,7,6,1,4,1,1,3,2.923,4,4,2,192,84,0,0,0,0,3,4,4,0,11,8,4,250000.0,1.92,2198,6.401,3.06,0.001,2.0,2.76,2.908,2.232,-1998,-48,5.134,74.052,313.0,22.0,1700,11169.472,7.403,215245,3.164,3.751,2.524,0.14,0.02,335996.0,584.341,0,79.897,4.662,570,5.809,575,3,1021.537,5.769,6110.0,3.103,3.751,575,119995.678,1066.33,4692,346.404,52652.697,229.462,929.445,438,0,368,-1993.167,-1908,-1146071,-2006,-45.508,123.344,-57,3,-26167,3.3,1147571,5642,1269,0.001,0.582


In [223]:
df["MSSubClassGarageArea_SUM"] = np.log10(df['MSSubClassGarageArea_SUM'] + 1)
df["MSSubClassGarageArea_MEAN"] = np.log10(df['MSSubClassGarageArea_MEAN'] + 1)
df["MSSubClass2ndFlrSF_VAR"] = np.log10(df['MSSubClass2ndFlrSF_VAR'] + 1)
df["MSSubClass2ndFlrSF_MEAN"] = np.log10(df['MSSubClass2ndFlrSF_MEAN'] + 1)
df["MSSubClass2ndFlrSF_STD"] = np.log10(df['MSSubClass2ndFlrSF_STD'] + 1)
df["MSSubClass2ndFlrSF_VAR"] = np.log10(df['MSSubClass2ndFlrSF_VAR'] + 1)

KeyError: 'LotArea'

In [None]:
for col in num_cols:
    df.hist(col, bins=50)
    plt.show()

In [None]:
na_list = [col for col in df.columns if df[col].isnull().sum() >0 and col not in "SalePrice"]
for col in na_list:
    df.dropna(subset=[col], how = "all", inplace=True)

In [65]:
def one_hot_encoder(df, drop_first=True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, drop_first=drop_first)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

df, cat_colss = one_hot_encoder(df)

In [150]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,LotShape,LotConfig,LandSlope,Condition1,HouseStyle,OverallQual,OverallCond,RoofStyle,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtUnfSF,TotalBsmtSF,HeatingQC,1stFlrSF,2ndFlrSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageFinish,GarageCars,GarageArea,GarageQual,OpenPorchSF,EnclosedPorch,MoSold,SaleType,SaleCondition,SalePrice,Floorcross_tersr,Total_Floor,_Floor_cross,Zemin_Alan_Kullanımı,Alana_düşen_Mutfak,bodrum_banyo_per,mean_1stFlrSF_GrLivArea,std_1stFlrSF_GrLivArea,Lot_divide,Home_Age,Restore_age,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NPkVill,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker
0,1,5,2,65.0,8450.0,3,4,0,2,3,6,4,1,12,9,1,196.0,1,4,2,2,4,2,706.0,6,150.0,856.0,0,856,854,3.233,1,0,2,1,3,1,2,6,2,0,3,1,2,2,548.0,5,61,0,1,8,4,208500.0,1.998,1710,1463760,857.998,0.001,1,429.616,602.997,130.0,5,7,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,0,2,80.0,9600.0,3,2,0,1,2,5,7,1,8,5,2,0.0,2,4,1,2,4,0,978.0,6,284.0,1262.0,0,1262,0,3.101,0,1,2,0,3,1,3,4,2,1,5,1,2,2,460.0,5,0,0,4,8,4,181500.0,1.0,1262,1592644,1263.0,0.001,0,632.551,890.176,120.0,31,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,3,5,2,68.0,11250.0,0,4,0,2,3,6,4,1,12,9,1,162.0,1,4,2,2,4,2,486.0,6,434.0,920.0,0,920,866,3.252,1,0,2,1,3,1,2,4,2,1,5,1,2,2,608.0,5,42,0,8,8,4,223500.0,1.941,1786,1643120,921.941,0.001,1,461.626,648.239,165.441,7,8,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,4,6,2,60.0,9550.0,0,0,0,2,3,6,4,1,13,11,2,0.0,2,4,0,4,1,0,216.0,6,540.0,756.0,2,961,756,3.235,1,0,1,0,3,1,2,5,2,1,2,5,3,3,642.0,5,35,272,1,8,0,140000.0,1.787,1717,1650037,962.787,0.001,1,482.117,677.242,159.167,91,38,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,5,5,2,84.0,14260.0,0,2,0,2,3,7,4,1,12,9,1,312.5,1,4,2,2,4,2,655.0,6,490.0,1145.0,0,1145,1053,3.342,1,0,2,1,4,1,2,7,2,1,5,1,2,3,836.0,5,84,0,11,8,4,250000.0,1.92,2198,2516710,1146.92,0.001,1,574.171,807.274,169.762,8,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [149]:
df = pd.get_dummies(df, "Neighborhood", drop_first=True)

In [None]:
variable = [ ]

for col in df.columns:
    if df[col].var()>=0.002:
        variable.append(col)

In [193]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotShape,LotConfig,Condition1,HouseStyle,OverallQual,OverallCond,RoofStyle,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtUnfSF,HeatingQC,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,Functional,Fireplaces,FireplaceQu,GarageType,GarageFinish,GarageCars,GarageQual,MoSold,SaleType,SaleCondition,SalePrice,Floorcross_tersr,Total_Floor,_Floor_cross,Zemin_Alan_Kullanımı,Alana_düşen_Mutfak,bodrum_banyo_per,mean_1stFlrSF_GrLivArea,std_1stFlrSF_GrLivArea,Lot_divide,Home_Age,Restore_age,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NPkVill,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker,MSZoningLotFrontage_SUM,MSZoningLotFrontage_MEAN,MSZoningLotFrontage_MAX,MSZoningLotFrontage_MIN,MSZoningLotArea_MIN,MSZoningLotArea_MEAN,MSZoningLotArea_SUM,MSZoningLotArea_MAX,MSZoningGrLivArea_MEAN,MSZoningGrLivArea_MAX,MSZoningGrLivArea_MIN,MSZoningGrLivArea_STD,MSZoningGrLivArea_VAR,MSSubClassGarageArea_SUM,MSSubClassGarageArea_MEAN,MSSubClassOpenPorchSF_MIN,MSSubClassOpenPorchSF_MEAN,MSSubClassOpenPorchSF_SUM,MSSubClassOpenPorchSF_MAX,MSSubClassTotRmsAbvGrd_MEAN,MSSubClassTotRmsAbvGrd_SIZE,MSSubClassTotalBsmtSF_MEAN,MSSubClassTotalBsmtSF_SUM,MSSubClassTotalBsmtSF_MAX,MSSubClassGrLivArea_MIN,MSSubClassGrLivArea_MAX,MSSubClassGrLivArea_SIZE,MSSubClass1stFlrSF_VAR,MSSubClass1stFlrSF_MEAN,MSSubClass1stFlrSF_MAX,MSSubClass1stFlrSF_STD,MSSubClass2ndFlrSF_VAR,MSSubClass2ndFlrSF_STD,MSSubClass2ndFlrSF_MEAN,MSSubClass2ndFlrSF_MIN,MSSubClassEnclosedPorch_MIN,MSSubClassEnclosedPorch_MAX,MSSubClassHome_Age_MEAN,MSSubClassHome_Age_MAX,MSSubClassHome_Age_SUM,MSSubClassRestore_age_MEAN,MSSubClassRestore_age_VAR,MSSubClassRestore_age_MIN,MSSubClassRestore_age_MAX,MSSubClassRestore_age_SUM,MSSubClassTotal_Floor_MEAN,MSSubClassTotal_Floor_SUM,MSSubClassAlana_düşen_Mutfak_MEAN,MSSubClassAlana_düşen_Mutfak_SUM
0,1,5,2,3,4,2,3,6,4,1,12,9,1,2.294,1,4,2,2,4,2,2.849,6,2.179,0,1,0,2,1,3,1,2,2,0,3,1,2,2,5,1,8,4,208500.0,1.998,1710,6.165,2.934,0.001,1,2.634,2.781,2.117,5,7,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3.184,1.85,2.063,1.398,3.356,3.985,3.62,4.232,3.148,3.751,2.524,0.133,0.018,675.696,2.725,0,78.391,4.289,406,5.585,248,2.948,2.865,3.786,3.121,3.751,248,0.011,2.984,3.671,0.103,0.007,0.083,2.943,2.642,0,252,14.629,63,3628,14.573,149.857,1,58,3614,3.277,468531,0.001,0.264
1,2,0,2,3,2,1,2,5,7,1,8,5,2,0.0,2,4,1,2,4,0,2.991,6,2.455,0,0,1,2,0,3,1,3,2,1,5,1,2,2,5,4,8,4,181500.0,1.0,1262,6.202,3.102,0.001,0,2.802,2.95,2.083,31,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,3.184,1.85,2.063,1.398,3.356,3.985,3.62,4.232,3.148,3.751,2.524,0.133,0.018,1258.317,2.584,0,38.856,4.277,418,3.889,487,2.973,3.161,3.506,2.524,3.462,487,0.013,3.095,3.462,0.113,0.018,0.134,0.006,0.0,0,280,32.376,71,15767,25.879,401.242,2,62,12603,3.11,626821,0.001,0.406
2,3,5,2,0,4,2,3,6,4,1,12,9,1,2.212,1,4,2,2,4,2,2.688,6,2.638,0,1,0,2,1,3,1,2,2,1,5,1,2,2,5,8,8,4,223500.0,1.941,1786,6.216,2.965,0.001,1,2.665,2.812,2.221,7,8,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3.184,1.85,2.063,1.398,3.356,3.985,3.62,4.232,3.148,3.751,2.524,0.133,0.018,675.696,2.725,0,78.391,4.289,406,5.585,248,2.948,2.865,3.786,3.121,3.751,248,0.011,2.984,3.671,0.103,0.007,0.083,2.943,2.642,0,252,14.629,63,3628,14.573,149.857,1,58,3614,3.277,468531,0.001,0.264
3,4,6,2,0,0,2,3,6,4,1,13,11,2,0.0,2,4,0,4,1,0,2.336,6,2.733,2,1,0,1,0,3,1,2,2,1,2,5,3,3,5,1,8,0,140000.0,1.787,1717,6.217,2.984,0.001,1,2.684,2.831,2.205,91,38,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3.184,1.85,2.063,1.398,3.356,3.985,3.62,4.232,3.148,3.751,2.524,0.133,0.018,140.322,2.419,0,36.379,3.324,312,5.517,58,2.883,2.226,3.137,3.055,3.422,58,0.011,2.964,3.198,0.103,0.011,0.105,2.894,2.703,0,386,89.5,136,5191,32.603,556.349,3,62,1891,3.244,101637,0.001,0.066
4,5,5,2,0,2,2,3,7,4,1,12,9,1,2.496,1,4,2,2,4,2,2.817,6,2.691,0,1,0,2,1,4,1,2,2,1,5,1,2,3,5,11,8,4,250000.0,1.92,2198,6.401,3.06,0.001,1,2.76,2.908,2.232,8,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,3.184,1.85,2.063,1.398,3.356,3.985,3.62,4.232,3.148,3.751,2.524,0.133,0.018,675.696,2.725,0,78.391,4.289,406,5.585,248,2.948,2.865,3.786,3.121,3.751,248,0.011,2.984,3.671,0.103,0.007,0.083,2.943,2.642,0,252,14.629,63,3628,14.573,149.857,1,58,3614,3.277,468531,0.001,0.264


In [190]:
train_df = df[df['SalePrice'].notnull()]
test_df = df[df['SalePrice'].isnull()]

In [195]:
test_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotShape,LotConfig,Condition1,HouseStyle,OverallQual,OverallCond,RoofStyle,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtUnfSF,HeatingQC,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,Functional,Fireplaces,FireplaceQu,GarageType,GarageFinish,GarageCars,GarageQual,MoSold,SaleType,SaleCondition,SalePrice,Floorcross_tersr,Total_Floor,_Floor_cross,Zemin_Alan_Kullanımı,Alana_düşen_Mutfak,bodrum_banyo_per,mean_1stFlrSF_GrLivArea,std_1stFlrSF_GrLivArea,Lot_divide,Home_Age,Restore_age,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NPkVill,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker,MSZoningLotFrontage_SUM,MSZoningLotFrontage_MEAN,MSZoningLotFrontage_MAX,MSZoningLotFrontage_MIN,MSZoningLotArea_MIN,MSZoningLotArea_MEAN,MSZoningLotArea_SUM,MSZoningLotArea_MAX,MSZoningGrLivArea_MEAN,MSZoningGrLivArea_MAX,MSZoningGrLivArea_MIN,MSZoningGrLivArea_STD,MSZoningGrLivArea_VAR,MSSubClassGarageArea_SUM,MSSubClassGarageArea_MEAN,MSSubClassOpenPorchSF_MIN,MSSubClassOpenPorchSF_MEAN,MSSubClassOpenPorchSF_SUM,MSSubClassOpenPorchSF_MAX,MSSubClassTotRmsAbvGrd_MEAN,MSSubClassTotRmsAbvGrd_SIZE,MSSubClassTotalBsmtSF_MEAN,MSSubClassTotalBsmtSF_SUM,MSSubClassTotalBsmtSF_MAX,MSSubClassGrLivArea_MIN,MSSubClassGrLivArea_MAX,MSSubClassGrLivArea_SIZE,MSSubClass1stFlrSF_VAR,MSSubClass1stFlrSF_MEAN,MSSubClass1stFlrSF_MAX,MSSubClass1stFlrSF_STD,MSSubClass2ndFlrSF_VAR,MSSubClass2ndFlrSF_STD,MSSubClass2ndFlrSF_MEAN,MSSubClass2ndFlrSF_MIN,MSSubClassEnclosedPorch_MIN,MSSubClassEnclosedPorch_MAX,MSSubClassHome_Age_MEAN,MSSubClassHome_Age_MAX,MSSubClassHome_Age_SUM,MSSubClassRestore_age_MEAN,MSSubClassRestore_age_VAR,MSSubClassRestore_age_MIN,MSSubClassRestore_age_MAX,MSSubClassRestore_age_SUM,MSSubClassTotal_Floor_MEAN,MSSubClassTotal_Floor_SUM,MSSubClassAlana_düşen_Mutfak_MEAN,MSSubClassAlana_düşen_Mutfak_SUM


In [167]:
train_df.dropna(inplace=True)

In [168]:
X =train_df.drop(["SalePrice", "Id"], axis=1)

y = np.log1p(train_df['SalePrice'])

In [169]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=17)

In [174]:
models = [("XGBoost", XGBRegressor(objective='reg:squarederror')),
          ("LightGBM", LGBMRegressor()),
           ("CatBoost", CatBoostRegressor(verbose=False)) 
                  ]

for name, regressor in models:
    rmse = np.mean(np.sqrt(-cross_val_score(regressor, X, y, cv=10, scoring="neg_mean_squared_error")))
    print(f"RMSE: {round(rmse, 4)} ({name}) ")

RMSE: 0.1225 (LR) 
RMSE: 0.1245 (Ridge) 
RMSE: 0.1989 (Lasso) 
RMSE: 0.1906 (ElasticNet) 
RMSE: 0.1954 (KNN) 
RMSE: 0.1961 (CART) 
RMSE: 0.1361 (RF) 
RMSE: 0.3007 (SVR) 
RMSE: 0.1231 (GBM) 
RMSE: 0.1327 (XGBoost) 
RMSE: 0.1294 (LightGBM) 
RMSE: 0.1192 (CatBoost) 


In [182]:
sorted_feature_importance = model.feature_importances_.argsort()
plt.barh(X.feature_names[sorted_feature_importance], 
        CatBoost.feature_importances_[sorted_feature_importance], 
        color='turquoise')
plt.xlabel("CatBoost Feature Importance")

NameError: name 'model' is not defined

In [131]:
lgbm_model = LGBMRegressor(random_state=46)

rmse = np.mean(np.sqrt(-cross_val_score(lgbm_model,
                                        X, y, cv=5, scoring="neg_mean_squared_error")))


lgbm_params = {"learning_rate": [0.01, 0.1],
               "n_estimators": [500, 1500, 2000, 5000],
               "max_bin":[255],
                'num_leaves': [7, 14, 21, 28, 31, 50],
               'max_depth': [-1, 3, 5],
               "colsample_bytree": [0.5, 0.7, 1]
             }

lgbm_gs_best = GridSearchCV(lgbm_model,
                            lgbm_params,
                            cv=3,
                            n_jobs=-1,
                            verbose=True).fit(X_train, y_train)

Fitting 3 folds for each of 432 candidates, totalling 1296 fits


KeyboardInterrupt: 

In [None]:
final_model = lgbm_model.set_params(**lgbm_gs_best.best_params_).fit(X, y)

rmse = np.mean(np.sqrt(-cross_val_score(final_model, X, y, cv=5, scoring="neg_mean_squared_error")))

In [None]:
rmse

In [None]:


catboost = CatBoostRegressor(verbose=False)

rmse = np.mean(np.sqrt(-cross_val_score(catboost,
                                        X, y, cv=10, scoring="neg_mean_squared_error")))
rmse

In [None]:
test_df 

In [185]:

catboost_params ={'iterations': [100, 150, 200,300,400,500,600,700,800,900,1000],
                     'learning_rate': [0.03, 0.1],
                    'depth': [2, 4, 6, 8],
                    'l2_leaf_reg': [0.2,0.3,0.4, 0.5, 1, 3]}

catboost_model = CatBoostRegressor(verbose=False)
catboost_best_grid = GridSearchCV(catboost_model, catboost_params, cv=10, n_jobs=-1, verbose=False).fit(X, y)

catboost_final = catboost_model.set_params(**catboost_best_grid.best_params_, random_state=17).fit(X, y)

rmse = np.mean(np.sqrt(-cross_val_score(catboost_final,
                                        X, y, cv=10, scoring="neg_mean_squared_error")))
rmse

0.11776106807437828

In [207]:
test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Feedr,Norm,1Fam,1Story,5,6,1961,1961,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,CBlock,TA,TA,No,Rec,468.0,LwQ,144.0,270.0,882.0,GasA,TA,Y,SBrkr,896,0,0,896,0.0,0.0,1,0,2,1,TA,5,Typ,0,,Attchd,1961.0,Unf,1.0,730.0,TA,TA,Y,140,0,0,0,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,6,6,1958,1958,Hip,CompShg,Wd Sdng,Wd Sdng,BrkFace,108.0,TA,TA,CBlock,TA,TA,No,ALQ,923.0,Unf,0.0,406.0,1329.0,GasA,TA,Y,SBrkr,1329,0,0,1329,0.0,0.0,1,1,3,1,Gd,6,Typ,0,,Attchd,1958.0,Unf,1.0,312.0,TA,TA,Y,393,36,0,0,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,5,5,1997,1998,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,PConc,Gd,TA,No,GLQ,791.0,Unf,0.0,137.0,928.0,GasA,Gd,Y,SBrkr,928,701,0,1629,0.0,0.0,2,1,3,1,TA,6,Typ,1,TA,Attchd,1997.0,Fin,2.0,482.0,TA,TA,Y,212,34,0,0,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,6,6,1998,1998,Gable,CompShg,VinylSd,VinylSd,BrkFace,20.0,TA,TA,PConc,TA,TA,No,GLQ,602.0,Unf,0.0,324.0,926.0,GasA,Ex,Y,SBrkr,926,678,0,1604,0.0,0.0,2,1,3,1,Gd,7,Typ,1,Gd,Attchd,1998.0,Fin,2.0,470.0,TA,TA,Y,360,36,0,0,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,Inside,Gtl,StoneBr,Norm,Norm,TwnhsE,1Story,8,5,1992,1992,Gable,CompShg,HdBoard,HdBoard,,0.0,Gd,TA,PConc,Gd,TA,No,ALQ,263.0,Unf,0.0,1017.0,1280.0,GasA,Ex,Y,SBrkr,1280,0,0,1280,0.0,0.0,2,0,2,1,Gd,5,Typ,0,,Attchd,1992.0,RFn,2.0,506.0,TA,TA,Y,0,82,0,0,144,0,,,,0,1,2010,WD,Normal


In [191]:
test_dff = test_df.drop(["SalePrice", "Id"], axis=1)

In [192]:
y_pred = catboost_final.predict(test_dff)

No objects info loaded


In [None]:

# Yapılan LOG dönüşümünün tersinin (inverse'nin) alınması
new_y= np.expm1(y_pred)

In [None]:
np.sqrt(mean_squared_error(new_y_test, new_y))

In [None]:

#final_model = lgbm_model.set_params(**lgbm_gs_best.best_params_).fit(X, y)

#rmse = np.mean(np.sqrt(-cross_val_score(final_model, X, y, cv=10, scoring="neg_mean_squared_error")))
#rmse

In [None]:
 The best score across ALL searched params:
 0.8803054783503172

 The best parameters across ALL searched params:
 {'max_depth': 4, 'n_estimators': 300}