#### Gerekli Kütüphaneler

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from typing import Optional
    
from lightgbm import LGBMRegressor
from lightgbm import plot_importance

from xgboost import XGBRegressor
from catboost import CatBoostRegressor

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge

from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import make_scorer

from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from sklearn.cluster import KMeans

- Pandas ayarlar

In [2]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

#### Veri setlerini yükleme

In [3]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample_submission = pd.read_csv("sample_submission.csv")

#### Veriyi inceleme

In [None]:
train.head()

In [None]:
train.describe().T

- Fonksiyon tanımlamaları

In [None]:
# Verisetinin değişkenlerini tespit etme                                                                                     
def grab_col_names(df, cat_th=10, car_th=20):

    # Categorical Columns
    cat_cols = [col for col in df.columns if df[col].dtypes == "O"]
    num_but_cat = [col for col in df.columns if df[col].nunique() < cat_th and df[col].dtypes != "O"]
    cat_but_car = [col for col in df.columns if df[col].nunique() > car_th and df[col].dtypes == "O"]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    # Numerical Columns
    num_cols = [col for col in df.columns if df[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]

    # Results
    print(f"Observations: {df.shape[0]}")
    print(f"Variables: {df.shape[1]}")
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')

    return cat_cols, cat_but_car, num_cols    

In [None]:
train.Alley.count()

In [4]:
# Kategorik değişken analizi                              
def categorical_value_counts(df, col, target: None, rare: Optional[float] = None):
    temp = df.groupby(col, dropna=False).agg(Count=(col, lambda x: x.isnull().count()), \
                               Ratio=(col, lambda x: x.isnull().count() / len(df)), \
                               Target_Ratio=(target, lambda x: x.sum() / df[target].sum())) \
        .sort_values("Count", ascending=False).reset_index()

    if rare is not None:
        rares = temp.loc[temp["Ratio"] <= float(rare), col].tolist()
        df.loc[df[col].isin(rares), col] = "Rare Category"
        print("---- Done! --- ")
        temp = df.groupby(col).agg(Count=(col, lambda x: x.isnull().count()), \
                                  Ratio=(col, lambda x: x.count() / len(df)), \
                                  Target_Ratio=(target, lambda x: x.sum() / df[target].sum())) \
              .sort_values("Count", ascending=False).reset_index()
    return temp

In [None]:
temp = categorical_value_counts(train, "Alley", "SalePrice")
cats = temp[temp["Ratio"] < 0.07]["Alley"]

In [None]:
def outliers(df, col, low_Quantile = 0.25, high_Quantile = 0.75, adjust = False):  
    Q1 = df[col].quantile(low_Quantile)
    Q3 = df[col].quantile(high_Quantile)
    IQR = Q3 - Q1
    low_Limit = Q1 - (1.5 * IQR)
    up_Limit = Q3 + (1.5 * IQR)
    
    if len(df[df[col] > up_Limit]) > 0:
        print(col, ": Higher Outlier!")
    if len(df[df[col] < low_Limit]) > 0:
        print(col, ": Lower Outlier!")
        
    if adjust:
        df.loc[(df[col] < low_Limit), col] = low_Limit
        df.loc[(df[col] > up_Limit), col] = up_Limit
        print(col, ": Done!")

In [None]:
cat_cols, cat_but_car, num_cols = grab_col_names(train)

In [None]:
for col in cat_cols:
    categorical_value_counts(train, col, "SalePrice")

In [None]:
RS = RobustScaler()

In [None]:
RS = RobustScaler()

train["MSSubClass"] = RS.fit_transform(train[["MSSubClass"]])
train["LotFrontage"] = RS.fit_transform(train[["LotFrontage"]])
train["LotFrontage"] = train["LotFrontage"].fillna(train["LotFrontage"].mean())
train["New_OverallQual"] = pd.cut(train["OverallQual"], bins=[-1, 5, 8, 11], labels=[0, 1, 2])
train["New_OverallCond"] = pd.cut(train["OverallCond"], bins=[0, 5, 8, 10], labels=[0, 1, 2])
train["New_YearBuilt"] = pd.cut(train["YearBuilt"], bins=[1871, 1943, 1990, 2011], labels=[0, 1, 2])
train["New_YearRemodAdd"] = pd.cut(train["YearRemodAdd"], bins=[1949, 1990, 2011], labels=[0, 1])
train["MasVnrType"] = train["MasVnrType"].fillna("None")
train["MasVnrArea"] = train["MasVnrArea"].fillna(0)
train["MasVnrArea"] = RS.fit_transform(train[["MasVnrArea"]])
train["BsmtFinSF1"] = RS.fit_transform(train[["BsmtFinSF1"]])
train["BsmtFinSF2"] = RS.fit_transform(train[["BsmtFinSF2"]])
train["BsmtUnfSF"] = RS.fit_transform(train[["BsmtUnfSF"]])
train["2ndFlrSF"] = train["2ndFlrSF"].apply(lambda x: 0 if x == 0 else 1) 
train["LowQualFinSF"] = train["LowQualFinSF"].apply(lambda x: 0 if x == 0 else 1) 

train["LotArea"] = RS.fit_transform(train[["LotArea"]])
train["Alley"] = train["Alley"].fillna("Unknown")
train["BsmtQual"] = train["BsmtQual"].fillna("Unknown")
train["BsmtCond"] = train["BsmtCond"].fillna("TA")
train["BsmtExposure"] = train["BsmtExposure"].fillna("Unknown")
train["BsmtFinType1"] = train["BsmtFinType1"].fillna("Unknown")
train["BsmtFinType2"] = train["BsmtFinType2"].fillna("Unf")
train["Electrical"] = train["Electrical"].fillna("SBrkr")
train["FireplaceQu"] = train["FireplaceQu"].fillna("Unknown")
train["GarageType"] = train["GarageType"].fillna("None")
train["GarageYrBlt"] = train["GarageYrBlt"].fillna("0")
train["GarageFinish"] = train["GarageFinish"].fillna("None")
train["GarageQual"] = train["GarageQual"].fillna("None")
train["GarageCond"] = train["GarageCond"].fillna("None")
train["PoolQC"] = train["PoolQC"].fillna("None")
train["Fence"] = train["Fence"].fillna("Unknown")
train["MiscFeature"] = train["MiscFeature"].fillna("None")

In [None]:
categorical_value_counts(train, "MSZoning", "SalePrice", 0.05)
# Street
categorical_value_counts(train, "Alley", "SalePrice", 0.05)
categorical_value_counts(train, "LotShape", "SalePrice", 0.05)
categorical_value_counts(train, "LandContour", "SalePrice", 0.05)
#Utilities
categorical_value_counts(train, "LotConfig", "SalePrice", 0.05) # 0.07
categorical_value_counts(train, "LandSlope", "SalePrice", 0.05)
categorical_value_counts(train, "Condition1", "SalePrice", 0.06)
categorical_value_counts(train, "Condition2", "SalePrice", 0.05)
categorical_value_counts(train, "BldgType", "SalePrice", 0.07) # 0.09
categorical_value_counts(train, "HouseStyle", "SalePrice", 0.05)
categorical_value_counts(train, "RoofStyle", "SalePrice", 0.05)
categorical_value_counts(train, "RoofMatl", "SalePrice", 0.05)
categorical_value_counts(train, "Exterior1st", "SalePrice", 0.05)
categorical_value_counts(train, "Exterior2nd", "SalePrice", 0.05)
#MasVnrType
categorical_value_counts(train, "ExterQual", "SalePrice", 0.05)
categorical_value_counts(train, "ExterCond", "SalePrice", 0.05)
categorical_value_counts(train, "Foundation", "SalePrice", 0.05)
categorical_value_counts(train, "BsmtQual", "SalePrice", 0.05)
categorical_value_counts(train, "BsmtCond", "SalePrice", 0.05)
#BsmtExposure
#BsmtFinType1
categorical_value_counts(train, "BsmtFinType2", "SalePrice", 0.05)
categorical_value_counts(train, "Heating", "SalePrice", 0.05)
categorical_value_counts(train, "HeatingQC", "SalePrice", 0.05)
#CentralAir
categorical_value_counts(train, "Electrical", "SalePrice", 0.1)
#KitchenQual
categorical_value_counts(train, "Functional", "SalePrice", 0.05)
categorical_value_counts(train, "FireplaceQu", "SalePrice", 0.05)
categorical_value_counts(train, "GarageType", "SalePrice", 0.1)
#GarageFinish
categorical_value_counts(train, "GarageQual", "SalePrice", 0.05)
categorical_value_counts(train, "GarageCond", "SalePrice", 0.05)
categorical_value_counts(train, "PavedDrive", "SalePrice", 0.1)
categorical_value_counts(train, "PoolQC", "SalePrice", 0.05)
categorical_value_counts(train, "Fence", "SalePrice", 0.05)
categorical_value_counts(train, "MiscFeature", "SalePrice", 0.05)
categorical_value_counts(train, "SaleType", "SalePrice", 0.05)
categorical_value_counts(train, "SaleCondition", "SalePrice", 0.05)
#OverallCond
categorical_value_counts(train, "BsmtFullBath", "SalePrice", 0.05)
categorical_value_counts(train, "BsmtHalfBath", "SalePrice", 0.06)
#categorical_value_counts(train, "FullBath", "SalePrice", 0.05)
#HalfBath
categorical_value_counts(train, "Alley", "SalePrice", 0.05)
#BedroomAbvGr
#KitchenAbvGr
#Fireplaces
#GarageCars
#PoolArea
#YrSold

In [None]:
train.isnull().sum()

In [None]:
train.info()

In [None]:
train["LotFrontage"] = train["LotFrontage"].fillna(0)

In [None]:
col = "LotFrontage"
print("Min:", train[col].min())
print("Max:", train[col].max())
print("Null Count:", train[col].isnull().sum())
print("Mean:", train[col].mean())
plt.ylim(-10, 1500)
plt.hist(train[col], bins=50);

In [24]:
RS = RobustScaler()

In [5]:
col = "GarageFinish"
print("Min:", train[col].min())
print("Max:", train[col].max())
print("Null Count:", train[col].isnull().sum())
print("Mean:", train[col].mean())
plt.ylim(-10, 1500)
plt.hist(train[col], bins=50);

TypeError: '<=' not supported between instances of 'str' and 'float'

In [6]:
categorical_value_counts(train, "GarageFinish", "SalePrice")

Unnamed: 0,GarageFinish,Count,Ratio,Target_Ratio
0,Unf,605,0.414384,0.325596
1,RFn,422,0.289041,0.322827
2,Fin,352,0.241096,0.319895
3,,81,0.055479,0.031682


In [7]:
train.head(10)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,Gd,Y,SBrkr,961,756,0,1717,1,0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1,0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,250000
5,6,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Mitchel,Norm,Norm,1Fam,1.5Fin,5,5,1993,1995,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,Wood,Gd,TA,No,GLQ,732,Unf,0,64,796,GasA,Ex,Y,SBrkr,796,566,0,1362,1,0,1,1,1,1,TA,5,Typ,0,,Attchd,1993.0,Unf,2,480,TA,TA,Y,40,30,0,320,0,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000
6,7,20,RL,75.0,10084,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Somerst,Norm,Norm,1Fam,1Story,8,5,2004,2005,Gable,CompShg,VinylSd,VinylSd,Stone,186.0,Gd,TA,PConc,Ex,TA,Av,GLQ,1369,Unf,0,317,1686,GasA,Ex,Y,SBrkr,1694,0,0,1694,1,0,2,0,3,1,Gd,7,Typ,1,Gd,Attchd,2004.0,RFn,2,636,TA,TA,Y,255,57,0,0,0,0,,,,0,8,2007,WD,Normal,307000
7,8,60,RL,,10382,Pave,,IR1,Lvl,AllPub,Corner,Gtl,NWAmes,PosN,Norm,1Fam,2Story,7,6,1973,1973,Gable,CompShg,HdBoard,HdBoard,Stone,240.0,TA,TA,CBlock,Gd,TA,Mn,ALQ,859,BLQ,32,216,1107,GasA,Ex,Y,SBrkr,1107,983,0,2090,1,0,2,1,3,1,TA,7,Typ,2,TA,Attchd,1973.0,RFn,2,484,TA,TA,Y,235,204,228,0,0,0,,,Shed,350,11,2009,WD,Normal,200000
8,9,50,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,Inside,Gtl,OldTown,Artery,Norm,1Fam,1.5Fin,7,5,1931,1950,Gable,CompShg,BrkFace,Wd Shng,,0.0,TA,TA,BrkTil,TA,TA,No,Unf,0,Unf,0,952,952,GasA,Gd,Y,FuseF,1022,752,0,1774,0,0,2,0,2,2,TA,8,Min1,2,TA,Detchd,1931.0,Unf,2,468,Fa,TA,Y,90,0,205,0,0,0,,,,0,4,2008,WD,Abnorml,129900
9,10,190,RL,50.0,7420,Pave,,Reg,Lvl,AllPub,Corner,Gtl,BrkSide,Artery,Artery,2fmCon,1.5Unf,5,6,1939,1950,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,BrkTil,TA,TA,No,GLQ,851,Unf,0,140,991,GasA,Ex,Y,SBrkr,1077,0,0,1077,1,0,1,0,2,2,TA,5,Typ,2,TA,Attchd,1939.0,RFn,1,205,Gd,TA,Y,0,4,0,0,0,0,,,,0,1,2008,WD,Normal,118000


In [None]:
dummied1=pd.get_dummies(data=train, 
               columns=["MSZoning", "Street", "Alley", 
                        "LotShape", "LandContour", "Utilities", 
                        "LotConfig", "LandSlope", "Neighborhood", 
                        "Condition1", "Condition2", "BldgType", 
                        "HouseStyle", "RoofStyle", "RoofMatl", 
                        "Exterior1st", "Exterior2nd", "MasVnrType", 
                        "ExterQual", "ExterCond", "Foundation", 
                        "BsmtQual", "BsmtCond", "BsmtExposure", 
                        "BsmtFinType1", "BsmtFinType2", "Heating", 
                        "HeatingQC", "CentralAir", "Electrical", 
                        "BsmtHalfBath", "KitchenQual", "Functional", 
                        "FireplaceQu", "GarageType", "GarageFinish", 
                        "GarageQual", "GarageCond", "PavedDrive", 
                        "PoolQC", "Fence", "MiscFeature", 
                        "SaleType", "SaleCondition"],
              drop_first=True)

In [None]:
dummied1.dtypes

In [None]:
dummied1.shape

In [None]:
dummied1.isnull().sum()

In [None]:
dummied2=pd.get_dummies(data=train, drop_first=False)

In [None]:
dummied2.shape

In [None]:
X_train = dummied1.drop("SalePrice", axis=1)
y_train = dummied1["SalePrice"]

In [None]:
CB_model = CatBoostRegressor(verbose=False)
CB_model.fit(X_train, y_train)
print("Train Score:", CB_model.score(X_train, y_train))
y_pred = CB_model.predict(X_train)
print("MAPE:", mean_absolute_percentage_error(y_train, y_pred))
print("MAE:", mean_absolute_error(y_train, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_train, y_pred)))

In [None]:
LGBM_model = LGBMRegressor()
LGBM_model.fit(X_train, y_train)
print("Train Score:", LGBM_model.score(X_train, y_train))
y_pred = LGBM_model.predict(X_train)
print("MAPE:", mean_absolute_percentage_error(y_train, y_pred))
print("MAE:", mean_absolute_error(y_train, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_train, y_pred)))

In [None]:
RS = RobustScaler()
test["LotFrontage"] = RS.fit_transform(test[["LotFrontage"]])

In [None]:
test["LotFrontage"] = test["LotFrontage"].fillna(test["LotFrontage"].mean())

In [None]:
test["MasVnrArea"] = test["MasVnrArea"].fillna(0)

In [None]:
test["RoofStyle"] = test["RoofStyle"].apply(lambda x: "Rare" if x in ["Shed", "Mansard", "Gambrel", "Flat"] else x)
test = pd.get_dummies(data=test, columns=["RoofStyle"], drop_first=True)

In [None]:
test.isnull().sum()

In [None]:
#y_test = test["SalePrice"]
X_test = test[["MSSubClass", "LotArea", "OverallQual", 
                 "OverallCond", "YearBuilt", "TotalBsmtSF", 
                 "1stFlrSF", "2ndFlrSF", "BsmtFullBath", 
                 "FullBath", "BedroomAbvGr", "KitchenAbvGr", "RoofStyle_Rare",
                 "TotRmsAbvGrd", "GarageCars", "GarageYrBlt", "RoofStyle_Hip",
                  "ScreenPorch", "YrSold", "LotFrontage", "YearRemodAdd",
                 "MiscVal", "3SsnPorch", "WoodDeckSF", "MasVnrArea",
                 "MoSold", "EnclosedPorch", "OpenPorchSF", "GarageArea"]]

In [None]:
y_pred_test = CB_model.predict(X_test)

In [None]:
y_pred_test

In [None]:
ID = test["Id"]

In [None]:
submission = pd.DataFrame({"Id": test["Id"], "SalePrice": y_pred_test})

In [None]:
submission.to_csv("pred3.csv", index=False)

In [None]:
test.head()

In [None]:
sample_submission

In [None]:
LR = LinearRegression()
LR.fit(X_train, y_train)
print("Train Score:", LR.score(X_train, y_train))
y_pred = LR.predict(X_train)
print("MAPE:", mean_absolute_percentage_error(y_train, y_pred))
print("MAE:", mean_absolute_error(y_train, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_train, y_pred)))
#print("Test Score:", LR.score(X_test_, y_test_))
#y_pred_test = LR.predict(X_test_)
#print("Test:", mean_absolute_percentage_error(y_test_, y_pred_test))