#### Gerekli Kütüphaneler

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from typing import Optional
    
from lightgbm import LGBMRegressor
from lightgbm import plot_importance

from xgboost import XGBRegressor
from catboost import CatBoostRegressor

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge

from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import make_scorer

from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from sklearn.cluster import KMeans

- Pandas ayarlar

In [44]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

#### Veri setlerini yükleme

In [51]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample_submission = pd.read_csv("sample_submission.csv")

#### Veriyi inceleme

In [None]:
train.head()

In [None]:
train.describe().T

- Fonksiyon tanımlamaları

In [4]:
# Verisetinin değişkenlerini tespit etme                                                                                     
def grab_col_names(df, cat_th=10, car_th=20):

    # Categorical Columns
    cat_cols = [col for col in df.columns if df[col].dtypes == "O"]
    num_but_cat = [col for col in df.columns if df[col].nunique() < cat_th and df[col].dtypes != "O"]
    cat_but_car = [col for col in df.columns if df[col].nunique() > car_th and df[col].dtypes == "O"]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    # Numerical Columns
    num_cols = [col for col in df.columns if df[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]

    # Results
    print(f"Observations: {df.shape[0]}")
    print(f"Variables: {df.shape[1]}")
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')

    return cat_cols, cat_but_car, num_cols    

In [5]:
# Kategorik değişken analizi                              
def categorical_value_counts(df, col, target, rare: Optional[float]=None):   
    temp = df.groupby(col).agg(Count = (col, lambda x:x.count()), \
                          Ratio = (col, lambda x:x.count() / len(df)), \
                          Target_Ratio = (target, lambda x:x.sum() / df[target].sum())) \
                          .sort_values("Count", ascending = False).reset_index()
    
    if rare is not None:
        rares = temp.loc[temp["Ratio"] <= float(rare), col].tolist()
        df.loc[df[col].isin(rares), col] = "Rare Category"
        print("---- Done! --- ")
        print(df.groupby(col).agg(Count = (col, lambda x:x.count()), \
                          Ratio = (col, lambda x:x.count() / len(df)), \
                          Target_Ratio = (target, lambda x:x.sum() / df[target].sum())) \
                          .sort_values("Count", ascending = False).reset_index(), "\n")
    else:
        print(temp, "\n")

In [6]:
def outliers(df, col, low_Quantile = 0.25, high_Quantile = 0.75, adjust = False):  
    Q1 = df[col].quantile(low_Quantile)
    Q3 = df[col].quantile(high_Quantile)
    IQR = Q3 - Q1
    low_Limit = Q1 - (1.5 * IQR)
    up_Limit = Q3 + (1.5 * IQR)
    
    if len(df[df[col] > up_Limit]) > 0:
        print(col, ": Higher Outlier!")
    if len(df[df[col] < low_Limit]) > 0:
        print(col, ": Lower Outlier!")
        
    if adjust:
        df.loc[(df[col] < low_Limit), col] = low_Limit
        df.loc[(df[col] > up_Limit), col] = up_Limit
        print(col, ": Done!")

In [7]:
cat_cols, cat_but_car, num_cols = grab_col_names(train)

Observations: 1460
Variables: 81
cat_cols: 53
num_cols: 27
cat_but_car: 1
num_but_cat: 11


In [32]:
for col in cat_cols:
    categorical_value_counts(train, col, "SalePrice")

        MSZoning  Count     Ratio  Target_Ratio
0             RL   1151  0.788356      0.832296
1             RM    218  0.149315      0.104250
2  Rare Category     91  0.062329      0.063454 

  Street  Count    Ratio  Target_Ratio
0   Pave   1454  0.99589      0.997043
1   Grvl      6  0.00411      0.002957 

           Alley  Count     Ratio  Target_Ratio
0        Unknown   1369  0.937671      0.950788
1  Rare Category     91  0.062329      0.049212 

        LotShape  Count     Ratio  Target_Ratio
0            Reg    925  0.633562      0.576949
1            IR1    484  0.331507      0.377646
2  Rare Category     51  0.034932      0.045405 

     LandContour  Count     Ratio  Target_Ratio
0            Lvl   1311  0.897945      0.894285
1  Rare Category    149  0.102055      0.105715 

  Utilities  Count     Ratio  Target_Ratio
0    AllPub   1459  0.999315      0.999479
1    NoSeWa      1  0.000685      0.000521 

       LotConfig  Count     Ratio  Target_Ratio
0         Inside   105

In [31]:
categorical_value_counts(train, "MSZoning", "SalePrice", 0.05)
# Street
categorical_value_counts(train, "Alley", "SalePrice", 0.05)
categorical_value_counts(train, "LotShape", "SalePrice", 0.05)
categorical_value_counts(train, "LandContour", "SalePrice", 0.05)
#Utilities
categorical_value_counts(train, "LotConfig", "SalePrice", 0.05) # 0.07
categorical_value_counts(train, "LandSlope", "SalePrice", 0.05)
categorical_value_counts(train, "Condition1", "SalePrice", 0.06)
categorical_value_counts(train, "Condition2", "SalePrice", 0.05)
categorical_value_counts(train, "BldgType", "SalePrice", 0.07) # 0.09
categorical_value_counts(train, "HouseStyle", "SalePrice", 0.05)
categorical_value_counts(train, "RoofStyle", "SalePrice", 0.05)
categorical_value_counts(train, "RoofMatl", "SalePrice", 0.05)
categorical_value_counts(train, "Exterior1st", "SalePrice", 0.05)
categorical_value_counts(train, "Exterior2nd", "SalePrice", 0.05)
#MasVnrType
categorical_value_counts(train, "ExterQual", "SalePrice", 0.05)
categorical_value_counts(train, "ExterCond", "SalePrice", 0.05)
categorical_value_counts(train, "Foundation", "SalePrice", 0.05)
categorical_value_counts(train, "BsmtQual", "SalePrice", 0.05)
categorical_value_counts(train, "BsmtCond", "SalePrice", 0.05)
#BsmtExposure
#BsmtFinType1
categorical_value_counts(train, "BsmtFinType2", "SalePrice", 0.05)
categorical_value_counts(train, "Heating", "SalePrice", 0.05)
categorical_value_counts(train, "HeatingQC", "SalePrice", 0.05)
#CentralAir
categorical_value_counts(train, "Electrical", "SalePrice", 0.1)
#KitchenQual
categorical_value_counts(train, "Functional", "SalePrice", 0.05)
categorical_value_counts(train, "FireplaceQu", "SalePrice", 0.05)
categorical_value_counts(train, "GarageType", "SalePrice", 0.1)
#GarageFinish
categorical_value_counts(train, "GarageQual", "SalePrice", 0.05)
categorical_value_counts(train, "GarageCond", "SalePrice", 0.05)
categorical_value_counts(train, "PavedDrive", "SalePrice", 0.1)
categorical_value_counts(train, "PoolQC", "SalePrice", 0.05)
categorical_value_counts(train, "Fence", "SalePrice", 0.05)
categorical_value_counts(train, "MiscFeature", "SalePrice", 0.05)
categorical_value_counts(train, "SaleType", "SalePrice", 0.05)
categorical_value_counts(train, "SaleCondition", "SalePrice", 0.05)
#OverallCond
categorical_value_counts(train, "BsmtFullBath", "SalePrice", 0.05)
categorical_value_counts(train, "BsmtHalfBath", "SalePrice", 0.06)
#categorical_value_counts(train, "FullBath", "SalePrice", 0.05)
#HalfBath
categorical_value_counts(train, "Alley", "SalePrice", 0.05)
#BedroomAbvGr
#KitchenAbvGr
#Fireplaces
#GarageCars
#PoolArea
#YrSold

---- Done! --- 
        MSZoning  Count     Ratio  Target_Ratio
0             RL   1151  0.788356      0.832296
1             RM    218  0.149315      0.104250
2  Rare Category     91  0.062329      0.063454 

---- Done! --- 
           Alley  Count     Ratio  Target_Ratio
0        Unknown   1369  0.937671      0.950788
1  Rare Category     91  0.062329      0.049212 

---- Done! --- 
        LotShape  Count     Ratio  Target_Ratio
0            Reg    925  0.633562      0.576949
1            IR1    484  0.331507      0.377646
2  Rare Category     51  0.034932      0.045405 

---- Done! --- 
     LandContour  Count     Ratio  Target_Ratio
0            Lvl   1311  0.897945      0.894285
1  Rare Category    149  0.102055      0.105715 

---- Done! --- 
       LotConfig  Count     Ratio  Target_Ratio
0         Inside   1052  0.720548      0.704684
1         Corner    263  0.180137      0.180836
2        CulDSac     94  0.064384      0.079662
3  Rare Category     51  0.034932      0.034817 

In [43]:
train.isnull().sum()

Id               0
MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
Street           0
Alley            0
LotShape         0
LandContour      0
Utilities        0
LotConfig        0
LandSlope        0
Neighborhood     0
Condition1       0
Condition2       0
BldgType         0
HouseStyle       0
OverallQual      0
OverallCond      0
YearBuilt        0
YearRemodAdd     0
RoofStyle        0
RoofMatl         0
Exterior1st      0
Exterior2nd      0
MasVnrType       8
MasVnrArea       8
ExterQual        0
ExterCond        0
Foundation       0
BsmtQual         0
BsmtCond         0
BsmtExposure     0
BsmtFinType1     0
BsmtFinSF1       0
BsmtFinType2     0
BsmtFinSF2       0
BsmtUnfSF        0
TotalBsmtSF      0
Heating          0
HeatingQC        0
CentralAir       0
Electrical       0
1stFlrSF         0
2ndFlrSF         0
LowQualFinSF     0
GrLivArea        0
BsmtFullBath     0
BsmtHalfBath     0
FullBath         0
HalfBath         0
BedroomAbvGr     0
KitchenAbvGr

In [10]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [30]:
RS = RobustScaler()
train["LotFrontage"] = RS.fit_transform(train[["LotFrontage"]])
train["LotFrontage"] = train["LotFrontage"].fillna(train["LotFrontage"].mean())
train["Alley"] = train["Alley"].fillna("Unknown")
train["BsmtQual"] = train["BsmtQual"].fillna("Unknown")
train["BsmtCond"] = train["BsmtCond"].fillna("TA")
train["BsmtExposure"] = train["BsmtExposure"].fillna("Unknown")
train["BsmtFinType1"] = train["BsmtFinType1"].fillna("Unknown")
train["BsmtFinType2"] = train["BsmtFinType2"].fillna("Unf")
train["Electrical"] = train["Electrical"].fillna("SBrkr")
train["FireplaceQu"] = train["FireplaceQu"].fillna("Unknown")
train["GarageType"] = train["GarageType"].fillna("Unknown")
#GarageYrBlt
train["GarageFinish"] = train["GarageFinish"].fillna("Unknown")
train["GarageQual"] = train["GarageQual"].fillna("TA")
train["GarageCond"] = train["GarageCond"].fillna("TA")
train["PoolQC"] = train["PoolQC"].fillna("None")
train["Fence"] = train["Fence"].fillna("Unknown")
train["MiscFeature"] = train["MiscFeature"].fillna("Unknown")
indexes = train[train["GarageYrBlt"].isnull()].index
for i in indexes:
    train.loc[i, "GarageYrBlt"] = train.loc[i, "YearBuilt"]

In [53]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [52]:
train["FullBath"].value_counts()

2    768
1    650
3     33
0      9
Name: FullBath, dtype: int64

In [54]:
train[cat_cols].head()

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Condition1,Condition2,BldgType,HouseStyle,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Heating,HeatingQC,CentralAir,Electrical,KitchenQual,Functional,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition,OverallCond,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,Fireplaces,GarageCars,PoolArea,YrSold
0,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Norm,Norm,1Fam,2Story,Gable,CompShg,VinylSd,VinylSd,BrkFace,Gd,TA,PConc,Gd,TA,No,GLQ,Unf,GasA,Ex,Y,SBrkr,Gd,Typ,,Attchd,RFn,TA,TA,Y,,,,WD,Normal,5,1,0,2,1,3,1,0,2,0,2008
1,RL,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Feedr,Norm,1Fam,1Story,Gable,CompShg,MetalSd,MetalSd,,TA,TA,CBlock,Gd,TA,Gd,ALQ,Unf,GasA,Ex,Y,SBrkr,TA,Typ,TA,Attchd,RFn,TA,TA,Y,,,,WD,Normal,8,0,1,2,0,3,1,1,2,0,2007
2,RL,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Norm,Norm,1Fam,2Story,Gable,CompShg,VinylSd,VinylSd,BrkFace,Gd,TA,PConc,Gd,TA,Mn,GLQ,Unf,GasA,Ex,Y,SBrkr,Gd,Typ,TA,Attchd,RFn,TA,TA,Y,,,,WD,Normal,5,1,0,2,1,3,1,1,2,0,2008
3,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Norm,Norm,1Fam,2Story,Gable,CompShg,Wd Sdng,Wd Shng,,TA,TA,BrkTil,TA,Gd,No,ALQ,Unf,GasA,Gd,Y,SBrkr,Gd,Typ,Gd,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml,5,1,0,1,0,3,1,1,3,0,2006
4,RL,Pave,,IR1,Lvl,AllPub,FR2,Gtl,Norm,Norm,1Fam,2Story,Gable,CompShg,VinylSd,VinylSd,BrkFace,Gd,TA,PConc,Gd,TA,Av,GLQ,Unf,GasA,Ex,Y,SBrkr,Gd,Typ,TA,Attchd,RFn,TA,TA,Y,,,,WD,Normal,5,1,0,2,1,4,1,1,3,0,2008


In [None]:
pd.get_dummies(data=train, 
               columns=["MSZoning", "Street", "Alley", 
                        "LotShape", "LandContour", "Utilities", 
                        "LotConfig", "LandSlope", "Neighborhood", 
                        "Condition1", "Condition2", "BldgType", 
                        "HouseStyle"])

In [35]:
X_train = train.drop("SalePrice", axis=1)
y_train = train["SalePrice"]

In [36]:
CB_model = CatBoostRegressor(verbose=False)
CB_model.fit(X_train, y_train)
print("Train Score:", CB_model.score(X_train, y_train))
y_pred = CB_model.predict(X_train)
print("MAPE:", mean_absolute_percentage_error(y_train, y_pred))
print("MAE:", mean_absolute_error(y_train, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_train, y_pred)))

CatBoostError: Bad value for num_feature[non_default_doc_idx=0,feature_idx=2]="RL": Cannot convert 'b'RL'' to float

In [37]:
LGBM_model = LGBMRegressor()
LGBM_model.fit(X_train, y_train)
print("Train Score:", LGBM_model.score(X_train, y_train))
y_pred = LGBM_model.predict(X_train)
print("MAPE:", mean_absolute_percentage_error(y_train, y_pred))
print("MAE:", mean_absolute_error(y_train, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_train, y_pred)))

ValueError: DataFrame.dtypes for data must be int, float or bool.
Did not expect the data types in the following fields: MSZoning, Street, Alley, LotShape, LandContour, Utilities, LotConfig, LandSlope, Neighborhood, Condition1, Condition2, BldgType, HouseStyle, RoofStyle, RoofMatl, Exterior1st, Exterior2nd, MasVnrType, ExterQual, ExterCond, Foundation, BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinType2, Heating, HeatingQC, CentralAir, Electrical, BsmtFullBath, BsmtHalfBath, FullBath, KitchenQual, Functional, FireplaceQu, GarageType, GarageFinish, GarageQual, GarageCond, PavedDrive, PoolQC, Fence, MiscFeature, SaleType, SaleCondition

In [None]:
RS = RobustScaler()
test["LotFrontage"] = RS.fit_transform(test[["LotFrontage"]])

In [None]:
test["LotFrontage"] = test["LotFrontage"].fillna(test["LotFrontage"].mean())

In [None]:
test["MasVnrArea"] = test["MasVnrArea"].fillna(0)

In [None]:
test["RoofStyle"] = test["RoofStyle"].apply(lambda x: "Rare" if x in ["Shed", "Mansard", "Gambrel", "Flat"] else x)
test = pd.get_dummies(data=test, columns=["RoofStyle"], drop_first=True)

In [None]:
test.isnull().sum()

In [None]:
#y_test = test["SalePrice"]
X_test = test[["MSSubClass", "LotArea", "OverallQual", 
                 "OverallCond", "YearBuilt", "TotalBsmtSF", 
                 "1stFlrSF", "2ndFlrSF", "BsmtFullBath", 
                 "FullBath", "BedroomAbvGr", "KitchenAbvGr", "RoofStyle_Rare",
                 "TotRmsAbvGrd", "GarageCars", "GarageYrBlt", "RoofStyle_Hip",
                  "ScreenPorch", "YrSold", "LotFrontage", "YearRemodAdd",
                 "MiscVal", "3SsnPorch", "WoodDeckSF", "MasVnrArea",
                 "MoSold", "EnclosedPorch", "OpenPorchSF", "GarageArea"]]

In [None]:
y_pred_test = CB_model.predict(X_test)

In [None]:
y_pred_test

In [None]:
ID = test["Id"]

In [None]:
submission = pd.DataFrame({"Id": test["Id"], "SalePrice": y_pred_test})

In [None]:
submission.to_csv("pred3.csv", index=False)

In [None]:
test.head()

In [None]:
sample_submission

In [None]:
LR = LinearRegression()
LR.fit(X_train, y_train)
print("Train Score:", LR.score(X_train, y_train))
y_pred = LR.predict(X_train)
print("MAPE:", mean_absolute_percentage_error(y_train, y_pred))
print("MAE:", mean_absolute_error(y_train, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_train, y_pred)))
#print("Test Score:", LR.score(X_test_, y_test_))
#y_pred_test = LR.predict(X_test_)
#print("Test:", mean_absolute_percentage_error(y_test_, y_pred_test))