#### Gerekli Kütüphaneler

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from typing import Optional
    
from lightgbm import LGBMRegressor
from lightgbm import plot_importance

from xgboost import XGBRegressor
from catboost import CatBoostRegressor

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge

from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import make_scorer

from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from sklearn.cluster import KMeans

- Pandas ayarlar

In [2]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

#### Veri setlerini yükleme

In [3]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample_submission = pd.read_csv("sample_submission.csv")

#### Veriyi inceleme

In [None]:
train.head()

In [None]:
train.describe().T

- Fonksiyon tanımlamaları

In [None]:
# Verisetinin değişkenlerini tespit etme                                                                                     
def grab_col_names(df, cat_th=10, car_th=20):

    # Categorical Columns
    cat_cols = [col for col in df.columns if df[col].dtypes == "O"]
    num_but_cat = [col for col in df.columns if df[col].nunique() < cat_th and df[col].dtypes != "O"]
    cat_but_car = [col for col in df.columns if df[col].nunique() > car_th and df[col].dtypes == "O"]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    # Numerical Columns
    num_cols = [col for col in df.columns if df[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]

    # Results
    print(f"Observations: {df.shape[0]}")
    print(f"Variables: {df.shape[1]}")
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')

    return cat_cols, cat_but_car, num_cols    

In [4]:
# Kategorik değişken analizi                              
def categorical_value_counts(df, col, target: None, rare: Optional[float] = None):
    temp = df.groupby(col, dropna=False).agg(Count=(col, lambda x: x.isnull().count()), \
                               Ratio=(col, lambda x: x.isnull().count() / len(df)), \
                               Target_Ratio=(target, lambda x: x.sum() / df[target].sum())) \
        .sort_values("Count", ascending=False).reset_index()

    if rare is not None:
        rares = temp.loc[temp["Ratio"] <= float(rare), col].tolist()
        df.loc[df[col].isin(rares), col] = "Rare Category"
        print("---- Done! --- ")
        temp = df.groupby(col).agg(Count=(col, lambda x: x.isnull().count()), \
                                  Ratio=(col, lambda x: x.count() / len(df)), \
                                  Target_Ratio=(target, lambda x: x.sum() / df[target].sum())) \
              .sort_values("Count", ascending=False).reset_index()
    return temp

In [None]:
def outliers(df, col, low_Quantile=0.25, high_Quantile=0.75, adjust=False):
    Q1 = df[col].quantile(low_Quantile)
    Q3 = df[col].quantile(high_Quantile)
    IQR = Q3 - Q1
    low_Limit = Q1 - (1.5 * IQR)
    up_Limit = Q3 + (1.5 * IQR)

    if len(df[df[col] > up_Limit]) > 0:
        print(col, ": Higher Outlier!")
    if len(df[df[col] < low_Limit]) > 0:
        print(col, ": Lower Outlier!")

    if adjust:
        df.loc[(df[col] < low_Limit), col] = low_Limit
        df.loc[(df[col] > up_Limit), col] = up_Limit
        print(col, ": Done!")

In [None]:
def on_isleme(col, navalue=None, rare=None, scale=None):
    if train[col].dtype == "int64":
        train[col] = train[col].fillna(navalue)
        test[col] = train[col].fillna(navalue)
        if scale == "binary":
            train[col] = train[col].apply(lambda x: 0 if x == 0 else 1)
            test[col] = test[col].apply(lambda x: 0 if x == 0 else 1)
        else:
            train[col] = scale.fit_transform(train[[col]])
            test[col] = scale.fit_transform(test[[col]])
    elif train[col].dtype == "O":
        train[col] = train[col].fillna(navalue)
        test[col] = test[col].fillna(navalue)
        temp = categorical_value_counts(train, col, "SalePrice")
        cats = temp[temp["Ratio"] < rare][col]
        train.loc[train[col].isin(cats), col] = "RareCat"
        test.loc[test[col].isin(cats), col] = "RareCat"

In [None]:
def plot_importance(model, features, num = 3):
    mpl_style(dark=True)
    feature_imp = pd.DataFrame({"Value": model.feature_importances_, "Feature": features.columns})
    plt.figure(figsize=(10, 10))
    sns.set(font_scale=1)
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False)[0:num])
    plt.title("Features")
    plt.tight_layout()
    plt.show()

In [None]:
train = train_.copy()
test = test_.copy()
cat_cols, cat_but_car, num_cols = grab_col_names(train)
RS = RobustScaler()

In [None]:
for col in cat_cols:
    categorical_value_counts(train, col, "SalePrice")

In [None]:
train["MSSubClass"] = RS.fit_transform(train[["MSSubClass"]])
train["LotFrontage"] = RS.fit_transform(train[["LotFrontage"]])
train["LotFrontage"] = train["LotFrontage"].fillna(train["LotFrontage"].mean())
train["New_OverallQual"] = pd.cut(train["OverallQual"], bins=[-1, 5, 8, 11], labels=[0, 1, 2])
train["New_OverallCond"] = pd.cut(train["OverallCond"], bins=[0, 5, 8, 10], labels=[0, 1, 2])
train["New_YearBuilt"] = pd.cut(train["YearBuilt"], bins=[1871, 1943, 1990, 2011], labels=[0, 1, 2])
train["New_YearRemodAdd"] = pd.cut(train["YearRemodAdd"], bins=[1949, 1990, 2011], labels=[0, 1])
train["MasVnrType"] = train["MasVnrType"].fillna("None")
train["MasVnrArea"] = train["MasVnrArea"].fillna(0)
train["MasVnrArea"] = RS.fit_transform(train[["MasVnrArea"]])
train["BsmtFinSF1"] = RS.fit_transform(train[["BsmtFinSF1"]])
train["BsmtFinSF2"] = RS.fit_transform(train[["BsmtFinSF2"]])
train["BsmtUnfSF"] = RS.fit_transform(train[["BsmtUnfSF"]])
train["2ndFlrSF"] = train["2ndFlrSF"].apply(lambda x: 0 if x == 0 else 1) 
train["LowQualFinSF"] = train["LowQualFinSF"].apply(lambda x: 0 if x == 0 else 1) 
train["LotArea"] = RS.fit_transform(train[["LotArea"]])
train["Alley"] = train["Alley"].fillna("Unknown")
train["BsmtQual"] = train["BsmtQual"].fillna("Unknown")
train["BsmtCond"] = train["BsmtCond"].fillna("TA")
train["BsmtExposure"] = train["BsmtExposure"].fillna("Unknown")
train["BsmtFinType1"] = train["BsmtFinType1"].fillna("Unknown")
train["BsmtFinType2"] = train["BsmtFinType2"].fillna("Unf")
train["Electrical"] = train["Electrical"].fillna("SBrkr")
train["FireplaceQu"] = train["FireplaceQu"].fillna("Unknown")
train["GarageType"] = train["GarageType"].fillna("None")
train["GarageYrBlt"] = train["GarageYrBlt"].fillna("0")
train["GarageFinish"] = train["GarageFinish"].fillna("None")
train["GarageQual"] = train["GarageQual"].fillna("None")
train["GarageCond"] = train["GarageCond"].fillna("None")
train["PoolQC"] = train["PoolQC"].fillna("None")
train["Fence"] = train["Fence"].fillna("Unknown")
train["MiscFeature"] = train["MiscFeature"].fillna("None")

In [None]:
categorical_value_counts(train, "MSZoning", "SalePrice", 0.05)
# Street
categorical_value_counts(train, "Alley", "SalePrice", 0.05)
categorical_value_counts(train, "LotShape", "SalePrice", 0.05)
categorical_value_counts(train, "LandContour", "SalePrice", 0.05)
#Utilities
categorical_value_counts(train, "LotConfig", "SalePrice", 0.05) # 0.07
categorical_value_counts(train, "LandSlope", "SalePrice", 0.05)
categorical_value_counts(train, "Condition1", "SalePrice", 0.06)
categorical_value_counts(train, "Condition2", "SalePrice", 0.05)
categorical_value_counts(train, "BldgType", "SalePrice", 0.07) # 0.09
categorical_value_counts(train, "HouseStyle", "SalePrice", 0.05)
categorical_value_counts(train, "RoofStyle", "SalePrice", 0.05)
categorical_value_counts(train, "RoofMatl", "SalePrice", 0.05)
categorical_value_counts(train, "Exterior1st", "SalePrice", 0.05)
categorical_value_counts(train, "Exterior2nd", "SalePrice", 0.05)
#MasVnrType
categorical_value_counts(train, "ExterQual", "SalePrice", 0.05)
categorical_value_counts(train, "ExterCond", "SalePrice", 0.05)
categorical_value_counts(train, "Foundation", "SalePrice", 0.05)
categorical_value_counts(train, "BsmtQual", "SalePrice", 0.05)
categorical_value_counts(train, "BsmtCond", "SalePrice", 0.05)
#BsmtExposure
#BsmtFinType1
categorical_value_counts(train, "BsmtFinType2", "SalePrice", 0.05)
categorical_value_counts(train, "Heating", "SalePrice", 0.05)
categorical_value_counts(train, "HeatingQC", "SalePrice", 0.05)
#CentralAir
categorical_value_counts(train, "Electrical", "SalePrice", 0.1)
#KitchenQual
categorical_value_counts(train, "Functional", "SalePrice", 0.05)
categorical_value_counts(train, "FireplaceQu", "SalePrice", 0.05)
categorical_value_counts(train, "GarageType", "SalePrice", 0.1)
#GarageFinish
categorical_value_counts(train, "GarageQual", "SalePrice", 0.05)
categorical_value_counts(train, "GarageCond", "SalePrice", 0.05)
categorical_value_counts(train, "PavedDrive", "SalePrice", 0.1)
categorical_value_counts(train, "PoolQC", "SalePrice", 0.05)
categorical_value_counts(train, "Fence", "SalePrice", 0.05)
categorical_value_counts(train, "MiscFeature", "SalePrice", 0.05)
categorical_value_counts(train, "SaleType", "SalePrice", 0.05)
categorical_value_counts(train, "SaleCondition", "SalePrice", 0.05)
#OverallCond
categorical_value_counts(train, "BsmtFullBath", "SalePrice", 0.05)
categorical_value_counts(train, "BsmtHalfBath", "SalePrice", 0.06)
#categorical_value_counts(train, "FullBath", "SalePrice", 0.05)
#HalfBath
categorical_value_counts(train, "Alley", "SalePrice", 0.05)
#BedroomAbvGr
#KitchenAbvGr
#Fireplaces
#GarageCars
#PoolArea
#YrSold

In [None]:
col = "LotFrontage"
print("Min:", train[col].min())
print("Max:", train[col].max())
print("Null Count:", train[col].isnull().sum())
print("Mean:", train[col].mean())
plt.ylim(-10, 1500)
plt.hist(train[col], bins=50);

In [None]:
dummied1=pd.get_dummies(data=train, 
               columns=["MSZoning", "Street", "Alley", 
                        "LotShape", "LandContour", "Utilities", 
                        "LotConfig", "LandSlope", "Neighborhood", 
                        "Condition1", "Condition2", "BldgType", 
                        "HouseStyle", "RoofStyle", "RoofMatl", 
                        "Exterior1st", "Exterior2nd", "MasVnrType", 
                        "ExterQual", "ExterCond", "Foundation", 
                        "BsmtQual", "BsmtCond", "BsmtExposure", 
                        "BsmtFinType1", "BsmtFinType2", "Heating", 
                        "HeatingQC", "CentralAir", "Electrical", 
                        "BsmtHalfBath", "KitchenQual", "Functional", 
                        "FireplaceQu", "GarageType", "GarageFinish", 
                        "GarageQual", "GarageCond", "PavedDrive", 
                        "PoolQC", "Fence", "MiscFeature", 
                        "SaleType", "SaleCondition"],
              drop_first=True)

In [None]:
X_train = dummied1.drop("SalePrice", axis=1)
y_train = dummied1["SalePrice"]

In [None]:
CB_model = CatBoostRegressor(verbose=False)
CB_model.fit(X_train, y_train)
print("Train Score:", CB_model.score(X_train, y_train))
y_pred = CB_model.predict(X_train)
print("MAPE:", mean_absolute_percentage_error(y_train, y_pred))
print("MAE:", mean_absolute_error(y_train, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_train, y_pred)))

In [None]:
LGBM_model = LGBMRegressor()
LGBM_model.fit(X_train, y_train)
print("Train Score:", LGBM_model.score(X_train, y_train))
y_pred = LGBM_model.predict(X_train)
print("MAPE:", mean_absolute_percentage_error(y_train, y_pred))
print("MAE:", mean_absolute_error(y_train, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_train, y_pred)))

In [None]:
LR = LinearRegression()
LR.fit(X_train, y_train)
print("Train Score:", LR.score(X_train, y_train))
y_pred = LR.predict(X_train)
print("MAPE:", mean_absolute_percentage_error(y_train, y_pred))
print("MAE:", mean_absolute_error(y_train, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_train, y_pred)))
#print("Test Score:", LR.score(X_test_, y_test_))
#y_pred_test = LR.predict(X_test_)
#print("Test:", mean_absolute_percentage_error(y_test_, y_pred_test))

In [1]:
import pandas as pd

In [2]:
train = pd.read_csv("train.csv")

In [9]:
info = train.info(memory_usage=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [10]:
info