In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

In [2]:
from lightgbm import LGBMRegressor
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVR
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeRegressor


In [3]:
pip install catboost

Collecting catboost
  Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.2


In [None]:
pip install matplotlib --upgrade

Collecting matplotlib
  Downloading matplotlib-3.8.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: matplotlib
  Attempting uninstall: matplotlib
    Found existing installation: matplotlib 3.7.1
    Uninstalling matplotlib-3.7.1:


In [None]:
test_data_path=r'houseprice_test.csv'
train_data_path=r'houseprice_train.csv'

In [None]:
test_data=pd.read_csv("houseprice_test.csv")
train_data=pd.read_csv("houseprice_train.csv")

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
train_data.columns

In [None]:
test_data.columns

In [None]:
train_data.shape

In [None]:
test_data.shape

In [None]:
train_data.info()

In [None]:
test_data.info()

In [None]:
train_data.describe().T

In [None]:
def missing_value(df):
    missing_values = df.isnull().sum()
    for i, satir in missing_values.items():
        if satir != 0:
            print(f"{satir} {i}")

print("########### TRAIN_DATA  ###########" )
missing_value(train_data)
print("########### TEST_DATA   ##########")
missing_value(test_data)

In [None]:
sns.set_theme()
sns.set(rc={"figure.dpi" : 300})
sns.set(rc={"figure.figsize": (8,4)})

In [None]:
sns.histplot(train_data['SalePrice'],kde=True, color='b', bins=30)
plt.title('Distribution of SalePrice')
plt.xlabel('SalePrice')
plt.ylabel('Count')
plt.show()

In [None]:
#çarpıklık (skewness) ve basıklık (kurtosis) özellikleri
print("Skewness: %f" % train_data['SalePrice'].skew())
print("Kurtosis: %f" % train_data['SalePrice'].kurt())

In [None]:
median = train_data["SalePrice"].median()
median

In [None]:
mean = train_data["SalePrice"].mean()
mean

In [None]:
plt.plot([median, mean], [0, 1], marker="o", label="Median ve Mean")
sns.set(rc={"figure.figsize": (6,4)})
plt.xlabel("Değer")
plt.ylabel("Sıklık")
plt.legend()

In [None]:
list(set(train_data.dtypes.tolist()))

In [None]:
df_num = train_data.select_dtypes(include = ['float64','int64'])

In [None]:
df_num.head()

In [None]:
df_num.isnull().sum()

In [None]:
correlation_matrix=df_num.corr()

In [None]:
plt.figure(figsize=(40,15))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.6)
plt.title('Correlation')
plt.show

In [None]:
categorical_features = train_data.select_dtypes(include=['object']).columns
for feature in categorical_features:
    plt.figure(figsize=(20,6))
    sns.boxplot(x=feature, y='SalePrice', data=train_data)
    plt.title(f'SalePrice vs {feature}')
    plt.xticks(rotation=45)
    plt.show()

In [None]:
plt.figure(figsize=(15, 9))
sns.histplot(x='LotArea', y='SalePrice', data=train_data, bins=50, cmap='coolwarm')
plt.title('SalePrice vs LotArea')
plt.xlabel('LotArea')
plt.ylabel('SalePrice')
plt.show()

In [None]:
#Değişkenleri birleştirme işlemi
df=train_data.append(test_data, ignore_index= False).reset_index()


In [None]:
df=df.drop("index",axis=1)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
# NUMERİK VE KATEGORİK DEĞİŞKENLERİN YAKALANMASI
def grab_col_names(dataframe, cat_th=10, car_th=20):


    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]

    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and
                   dataframe[col].dtypes != "O"]

    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and
                   dataframe[col].dtypes == "O"]

    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]

    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')



    return cat_cols, cat_but_car, num_cols
cat_cols,cat_but_car, num_cols=grab_col_names(df)

In [None]:
#Kategorik Değişken Analizi

def cat_summary (dataframe, col_name,plot=False):
  print(pd.DataFrame({col_name: dataframe[col_name].value_counts(),
                      "Ratio":100*dataframe[col_name].value_counts()/len(dataframe)}))

  if plot:
    sns.countplot(x=dataframe[col_name], data=dataframe)
    plt.show()

for col in cat_cols:
    cat_summary(df,col)

In [None]:
#Sayısal değişken analizi
def num_summary(dataframe, numerical_col, plot=False):
    quantiles = [0.05, 0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 0.95, 0.99]
    print(dataframe[numerical_col].describe(quantiles).T)

    if plot:
        dataframe[numerical_col].hist(bins=50)
        plt.xlabel(numerical_col)
        plt.title(numerical_col)
        plt.show()

    print("#####################################")


for col in num_cols:
    num_summary(df, col, True)

In [None]:
#Target Analysis

def target_summary_with_cat(dataframe, target, categorical_col):
  print(pd.DataFrame({"Target_mean": dataframe.groupby(categorical_col)[target].mean()}), end="\n\n\n")

for col in cat_cols:
  target_summary_with_cat(df,"SalePrice",col)

In [None]:
# Korelasyonların gösterilmesi

corr = df[num_cols].corr()
corr

sns.set(rc={'figure.figsize': (20, 12)})
sns.heatmap(corr,annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.6)
plt.show()

In [None]:
#yüksek korelasyonlu değişkenler
def high_correlated_cols(dataframe, plot=False, corr_th=0.70):
    corr = dataframe.corr()
    cor_matrix = corr.abs()
    upper_triangle_matrix = cor_matrix.where(np.triu(np.ones(cor_matrix.shape), k=1).astype(np.bool))
    drop_list = [col for col in upper_triangle_matrix.columns if any(upper_triangle_matrix[col] > corr_th)]
    if plot:
        import seaborn as sns
        import matplotlib.pyplot as plt
        sns.set(rc={'figure.figsize': (15, 15)})
        sns.heatmap(corr, cmap="RdBu")
        plt.show()
    return drop_list

high_correlated_cols(df, plot=False)

  corr = dataframe.corr()
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  upper_triangle_matrix = cor_matrix.where(np.triu(np.ones(cor_matrix.shape), k=1).astype(np.bool))


['1stFlrSF', 'TotRmsAbvGrd', 'GarageYrBlt', 'GarageArea', 'SalePrice']

In [None]:
# Aykırı Değer Analizi


# Aykırı değerlerin baskılanması
def outlier_thresholds(dataframe, variable, low_quantile=0.10, up_quantile=0.90):
    quantile_one = dataframe[variable].quantile(low_quantile)
    quantile_three = dataframe[variable].quantile(up_quantile)
    interquantile_range = quantile_three - quantile_one
    up_limit = quantile_three + 1.5 * interquantile_range
    low_limit = quantile_one - 1.5 * interquantile_range
    return low_limit, up_limit

In [None]:
def check_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    if dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None):
        return True
    else:
        return False

In [None]:
for col in num_cols:
  if col !="SalePrice":
    print(col ,check_outlier(df,col))

Id False
MSSubClass False
LotFrontage True
LotArea True
OverallQual False
YearBuilt False
YearRemodAdd False
MasVnrArea True
BsmtFinSF1 True
BsmtFinSF2 True
BsmtUnfSF False
TotalBsmtSF True
1stFlrSF True
2ndFlrSF False
LowQualFinSF True
GrLivArea True
TotRmsAbvGrd True
GarageYrBlt True
GarageArea False
WoodDeckSF True
OpenPorchSF True
EnclosedPorch True
3SsnPorch True
ScreenPorch True
PoolArea True
MiscVal True
MoSold False


In [None]:
def replace(dataframe, variable):
    low_limit, up_limit = outlier_thresholds(dataframe, variable)
    dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit

In [None]:
for col in num_cols:
    if col != "SalePrice":
        replace(df,col)

In [None]:
for col in num_cols:
  if col !="SalePrice":
    print(col ,check_outlier(df,col))

Id False
MSSubClass False
LotFrontage False
LotArea False
OverallQual False
YearBuilt False
YearRemodAdd False
MasVnrArea False
BsmtFinSF1 False
BsmtFinSF2 False
BsmtUnfSF False
TotalBsmtSF False
1stFlrSF False
2ndFlrSF False
LowQualFinSF False
GrLivArea False
TotRmsAbvGrd False
GarageYrBlt False
GarageArea False
WoodDeckSF False
OpenPorchSF False
EnclosedPorch False
3SsnPorch False
ScreenPorch False
PoolArea False
MiscVal False
MoSold False


In [None]:
######################################
# Eksik Değer Analizi
######################################


def missing_values(dataframe, na_name=False):
    na_columns = [col for col in dataframe.columns if dataframe[col].isnull().sum() > 0]

    n_miss = dataframe[na_columns].isnull().sum().sort_values(ascending=False)

    ratio = (dataframe[na_columns].isnull().sum() / dataframe.shape[0] * 100).sort_values(ascending=False)

    missing_df = pd.concat([n_miss, np.round(ratio, 2)], axis=1, keys=['n_miss', 'ratio'])

    print(missing_df, end="\n")

    if na_name:
        return na_columns

missing_values(df)

              n_miss  ratio
PoolQC          2909  99.66
MiscFeature     2814  96.40
Alley           2721  93.22
Fence           2348  80.44
SalePrice       1459  49.98
FireplaceQu     1420  48.65
LotFrontage      486  16.65
GarageCond       159   5.45
GarageYrBlt      159   5.45
GarageFinish     159   5.45
GarageQual       159   5.45
GarageType       157   5.38
BsmtExposure      82   2.81
BsmtCond          82   2.81
BsmtQual          81   2.77
BsmtFinType2      80   2.74
BsmtFinType1      79   2.71
MasVnrType        24   0.82
MasVnrArea        23   0.79
MSZoning           4   0.14
Functional         2   0.07
BsmtHalfBath       2   0.07
BsmtFullBath       2   0.07
Utilities          2   0.07
KitchenQual        1   0.03
TotalBsmtSF        1   0.03
BsmtUnfSF          1   0.03
GarageCars         1   0.03
GarageArea         1   0.03
BsmtFinSF2         1   0.03
BsmtFinSF1         1   0.03
Exterior2nd        1   0.03
Exterior1st        1   0.03
SaleType           1   0.03
Electrical         1

In [None]:
no_cols = ["Alley","BsmtQual","BsmtCond","BsmtExposure","BsmtFinType1","BsmtFinType2","FireplaceQu",
           "GarageType","GarageFinish","GarageQual","GarageCond","PoolQC","Fence","MiscFeature"]

In [None]:
for col in no_cols:
  df[col].fillna("No", inplace=True)

In [None]:
missing_values(df)

              n_miss  ratio
SalePrice       1459  49.98
LotFrontage      486  16.65
GarageYrBlt      159   5.45
MasVnrType        24   0.82
MasVnrArea        23   0.79
MSZoning           4   0.14
BsmtFullBath       2   0.07
BsmtHalfBath       2   0.07
Functional         2   0.07
Utilities          2   0.07
Exterior2nd        1   0.03
BsmtFinSF1         1   0.03
BsmtFinSF2         1   0.03
BsmtUnfSF          1   0.03
Exterior1st        1   0.03
Electrical         1   0.03
KitchenQual        1   0.03
GarageCars         1   0.03
GarageArea         1   0.03
SaleType           1   0.03
TotalBsmtSF        1   0.03


In [None]:
#eksik değerleri mod,medyan ve ortalama ile doldurmak.
def quick_missing_imp(data, num_method="median", cat_length=20, target="SalePrice"):
    variables_with_na = [col for col in data.columns if data[col].isnull().sum() > 0]  # Eksik değere sahip olan değişkenler listelenir

    temp_target = data[target]

    print("# BEFORE")
    print(data[variables_with_na].isnull().sum(), "\n\n")  # Uygulama öncesi değişkenlerin eksik değerlerinin sayısı

    # değişken object ve sınıf sayısı cat_lengthe eşit veya altındaysa boş değerleri mode ile doldur
    data = data.apply(lambda x: x.fillna(x.mode()[0]) if (x.dtype == "O" and len(x.unique()) <= cat_length) else x, axis=0)

    # num_method mean ise tipi object olmayan değişkenlerin boş değerleri ortalama ile dolduruluyor
    if num_method == "mean":
        data = data.apply(lambda x: x.fillna(x.mean()) if x.dtype != "O" else x, axis=0)
    # num_method median ise tipi object olmayan değişkenlerin boş değerleri median ile dolduruluyor
    elif num_method == "median":
        data = data.apply(lambda x: x.fillna(x.median()) if x.dtype != "O" else x, axis=0)

    data[target] = temp_target

    print("# AFTER \n Imputation method is 'MODE' for categorical variables!")
    print(" Imputation method is '" + num_method.upper() + "' for numeric variables! \n")
    print(data[variables_with_na].isnull().sum(), "\n\n")

    return data

In [None]:
df = quick_missing_imp(df, num_method="median", cat_length=17)

# BEFORE
MSZoning           4
LotFrontage      486
Utilities          2
Exterior1st        1
Exterior2nd        1
MasVnrType        24
MasVnrArea        23
BsmtFinSF1         1
BsmtFinSF2         1
BsmtUnfSF          1
TotalBsmtSF        1
Electrical         1
BsmtFullBath       2
BsmtHalfBath       2
KitchenQual        1
Functional         2
GarageYrBlt      159
GarageCars         1
GarageArea         1
SaleType           1
SalePrice       1459
dtype: int64 


# AFTER 
 Imputation method is 'MODE' for categorical variables!
 Imputation method is 'MEDIAN' for numeric variables! 

MSZoning           0
LotFrontage        0
Utilities          0
Exterior1st        0
Exterior2nd        0
MasVnrType         0
MasVnrArea         0
BsmtFinSF1         0
BsmtFinSF2         0
BsmtUnfSF          0
TotalBsmtSF        0
Electrical         0
BsmtFullBath       0
BsmtHalfBath       0
KitchenQual        0
Functional         0
GarageYrBlt        0
GarageCars         0
GarageArea         0
SaleType      

In [None]:
df["SalePrice"].fillna(df["SalePrice"].median(), inplace=True)

In [None]:
df["SalePrice"].isnull().sum()

0

In [None]:
df.drop(["Id"],axis=1, inplace=True)

In [None]:
label_encoders = {}
for column in df.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    df[column] = label_encoders[column].fit_transform(df[column])

In [None]:
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,3,65.0,8450.0,1,1,3,3,0,4,...,0,3,4,1,0,2.0,2008,8,4,208500.0
1,20,3,80.0,9600.0,1,1,3,3,0,2,...,0,3,4,1,0,5.0,2007,8,4,181500.0
2,60,3,68.0,11250.0,1,1,0,3,0,4,...,0,3,4,1,0,9.0,2008,8,4,223500.0
3,70,3,60.0,9550.0,1,1,0,3,0,0,...,0,3,4,1,0,2.0,2006,8,0,140000.0
4,60,3,84.0,14260.0,1,1,0,3,0,2,...,0,3,4,1,0,12.0,2008,8,4,250000.0


In [None]:
X= df.drop(["SalePrice"], axis=1)
y= df["SalePrice"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=26)

In [None]:
rf_model= RandomForestRegressor()

In [None]:
rmse = np.mean(np.sqrt(-cross_val_score(rf_model, X ,y ,cv=5, scoring="neg_mean_squared_error")))

In [None]:
print(rmse)

In [None]:
#Feature Extraction
df["New_YearsBR"] = df["YearRemodAdd"] - df["YearBuilt"]

In [None]:
df["NEW_TotalFlrSF"] = df["1stFlrSF"] + df["2ndFlrSF"]

In [None]:
df["NEW_TotalHouseArea"] = df["NEW_TotalFlrSF"] + df["TotalBsmtSF"]

In [None]:
df["New_Area"] = df["GrLivArea"] + df["GarageArea"]

In [None]:
df["New_Overall"] = df["OverallCond"]* df["OverallQual"]

In [None]:
df["New_RoofQuality"] = df["RoofStyle"] + df["RoofMatl"]

In [None]:
df["New_ExterQual"] = df["ExterQual"] + df["ExterCond"]

In [None]:
df["NEW_TotalBath"]=df["BsmtFullBath"] + df["BsmtHalfBath"] + df["FullBath"] + df["HalfBath"]

In [None]:
df["New_GarageQuality"] = df["GarageQual"] + df["GarageCond"]

In [None]:
df["New_RmsLiveArea"] = df["TotRmsAbvGrd"] + df["GrLivArea"]

In [None]:
df["TotalQual"] = df[["OverallQual", "OverallCond", "ExterQual", "ExterCond", "BsmtCond", "BsmtFinType1",
                      "BsmtFinType2", "HeatingQC", "KitchenQual", "Functional", "FireplaceQu", "GarageQual", "GarageCond", "Fence"]].sum(axis = 1)

In [None]:
df.head()

In [None]:
drop_list=["Street", "Alley", "Neighborhood","PoolQC","Heating","LandSlope","Utilities","LandContour"]

In [None]:
df.drop(drop_list, axis=1 , inplace=True)

In [None]:
df.head()

In [None]:
# MODELLEME #

train_df=df[df['SalePrice'].notnull()]
test_df=df[df['SalePrice'].isnull()]

In [None]:
y=train_df['SalePrice']

X=train_df.drop(["SalePrice"],axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=26)

In [None]:
models = [('LR', LinearRegression()),
          ('KNN', KNeighborsRegressor()),
          ('RF', RandomForestRegressor()),
          ('GBM',GradientBoostingRegressor()),
          ('XGBoost', XGBRegressor(objective='reg:squarederror')),
          ("CART", DecisionTreeRegressor()),
          ("LightGBM", LGBMRegressor())]


In [None]:
for name, regressor in models:
    rmse = np.mean(np.sqrt(-cross_val_score(regressor, X ,y ,cv=5, scoring="neg_mean_squared_error")))
    print(f"RMSE: {round(rmse, 4)} ({name})")

In [None]:
model_rf = RandomForestRegressor(random_state=26)


In [None]:
model_rf.get_params()

In [None]:
rf_params={"max_depth":[12,13,14],
          "max_features":[3,4,5],
          "n_estimators":[200,250],
          "min_samples_split":[3,4,5]}

In [None]:
best_grid=GridSearchCV(model_rf,rf_params,cv=5 ,n_jobs=-1,verbose=True).fit(X,y)


In [None]:
best_grid.best_params_

In [None]:
rf_final=model_rf.set_params(**best_grid.best_params_, random_state=26).fit(X,y)


In [None]:
rmse = np.mean(np.sqrt(-cross_val_score(rf_final, X, y, cv=5, scoring="neg_mean_squared_error")))

In [None]:
print(rmse)