In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt

from catboost import CatBoostRegressor
from sklearn.model_selection import KFold

In [None]:
data_train = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
print(data_train.shape)

data_val = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")
print(data_val.shape)

In [None]:
def preprocess(data_train, data_val):
    # Clearing
    label = "SalePrice"
    target = data_train[label]
    data_train = data_train.drop([label], axis=1)

    data = pd.concat([data_train, data_val])
    features = data.drop(["Id"], axis=1)

    # Normalization 
    num_features_names = features.dtypes[features.dtypes != "O"].index
    features[num_features_names] = (features[num_features_names] - features[num_features_names].mean()) / features[num_features_names].std()

    
    data_train_clean, data_val_clean = features[:data_train.shape[0]], features[data_train.shape[0]:]
    data_train_clean.loc[:, num_features_names] = data_train_clean[num_features_names].fillna(data_train_clean[num_features_names].mean())
    
    return pd.concat([target, data_train_clean], axis=1), data_val_clean

In [None]:
data_train_norm, data_val_norm = preprocess(data_train, data_val) 

In [None]:
cat_features_names = data_train.dtypes[data_train.dtypes == "O"].index

for feature in cat_features_names:
    data_train_norm[feature].fillna("nan", inplace=True)
    data_val_norm[feature].fillna("nan", inplace=True)

In [None]:
X_train, y_train = data_train_norm.drop(["SalePrice"], axis=1), np.log(data_train_norm["SalePrice"])

In [None]:
kf = KFold(n_splits=10, shuffle=True)

In [None]:
from tqdm.auto import tqdm

val_errors = []
train_errors = []

for train_index, val_index in tqdm(kf.split(X_train)):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    model = CatBoostRegressor(iterations=300,
                         depth=8,
                         loss_function="RMSE",
                         cat_features=cat_features_names.tolist(),
                         nan_mode='Min',
                         verbose=0)

    model.fit(X_train_fold, y_train_fold, eval_set=(X_val_fold, y_val_fold))
    
    evals_result = model.get_evals_result()
    train_loss = evals_result["learn"]["RMSE"]
    train_errors.append(train_loss)

    y_val_pred = model.predict(X_val_fold)

    val_rmse = np.sqrt(np.mean((y_val_fold - y_val_pred) ** 2))
    val_errors.append(val_rmse)
print(f"Average RMSE: {np.mean(val_errors)}")

In [None]:
plt.scatter(range(len(val_errors)), val_errors, c="orange")
plt.xlabel("Number of fold")
plt.ylabel("Loss (RMSE)")
plt.grid(True)
plt.show()


In [None]:
mean_loss = np.mean(np.dstack(train_errors)[0], axis=1)

plt.figure(figsize=(10, 6))
plt.plot(range(len(mean_loss)), mean_loss, c="orange", label="Train loss")
plt.xlabel("Number of trees")
plt.ylabel("Loss (RMSE)")
plt.grid(True)
plt.show()

In [None]:
model = CatBoostRegressor(iterations=300,
                     depth=10,
                     loss_function="RMSE",
                    cat_features=cat_features_names.tolist(),
                     nan_mode='Min',
                     verbose=0)

model.fit(X_train, y_train)

In [None]:
train_loss = model.get_evals_result()["learn"]["RMSE"]

plt.figure(figsize=(10, 6))
plt.plot(range(len(train_loss)), train_loss, c="orange", label="Train loss")
plt.xlabel("Number of trees")
plt.ylabel("Loss (RMSE)")
plt.title("Train Loss")
plt.grid(True)
plt.show()

In [None]:
import seaborn as sns


feats_importance = pd.DataFrame({"Importance": model.feature_importances_, 
                                 "Feat_Name": X_train.columns}).sort_values(by="Importance", ascending=False)

plt.figure(figsize=(10,6))
sns.barplot(x="Importance", y="Feat_Name",
            data=feats_importance[:10],
            palette="icefire_r")
plt.title("Feature Importances")
plt.xlabel("Feature Importance")
plt.ylabel("Feature name")
plt.show()


In [None]:
import shap

shap.initjs()


explainer = shap.TreeExplainer(model)
shap_values = explainer(X_train)

In [None]:
shap.summary_plot(shap_values, X_train)

In [None]:
y_pred = np.exp(model.predict(data_val_norm))

data_submission = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv")
np.sqrt(np.mean((data_submission["SalePrice"] - y_pred) ** 2))

In [None]:
data_submission["SalePrice"] = y_pred
data_submission.to_csv('/kaggle/working/submission.csv', index=False)