In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from folium.plugins import HeatMap
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from skopt import BayesSearchCV
from skopt.space import Real, Integer
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline


In [2]:
# Numerical features — confirmed from dataset column names
numerical_cols = [
    'LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
    'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
    'GrLivArea', 'GarageArea', 'GarageCars', 'WoodDeckSF', 'OpenPorchSF',
    'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
    'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
    'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'MoSold',
    'YrSold', 'YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'OverallQual', 'OverallCond'
]

# Ordinal columns with their order preserved
ordinal_map = {
    'ExterQual': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'ExterCond': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'BsmtQual': ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'BsmtCond': ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'BsmtExposure': ['NA', 'No', 'Mn', 'Av', 'Gd'],
    'BsmtFinType1': ['NA', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'],
    'BsmtFinType2': ['NA', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'],
    'HeatingQC': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'KitchenQual': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'Functional': ['Sal', 'Sev', 'Maj2', 'Maj1', 'Mod', 'Min2', 'Min1', 'Typ'],
    'FireplaceQu': ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'GarageFinish': ['NA', 'Unf', 'RFn', 'Fin'],
    'GarageQual': ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'GarageCond': ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'PavedDrive': ['N', 'P', 'Y'],
    'PoolQC': ['NA', 'Fa', 'TA', 'Gd', 'Ex'],
    'Fence': ['NA', 'MnWw', 'GdWo', 'MnPrv', 'GdPrv']
}
ordinal_cols = list(ordinal_map.keys())

# Nominal (non-ordinal) categorical features — matched to real names
nominal_cols = [
    'MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour',
    'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1',
    'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl',
    'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 'Heating',
    'CentralAir', 'Electrical', 'GarageType', 'MiscFeature',
    'SaleType', 'SaleCondition'
]


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OrdinalEncoder
from catboost import CatBoostRegressor

# === Step 1: Load Data ===
df = pd.read_csv("data/train.csv", keep_default_na=False)

# === Step 2: Identify target ===
y = np.log1p(df["SalePrice"])
df = df.drop(columns=["SalePrice", "Id"])  # Drop ID if present

# === Step 3: Setup categorical column lists ===
# nominal_cols and ordinal_map should be defined before this code block
cat_cols = nominal_cols  # Nominal features only for CatBoost

# === Step 4: Data Cleaning & Transformation ===

# --- Clean and impute numerical columns ---
for col in numerical_cols:
    df[col] = df[col].replace(['NA', 'N/A', 'na', 'n/a', 'NaN', 'nan', 'None', 'none', ''], np.nan)
    df[col] = pd.to_numeric(df[col], errors='coerce')
    df[col] = df[col].fillna(df[col].median())

# --- Encode ordinal columns ---
ordinal_encoder = OrdinalEncoder(
    handle_unknown='use_encoded_value',
    unknown_value=-1,
    categories=[[str(cat) for cat in ordinal_map[col]] for col in ordinal_cols]
)

missing_like = ['NA', 'N/A', 'na', 'n/a', 'NaN', 'nan', 'None', 'none', '', ' ']

for col in ordinal_cols:
    if col in df.columns:
        df[col] = df[col].replace(missing_like, 'missing')
        df[col] = df[col].fillna('missing')
        
df[ordinal_cols] = ordinal_encoder.fit_transform(df[ordinal_cols].astype(str))

# --- Convert nominal features to string and fill missing as well as dealing with rare values ---
rare_label_dict = {}
for col in nominal_cols:
    if col in df.columns:
        df[col] = df[col].replace(missing_like, 'missing')
        freqs = df[col].value_counts(normalize=True)
        rare_labels = freqs[freqs < 0.01].index
        df[col] = df[col].replace(rare_labels, 'Rare')
        df[col] = df[col].astype('category')
        
        rare_label_dict[col] = set(rare_labels)

In [4]:
import pandas as pd
import numpy as np

# Assume:
# - `df` was your training data with 'SalePrice'
# - `df_test` is your test data WITHOUT 'SalePrice'
# - `numerical_cols`, `ordinal_cols`, `nominal_cols`, and `cat_features`
#   were defined and used during training
# - `ordinal_encoder`, `catboost_model` were already fitted using training data
# - `X.columns` gives you the correct column order for prediction

df_test = pd.read_csv("data/test.csv", keep_default_na=False)

final_df = df_test[["Id"]] 

# Step 1: Replace missing-like strings
missing_like = ['NA', 'N/A', 'na', 'n/a', 'NaN', 'nan', 'None', 'none', '', ' ']

# Numerical columns
for col in numerical_cols:
    if col in df_test.columns:
        df_test[col] = df_test[col].replace(missing_like, np.nan)
        df_test[col] = pd.to_numeric(df_test[col], errors='coerce')
        median_val = df[col].median()  # use median from training data
        df_test[col] = df_test[col].fillna(median_val)

df[ordinal_cols] = ordinal_encoder.fit_transform(df[ordinal_cols].astype(str))

# Ordinal columns
for col in ordinal_cols:
    if col in df_test.columns:
        df_test[col] = df_test[col].replace(missing_like, 'missing')
        df_test[col] = df_test[col].fillna('missing')

df_test[ordinal_cols] = ordinal_encoder.transform(df_test[ordinal_cols].astype(str))

# Nominal categorical columns (used by CatBoost)
for col in nominal_cols:
    if col in df_test.columns:
        df_test[col] = df_test[col].replace(missing_like, 'missing')
        df_test[col] = df_test[col].apply(lambda x: 'Rare' if x in rare_label_dict[col] else x)
        df_test[col] = df_test[col].astype('category')

# Step 2: Reorder columns to match training data
df_test = df_test[df.columns]  # X is your original train set (post-processing)

In [11]:
from catboost import CatBoostRegressor
from skopt import BayesSearchCV
from skopt.space import Real, Integer

# === Step 5: Define CatBoost Model and Search Space ===
model = CatBoostRegressor(silent=True, random_state=42)

search_spaces = {
    'depth': Integer(4, 10),
    'learning_rate': Real(0.01, 0.2, prior='log-uniform'),
    'iterations': Integer(300, 1500),
    'l2_leaf_reg': Integer(1, 9),
    'border_count': Integer(32, 128),
    'bagging_temperature': Real(0.0, 1.0),
    'random_strength': Integer(1, 10),
}

# Make sure cat_features uses column names
cat_features = [col for col in cat_cols if col in df.columns]

# === Step 6: Run Bayesian Search ===
bayes_search = BayesSearchCV(
    estimator=model,
    search_spaces=search_spaces,
    scoring='neg_root_mean_squared_error',
    n_iter=20,  # Increase to 50+ for better optimization
    cv=3,
    n_jobs=-1,
    verbose=1,
    random_state=42
)

# Fit the model (CatBoost will internally handle categorical columns)
bayes_search.fit(df, y, cat_features=cat_features)

# === Step 7: Best Model ===
print("Best parameters:", bayes_search.best_params_)
print("Best RMSE score:", -bayes_search.best_score_)

best_model = bayes_search.best_estimator_

preds = best_model.predict(df_test)


Fitting 3 folds for each of 1 candidates, totalling 3 fits


KeyboardInterrupt: 

In [None]:
final_df["SalePrice"] = predictions
final_df.to_csv("submissions/final_submission4.csv", index=False)