In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from folium.plugins import HeatMap
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [47]:
def clean(df):
    global cat_cols, numberical_cols
        
    # Replace common string 'na' values in numerical columns with actual np.nan
    for col in numerical_cols:
        df[col] = df[col].replace(['NA', 'N/A', 'na', 'n/a', 'NaN', 'nan', 'None', 'none', ''], np.nan)

        df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # Convert Na Values in numerical_cols to the median of the column
    for col in numerical_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

            median_val = df[col].median()
            df[col] = df[col].fillna(median_val)
    
    # Convert categorical columns to dtype "category"
    for col in cat_cols:
        if col in df.columns:
            df[col] = df[col].astype('category')
            df[col] = df[col].astype(str).fillna("missing")
    return df

In [48]:
cat_cols = """
MSSubClass
MSZoning
Street
Alley
LotShape
LandContour
Utilities
LotConfig
LandSlope
Neighborhood
Condition1 
Condition2
BldgType
HouseStyle
OverallQual
OverallCond
RoofStyle
RoofMatl
Exterior1st
Exterior2nd
MasVnrType
ExterQual
ExterCond
Foundation
BsmtQual
BsmtCond
BsmtExposure
BsmtFinType1
BsmtFinType2
Heating
HeatingQC
CentralAir
Electrical
KitchenQual
Functional
FireplaceQu
GarageType
GarageFinish
GarageQual
GarageCond
PavedDrive
PoolQC
Fence
MiscFeature
MiscVal
SaleType
SaleCondition
""".split()

In [49]:
# Load Not cleaned Training and Testing Data Frames

df = (pd.read_csv("data/train.csv", keep_default_na=False))
df_test = (pd.read_csv("data/test.csv", keep_default_na=False))

final_df = df_test[["Id"]]

In [50]:
numerical_cols = [col for col in df_test.columns if col not in cat_cols]

In [51]:
df = clean(df)
df_test = clean(df_test)

In [52]:
X = df.drop(["Id", "SalePrice"], axis=1)
y = df["SalePrice"]

In [64]:
df_test.drop("Id", axis=1, inplace=True)

In [53]:
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_squared_log_error

# Step 1: Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Create CatBoost Pool (efficient way to handle categorical data)
train_pool = Pool(X_train, y_train, cat_features=cat_cols)
test_pool = Pool(X_test, y_test, cat_features=cat_cols)

# Step 3: Initialize and train CatBoostRegressor
model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    loss_function='RMSE',
    eval_metric='RMSE',
    cat_features=cat_cols,
    verbose=100,
    early_stopping_rounds=50
)

model.fit(train_pool, eval_set=test_pool)

# Step 4: Evaluate
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_log_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"\nTest RMSE: {rmse:.2f}")
print(f"Test R²: {r2:.4f}")


0:	learn: 74869.1893709	test: 85731.1692489	best: 85731.1692489 (0)	total: 8.83ms	remaining: 8.82s
100:	learn: 20848.9693577	test: 29793.6818527	best: 29793.6818527 (100)	total: 577ms	remaining: 5.14s
200:	learn: 16603.6823587	test: 27303.6101971	best: 27303.6101971 (200)	total: 1.17s	remaining: 4.63s
300:	learn: 13982.9674562	test: 26550.5261468	best: 26550.3671621 (299)	total: 1.8s	remaining: 4.19s
400:	learn: 12128.7179000	test: 26278.3355023	best: 26278.3355023 (400)	total: 2.47s	remaining: 3.69s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 26201.16552
bestIteration = 440

Shrink model to first 441 iterations.

Test RMSE: 0.14
Test R²: 0.9105


In [54]:
# fit model on entitre training data

train_pool = Pool(X, y, cat_features=cat_cols)

# Step 3: Initialize and train CatBoostRegressor
model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    loss_function='RMSE',
    eval_metric='RMSE',
    cat_features=cat_cols,
    verbose=100,
    early_stopping_rounds=50
)

model.fit(train_pool)

# Step 4: Evaluate
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_log_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"\nTest RMSE: {rmse:.2f}")
print(f"Test R²: {r2:.4f}")

0:	learn: 76988.4712201	total: 11.9ms	remaining: 11.9s
100:	learn: 21435.8041434	total: 570ms	remaining: 5.07s
200:	learn: 17058.5484466	total: 1.13s	remaining: 4.5s
300:	learn: 15080.1716663	total: 1.7s	remaining: 3.95s
400:	learn: 13429.0259707	total: 2.29s	remaining: 3.42s
500:	learn: 12195.1291931	total: 2.96s	remaining: 2.95s
600:	learn: 11167.7822228	total: 3.57s	remaining: 2.37s
700:	learn: 10170.2804623	total: 4.14s	remaining: 1.77s
800:	learn: 9362.8669947	total: 4.73s	remaining: 1.17s
900:	learn: 8618.3619262	total: 5.32s	remaining: 584ms
999:	learn: 7965.0304011	total: 5.89s	remaining: 0us

Test RMSE: 0.07
Test R²: 0.9853


In [65]:
y_test = model.predict(df_test)

In [66]:
final_df["SalePrice"] = y_test

In [73]:
final_df.to_csv("final_submission.csv", index=False)

In [74]:
final_df

Unnamed: 0,Id,SalePrice
0,1461,121402.176783
1,1462,170220.415029
2,1463,185175.387094
3,1464,194510.988379
4,1465,185341.180459
...,...,...
1454,2915,78725.237347
1455,2916,82270.954017
1456,2917,167172.920472
1457,2918,118114.048921
