In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from folium.plugins import HeatMap
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline


In [17]:
# Numerical features — confirmed from dataset column names
numerical_cols = [
    'LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
    'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
    'GrLivArea', 'GarageArea', 'GarageCars', 'WoodDeckSF', 'OpenPorchSF',
    'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
    'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
    'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'MoSold',
    'YrSold', 'YearBuilt', 'YearRemodAdd', 'GarageYrBlt'
]

# Ordinal columns with their order preserved
ordinal_map = {
    'ExterQual': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'ExterCond': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'BsmtQual': ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'BsmtCond': ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'BsmtExposure': ['NA', 'No', 'Mn', 'Av', 'Gd'],
    'BsmtFinType1': ['NA', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'],
    'BsmtFinType2': ['NA', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'],
    'HeatingQC': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'KitchenQual': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'Functional': ['Sal', 'Sev', 'Maj2', 'Maj1', 'Mod', 'Min2', 'Min1', 'Typ'],
    'FireplaceQu': ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'GarageFinish': ['NA', 'Unf', 'RFn', 'Fin'],
    'GarageQual': ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'GarageCond': ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'PavedDrive': ['N', 'P', 'Y'],
    'PoolQC': ['NA', 'Fa', 'TA', 'Gd', 'Ex'],
    'Fence': ['NA', 'MnWw', 'GdWo', 'MnPrv', 'GdPrv'],
    'OverallQual': list(range(1, 11)),
    'OverallCond': list(range(1, 11))
}
ordinal_cols = list(ordinal_map.keys())

# Nominal (non-ordinal) categorical features — matched to real names
nominal_cols = [
    'MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour',
    'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1',
    'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl',
    'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 'Heating',
    'CentralAir', 'Electrical', 'GarageType', 'MiscFeature',
    'SaleType', 'SaleCondition'
]


In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from catboost import CatBoostRegressor, Pool

# === Step 1: Load Data ===
df = pd.read_csv("data/train.csv", keep_default_na=False)

# === Step 2: Identify target ===
y = df["SalePrice"]
df = df.drop(columns=["SalePrice", "Id"])  # Drop ID if present

# Nominal features = all remaining object columns not in ordinal
cat_cols = df.select_dtypes(include='object').columns.tolist()


# === Step 4: Data Cleaning & Transformation ===

# --- Clean and impute numerical columns ---
for col in numerical_cols:
    df[col] = df[col].replace(['NA', 'N/A', 'na', 'n/a', 'NaN', 'nan', 'None', 'none', ''], np.nan)
    df[col] = pd.to_numeric(df[col], errors='coerce')
    df[col] = df[col].fillna(df[col].median())

# --- Encode ordinal columns ---
ordinal_encoder = OrdinalEncoder(
    handle_unknown='use_encoded_value',
    unknown_value=-1,
    categories=[[str(cat) for cat in ordinal_map[col]] for col in ordinal_cols]
)

missing_like = ['NA', 'N/A', 'na', 'n/a', 'NaN', 'nan', 'None', 'none', '', ' ']
for col in ordinal_cols:
    df[col] = df[col].replace(missing_like, 'missing')
    df[col] = df[col].fillna('missing')
    
df[ordinal_cols] = ordinal_encoder.fit_transform(df[ordinal_cols].astype(str))

# --- Convert nominal features to string and fill missing ---
for col in nominal_cols:
    if col in df.columns:
        df[col] = df[col].replace(missing_like, 'missing')
        df[col] = df[col].astype('category')
    
        
# === Step 5: Train-Test Split ===
X_train, X_val, y_train, y_val = train_test_split(df, y, test_size=0.2, random_state=42)

# === Step 6: CatBoost Model Training ===
cat_features = [X_train.columns.get_loc(col) for col in nominal_cols]  # use indices

model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    loss_function='RMSE',
    eval_metric='RMSE',
    random_seed=42,
    early_stopping_rounds=50,
    verbose=100
)

train_pool = Pool(X_train, y_train, cat_features=cat_features)
val_pool = Pool(X_val, y_val, cat_features=cat_features)

model.fit(train_pool, eval_set=val_pool)

# === Step 7: Evaluate ===
preds = model.predict(X_val)
rmse = np.sqrt(np.mean((preds - y_val) ** 2))
print(f"\nValidation RMSE: {rmse:.2f}")

0:	learn: 74809.3608346	test: 85046.2254447	best: 85046.2254447 (0)	total: 5.17ms	remaining: 5.16s
100:	learn: 20708.9409861	test: 29981.6481655	best: 29981.6481655 (100)	total: 324ms	remaining: 2.88s
200:	learn: 16270.8895995	test: 27446.4889387	best: 27446.4889387 (200)	total: 645ms	remaining: 2.56s
300:	learn: 13739.6431446	test: 26459.7033043	best: 26459.2868553 (299)	total: 1.03s	remaining: 2.39s
400:	learn: 12084.5051635	test: 26124.5744669	best: 26124.5744669 (400)	total: 1.4s	remaining: 2.09s
500:	learn: 10655.5543781	test: 25735.9250319	best: 25733.2332930 (498)	total: 1.76s	remaining: 1.76s
600:	learn: 9530.8924739	test: 25590.9371221	best: 25590.9371221 (600)	total: 2.14s	remaining: 1.42s
700:	learn: 8641.3362976	test: 25525.5262413	best: 25521.8343136 (694)	total: 2.49s	remaining: 1.06s
800:	learn: 7922.2583625	test: 25428.2158915	best: 25414.2919209 (789)	total: 2.84s	remaining: 706ms
900:	learn: 7153.6713153	test: 25366.5569065	best: 25344.9860632 (885)	total: 3.21s	remai

In [30]:
import pandas as pd
import numpy as np

# Assume:
# - `df` was your training data with 'SalePrice'
# - `df_test` is your test data WITHOUT 'SalePrice'
# - `numerical_cols`, `ordinal_cols`, `nominal_cols`, and `cat_features`
#   were defined and used during training
# - `ordinal_encoder`, `catboost_model` were already fitted using training data
# - `X.columns` gives you the correct column order for prediction

df_test = pd.read_csv("data/test.csv", keep_default_na=False)

final_df = df_test[["Id"]] 

# Step 1: Replace missing-like strings
missing_like = ['NA', 'N/A', 'na', 'n/a', 'NaN', 'nan', 'None', 'none', '', ' ']

# Numerical columns
for col in numerical_cols:
    if col in df_test.columns:
        df_test[col] = df_test[col].replace(missing_like, np.nan)
        df_test[col] = pd.to_numeric(df_test[col], errors='coerce')
        median_val = df[col].median()  # use median from training data
        df_test[col] = df_test[col].fillna(median_val)

# Ordinal columns
for col in ordinal_cols:
    if col in df_test.columns:
        df_test[col] = df_test[col].replace(missing_like, 'missing')
        df_test[col] = df_test[col].fillna('missing')

df_test[ordinal_cols] = ordinal_encoder.transform(df_test[ordinal_cols].astype(str))

# Nominal categorical columns (used by CatBoost)
for col in nominal_cols:
    if col in df_test.columns:
        df_test[col] = df_test[col].replace(missing_like, 'missing')
        df_test[col] = df_test[col].astype('category')

# Step 2: Reorder columns to match training data
df_test = df_test[df.columns]  # X is your original train set (post-processing)
    
# Step 3: Predict using trained model
predictions = model.predict(df_test)

# Step 4: Return predictions
predictions

array([123714.32881146, 163229.51110484, 187564.18803941, ...,
       162825.30425278, 122741.4683695 , 208956.89079668])

In [31]:
final_df["SalePrice"] = predictions

In [32]:
final_df.to_csv("final_submission2.csv", index=False)

In [33]:
final_df

Unnamed: 0,Id,SalePrice
0,1461,123714.328811
1,1462,163229.511105
2,1463,187564.188039
3,1464,188852.937783
4,1465,199269.758146
...,...,...
1454,2915,85572.939348
1455,2916,83157.250335
1456,2917,162825.304253
1457,2918,122741.468370
