In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/iml-challenge-2-russian-cities-housing-challenge/sample_submission.csv
/kaggle/input/iml-challenge-2-russian-cities-housing-challenge/test/test.csv
/kaggle/input/iml-challenge-2-russian-cities-housing-challenge/train/train.csv


In [2]:
# ================================
# IMPORTS
# ================================
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor

# GPU Boosting Models
import xgboost as xgb
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor


In [3]:
# ================================
# LOAD DATA
# ================================
train_path = "/kaggle/input/iml-challenge-2-russian-cities-housing-challenge/train/train.csv"
test_path  = "/kaggle/input/iml-challenge-2-russian-cities-housing-challenge/test/test.csv"

train_df = pd.read_csv(train_path)
test_df  = pd.read_csv(test_path)

print(train_df.shape, test_df.shape)


(181507, 279) (77789, 278)


In [5]:
# ================================
# TRAIN / VALIDATION SPLIT
# ================================

TARGET = "price_doc"

X = train_df.drop(columns=[TARGET])
y = train_df[TARGET]

# 70% train, 30% validation (same split as Task 1 but NOT same seed)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.30, random_state=42
)


In [6]:
# ================================
# PREPROCESSING PIPELINE
# ================================

numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ],
    remainder="drop",
    n_jobs=-1
)


In [9]:
# ================================
# MODEL DEFINITIONS
# ================================

models = {
    "RegressionTree": DecisionTreeRegressor(random_state=42),

    "LinearRegression": LinearRegression(),

    "GradientBoostingRegressor": GradientBoostingRegressor(
        random_state=42
    ),

    # GPU Boosters
    "XGBoost": xgb.XGBRegressor(
        tree_method="hist",
        device="cuda",
        eval_metric="rmse",
        random_state=42,
        enable_categorical=False
    ),

    "LightGBM": LGBMRegressor(
        device="gpu",
        random_state=42
    ),

    "CatBoost": CatBoostRegressor(
        task_type="GPU",
        devices="0",
        verbose=False,
        random_state=42
    )
}


In [10]:
# ================================
# TRAIN + VALIDATE ALL MODELS
# ================================

results = []

for name, model in models.items():
    pipe = Pipeline([
        ("preprocessor", preprocessor),
        ("model", model)
    ])

    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_val)

    rmse = mean_squared_error(y_val, preds, squared=False)

    results.append([name, rmse])
    print(f"{name} RMSE: {rmse:.4f}")


RegressionTree RMSE: 17.8898
LinearRegression RMSE: 13.6710
GradientBoostingRegressor RMSE: 12.7993
XGBoost RMSE: 13.2251
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 67227
[LightGBM] [Info] Number of data points in the train set: 127054, number of used features: 596
[LightGBM] [Info] Using GPU Device: Tesla P100-PCIE-16GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 264 dense feature groups (31.99 MB) transferred to GPU in 0.030732 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 14.852695
LightGBM RMSE: 12.8538
CatBoost RMSE: 12.8787


In [11]:
# ================================
# RESULTS TABLE
# ================================
results_df = pd.DataFrame(results, columns=["Model", "RMSE"])
results_df = results_df.sort_values("RMSE")
results_df


Unnamed: 0,Model,RMSE
2,GradientBoostingRegressor,12.799301
4,LightGBM,12.853772
5,CatBoost,12.87873
3,XGBoost,13.225103
1,LinearRegression,13.67096
0,RegressionTree,17.889833


In [12]:
# ================================
# TRAIN BEST MODEL (FULL DATA)
# ================================

best_model_name = results_df.iloc[0]["Model"]
best_model = models[best_model_name]

print("Training BEST model:", best_model_name)

final_pipe = Pipeline([
    ("preprocessor", preprocessor),
    ("model", best_model)
])

final_pipe.fit(X, y)


Training BEST model: GradientBoostingRegressor


In [15]:
# ================================
# TEST PREDICTIONS + SUBMISSION
# ================================

test_preds = final_pipe.predict(test_df)

submission = pd.DataFrame({
    "id": test_df["id"],     # adjust if different ID column
    "price_doc": test_preds
})

submission_path = "submission.csv"
submission.to_csv(submission_path, index=False)

submission.head()


Unnamed: 0,id,price_doc
0,243467,6.995674
1,230180,11.709462
2,256036,3.997885
3,1848,4.042517
4,68720,14.411258


In [16]:
from IPython.display import FileLink
FileLink("submission.csv")
