In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/iml-challenge-2-russian-cities-housing-challenge/sample_submission.csv
/kaggle/input/iml-challenge-2-russian-cities-housing-challenge/test/test.csv
/kaggle/input/iml-challenge-2-russian-cities-housing-challenge/train/train.csv


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
import warnings
import time
warnings.filterwarnings('ignore')

In [3]:
# ================================
# LOAD DATA
# ================================
train_path = "/kaggle/input/iml-challenge-2-russian-cities-housing-challenge/train/train.csv"
test_path  = "/kaggle/input/iml-challenge-2-russian-cities-housing-challenge/test/test.csv"

train_df = pd.read_csv(train_path)
test_df  = pd.read_csv(test_path)

print(train_df.shape, test_df.shape)


(181507, 279) (77789, 278)


In [5]:
# ============================================================================
# CONFIGURATION
# ============================================================================
ERP_ID = 27857
RANDOM_STATE = 42
VALIDATION_SIZE = 0.25  # 75/25 split for Task 2 (more training data)
USE_FEATURE_SELECTION = True
MAX_FEATURES = 50  # More features for Task 2

np.random.seed(RANDOM_STATE)

print("="*80)
print("TASK 2: KAGGLE SUBMISSION - FULL TRAINING SET")
print("="*80)
print(f"ERP ID: {ERP_ID}")
print(f"Random State: {RANDOM_STATE}")
print(f"Validation Split: 75/25 (NEW - different from Task 1's 70/30)")
print(f"Max Features: {MAX_FEATURES}")

TASK 2: KAGGLE SUBMISSION - FULL TRAINING SET
ERP ID: 27857
Random State: 42
Validation Split: 75/25 (NEW - different from Task 1's 70/30)
Max Features: 50


In [6]:
# ============================================================================
# HELPER FUNCTIONS (SAME AS TASK 1)
# ============================================================================

def create_features(df):
    """Feature engineering - SAME AS TASK 1"""
    df = df.copy()
    if 'build_year' in df.columns:
        df['building_age'] = (2015 - df['build_year']).clip(lower=0)
    if 'full_sq' in df.columns and 'num_room' in df.columns:
        df['sqm_per_room'] = df['full_sq'] / (df['num_room'] + 1)
    if 'life_sq' in df.columns and 'full_sq' in df.columns:
        df['living_area_ratio'] = df['life_sq'] / (df['full_sq'] + 1)
    if 'kitch_sq' in df.columns and 'full_sq' in df.columns:
        df['kitchen_area_ratio'] = df['kitch_sq'] / (df['full_sq'] + 1)
    if 'floor' in df.columns and 'max_floor' in df.columns:
        df['floor_ratio'] = df['floor'] / (df['max_floor'] + 1)
        df['is_first_floor'] = (df['floor'] == 1).astype(int)
        df['is_last_floor'] = (df['floor'] == df['max_floor']).astype(int)
    return df

def quick_feature_selection(X, y, n_features=50, random_state=None):
    """Feature selection - SAME AS TASK 1"""
    print(f"\n[Feature Selection] Selecting top {n_features} features...")
    numeric_X = X.select_dtypes(include=[np.number])
    
    if numeric_X.shape[1] <= n_features:
        print(f"  Only {numeric_X.shape[1]} numeric features available, using all")
        return numeric_X.columns.tolist()
    
    imputer = SimpleImputer(strategy='median')
    X_imputed = imputer.fit_transform(numeric_X)
    
    # F-statistic selection (fast and effective)
    selector = SelectKBest(score_func=f_regression, k=n_features)
    selector.fit(X_imputed, y)
    selected = numeric_X.columns[selector.get_support()].tolist()
    
    print(f"  Selected {len(selected)} features")
    return selected

def evaluate_model(name, model, X_train, X_val, y_train, y_val, results_dict):
    """Train and evaluate model - ENHANCED FOR TASK 2"""
    print(f"\n{'='*70}")
    print(f"Training: {name}")
    print(f"{'='*70}")
    
    # Training
    start_time = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - start_time
    
    # Predictions
    start_time = time.time()
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)
    inference_time = time.time() - start_time
    
    # Metrics
    train_mse = mean_squared_error(y_train, y_train_pred)
    val_mse = mean_squared_error(y_val, y_val_pred)
    train_rmse = np.sqrt(train_mse)
    val_rmse = np.sqrt(val_mse)
    train_r2 = r2_score(y_train, y_train_pred)
    val_r2 = r2_score(y_val, y_val_pred)
    train_mae = mean_absolute_error(y_train, y_train_pred)
    val_mae = mean_absolute_error(y_val, y_val_pred)
    
    # Store results
    results_dict[name] = {
        'train_mse': train_mse,
        'val_mse': val_mse,
        'train_rmse': train_rmse,
        'val_rmse': val_rmse,
        'train_r2': train_r2,
        'val_r2': val_r2,
        'train_mae': train_mae,
        'val_mae': val_mae,
        'train_time': train_time,
        'inference_time': inference_time,
        'model': model
    }
    
    # Print results
    print(f"\nResults:")
    print(f"  Validation MSE:  {val_mse:.4f}")
    print(f"  Validation RMSE: {val_rmse:.4f} million RUB")
    print(f"  Validation R¬≤:   {val_r2:.4f}")
    print(f"  Training time:   {train_time:.2f}s")
    
    return model

In [9]:
# ============================================================================
# STEP 1: LOAD FULL DATASET (REQUIREMENT #1)
# ============================================================================
print("\n" + "="*80)
print("STEP 1: LOADING FULL DATASET")
print("="*80)


print(f"\n‚úì Full training data loaded: {train_df.shape}")
print(f"‚úì Test data loaded: {test_df.shape}")
print(f"‚úì Using ALL {len(train_df):,} training samples (NO sampling)")


STEP 1: LOADING FULL DATASET

‚úì Full training data loaded: (181507, 279)
‚úì Test data loaded: (77789, 278)
‚úì Using ALL 181,507 training samples (NO sampling)


In [10]:
# ============================================================================
# STEP 2: FEATURE ENGINEERING (SAME AS TASK 1)
# ============================================================================
print("\n" + "="*80)
print("STEP 2: FEATURE ENGINEERING")
print("="*80)

train_full = create_features(train_df)
test_full = create_features(test_df)

print(f"\n‚úì Feature engineering completed")
print(f"  Training features: {train_full.shape[1]}")
print(f"  Test features: {test_full.shape[1]}")


STEP 2: FEATURE ENGINEERING

‚úì Feature engineering completed
  Training features: 280
  Test features: 279


In [11]:
# ============================================================================
# STEP 3: PREPARE DATA
# ============================================================================
print("\n" + "="*80)
print("STEP 3: DATA PREPARATION")
print("="*80)

# Separate features and target
X_full = train_full.drop('price_doc', axis=1)
y_full = train_full['price_doc']

# Remove non-predictive columns
cols_to_drop = ['id', 'timestamp']
X_full = X_full.drop([c for c in cols_to_drop if c in X_full.columns], axis=1)
X_test = test_full.drop([c for c in cols_to_drop if c in test_full.columns], axis=1)

# Store test IDs for submission
test_ids = test_full['id'].values

print(f"\n‚úì Features prepared: {X_full.shape[1]}")
print(f"‚úì Test IDs stored: {len(test_ids)}")


STEP 3: DATA PREPARATION

‚úì Features prepared: 278
‚úì Test IDs stored: 77789


In [12]:
# ============================================================================
# STEP 4: NEW TRAIN-VALIDATION SPLIT (90/10 for Task 2)
# ============================================================================
print("\n" + "="*80)
print("STEP 4: TRAIN-VALIDATION SPLIT (NEW 75/25)")
print("="*80)

X_train, X_val, y_train, y_val = train_test_split(
    X_full, y_full, 
    test_size=VALIDATION_SIZE, 
    random_state=RANDOM_STATE
)

print(f"\n‚úì Training set: {X_train.shape} ({len(X_train):,} samples)")
print(f"‚úì Validation set: {X_val.shape} ({len(X_val):,} samples)")
print(f"‚úì Test set: {X_test.shape}")
print(f"‚úì Random state: {RANDOM_STATE}")
print(f"\n‚ö†Ô∏è  NOTE: This is DIFFERENT from Task 1's 70/30 split!")


STEP 4: TRAIN-VALIDATION SPLIT (NEW 90/10)

‚úì Training set: (136130, 278) (136,130 samples)
‚úì Validation set: (45377, 278) (45,377 samples)
‚úì Test set: (77789, 278)
‚úì Random state: 42

‚ö†Ô∏è  NOTE: This is DIFFERENT from Task 1's 70/30 split!


In [22]:
# ============================================================================
# STEP 5: FEATURE SELECTION
# ============================================================================
print("\n" + "="*80)
print("STEP 5: FEATURE SELECTION")
print("="*80)

if USE_FEATURE_SELECTION:
    selected_features = quick_feature_selection(X_train, y_train, MAX_FEATURES, RANDOM_STATE)
    X_train_selected = X_train[selected_features]
    X_val_selected = X_val[selected_features]
    X_test_selected = X_test[selected_features]
    print(f"\n‚úì Using {len(selected_features)} selected features")
else:
    X_train_selected = X_train.select_dtypes(include=[np.number])
    X_val_selected = X_val.select_dtypes(include=[np.number])
    X_test_selected = X_test.select_dtypes(include=[np.number])
    selected_features = X_train_selected.columns.tolist()


STEP 5: FEATURE SELECTION

[Feature Selection] Selecting top 50 features...
  Selected 50 features

‚úì Using 50 selected features


In [23]:
print(selected_features)

['full_sq', 'life_sq', 'floor', 'culture_objects_top_25_raion', 'industrial_km', 'church_synagogue_km', 'office_count_500', 'office_sqm_500', 'trc_sqm_500', 'cafe_count_500', 'cafe_count_500_na_price', 'cafe_count_500_price_500', 'cafe_count_500_price_1000', 'cafe_count_500_price_1500', 'cafe_count_500_price_2500', 'cafe_count_500_price_4000', 'cafe_count_500_price_high', 'big_church_count_500', 'church_count_500', 'mosque_count_500', 'leisure_count_500', 'market_count_500', 'office_count_1000', 'office_sqm_1000', 'trc_sqm_1000', 'cafe_count_1000', 'cafe_count_1000_na_price', 'cafe_count_1000_price_500', 'cafe_count_1000_price_1000', 'cafe_count_1000_price_1500', 'cafe_count_1000_price_2500', 'cafe_count_1000_price_4000', 'cafe_count_1000_price_high', 'big_church_count_1000', 'church_count_1000', 'leisure_count_1000', 'cafe_count_1500', 'cafe_count_1500_na_price', 'cafe_count_1500_price_500', 'cafe_count_1500_price_1500', 'cafe_count_1500_price_2500', 'cafe_count_1500_price_4000', 'caf

In [24]:
# ============================================================================
# STEP 5b: ADD CATEGORICAL FEATURES (SAME AS TASK 1)
# ============================================================================
print("\n" + "="*80)
print("STEP 5b: ADDING CATEGORICAL FEATURES")
print("="*80)

categorical_features_all = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
print(f"\n‚úì Found {len(categorical_features_all)} categorical features")

if len(categorical_features_all) > 0:
    categorical_selected = [col for col in categorical_features_all 
                            if 2 <= X_train[col].nunique() <= 50]
    print(f"‚úì Selected {len(categorical_selected)} categorical features (2-50 categories)")
else:
    categorical_selected = []

final_features = selected_features + categorical_selected
X_train_combined = X_train[final_features]
X_val_combined = X_val[final_features]
X_test_combined = X_test[final_features]

print(f"\n‚úì FINAL FEATURE SET:")
print(f"  Numeric:      {len(selected_features)}")
print(f"  Categorical:  {len(categorical_selected)}")
print(f"  Total:        {len(final_features)}")


STEP 5b: ADDING CATEGORICAL FEATURES

‚úì Found 15 categorical features
‚úì Selected 14 categorical features (2-50 categories)

‚úì FINAL FEATURE SET:
  Numeric:      50
  Categorical:  14
  Total:        64


In [25]:
# ============================================================================
# STEP 6: PREPROCESSING PIPELINE (SAME AS TASK 1)
# ============================================================================
print("\n" + "="*80)
print("STEP 6: PREPROCESSING PIPELINE")
print("="*80)

numeric_features = X_train_combined.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X_train_combined.select_dtypes(include=['object', 'category']).columns.tolist()

print(f"\n‚úì Numeric: {len(numeric_features)}, Categorical: {len(categorical_features)}")

# Numeric transformer
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Categorical transformer
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False, max_categories=20))
])

# Combined
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
], remainder='drop')

print("‚úì Preprocessing: Numeric (Median‚ÜíScale) | Categorical (Mode‚ÜíOneHot)")


STEP 6: PREPROCESSING PIPELINE

‚úì Numeric: 50, Categorical: 14
‚úì Preprocessing: Numeric (Median‚ÜíScale) | Categorical (Mode‚ÜíOneHot)


In [28]:
print(len(final_features))
print(final_features)

64
['full_sq', 'life_sq', 'floor', 'culture_objects_top_25_raion', 'industrial_km', 'church_synagogue_km', 'office_count_500', 'office_sqm_500', 'trc_sqm_500', 'cafe_count_500', 'cafe_count_500_na_price', 'cafe_count_500_price_500', 'cafe_count_500_price_1000', 'cafe_count_500_price_1500', 'cafe_count_500_price_2500', 'cafe_count_500_price_4000', 'cafe_count_500_price_high', 'big_church_count_500', 'church_count_500', 'mosque_count_500', 'leisure_count_500', 'market_count_500', 'office_count_1000', 'office_sqm_1000', 'trc_sqm_1000', 'cafe_count_1000', 'cafe_count_1000_na_price', 'cafe_count_1000_price_500', 'cafe_count_1000_price_1000', 'cafe_count_1000_price_1500', 'cafe_count_1000_price_2500', 'cafe_count_1000_price_4000', 'cafe_count_1000_price_high', 'big_church_count_1000', 'church_count_1000', 'leisure_count_1000', 'cafe_count_1500', 'cafe_count_1500_na_price', 'cafe_count_1500_price_500', 'cafe_count_1500_price_1500', 'cafe_count_1500_price_2500', 'cafe_count_1500_price_4000', '

In [29]:
# ============================================================================
# STEP 7: TRAIN ALL REQUIRED MODELS (REQUIREMENT #2)
# ============================================================================
print("\n" + "="*80)
print("STEP 7: TRAINING ALL 6 REQUIRED MODELS")
print("="*80)

results = {}


STEP 7: TRAINING ALL 6 REQUIRED MODELS


In [30]:
# ============================================================================
# MODEL 1/6: REGRESSION TREE (REQUIRED)
# ============================================================================
print("\n" + "="*80)
print("MODEL 1/6: REGRESSION TREE")
print("="*80)

tree_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', DecisionTreeRegressor(
        max_depth=15,
        min_samples_split=20,
        min_samples_leaf=10,
        random_state=RANDOM_STATE
    ))
])

evaluate_model('Regression Tree', tree_pipeline, 
               X_train_combined, X_val_combined, y_train, y_val, results)


MODEL 1/6: REGRESSION TREE

Training: Regression Tree

Results:
  Validation MSE:  178.2366
  Validation RMSE: 13.3505 million RUB
  Validation R¬≤:   0.6128
  Training time:   13.32s


In [32]:
# ============================================================================
# MODEL 2/6: LASSO - BEST FROM TASK 1 (REQUIRED)
# ============================================================================
print("\n" + "="*80)
print("MODEL 2/6: LASSO (BEST FROM TASK 1)")
print("="*80)

print("\nNote: Using alpha=0.1 (best from Task 1)")
print("      Now training on 75% of FULL dataset")

lasso_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', Lasso(alpha=0.1, max_iter=10000, random_state=RANDOM_STATE))
])

evaluate_model('Lasso (Task 1 Best)', lasso_pipeline,
               X_train_combined, X_val_combined, y_train, y_val, results)


MODEL 2/6: LASSO (BEST FROM TASK 1)

Note: Using alpha=0.1 (best from Task 1)
      Now training on 75% of FULL dataset

Training: Lasso (Task 1 Best)

Results:
  Validation MSE:  173.0294
  Validation RMSE: 13.1541 million RUB
  Validation R¬≤:   0.6241
  Training time:   4.58s


In [33]:
# ============================================================================
# MODEL 3/6: GRADIENT BOOSTING (SKLEARN) (REQUIRED)
# ============================================================================
print("\n" + "="*80)
print("MODEL 3/6: GRADIENT BOOSTING (sklearn)")
print("="*80)

# Preprocess for gradient boosting
X_train_gb = preprocessor.fit_transform(X_train_combined)
X_val_gb = preprocessor.transform(X_val_combined)

gb_model = GradientBoostingRegressor(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=5,
    min_samples_split=20,
    min_samples_leaf=15,
    subsample=0.8,
    random_state=RANDOM_STATE,
    verbose=0
)

evaluate_model('GradientBoosting', gb_model,
               X_train_gb, X_val_gb, y_train, y_val, results)


MODEL 3/6: GRADIENT BOOSTING (sklearn)

Training: GradientBoosting

Results:
  Validation MSE:  162.9015
  Validation RMSE: 12.7633 million RUB
  Validation R¬≤:   0.6461
  Training time:   779.43s


In [34]:
# ============================================================================
# MODEL 4/6: LIGHTGBM (REQUIRED)
# ============================================================================
print("\n" + "="*80)
print("MODEL 4/6: LIGHTGBM")
print("="*80)

X_train_lgb = X_train_combined.select_dtypes(include=[np.number]).fillna(-999)
X_val_lgb = X_val_combined.select_dtypes(include=[np.number]).fillna(-999)

print("‚úì Training with early stopping...")

start_time = time.time()

lgb_model = lgb.LGBMRegressor(
    objective='regression',
    metric='rmse',
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=7,
    num_leaves=31,
    min_child_samples=20,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=RANDOM_STATE,
    verbose=-1
)

lgb_model.fit(
    X_train_lgb, y_train,
    eval_set=[(X_val_lgb, y_val)],
    callbacks=[lgb.early_stopping(50, verbose=False)]
)

train_time = time.time() - start_time

# Evaluate
start_time = time.time()
y_train_pred = lgb_model.predict(X_train_lgb)
y_val_pred = lgb_model.predict(X_val_lgb)
inference_time = time.time() - start_time

results['LightGBM'] = {
    'train_mse': mean_squared_error(y_train, y_train_pred),
    'val_mse': mean_squared_error(y_val, y_val_pred),
    'train_rmse': np.sqrt(mean_squared_error(y_train, y_train_pred)),
    'val_rmse': np.sqrt(mean_squared_error(y_val, y_val_pred)),
    'train_r2': r2_score(y_train, y_train_pred),
    'val_r2': r2_score(y_val, y_val_pred),
    'train_mae': mean_absolute_error(y_train, y_train_pred),
    'val_mae': mean_absolute_error(y_val, y_val_pred),
    'train_time': train_time,
    'inference_time': inference_time,
    'model': lgb_model
}

print(f"\n‚úì Best iteration: {lgb_model.best_iteration_}")
print(f"  Validation MSE:  {results['LightGBM']['val_mse']:.4f}")
print(f"  Validation RMSE: {results['LightGBM']['val_rmse']:.4f}")
print(f"  Training time:   {train_time:.2f}s")


MODEL 4/6: LIGHTGBM
‚úì Training with early stopping...

‚úì Best iteration: 114
  Validation MSE:  161.2937
  Validation RMSE: 12.7001
  Training time:   2.73s


In [35]:
# ============================================================================
# MODEL 5/6: CATBOOST (REQUIRED)
# ============================================================================
print("\n" + "="*80)
print("MODEL 5/6: CATBOOST")
print("="*80)

X_train_cb = X_train_combined.select_dtypes(include=[np.number]).fillna(-999)
X_val_cb = X_val_combined.select_dtypes(include=[np.number]).fillna(-999)

print("‚úì Training with early stopping...")

start_time = time.time()

catboost_model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    l2_leaf_reg=3,
    random_state=RANDOM_STATE,
    verbose=False,
    early_stopping_rounds=50
)

catboost_model.fit(
    X_train_cb, y_train,
    eval_set=(X_val_cb, y_val)
)

train_time = time.time() - start_time

# Evaluate
start_time = time.time()
y_train_pred = catboost_model.predict(X_train_cb)
y_val_pred = catboost_model.predict(X_val_cb)
inference_time = time.time() - start_time

results['CatBoost'] = {
    'train_mse': mean_squared_error(y_train, y_train_pred),
    'val_mse': mean_squared_error(y_val, y_val_pred),
    'train_rmse': np.sqrt(mean_squared_error(y_train, y_train_pred)),
    'val_rmse': np.sqrt(mean_squared_error(y_val, y_val_pred)),
    'train_r2': r2_score(y_train, y_train_pred),
    'val_r2': r2_score(y_val, y_val_pred),
    'train_mae': mean_absolute_error(y_train, y_train_pred),
    'val_mae': mean_absolute_error(y_val, y_val_pred),
    'train_time': train_time,
    'inference_time': inference_time,
    'model': catboost_model
}

print(f"\n‚úì Best iteration: {catboost_model.best_iteration_}")
print(f"  Validation MSE:  {results['CatBoost']['val_mse']:.4f}")
print(f"  Validation RMSE: {results['CatBoost']['val_rmse']:.4f}")
print(f"  Training time:   {train_time:.2f}s")


MODEL 5/6: CATBOOST
‚úì Training with early stopping...

‚úì Best iteration: 155
  Validation MSE:  161.7382
  Validation RMSE: 12.7176
  Training time:   4.68s


In [39]:
# ============================================================================
# MODEL 6/6: XGBOOST (REQUIRED)
# ============================================================================
print("\n" + "="*80)
print("MODEL 6/6: XGBOOST")
print("="*80)

X_train_xgb = X_train_combined.select_dtypes(include=[np.number]).fillna(-999)
X_val_xgb = X_val_combined.select_dtypes(include=[np.number]).fillna(-999)

print("‚úì Training with early stopping...")

start_time = time.time()

xgb_model = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=7,
    min_child_weight=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=RANDOM_STATE,
    verbosity=0
)

xgb_model.fit(
    X_train_xgb, y_train,
    eval_set=[(X_val_xgb, y_val)],
    early_stopping_rounds=50,
    verbose=False
)


train_time = time.time() - start_time

# Evaluate
start_time = time.time()
y_train_pred = xgb_model.predict(X_train_xgb)
y_val_pred = xgb_model.predict(X_val_xgb)
inference_time = time.time() - start_time

results['XGBoost'] = {
    'train_mse': mean_squared_error(y_train, y_train_pred),
    'val_mse': mean_squared_error(y_val, y_val_pred),
    'train_rmse': np.sqrt(mean_squared_error(y_train, y_train_pred)),
    'val_rmse': np.sqrt(mean_squared_error(y_val, y_val_pred)),
    'train_r2': r2_score(y_train, y_train_pred),
    'val_r2': r2_score(y_val, y_val_pred),
    'train_mae': mean_absolute_error(y_train, y_train_pred),
    'val_mae': mean_absolute_error(y_val, y_val_pred),
    'train_time': train_time,
    'inference_time': inference_time,
    'model': xgb_model
}

print(f"\n‚úì Best iteration: {xgb_model.best_iteration}")
print(f"  Validation MSE:  {results['XGBoost']['val_mse']:.4f}")
print(f"  Validation RMSE: {results['XGBoost']['val_rmse']:.4f}")
print(f"  Training time:   {train_time:.2f}s")


MODEL 6/6: XGBOOST
‚úì Training with early stopping...

‚úì Best iteration: 72
  Validation MSE:  161.2559
  Validation RMSE: 12.6987
  Training time:   4.50s


In [40]:
# ============================================================================
# STEP 8: RESULTS COMPARISON (REQUIREMENT #5a)
# ============================================================================
print("\n" + "="*80)
print("STEP 8: COMPREHENSIVE MODEL COMPARISON")
print("="*80)

results_df = pd.DataFrame(results).T
results_df = results_df.sort_values('val_mse')

print("\n‚úì DETAILED PERFORMANCE COMPARISON:")
print("="*80)
comparison_cols = ['val_mse', 'val_rmse', 'val_r2', 'val_mae', 'train_time', 'inference_time']
print(results_df[comparison_cols].to_string())

# Identify best model
best_model_name = results_df['val_mse'].idxmin()
best_model = results[best_model_name]['model']
best_val_mse = results_df.loc[best_model_name, 'val_mse']
best_val_rmse = results_df.loc[best_model_name, 'val_rmse']

print(f"\n" + "="*80)
print(f"üèÜ BEST MODEL: {best_model_name}")
print("="*80)
print(f"  Validation MSE:  {best_val_mse:.4f}")
print(f"  Validation RMSE: {best_val_rmse:.4f} million RUB")
print(f"  Validation R¬≤:   {results_df.loc[best_model_name, 'val_r2']:.4f}")

# Save results
results_df.to_csv('task2_model_comparison.csv')
print(f"\n‚úì Results saved to 'task2_model_comparison.csv'")


STEP 8: COMPREHENSIVE MODEL COMPARISON

‚úì DETAILED PERFORMANCE COMPARISON:
                        val_mse   val_rmse    val_r2   val_mae  train_time inference_time
XGBoost               161.25595  12.698659  0.649688  6.052683    4.503516       0.199901
LightGBM             161.293731  12.700147  0.649606  6.095551    2.733824       0.531341
CatBoost             161.738196  12.717633   0.64864   6.13665    4.679461       0.050371
GradientBoosting     162.901466  12.763286  0.646113    6.1096  779.434361       0.896976
Lasso (Task 1 Best)  173.029436  13.154065  0.624111  6.686694    4.580143        0.89229
Regression Tree      178.236617  13.350529  0.612799  6.171607   13.319069       0.893535

üèÜ BEST MODEL: XGBoost
  Validation MSE:  161.2559
  Validation RMSE: 12.6987 million RUB
  Validation R¬≤:   0.6497

‚úì Results saved to 'task2_model_comparison.csv'


In [41]:
# ============================================================================
# STEP 9: GENERATE KAGGLE SUBMISSION (REQUIREMENT #4)
# ============================================================================
print("\n" + "="*80)
print("STEP 9: GENERATING KAGGLE SUBMISSION")
print("="*80)

print(f"\n‚úì Using best model: {best_model_name}")

# Prepare test data based on model type
if best_model_name in ['Regression Tree', 'Lasso (Task 1 Best)']:
    X_test_prepared = preprocessor.transform(X_test_combined)
elif best_model_name == 'GradientBoosting':
    X_test_prepared = preprocessor.transform(X_test_combined)
elif best_model_name in ['LightGBM', 'CatBoost', 'XGBoost']:
    X_test_prepared = X_test_combined.select_dtypes(include=[np.number]).fillna(-999)

# Generate predictions
print(f"‚úì Generating predictions for {len(test_ids):,} test samples...")
test_predictions = best_model.predict(X_test_prepared)

# Create submission
submission = pd.DataFrame({
    'id': test_ids,
    'price_doc': test_predictions
})

submission.to_csv('submission.csv', index=False)

print(f"\n‚úì SUCCESS: Submission file created!")
print(f"  File: submission.csv")
print(f"  Rows: {len(submission):,}")
print(f"  Prediction range: [{test_predictions.min():.2f}, {test_predictions.max():.2f}]")
print(f"  Mean prediction: {test_predictions.mean():.2f} million RUB")

print("\n‚úì Sample predictions:")
print(submission.head(10).to_string(index=False))


STEP 9: GENERATING KAGGLE SUBMISSION

‚úì Using best model: XGBoost
‚úì Generating predictions for 77,789 test samples...

‚úì SUCCESS: Submission file created!
  File: submission.csv
  Rows: 77,789
  Prediction range: [2.39, 73.27]
  Mean prediction: 14.82 million RUB

‚úì Sample predictions:
    id  price_doc
243467   7.055458
230180  12.927562
256036   4.098922
  1848   4.114960
 68720  14.443431
163181   5.903276
161538   6.006316
 15029   4.969466
 12928   6.363547
193799  11.967943


In [45]:
# ============================================================================
# STEP 10: FEATURE IMPORTANCE
# ============================================================================
print("\n" + "="*80)
print("STEP 10: FEATURE IMPORTANCE ANALYSIS")
print("="*80)

if best_model_name in ['LightGBM', 'XGBoost', 'CatBoost']:
    if hasattr(best_model, 'feature_importances_'):
        numeric_cols = X_train_combined.select_dtypes(include=[np.number]).columns
        importances = best_model.feature_importances_
        
        feature_importance_df = pd.DataFrame({
            'feature': numeric_cols,
            'importance': importances
        }).sort_values('importance', ascending=False)
        
        print(f"\n‚úì Top 20 Most Important Features:")
        print(feature_importance_df.head(20).to_string(index=False))
        
        feature_importance_df.to_csv('feature_importance.csv', index=False)
        print(f"\n‚úì Feature importance saved to 'feature_importance.csv'")


STEP 10: FEATURE IMPORTANCE ANALYSIS

‚úì Top 20 Most Important Features:
                     feature  importance
                     full_sq    0.455747
            mosque_count_500    0.220517
             large_apartment    0.053697
culture_objects_top_25_raion    0.018196
  cafe_count_2000_price_2500    0.010293
              rooms_inferred    0.009350
                 trc_sqm_500    0.007599
          leisure_count_1000    0.007270
  cafe_count_1500_price_high    0.007126
  cafe_count_1000_price_high    0.006865
             cafe_count_1500    0.006756
         church_synagogue_km    0.006725
  cafe_count_1000_price_1500    0.006620
             cafe_count_1000    0.006406
  cafe_count_1500_price_2500    0.006361
  cafe_count_2000_price_high    0.006266
           leisure_count_500    0.005897
               industrial_km    0.005758
   cafe_count_500_price_high    0.005558
  cafe_count_1500_price_4000    0.005544

‚úì Feature importance saved to 'feature_importance.csv'


In [47]:
# ============================================================================
# STEP 9: ENSEMBLE SUBMISSION (XGBoost + LightGBM + CatBoost)
# ============================================================================
print("\n" + "="*80)
print("STEP 9: GENERATING ENSEMBLE SUBMISSION")
print("="*80)

print("\n‚úì Creating ensemble from 3 best gradient boosting models:")
print("  ‚Ä¢ XGBoost")
print("  ‚Ä¢ LightGBM")
print("  ‚Ä¢ CatBoost")

# Prepare test data for gradient boosting models
X_test_prepared = X_test_combined.select_dtypes(include=[np.number]).fillna(-999)
print(f"\n‚úì Test data prepared: {X_test_prepared.shape}")

# Get predictions from each model
print("\n‚úì Generating predictions from each model...")

print("  [1/3] XGBoost predicting...")
xgb_predictions = results['XGBoost']['model'].predict(X_test_prepared)
print(f"      Range: [{xgb_predictions.min():.2f}, {xgb_predictions.max():.2f}]")
print(f"      Mean: {xgb_predictions.mean():.2f}")

print("  [2/3] LightGBM predicting...")
lgb_predictions = results['LightGBM']['model'].predict(X_test_prepared)
print(f"      Range: [{lgb_predictions.min():.2f}, {lgb_predictions.max():.2f}]")
print(f"      Mean: {lgb_predictions.mean():.2f}")

print("  [3/3] CatBoost predicting...")
catboost_predictions = results['CatBoost']['model'].predict(X_test_prepared)
print(f"      Range: [{catboost_predictions.min():.2f}, {catboost_predictions.max():.2f}]")
print(f"      Mean: {catboost_predictions.mean():.2f}")

# Calculate ensemble predictions (simple average)
print("\n‚úì Creating ensemble predictions (simple average)...")
ensemble_predictions = (xgb_predictions + lgb_predictions + catboost_predictions) / 3

print(f"\n‚úì Ensemble Statistics:")
print(f"  Range: [{ensemble_predictions.min():.2f}, {ensemble_predictions.max():.2f}]")
print(f"  Mean: {ensemble_predictions.mean():.2f} million RUB")
print(f"  Std: {ensemble_predictions.std():.2f}")

# Calculate ensemble validation performance (for reference)
print("\n‚úì Ensemble Validation Performance:")
xgb_val_pred = results['XGBoost']['model'].predict(X_val_combined.select_dtypes(include=[np.number]).fillna(-999))
lgb_val_pred = results['LightGBM']['model'].predict(X_val_combined.select_dtypes(include=[np.number]).fillna(-999))
catboost_val_pred = results['CatBoost']['model'].predict(X_val_combined.select_dtypes(include=[np.number]).fillna(-999))
ensemble_val_pred = (xgb_val_pred + lgb_val_pred + catboost_val_pred) / 3

ensemble_val_mse = mean_squared_error(y_val, ensemble_val_pred)
ensemble_val_rmse = np.sqrt(ensemble_val_mse)
ensemble_val_r2 = r2_score(y_val, ensemble_val_pred)
ensemble_val_mae = mean_absolute_error(y_val, ensemble_val_pred)

print(f"  Validation MSE:  {ensemble_val_mse:.4f}")
print(f"  Validation RMSE: {ensemble_val_rmse:.4f} million RUB")
print(f"  Validation R¬≤:   {ensemble_val_r2:.4f}")
print(f"  Validation MAE:  {ensemble_val_mae:.4f}")

# Compare with individual models
print("\n‚úì Comparison with Individual Models:")
print("="*70)
comparison_data = {
    'Model': ['XGBoost', 'LightGBM', 'CatBoost', 'Ensemble (Average)'],
    'Val RMSE': [
        results['XGBoost']['val_rmse'],
        results['LightGBM']['val_rmse'],
        results['CatBoost']['val_rmse'],
        ensemble_val_rmse
    ],
    'Val R¬≤': [
        results['XGBoost']['val_r2'],
        results['LightGBM']['val_r2'],
        results['CatBoost']['val_r2'],
        ensemble_val_r2
    ],
    'Val MAE': [
        results['XGBoost']['val_mae'],
        results['LightGBM']['val_mae'],
        results['CatBoost']['val_mae'],
        ensemble_val_mae
    ]
}
comparison_df = pd.DataFrame(comparison_data)
print(comparison_df.to_string(index=False))

# Create submission file
submission = pd.DataFrame({
    'id': test_ids,
    'price_doc': ensemble_predictions
})

submission.to_csv('submission_ensemble.csv', index=False)

print(f"\n" + "="*80)
print("‚úÖ ENSEMBLE SUBMISSION GENERATED")
print("="*80)
print(f"  File: submission_ensemble.csv")
print(f"  Rows: {len(submission):,}")
print(f"  Models: XGBoost + LightGBM + CatBoost (average)")
print(f"  Validation RMSE: {ensemble_val_rmse:.4f} million RUB")

print("\n‚úì Sample predictions:")
print(submission.head(10).to_string(index=False))

# Also save individual model submissions for comparison
print("\n‚úì Saving individual model submissions for reference...")

for model_name in ['XGBoost', 'LightGBM', 'CatBoost']:
    if model_name == 'XGBoost':
        preds = xgb_predictions
    elif model_name == 'LightGBM':
        preds = lgb_predictions
    else:
        preds = catboost_predictions
    
    individual_submission = pd.DataFrame({
        'id': test_ids,
        'price_doc': preds
    })
    
    filename = f'submission_{model_name.lower()}.csv'
    individual_submission.to_csv(filename, index=False)
    print(f"  ‚Ä¢ {filename} (Val RMSE: {results[model_name]['val_rmse']:.4f})")

print("\n‚úì ALL FILES GENERATED:")
print("  ‚Ä¢ submission_ensemble.csv ‚≠ê (RECOMMENDED)")
print("  ‚Ä¢ submission_xgboost.csv")
print("  ‚Ä¢ submission_lightgbm.csv")
print("  ‚Ä¢ submission_catboost.csv")

print("\nüéØ RECOMMENDED: Upload submission_ensemble.csv to Kaggle!")
print("   Ensemble models often perform better by combining strengths of multiple models.")


STEP 9: GENERATING ENSEMBLE SUBMISSION

‚úì Creating ensemble from 3 best gradient boosting models:
  ‚Ä¢ XGBoost
  ‚Ä¢ LightGBM
  ‚Ä¢ CatBoost

‚úì Test data prepared: (77789, 50)

‚úì Generating predictions from each model...
  [1/3] XGBoost predicting...
      Range: [2.39, 73.27]
      Mean: 14.82
  [2/3] LightGBM predicting...
      Range: [3.34, 67.48]
      Mean: 14.82
  [3/3] CatBoost predicting...
      Range: [2.91, 61.61]
      Mean: 14.82

‚úì Creating ensemble predictions (simple average)...

‚úì Ensemble Statistics:
  Range: [3.07, 62.71]
  Mean: 14.82 million RUB
  Std: 17.13

‚úì Ensemble Validation Performance:
  Validation MSE:  160.9321
  Validation RMSE: 12.6859 million RUB
  Validation R¬≤:   0.6504
  Validation MAE:  6.0790

‚úì Comparison with Individual Models:
             Model  Val RMSE   Val R¬≤  Val MAE
           XGBoost 12.698659 0.649688 6.052683
          LightGBM 12.700147 0.649606 6.095551
          CatBoost 12.717633 0.648640 6.136650
Ensemble (Aver