In [1]:
"""
Russian Cities Housing Challenge 2025 - Task 2
Kaggle Submission with Multiple Models - CORRECTED VERSION
Author: [Your Name]
ERP ID: 27857

This version includes ALL requirement documentation, especially Requirement #3
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.feature_selection import SelectKBest, f_regression
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
import warnings
import time
warnings.filterwarnings('ignore')

In [2]:
# ============================================================================
# CONFIGURATION
# ============================================================================
ERP_ID = 27857
RANDOM_STATE = 42
VALIDATION_SIZE = 0.10  # Use 90% for training, 10% for validation

np.random.seed(RANDOM_STATE)

print("="*80)
print("TASK 2: KAGGLE SUBMISSION - FULL TRAINING SET")
print("="*80)
print(f"Configuration:")
print(f"  ERP ID: {ERP_ID}")
print(f"  Random State: {RANDOM_STATE}")
print(f"  Validation Split: {int((1-VALIDATION_SIZE)*100)}/{int(VALIDATION_SIZE*100)}")

TASK 2: KAGGLE SUBMISSION - FULL TRAINING SET
Configuration:
  ERP ID: 27857
  Random State: 42
  Validation Split: 90/10


In [3]:
# ============================================================================
# HELPER FUNCTIONS
# ============================================================================

def create_features(df):
    """Feature engineering - creates domain-relevant features"""
    df = df.copy()
    
    # 1. Building age (temporal feature)
    if 'build_year' in df.columns:
        df['building_age'] = (2015 - df['build_year']).clip(lower=0)
    
    # 2. Area per room (space efficiency)
    if 'full_sq' in df.columns and 'num_room' in df.columns:
        df['sqm_per_room'] = df['full_sq'] / (df['num_room'] + 1)
    
    # 3. Living area ratio (quality indicator)
    if 'life_sq' in df.columns and 'full_sq' in df.columns:
        df['living_area_ratio'] = df['life_sq'] / (df['full_sq'] + 1)
    
    # 4. Kitchen area ratio (quality indicator)
    if 'kitch_sq' in df.columns and 'full_sq' in df.columns:
        df['kitchen_area_ratio'] = df['kitch_sq'] / (df['full_sq'] + 1)
    
    # 5. Floor position features (location within building)
    if 'floor' in df.columns and 'max_floor' in df.columns:
        df['floor_ratio'] = df['floor'] / (df['max_floor'] + 1)
        df['is_first_floor'] = (df['floor'] == 1).astype(int)
        df['is_last_floor'] = (df['floor'] == df['max_floor']).astype(int)
    
    return df

def evaluate_model(name, model, X_train, X_val, y_train, y_val, results_dict):
    """Train model and collect comprehensive metrics"""
    print(f"\n{'='*70}")
    print(f"Training: {name}")
    print(f"{'='*70}")
    
    # Training
    start_time = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - start_time
    
    # Predictions
    start_time = time.time()
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)
    inference_time = time.time() - start_time
    
    # Calculate metrics
    train_mse = mean_squared_error(y_train, y_train_pred)
    val_mse = mean_squared_error(y_val, y_val_pred)
    train_rmse = np.sqrt(train_mse)
    val_rmse = np.sqrt(val_mse)
    train_r2 = r2_score(y_train, y_train_pred)
    val_r2 = r2_score(y_val, y_val_pred)
    train_mae = mean_absolute_error(y_train, y_train_pred)
    val_mae = mean_absolute_error(y_val, y_val_pred)
    
    # Store results
    results_dict[name] = {
        'train_mse': train_mse,
        'val_mse': val_mse,
        'train_rmse': train_rmse,
        'val_rmse': val_rmse,
        'train_r2': train_r2,
        'val_r2': val_r2,
        'train_mae': train_mae,
        'val_mae': val_mae,
        'train_time': train_time,
        'inference_time': inference_time,
        'model': model
    }
    
    # Print results
    print(f"\nResults:")
    print(f"  Validation MSE:  {val_mse:.4f}")
    print(f"  Validation RMSE: {val_rmse:.4f} million RUB")
    print(f"  Validation R²:   {val_r2:.4f}")
    print(f"  Validation MAE:  {val_mae:.4f} million RUB")
    print(f"  Training time:   {train_time:.2f}s")
    print(f"  Inference time:  {inference_time:.4f}s")
    
    # Overfitting check
    overfitting_gap = train_rmse - val_rmse
    if overfitting_gap < -1:
        print(f"  ⚠️  Potential overfitting (train-val gap: {overfitting_gap:.2f})")
    elif abs(overfitting_gap) < 1:
        print(f"  ✓ Good generalization (train-val gap: {overfitting_gap:.2f})")
    
    return model

In [5]:
# ============================================================================
# STEP 1: LOAD FULL DATA (NO SAMPLING - REQUIREMENT #1)
# ============================================================================
print("\n" + "="*80)
print("STEP 1: LOADING FULL DATASET (REQUIREMENT #1)")
print("="*80)

train_full = pd.read_csv('train.csv')
test_full = pd.read_csv('test.csv')

print(f"\n✓ Full training data loaded: {train_full.shape}")
print(f"✓ Test data loaded: {test_full.shape}")
print(f"✓ Using ALL {len(train_full):,} training samples (no sampling)")
print(f"\n✓ REQUIREMENT #1: COMPLETE - Full dataset loaded")


STEP 1: LOADING FULL DATASET (REQUIREMENT #1)


MemoryError: Unable to allocate 186. MiB for an array with shape (134, 181507) and data type float64