In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
import warnings
import time
warnings.filterwarnings('ignore')

In [2]:
# ============================================================================
# CONFIGURATION
# ============================================================================
ERP_ID = 27857
RANDOM_STATE = 42
VALIDATION_SIZE = 0.10  # Use 90% for training, 10% for validation

np.random.seed(RANDOM_STATE)

print("="*80)
print("TASK 2: KAGGLE SUBMISSION - FULL TRAINING SET")
print("="*80)
print(f"Configuration:")
print(f"  ERP ID: {ERP_ID}")
print(f"  Random State: {RANDOM_STATE}")
print(f"  Validation Split: {int((1-VALIDATION_SIZE)*100)}/{int(VALIDATION_SIZE*100)}")

TASK 2: KAGGLE SUBMISSION - FULL TRAINING SET
Configuration:
  ERP ID: 27857
  Random State: 42
  Validation Split: 90/10


In [3]:
# ============================================================================
# HELPER FUNCTIONS
# ============================================================================

def create_features(df):
    """Feature engineering (same as Task 1)"""
    df = df.copy()
    
    # Building age
    if 'build_year' in df.columns:
        df['building_age'] = (2015 - df['build_year']).clip(lower=0)
    
    # Area ratios
    if 'full_sq' in df.columns and 'num_room' in df.columns:
        df['sqm_per_room'] = df['full_sq'] / (df['num_room'] + 1)
    
    if 'life_sq' in df.columns and 'full_sq' in df.columns:
        df['living_area_ratio'] = df['life_sq'] / (df['full_sq'] + 1)
    
    if 'kitch_sq' in df.columns and 'full_sq' in df.columns:
        df['kitchen_area_ratio'] = df['kitch_sq'] / (df['full_sq'] + 1)
    
    # Floor features
    if 'floor' in df.columns and 'max_floor' in df.columns:
        df['floor_ratio'] = df['floor'] / (df['max_floor'] + 1)
        df['is_first_floor'] = (df['floor'] == 1).astype(int)
        df['is_last_floor'] = (df['floor'] == df['max_floor']).astype(int)
    
    return df

def evaluate_model(name, model, X_train, X_val, y_train, y_val, results_dict):
    """Train model and collect metrics"""
    print(f"\n{'='*70}")
    print(f"Training: {name}")
    print(f"{'='*70}")
    
    # Training
    start_time = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - start_time
    
    # Predictions
    start_time = time.time()
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)
    inference_time = time.time() - start_time
    
    # Metrics
    train_mse = mean_squared_error(y_train, y_train_pred)
    val_mse = mean_squared_error(y_val, y_val_pred)
    train_rmse = np.sqrt(train_mse)
    val_rmse = np.sqrt(val_mse)
    train_r2 = r2_score(y_train, y_train_pred)
    val_r2 = r2_score(y_val, y_val_pred)
    train_mae = mean_absolute_error(y_train, y_train_pred)
    val_mae = mean_absolute_error(y_val, y_val_pred)
    
    # Store results
    results_dict[name] = {
        'train_mse': train_mse,
        'val_mse': val_mse,
        'train_rmse': train_rmse,
        'val_rmse': val_rmse,
        'train_r2': train_r2,
        'val_r2': val_r2,
        'train_mae': train_mae,
        'val_mae': val_mae,
        'train_time': train_time,
        'inference_time': inference_time,
        'model': model
    }
    
    # Print results
    print(f"\nResults:")
    print(f"  Validation MSE:  {val_mse:.4f}")
    print(f"  Validation RMSE: {val_rmse:.4f}")
    print(f"  Validation R²:   {val_r2:.4f}")
    print(f"  Validation MAE:  {val_mae:.4f}")
    print(f"  Training time:   {train_time:.2f}s")
    print(f"  Inference time:  {inference_time:.4f}s")
    
    return model

In [4]:
# ============================================================================
# STEP 1: LOAD FULL DATA (NO SAMPLING)
# ============================================================================
print("\n" + "="*80)
print("STEP 1: LOADING FULL DATASET")
print("="*80)

train_full = pd.read_csv('train.csv')
test_full = pd.read_csv('test.csv')

print(f"\n✓ Full training data loaded: {train_full.shape}")
print(f"✓ Test data loaded: {test_full.shape}")
print(f"✓ Using ALL {len(train_full):,} training samples (no sampling)")


STEP 1: LOADING FULL DATASET

✓ Full training data loaded: (181507, 279)
✓ Test data loaded: (77789, 278)
✓ Using ALL 181,507 training samples (no sampling)


In [5]:
# ============================================================================
# STEP 2: FEATURE ENGINEERING
# ============================================================================
print("\n" + "="*80)
print("STEP 2: FEATURE ENGINEERING")
print("="*80)

train_full = create_features(train_full)
test_full = create_features(test_full)

print(f"\n✓ Feature engineering completed")
print(f"  Training features: {train_full.shape[1]}")
print(f"  Test features: {test_full.shape[1]}")


STEP 2: FEATURE ENGINEERING

✓ Feature engineering completed
  Training features: 280
  Test features: 279


In [6]:
# ============================================================================
# STEP 3: PREPARE DATA
# ============================================================================
print("\n" + "="*80)
print("STEP 3: DATA PREPARATION")
print("="*80)

# Separate features and target
X_full = train_full.drop('price_doc', axis=1)
y_full = train_full['price_doc']

# Remove non-predictive columns
cols_to_drop = ['id', 'timestamp']
X_full = X_full.drop([c for c in cols_to_drop if c in X_full.columns], axis=1)
X_test = test_full.drop([c for c in cols_to_drop if c in test_full.columns], axis=1)

# Store test IDs for submission
test_ids = test_full['id'].values

print(f"\n✓ Features prepared: {X_full.shape[1]}")


STEP 3: DATA PREPARATION

✓ Features prepared: 278


In [7]:
# ============================================================================
# STEP 4: TRAIN-VALIDATION SPLIT (NEW SPLIT FOR TASK 2)
# ============================================================================
print("\n" + "="*80)
print("STEP 4: TRAIN-VALIDATION SPLIT")
print("="*80)

X_train, X_val, y_train, y_val = train_test_split(
    X_full, y_full, 
    test_size=VALIDATION_SIZE, 
    random_state=RANDOM_STATE
)

print(f"\n✓ Training set: {X_train.shape} ({len(X_train):,} samples)")
print(f"✓ Validation set: {X_val.shape} ({len(X_val):,} samples)")
print(f"✓ Test set: {X_test.shape}")
print(f"✓ Random state: {RANDOM_STATE}")


STEP 4: TRAIN-VALIDATION SPLIT

✓ Training set: (163356, 278) (163,356 samples)
✓ Validation set: (18151, 278) (18,151 samples)
✓ Test set: (77789, 278)
✓ Random state: 42


In [8]:
# ============================================================================
# STEP 5: FEATURE SELECTION (OPTIONAL BUT RECOMMENDED)
# ============================================================================
print("\n" + "="*80)
print("STEP 5: FEATURE SELECTION")
print("="*80)

# Select top features (you can adjust this)
from sklearn.feature_selection import SelectKBest, f_regression

# Select numeric features only for initial selection
numeric_features = X_train.select_dtypes(include=[np.number]).columns.tolist()
X_train_numeric = X_train[numeric_features]
X_val_numeric = X_val[numeric_features]
X_test_numeric = X_test[numeric_features]

# Quick feature selection (top 100 features)
n_features = min(100, len(numeric_features))

imputer_temp = SimpleImputer(strategy='median')
X_train_imputed = imputer_temp.fit_transform(X_train_numeric)

selector = SelectKBest(score_func=f_regression, k=n_features)
selector.fit(X_train_imputed, y_train)

selected_features = X_train_numeric.columns[selector.get_support()].tolist()

print(f"\n✓ Selected {len(selected_features)} features from {len(numeric_features)} numeric features")
print(f"✓ Top 10 features: {selected_features[:10]}")

# Apply selection
X_train_selected = X_train[selected_features]
X_val_selected = X_val[selected_features]
X_test_selected = X_test[selected_features]


STEP 5: FEATURE SELECTION

✓ Selected 100 features from 263 numeric features
✓ Top 10 features: ['full_sq', 'life_sq', 'floor', 'university_top_20_raion', 'culture_objects_top_25_raion', 'office_raion', 'build_count_foam', 'build_count_slag', 'build_count_before_1920', 'build_count_after_1995']


In [9]:
#top 100 features
print(selected_features)

['full_sq', 'life_sq', 'floor', 'university_top_20_raion', 'culture_objects_top_25_raion', 'office_raion', 'build_count_foam', 'build_count_slag', 'build_count_before_1920', 'build_count_after_1995', 'kindergarten_km', 'school_km', 'industrial_km', 'public_transport_station_km', 'public_transport_station_min_walk', 'fitness_km', 'public_healthcare_km', 'additional_education_km', 'preschool_km', 'church_synagogue_km', 'prom_part_500', 'office_count_500', 'office_sqm_500', 'trc_count_500', 'trc_sqm_500', 'cafe_count_500', 'cafe_count_500_na_price', 'cafe_count_500_price_500', 'cafe_count_500_price_1000', 'cafe_count_500_price_1500', 'cafe_count_500_price_2500', 'cafe_count_500_price_4000', 'cafe_count_500_price_high', 'big_church_count_500', 'church_count_500', 'mosque_count_500', 'leisure_count_500', 'sport_count_500', 'market_count_500', 'office_count_1000', 'office_sqm_1000', 'trc_count_1000', 'trc_sqm_1000', 'cafe_count_1000', 'cafe_count_1000_na_price', 'cafe_count_1000_price_500', 

In [10]:
# ============================================================================
# STEP 6: PREPROCESSING PIPELINE
# ============================================================================
print("\n" + "="*80)
print("STEP 6: PREPROCESSING PIPELINE")
print("="*80)

# Simple preprocessing for numeric features
preprocessor = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

print("\n✓ Preprocessing pipeline created:")
print("  - Median imputation for missing values")
print("  - Standard scaling")


STEP 6: PREPROCESSING PIPELINE

✓ Preprocessing pipeline created:
  - Median imputation for missing values
  - Standard scaling


In [11]:
# ============================================================================
# STEP 7: TRAIN ALL REQUIRED MODELS
# ============================================================================
print("\n" + "="*80)
print("STEP 7: TRAINING ALL REQUIRED MODELS")
print("="*80)

results = {}


STEP 7: TRAINING ALL REQUIRED MODELS


In [12]:
# ============================================================================
# MODEL 1: REGRESSION TREE (REQUIRED)
# ============================================================================
print("\n" + "="*80)
print("MODEL 1/6: REGRESSION TREE")
print("="*80)

tree_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', DecisionTreeRegressor(
        max_depth=15,
        min_samples_split=20,
        min_samples_leaf=10,
        random_state=RANDOM_STATE
    ))
])

evaluate_model('Regression Tree', tree_pipeline, 
               X_train_selected, X_val_selected, y_train, y_val, results)


MODEL 1/6: REGRESSION TREE

Training: Regression Tree

Results:
  Validation MSE:  192.1782
  Validation RMSE: 13.8628
  Validation R²:   0.5830
  Validation MAE:  6.3102
  Training time:   82.73s
  Inference time:  1.5842s


0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,steps,"[('imputer', ...), ('scaler', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,15
,min_samples_split,20
,min_samples_leaf,10
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [13]:
# ============================================================================
# MODEL 2: LASSO (BEST FROM TASK 1) (REQUIRED)
# ============================================================================
print("\n" + "="*80)
print("MODEL 2/6: LASSO (BEST FROM TASK 1)")
print("="*80)

lasso_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', Lasso(alpha=0.1, max_iter=10000, random_state=RANDOM_STATE))
])

evaluate_model('Lasso (Task 1 Best)', lasso_pipeline,
               X_train_selected, X_val_selected, y_train, y_val, results)


MODEL 2/6: LASSO (BEST FROM TASK 1)

Training: Lasso (Task 1 Best)

Results:
  Validation MSE:  178.0053
  Validation RMSE: 13.3419
  Validation R²:   0.6137
  Validation MAE:  6.8479
  Training time:   35.06s
  Inference time:  0.9169s


0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,steps,"[('imputer', ...), ('scaler', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,alpha,0.1
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,10000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,42
,selection,'cyclic'


In [15]:
# ============================================================================
# MODEL 3: GRADIENT BOOSTING (SKLEARN) (REQUIRED)
# ============================================================================
print("\n" + "="*80)
print("MODEL 3/6: GRADIENT BOOSTING (SKLEARN)")
print("="*80)

# Preprocess for gradient boosting
X_train_gb = preprocessor.fit_transform(X_train_selected)
X_val_gb = preprocessor.transform(X_val_selected)

gb_model = GradientBoostingRegressor(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=5,
    min_samples_split=20,
    min_samples_leaf=15,
    subsample=0.8,
    random_state=RANDOM_STATE,
    verbose=0
)

evaluate_model('GradientBoosting', gb_model,
               X_train_gb, X_val_gb, y_train, y_val, results)


MODEL 3/6: GRADIENT BOOSTING (SKLEARN)

Training: GradientBoosting

Results:
  Validation MSE:  167.6722
  Validation RMSE: 12.9488
  Validation R²:   0.6361
  Validation MAE:  6.1535
  Training time:   3500.26s
  Inference time:  5.7418s


0,1,2
,loss,'squared_error'
,learning_rate,0.05
,n_estimators,200
,subsample,0.8
,criterion,'friedman_mse'
,min_samples_split,20
,min_samples_leaf,15
,min_weight_fraction_leaf,0.0
,max_depth,5
,min_impurity_decrease,0.0


In [20]:
# ============================================================================
# MODEL 4: LIGHTGBM (REQUIRED)
# ============================================================================
print("\n" + "="*80)
print("MODEL 4/6: LIGHTGBM")
print("="*80)

lgb_model = lgb.LGBMRegressor(
    objective='regression',
    metric='rmse',
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=7,
    num_leaves=31,
    min_child_samples=20,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=RANDOM_STATE,
    verbose=1
)

# LightGBM can handle NaN values directly
X_train_lgb = X_train_selected.values
X_val_lgb = X_val_selected.values

lgb_model.fit(
    X_train_lgb, y_train,
    eval_set=[(X_val_lgb, y_val)],
    callbacks=[lgb.early_stopping(50, verbose=False)]
)

# Manual evaluation for LightGBM
start_time = time.time()
y_train_pred = lgb_model.predict(X_train_lgb)
y_val_pred = lgb_model.predict(X_val_lgb)
inference_time = time.time() - start_time

results['LightGBM'] = {
    'train_mse': mean_squared_error(y_train, y_train_pred),
    'val_mse': mean_squared_error(y_val, y_val_pred),
    'train_rmse': np.sqrt(mean_squared_error(y_train, y_train_pred)),
    'val_rmse': np.sqrt(mean_squared_error(y_val, y_val_pred)),
    'train_r2': r2_score(y_train, y_train_pred),
    'val_r2': r2_score(y_val, y_val_pred),
    'train_mae': mean_absolute_error(y_train, y_train_pred),
    'val_mae': mean_absolute_error(y_val, y_val_pred),
    'train_time': 0,  # Calculated during fit
    'inference_time': inference_time,
    'model': lgb_model
}

print(f"\nResults:")
print(f"  Validation MSE:  {results['LightGBM']['val_mse']:.4f}")
print(f"  Validation RMSE: {results['LightGBM']['val_rmse']:.4f}")
print(f"  Validation R²:   {results['LightGBM']['val_r2']:.4f}")


MODEL 4/6: LIGHTGBM
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.191057 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 24994
[LightGBM] [Info] Number of data points in the train set: 163356, number of used features: 100
[LightGBM] [Info] Start training from score 14.844620

Results:
  Validation MSE:  165.8967
  Validation RMSE: 12.8801
  Validation R²:   0.6400


In [21]:
# ============================================================================
# MODEL 5: CATBOOST (REQUIRED)
# ============================================================================
print("\n" + "="*80)
print("MODEL 5/6: CATBOOST")
print("="*80)

# Fill NaN for CatBoost
X_train_cb = X_train_selected.fillna(-999)
X_val_cb = X_val_selected.fillna(-999)

catboost_model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    l2_leaf_reg=3,
    random_state=RANDOM_STATE,
    verbose=False,
    early_stopping_rounds=50
)

catboost_model.fit(
    X_train_cb, y_train,
    eval_set=(X_val_cb, y_val)
)

# Manual evaluation
start_time = time.time()
y_train_pred = catboost_model.predict(X_train_cb)
y_val_pred = catboost_model.predict(X_val_cb)
inference_time = time.time() - start_time

results['CatBoost'] = {
    'train_mse': mean_squared_error(y_train, y_train_pred),
    'val_mse': mean_squared_error(y_val, y_val_pred),
    'train_rmse': np.sqrt(mean_squared_error(y_train, y_train_pred)),
    'val_rmse': np.sqrt(mean_squared_error(y_val, y_val_pred)),
    'train_r2': r2_score(y_train, y_train_pred),
    'val_r2': r2_score(y_val, y_val_pred),
    'train_mae': mean_absolute_error(y_train, y_train_pred),
    'val_mae': mean_absolute_error(y_val, y_val_pred),
    'train_time': 0,
    'inference_time': inference_time,
    'model': catboost_model
}

print(f"\nResults:")
print(f"  Validation MSE:  {results['CatBoost']['val_mse']:.4f}")
print(f"  Validation RMSE: {results['CatBoost']['val_rmse']:.4f}")
print(f"  Validation R²:   {results['CatBoost']['val_r2']:.4f}")


MODEL 5/6: CATBOOST

Results:
  Validation MSE:  166.0650
  Validation RMSE: 12.8866
  Validation R²:   0.6396


In [22]:
# ============================================================================
# MODEL 6: XGBOOST (REQUIRED)
# ============================================================================
print("\n" + "="*80)
print("MODEL 6/6: XGBOOST")
print("="*80)

# Fill NaN for XGBoost
X_train_xgb = X_train_selected.fillna(-999)
X_val_xgb = X_val_selected.fillna(-999)

xgb_model = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=7,
    min_child_weight=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=RANDOM_STATE,
    verbosity=0
)

xgb_model.fit(
    X_train_xgb, y_train,
    eval_set=[(X_val_xgb, y_val)],
    verbose=False
)

# Manual evaluation
start_time = time.time()
y_train_pred = xgb_model.predict(X_train_xgb)
y_val_pred = xgb_model.predict(X_val_xgb)
inference_time = time.time() - start_time

results['XGBoost'] = {
    'train_mse': mean_squared_error(y_train, y_train_pred),
    'val_mse': mean_squared_error(y_val, y_val_pred),
    'train_rmse': np.sqrt(mean_squared_error(y_train, y_train_pred)),
    'val_rmse': np.sqrt(mean_squared_error(y_val, y_val_pred)),
    'train_r2': r2_score(y_train, y_train_pred),
    'val_r2': r2_score(y_val, y_val_pred),
    'train_mae': mean_absolute_error(y_train, y_train_pred),
    'val_mae': mean_absolute_error(y_val, y_val_pred),
    'train_time': 0,
    'inference_time': inference_time,
    'model': xgb_model
}

print(f"\nResults:")
print(f"  Validation MSE:  {results['XGBoost']['val_mse']:.4f}")
print(f"  Validation RMSE: {results['XGBoost']['val_rmse']:.4f}")
print(f"  Validation R²:   {results['XGBoost']['val_r2']:.4f}")


MODEL 6/6: XGBOOST

Results:
  Validation MSE:  170.7877
  Validation RMSE: 13.0686
  Validation R²:   0.6294


In [23]:
# ============================================================================
# STEP 8: RESULTS COMPARISON
# ============================================================================
print("\n" + "="*80)
print("STEP 8: MODEL COMPARISON")
print("="*80)

# Create comparison DataFrame
results_df = pd.DataFrame(results).T
results_df = results_df.sort_values('val_mse')

print("\n✓ COMPREHENSIVE MODEL COMPARISON TABLE:")
print("="*80)
comparison_cols = ['val_mse', 'val_rmse', 'val_r2', 'val_mae', 'train_time', 'inference_time']
print(results_df[comparison_cols].to_string())

# Identify best model
best_model_name = results_df['val_mse'].idxmin()
best_model = results[best_model_name]['model']
best_val_mse = results_df.loc[best_model_name, 'val_mse']
best_val_rmse = results_df.loc[best_model_name, 'val_rmse']

print(f"\n" + "="*80)
print(f"🏆 BEST MODEL: {best_model_name}")
print("="*80)
print(f"  Validation MSE:  {best_val_mse:.4f}")
print(f"  Validation RMSE: {best_val_rmse:.4f}")
print(f"  Validation R²:   {results_df.loc[best_model_name, 'val_r2']:.4f}")

# Save results
results_df.to_csv('task2_model_comparison.csv')
print(f"\n✓ Results saved to 'task2_model_comparison.csv'")


STEP 8: MODEL COMPARISON

✓ COMPREHENSIVE MODEL COMPARISON TABLE:
                        val_mse   val_rmse    val_r2   val_mae   train_time inference_time
LightGBM             165.896663  12.880088  0.639993  6.159574            0        0.51756
CatBoost             166.064977   12.88662  0.639627  6.185576            0       0.501582
GradientBoosting     167.672234  12.948831   0.63614  6.153505  3500.257476        5.74175
XGBoost              170.787746  13.068579  0.629379   6.00201            0       3.698084
Lasso (Task 1 Best)  178.005282  13.341862  0.613716  6.847873    35.061857       0.916889
Regression Tree      192.178234  13.862836   0.58296  6.310182    82.734847       1.584171

🏆 BEST MODEL: LightGBM
  Validation MSE:  165.8967
  Validation RMSE: 12.8801
  Validation R²:   0.6400

✓ Results saved to 'task2_model_comparison.csv'


In [24]:
# ============================================================================
# STEP 9: GENERATE KAGGLE SUBMISSION
# ============================================================================
print("\n" + "="*80)
print("STEP 9: GENERATING KAGGLE SUBMISSION")
print("="*80)

print(f"\n✓ Using best model: {best_model_name}")

# Prepare test data based on model type
if best_model_name in ['Regression Tree', 'Lasso (Task 1 Best)']:
    X_test_prepared = preprocessor.transform(X_test_selected)
elif best_model_name == 'GradientBoosting':
    X_test_prepared = preprocessor.transform(X_test_selected)
elif best_model_name == 'LightGBM':
    X_test_prepared = X_test_selected.values
elif best_model_name in ['CatBoost', 'XGBoost']:
    X_test_prepared = X_test_selected.fillna(-999)

# Generate predictions
print("✓ Generating predictions on test set...")
test_predictions = best_model.predict(X_test_prepared)

# Create submission file
submission = pd.DataFrame({
    'id': test_ids,
    'price_doc': test_predictions
})

# Save submission
submission.to_csv('submission.csv', index=False)

print(f"\n✓ Submission file created: submission.csv")
print(f"  Number of predictions: {len(submission)}")
print(f"  Prediction range: [{test_predictions.min():.2f}, {test_predictions.max():.2f}]")
print(f"  Mean prediction: {test_predictions.mean():.2f}")

# Show sample
print(f"\n✓ Sample predictions:")
print(submission.head(10).to_string(index=False))


STEP 9: GENERATING KAGGLE SUBMISSION

✓ Using best model: LightGBM
✓ Generating predictions on test set...

✓ Submission file created: submission.csv
  Number of predictions: 77789
  Prediction range: [3.61, 64.99]
  Mean prediction: 14.83

✓ Sample predictions:
    id  price_doc
243467   6.613294
230180  12.191080
256036   4.051415
  1848   4.340369
 68720  12.320714
163181   5.342346
161538   5.736049
 15029   4.402367
 12928   6.223980
193799  10.756773


In [25]:
# ============================================================================
# STEP 10: FEATURE IMPORTANCE (FOR BEST MODEL)
# ============================================================================
print("\n" + "="*80)
print("STEP 10: FEATURE IMPORTANCE ANALYSIS")
print("="*80)

if best_model_name in ['LightGBM', 'XGBoost', 'CatBoost', 'GradientBoosting']:
    if hasattr(best_model, 'feature_importances_'):
        importances = best_model.feature_importances_
        feature_importance_df = pd.DataFrame({
            'feature': selected_features,
            'importance': importances
        }).sort_values('importance', ascending=False)
        
        print(f"\n✓ Top 20 Most Important Features for {best_model_name}:")
        print(feature_importance_df.head(20).to_string(index=False))
        
        # Save to CSV
        feature_importance_df.to_csv('feature_importance.csv', index=False)
        print(f"\n✓ Feature importance saved to 'feature_importance.csv'")


STEP 10: FEATURE IMPORTANCE ANALYSIS

✓ Top 20 Most Important Features for LightGBM:
                     feature  importance
                     full_sq         168
                       floor          77
culture_objects_top_25_raion          57
                     life_sq          51
               industrial_km          50
 public_transport_station_km          46
                office_raion          44
           church_count_1500          43
  cafe_count_2000_price_high          42
           church_count_3000          41
      build_count_after_1995          40
  cafe_count_2000_price_2500          39
   cafe_count_500_price_2500          39
             office_sqm_1000          39
    cafe_count_2000_na_price          38
            build_count_slag          37
  cafe_count_3000_price_2500          36
                   school_km          34
          leisure_count_1500          34
             cafe_count_1500          34

✓ Feature importance saved to 'feature_importance.cs

In [26]:
# ============================================================================
# FINAL SUMMARY
# ============================================================================
print("\n" + "="*80)
print("TASK 2 COMPLETED SUCCESSFULLY")
print("="*80)

print("\n✓ All required models trained:")
print("  [✓] Regression Tree")
print("  [✓] Lasso (Best from Task 1)")
print("  [✓] Gradient Boosting")
print("  [✓] LightGBM")
print("  [✓] CatBoost")
print("  [✓] XGBoost")

print("\n✓ Deliverables generated:")
print("  [✓] Model comparison table: task2_model_comparison.csv")
print("  [✓] Kaggle submission file: submission.csv")
print("  [✓] Feature importance: feature_importance.csv")

print("\n✓ Next steps:")
print("  1. Submit 'submission.csv' to Kaggle")
print("  2. Record your public leaderboard score")
print("  3. Take screenshot of submission")
print("  4. Complete Report 2 with:")
print("     - Comparison table (already generated)")
print("     - Kaggle score (after submission)")
print("     - Analysis of best model performance")

print("\n" + "="*80)
print("Ready for Kaggle submission!")
print("="*80)


TASK 2 COMPLETED SUCCESSFULLY

✓ All required models trained:
  [✓] Regression Tree
  [✓] Lasso (Best from Task 1)
  [✓] Gradient Boosting
  [✓] LightGBM
  [✓] CatBoost
  [✓] XGBoost

✓ Deliverables generated:
  [✓] Model comparison table: task2_model_comparison.csv
  [✓] Kaggle submission file: submission.csv
  [✓] Feature importance: feature_importance.csv

✓ Next steps:
  1. Submit 'submission.csv' to Kaggle
  2. Record your public leaderboard score
  3. Take screenshot of submission
  4. Complete Report 2 with:
     - Comparison table (already generated)
     - Kaggle score (after submission)
     - Analysis of best model performance

Ready for Kaggle submission!


Retraining all models for submission files

In [27]:
print("\n" + "="*80)
print("STEP 9: RETRAINING ALL MODELS ON FULL DATA")
print("="*80)

X_full_selected = X_full[selected_features]
X_test_selected = X_test[selected_features]

# Preprocessing
X_full_preprocessed = preprocessor.fit_transform(X_full_selected)
X_test_preprocessed = preprocessor.transform(X_test_selected)

# Containers for final models + predictions
final_predictions = {}

for model_name, info in results.items():
    print(f"\nRetraining model on full data: {model_name}")

    model = info['model']

    # Handle models differently based on NA support
    if model_name in ["LightGBM", "CatBoost", "XGBoost"]:
        if model_name == "CatBoost" or model_name == "XGBoost":
            Xf = X_full_selected.fillna(-999)
            Xt = X_test_selected.fillna(-999)
        else:
            Xf = X_full_selected.values
            Xt = X_test_selected.values

        model.fit(Xf, y_full)
        preds = model.predict(Xt)

    else:
        # Tree, Lasso, GradientBoosting → use preprocessed
        model.fit(X_full_preprocessed, y_full)
        preds = model.predict(X_test_preprocessed)

    final_predictions[model_name] = preds



STEP 9: RETRAINING ALL MODELS ON FULL DATA

Retraining model on full data: Regression Tree

Retraining model on full data: Lasso (Task 1 Best)

Retraining model on full data: GradientBoosting

Retraining model on full data: LightGBM
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.193284 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 24994
[LightGBM] [Info] Number of data points in the train set: 181507, number of used features: 100
[LightGBM] [Info] Start training from score 14.845599

Retraining model on full data: CatBoost

Retraining model on full data: XGBoost


In [28]:

print("\n" + "="*80)
print("STEP 10: GENERATING SUBMISSION FILES FOR ALL MODELS")
print("="*80)

for model_name, preds in final_predictions.items():
    file_name = f"submission_{model_name.replace(' ', '_')}_{ERP_ID}.csv"
    
    submission_df = pd.DataFrame({
        'id': test_ids,
        'price_doc': preds
    })

    submission_df.to_csv(file_name, index=False)
    print(f"✓ Saved: {file_name}")



STEP 10: GENERATING SUBMISSION FILES FOR ALL MODELS
✓ Saved: submission_Regression_Tree_27857.csv
✓ Saved: submission_Lasso_(Task_1_Best)_27857.csv
✓ Saved: submission_GradientBoosting_27857.csv
✓ Saved: submission_LightGBM_27857.csv
✓ Saved: submission_CatBoost_27857.csv
✓ Saved: submission_XGBoost_27857.csv
