Imports

In [1]:
"""
Russian Cities Housing Challenge 2025 — Task 1
Linear models + one Gradient Boosting model
Split uses random_state = 27857 (user ERP)
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb
import warnings
import time
warnings.filterwarnings('ignore')


Configuration

In [2]:
# ============================================================================
# CONFIGURATION
# ============================================================================
ERP_ID = 27857 

# SAMPLING CONFIGURATION
USE_SAMPLING = True  # Set to False to use full dataset
SAMPLE_SIZE = 50000  # Recommended: 30k-60k for speed, full dataset for final run

# FEATURE SELECTION CONFIGURATION
USE_FEATURE_SELECTION = True
MAX_FEATURES_LINEAR = 30  # For linear models (prevents polynomial explosion)
MAX_FEATURES_BOOSTING = 80  # For gradient boosting (can handle more)

np.random.seed(ERP_ID)

print("="*80)
print("RUSSIAN HOUSING PRICE PREDICTION - OPTIMIZED VERSION")
print("="*80)
print(f"Configuration:")
print(f"  ERP_ID: {ERP_ID}")
print(f"  Sampling: {USE_SAMPLING} (size={SAMPLE_SIZE if USE_SAMPLING else 'FULL'})")
print(f"  Feature Selection: {USE_FEATURE_SELECTION}")

RUSSIAN HOUSING PRICE PREDICTION - OPTIMIZED VERSION
Configuration:
  ERP_ID: 27857
  Sampling: True (size=50000)
  Feature Selection: True


Helper Functions

In [3]:
# ============================================================================
# HELPER FUNCTIONS
# ============================================================================

def stratified_sample(df, target_col='price_doc', sample_size=50000, random_state=None):
    """Sample data while preserving target distribution"""
    if len(df) <= sample_size:
        return df
    
    # Create price bins
    df['price_bin'] = pd.qcut(df[target_col], q=10, labels=False, duplicates='drop')
    
    # Stratified sampling
    sampled = df.groupby('price_bin', group_keys=False).apply(
        lambda x: x.sample(frac=sample_size/len(df), random_state=random_state)
    )
    
    return sampled.drop('price_bin', axis=1)


def quick_feature_selection(X, y, n_features=30, random_state=None):
    """
    Fast feature selection combining multiple methods
    Returns: list of selected feature names
    """
    print(f"\n[Feature Selection] Selecting top {n_features} features...")
    
    numeric_X = X.select_dtypes(include=[np.number])
    
    if numeric_X.shape[1] <= n_features:
        print(f"  Only {numeric_X.shape[1]} numeric features, using all")
        return numeric_X.columns.tolist()
    
    # Impute for feature selection
    imputer = SimpleImputer(strategy='median')
    X_imputed = imputer.fit_transform(numeric_X)
    
    # Method 1: Correlation with target (fast)
    correlations = numeric_X.corrwith(y).abs()
    top_corr = correlations.nlargest(n_features).index.tolist()
    
    # Method 2: F-statistic (fast, statistical)
    selector = SelectKBest(score_func=f_regression, k=n_features)
    selector.fit(X_imputed, y)
    top_fstat = numeric_X.columns[selector.get_support()].tolist()
    
    # Method 3: Random Forest importance (slower but captures non-linearity)
    print("  Training Random Forest for feature importance...")
    rf = RandomForestRegressor(
        n_estimators=50, max_depth=8, 
        random_state=random_state, n_jobs=1
    )
    rf.fit(X_imputed, y)
    importances = pd.Series(rf.feature_importances_, index=numeric_X.columns)
    top_rf = importances.nlargest(n_features).index.tolist()
    
    # Combine: features appearing in at least 2 methods
    from collections import Counter
    all_features = top_corr + top_fstat + top_rf
    feature_votes = Counter(all_features)
    
    # Get features with 2+ votes, then fill with top RF features if needed
    selected = [f for f, votes in feature_votes.most_common() if votes >= 2]
    
    if len(selected) < n_features:
        # Add top RF features until we reach n_features
        for f in top_rf:
            if f not in selected:
                selected.append(f)
            if len(selected) >= n_features:
                break
    
    selected = selected[:n_features]
    
    print(f"  Selected {len(selected)} features")
    print(f"  Top 10: {selected[:10]}")
    
    return selected


def create_features(df):
    """Feature engineering"""
    df = df.copy()
    
    if 'build_year' in df.columns:
        current_year = 2015
        df['building_age'] = (current_year - df['build_year']).clip(lower=0)
    
    if 'full_sq' in df.columns and 'num_room' in df.columns:
        df['sqm_per_room'] = df['full_sq'] / (df['num_room'] + 1)
    
    if 'life_sq' in df.columns and 'full_sq' in df.columns:
        df['living_area_ratio'] = df['life_sq'] / (df['full_sq'] + 1)
    
    if 'kitch_sq' in df.columns and 'full_sq' in df.columns:
        df['kitchen_area_ratio'] = df['kitch_sq'] / (df['full_sq'] + 1)
    
    if 'floor' in df.columns and 'max_floor' in df.columns:
        df['floor_ratio'] = df['floor'] / (df['max_floor'] + 1)
        df['is_first_floor'] = (df['floor'] == 1).astype(int)
        df['is_last_floor'] = (df['floor'] == df['max_floor']).astype(int)
    
    return df

Data Loading and Sampling

In [4]:
# ============================================================================
# STEP 1: DATA LOADING & SAMPLING
# ============================================================================
print("\n[STEP 1] Loading Data...")

train_df = pd.read_csv('train.csv')
print(f"Original training data: {train_df.shape}")
print(f"Memory usage: {train_df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

# Apply sampling if enabled
if USE_SAMPLING and len(train_df) > SAMPLE_SIZE:
    print(f"\n[STEP 1b] Applying stratified sampling to {SAMPLE_SIZE} rows...")
    train_df = stratified_sample(train_df, 'price_doc', SAMPLE_SIZE, ERP_ID)
    print(f"Sampled data: {train_df.shape}")
    print(f"New memory usage: {train_df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
    print(f"Reduction: {100*(1-len(train_df)/180000):.1f}%")


[STEP 1] Loading Data...
Original training data: (181507, 279)
Memory usage: 521.4 MB

[STEP 1b] Applying stratified sampling to 50000 rows...
Sampled data: (50000, 279)
New memory usage: 144.0 MB
Reduction: 72.2%


In [5]:
# ============================================================================
# STEP 2: FEATURE ENGINEERING
# ============================================================================
print("\n[STEP 2] Feature Engineering...")

train_df = create_features(train_df)

# Separate features and target
X = train_df.drop('price_doc', axis=1)
y = train_df['price_doc']

# Remove non-predictive columns
cols_to_drop = ['id', 'timestamp']
X = X.drop([c for c in cols_to_drop if c in X.columns], axis=1)

print(f"Features after engineering: {X.shape[1]}")


[STEP 2] Feature Engineering...
Features after engineering: 278


Train-test split

In [6]:
# ============================================================================
# STEP 3: TRAIN-VALIDATION SPLIT
# ============================================================================
print("\n[STEP 3] Creating Train-Validation Split (70-30)...")

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.3, random_state=ERP_ID
)

print(f"Training set: {X_train.shape}")
print(f"Validation set: {X_val.shape}")


[STEP 3] Creating Train-Validation Split (70-30)...
Training set: (35000, 278)
Validation set: (15000, 278)


In [7]:
# ============================================================================
# STEP 4: FEATURE SELECTION
# ============================================================================

if USE_FEATURE_SELECTION:
    print("\n[STEP 4] Applying Feature Selection...")
    
    # Select features for linear models (aggressive reduction)
    selected_features_linear = quick_feature_selection(
        X_train, y_train, 
        n_features=MAX_FEATURES_LINEAR,
        random_state=ERP_ID
    )
    
    # Select features for boosting (less aggressive)
    selected_features_boosting = quick_feature_selection(
        X_train, y_train,
        n_features=MAX_FEATURES_BOOSTING,
        random_state=ERP_ID
    )
    
    X_train_linear = X_train[selected_features_linear]
    X_val_linear = X_val[selected_features_linear]
    
    X_train_boosting = X_train[selected_features_boosting]
    X_val_boosting = X_val[selected_features_boosting]
    
    print(f"\n✓ Linear models using: {X_train_linear.shape[1]} features")
    print(f"✓ Boosting using: {X_train_boosting.shape[1]} features")
    
else:
    # Use all features
    X_train_linear = X_train.select_dtypes(include=[np.number])
    X_val_linear = X_val.select_dtypes(include=[np.number])
    X_train_boosting = X_train_linear
    X_val_boosting = X_val_linear
    
    print(f"\n[STEP 4] Using all {X_train_linear.shape[1]} numeric features")


[STEP 4] Applying Feature Selection...

[Feature Selection] Selecting top 30 features...
  Training Random Forest for feature importance...
  Selected 30 features
  Top 10: ['leisure_count_1000', 'full_sq', 'leisure_count_500', 'trc_sqm_500', 'cafe_count_1000_price_high', 'mosque_count_500', 'cafe_count_1000_price_1500', 'big_church_count_1000', 'cafe_count_1000_price_1000', 'cafe_count_2000_price_high']

[Feature Selection] Selecting top 80 features...
  Training Random Forest for feature importance...
  Selected 80 features
  Top 10: ['leisure_count_1000', 'full_sq', 'leisure_count_500', 'trc_sqm_500', 'cafe_count_1000_price_high', 'mosque_count_500', 'cafe_count_500_price_high', 'cafe_count_500_price_1000', 'leisure_count_1500', 'cafe_count_1000_price_1500']

✓ Linear models using: 30 features
✓ Boosting using: 80 features


Pre-processing

In [8]:
# ============================================================================
# STEP 5: PREPROCESSING PIPELINE
# ============================================================================
print("\n[STEP 5] Building Preprocessing Pipeline...")

# For linear models: simple pipeline
preprocessor_linear = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

print("✓ Preprocessor ready")


[STEP 5] Building Preprocessing Pipeline...
✓ Preprocessor ready


Model Training

In [12]:
# ============================================================================
# STEP 6: MODEL TRAINING
# ============================================================================
print("\n[STEP 6] Training Models...")
print("="*80)

results = {}

def evaluate_model(name, model, X_train, X_val, y_train, y_val):
    """Train and evaluate model"""
    start_time = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - start_time
    
    start_time = time.time()
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)
    inference_time = time.time() - start_time
    
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
    train_r2 = r2_score(y_train, y_train_pred)
    val_r2 = r2_score(y_val, y_val_pred)
    
    results[name] = {
        'train_rmse': train_rmse,
        'val_rmse': val_rmse,
        'train_r2': train_r2,
        'val_r2': val_r2,
        'train_time': train_time,
        'inference_time': inference_time
    }
    
    print(f"\n{name}")
    print(f"  Val RMSE: {val_rmse:.4f} | Train RMSE: {train_rmse:.4f}")
    print(f"  Val R²: {val_r2:.4f} | Train R²: {train_r2:.4f}")
    print(f"  Time: {train_time:.2f}s training, {inference_time:.4f}s inference")
    
    return model

# --- 6.1: Baseline ---
print("\n--- Baseline Linear Regression ---")
baseline = Pipeline([
    ('preprocessor', preprocessor_linear),
    ('regressor', LinearRegression())
])
evaluate_model('Baseline Linear', baseline, 
               X_train_linear, X_val_linear, y_train, y_val)

# --- 6.2: Polynomial (Degree 2 only - manageable with feature selection) ---
print("\n--- Polynomial Regression (Degree 2, interaction_only=False) ---")
poly2 = Pipeline([
    ('preprocessor', preprocessor_linear),
    ('poly', PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)),
    ('regressor', LinearRegression())
])
evaluate_model('Polynomial (deg=2, interaction_only=False)', poly2,
               X_train_linear, X_val_linear, y_train, y_val)



print("\n--- Polynomial Regression (Degree 2, interaction_only=True) ---")
poly2_inter = Pipeline([
    ('preprocessor', preprocessor_linear),
    ('poly', PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)),
    ('regressor', LinearRegression())
])
evaluate_model('Polynomial (deg=2, interaction_only=True)', poly2_inter,
               X_train_linear, X_val_linear, y_train, y_val)



print("\n--- Polynomial Regression (Degree 3, interaction_only=False) ---")
poly3_full = Pipeline([
    ('preprocessor', preprocessor_linear),
    ('poly', PolynomialFeatures(degree=3, include_bias=False, interaction_only=False)),
    ('regressor', LinearRegression())
])
evaluate_model('Polynomial (deg=3, interaction_only=False)', poly3_full,
               X_train_linear, X_val_linear, y_train, y_val)



print("\n--- Polynomial Regression (Degree 3, interaction_only=True) ---")
poly3_inter = Pipeline([
    ('preprocessor', preprocessor_linear),
    ('poly', PolynomialFeatures(degree=3, include_bias=False, interaction_only=True)),
    ('regressor', LinearRegression())
])
evaluate_model('Polynomial (deg=3, interaction_only=True)', poly3_inter,
               X_train_linear, X_val_linear, y_train, y_val)


# --- 6.3: Ridge ---
print("\n--- Ridge Regression ---")
ridge_pipe = Pipeline([
    ('preprocessor', preprocessor_linear),
    ('regressor', Ridge())
])

param_grid = {'regressor__alpha': [0.05, 0.1, 1, 10, 100]}
ridge_search = GridSearchCV(ridge_pipe, param_grid, cv=3, 
                            scoring='neg_root_mean_squared_error', n_jobs=1)
ridge_search.fit(X_train_linear, y_train)

print(f"Best alpha: {ridge_search.best_params_['regressor__alpha']}")
evaluate_model('Ridge (Best)', ridge_search.best_estimator_,
               X_train_linear, X_val_linear, y_train, y_val)

# --- 6.4: Lasso ---
print("\n--- Lasso Regression ---")
lasso_pipe = Pipeline([
    ('preprocessor', preprocessor_linear),
    ('regressor', Lasso(max_iter=5000))
])

param_grid = {'regressor__alpha': [0.01, 0.1, 1, 10]}
lasso_search = GridSearchCV(lasso_pipe, param_grid, cv=3,
                            scoring='neg_root_mean_squared_error', n_jobs=1)
lasso_search.fit(X_train_linear, y_train)

print(f"Best alpha: {lasso_search.best_params_['regressor__alpha']}")
evaluate_model('Lasso (Best)', lasso_search.best_estimator_,
               X_train_linear, X_val_linear, y_train, y_val)

# --- 6.5: Elastic Net ---
print("\n--- Elastic Net ---")
elastic_pipe = Pipeline([
    ('preprocessor', preprocessor_linear),
    ('regressor', ElasticNet(max_iter=5000))
])

param_grid = {
    'regressor__alpha': [0.01, 0.1, 1, 10],
    'regressor__l1_ratio': [0.3, 0.5, 0.7]
}
elastic_search = GridSearchCV(elastic_pipe, param_grid, cv=3,
                              scoring='neg_root_mean_squared_error', n_jobs=1)
elastic_search.fit(X_train_linear, y_train)

print(f"Best params: {elastic_search.best_params_}")
evaluate_model('Elastic Net (Best)', elastic_search.best_estimator_,
               X_train_linear, X_val_linear, y_train, y_val)


[STEP 6] Training Models...

--- Baseline Linear Regression ---

Baseline Linear
  Val RMSE: 13.5546 | Train RMSE: 13.2758
  Val R²: 0.6157 | Train R²: 0.6157
  Time: 1.28s training, 0.3995s inference

--- Polynomial Regression (Degree 2, interaction_only=False) ---

Polynomial (deg=2, interaction_only=False)
  Val RMSE: 13.7110 | Train RMSE: 12.4429
  Val R²: 0.6068 | Train R²: 0.6624
  Time: 3.76s training, 0.5667s inference

--- Polynomial Regression (Degree 2, interaction_only=True) ---

Polynomial (deg=2, interaction_only=True)
  Val RMSE: 13.6708 | Train RMSE: 12.4780
  Val R²: 0.6091 | Train R²: 0.6605
  Time: 3.93s training, 0.5674s inference

--- Polynomial Regression (Degree 3, interaction_only=False) ---

Polynomial (deg=3, interaction_only=False)
  Val RMSE: 52.2360 | Train RMSE: 4.7082
  Val R²: -4.7067 | Train R²: 0.9517
  Time: 336.58s training, 6.3344s inference

--- Polynomial Regression (Degree 3, interaction_only=True) ---

Polynomial (deg=3, interaction_only=True)


0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,steps,"[('imputer', ...), ('scaler', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,alpha,0.1
,l1_ratio,0.7
,fit_intercept,True
,precompute,False
,max_iter,5000
,copy_X,True
,tol,0.0001
,warm_start,False
,positive,False
,random_state,


Gradient Boosting (LightGBM)

In [10]:
# ============================================================================
# STEP 7: GRADIENT BOOSTING
# ============================================================================
print("\n[STEP 7] Training LightGBM...")
print("="*80)

# Prepare boosting features (numeric only)
X_train_lgb = X_train_boosting.select_dtypes(include=[np.number])
X_val_lgb = X_val_boosting.select_dtypes(include=[np.number])

# Impute missing values (LightGBM can handle them, but let's be safe)
imputer = SimpleImputer(strategy='median')
X_train_lgb_imputed = pd.DataFrame(
    imputer.fit_transform(X_train_lgb),
    columns=X_train_lgb.columns,
    index=X_train_lgb.index
)
X_val_lgb_imputed = pd.DataFrame(
    imputer.transform(X_val_lgb),
    columns=X_val_lgb.columns,
    index=X_val_lgb.index
)

start_time = time.time()

lgb_model = lgb.LGBMRegressor(
    objective='regression',
    metric='rmse',
    n_estimators=500,
    learning_rate=0.05,
    max_depth=7,
    num_leaves=31,
    min_child_samples=20,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=ERP_ID,
    verbose=1
)

lgb_model.fit(
    X_train_lgb_imputed, y_train,
    eval_set=[(X_val_lgb_imputed, y_val)],
    callbacks=[lgb.early_stopping(50), lgb.log_evaluation(100)]
)

lgb_train_time = time.time() - start_time

# Evaluate
start_time = time.time()
y_train_pred_lgb = lgb_model.predict(X_train_lgb_imputed)
y_val_pred_lgb = lgb_model.predict(X_val_lgb_imputed)
lgb_inference_time = time.time() - start_time

results['LightGBM'] = {
    'train_rmse': np.sqrt(mean_squared_error(y_train, y_train_pred_lgb)),
    'val_rmse': np.sqrt(mean_squared_error(y_val, y_val_pred_lgb)),
    'train_r2': r2_score(y_train, y_train_pred_lgb),
    'val_r2': r2_score(y_val, y_val_pred_lgb),
    'train_time': lgb_train_time,
    'inference_time': lgb_inference_time
}

print(f"\nLightGBM")
print(f"  Val RMSE: {results['LightGBM']['val_rmse']:.4f} | "
      f"Train RMSE: {results['LightGBM']['train_rmse']:.4f}")
print(f"  Val R²: {results['LightGBM']['val_r2']:.4f} | "
      f"Train R²: {results['LightGBM']['train_r2']:.4f}")
print(f"  Time: {lgb_train_time:.2f}s training, {lgb_inference_time:.4f}s inference")


[STEP 7] Training LightGBM...


  File "C:\Users\SHEIKHANI LAPTOP\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\joblib\externals\loky\backend\context.py", line 247, in _count_physical_cores
    cpu_count_physical = _count_physical_cores_win32()
                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\SHEIKHANI LAPTOP\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\joblib\externals\loky\backend\context.py", line 299, in _count_physical_cores_win32
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\Lib\subprocess.py

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.062579 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19894
[LightGBM] [Info] Number of data points in the train set: 35000, number of used features: 80
[LightGBM] [Info] Start training from score 14.825496
Training until validation scores don't improve for 50 rounds
[100]	valid_0's rmse: 13.0653
Early stopping, best iteration is:
[80]	valid_0's rmse: 13.0619

LightGBM
  Val RMSE: 13.0619 | Train RMSE: 11.0803
  Val R²: 0.6432 | Train R²: 0.7323
  Time: 3.70s training, 0.1557s inference


In [11]:
# ============================================================================
# STEP 8: RESULTS SUMMARY
# ============================================================================
print("\n" + "="*80)
print("FINAL RESULTS")
print("="*80)

results_df = pd.DataFrame(results).T.sort_values('val_rmse')

print("\nModel Performance (sorted by Validation RMSE):")
print(results_df[['val_rmse', 'train_rmse', 'val_r2', 'train_time']])

best_linear = results_df.iloc[:-1]['val_rmse'].idxmin()
best_linear_rmse = results_df.loc[best_linear, 'val_rmse']
lgb_rmse = results_df.loc['LightGBM', 'val_rmse']

print(f"\n--- KEY FINDINGS ---")
print(f"Best Linear Model: {best_linear}")
print(f"  Validation RMSE: {best_linear_rmse:.4f} million RUB")
print(f"\nLightGBM:")
print(f"  Validation RMSE: {lgb_rmse:.4f} million RUB")
print(f"  Improvement: {((best_linear_rmse - lgb_rmse) / best_linear_rmse * 100):.2f}%")

print(f"\n--- EFFICIENCY GAINS ---")
if USE_SAMPLING:
    print(f"✓ Data reduced: 180k → {len(train_df):,} rows ({100*(1-len(train_df)/180000):.1f}% reduction)")
if USE_FEATURE_SELECTION:
    print(f"✓ Features reduced: {X.shape[1]} → {X_train_linear.shape[1]} for linear models")
    print(f"✓ Polynomial features estimated: {X_train_linear.shape[1]**2//2:,} (degree 2)")

print("\n" + "="*80)
print("✓ TASK 1 COMPLETE")
print("="*80)

# Save selected features for documentation
if USE_FEATURE_SELECTION:
    print(f"\nSelected features for linear models:")
    print(selected_features_linear)
    
    with open('selected_features.txt', 'w') as f:
        f.write("Linear Model Features:\n")
        f.write('\n'.join(selected_features_linear))
    print("\n✓ Features saved to 'selected_features.txt'")


FINAL RESULTS

Model Performance (sorted by Validation RMSE):
                     val_rmse  train_rmse    val_r2  train_time
LightGBM            13.061885   11.080303  0.643174    3.702622
Lasso (Best)        13.540897   13.282709  0.616523    1.844970
Elastic Net (Best)  13.541983   13.284645  0.616462    1.930001
Ridge (Best)        13.553878   13.275836  0.615787    0.326002
Baseline Linear     13.554622   13.275803  0.615745    0.551002
Polynomial (deg=2)  13.711016   12.442941  0.606827    4.546981

--- KEY FINDINGS ---
Best Linear Model: LightGBM
  Validation RMSE: 13.0619 million RUB

LightGBM:
  Validation RMSE: 13.0619 million RUB
  Improvement: 0.00%

--- EFFICIENCY GAINS ---
✓ Data reduced: 180k → 50,000 rows (72.2% reduction)
✓ Features reduced: 278 → 30 for linear models
✓ Polynomial features estimated: 450 (degree 2)

✓ TASK 1 COMPLETE

Selected features for linear models:
['leisure_count_1000', 'full_sq', 'leisure_count_500', 'trc_sqm_500', 'cafe_count_1000_price_high'