In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb
import warnings
import time
warnings.filterwarnings('ignore')

In [2]:
# ============================================================================
# CONFIGURATION
# ============================================================================
ERP_ID = 27857
USE_SAMPLING = False
SAMPLE_SIZE = 50000
USE_FEATURE_SELECTION = True
MAX_FEATURES = 30

np.random.seed(ERP_ID)

print("="*80)
print("RUSSIAN HOUSING PRICE PREDICTION - TASK 1")
print("="*80)
print(f"ERP ID: {ERP_ID}")
print(f"Configuration: Sampling={USE_SAMPLING}, Feature Selection={USE_FEATURE_SELECTION}")

RUSSIAN HOUSING PRICE PREDICTION - TASK 1
ERP ID: 27857
Configuration: Sampling=False, Feature Selection=True


In [3]:
# ============================================================================
# HELPER FUNCTIONS
# ============================================================================

def stratified_sample(df, target_col='price_doc', sample_size=50000, random_state=None):
    """Stratified sampling preserving target distribution"""
    if len(df) <= sample_size:
        return df
    df['price_bin'] = pd.qcut(df[target_col], q=10, labels=False, duplicates='drop')
    sampled = df.groupby('price_bin', group_keys=False).apply(
        lambda x: x.sample(frac=sample_size/len(df), random_state=random_state)
    )
    return sampled.drop('price_bin', axis=1)

def create_features(df):
    """Feature engineering"""
    df = df.copy()
    if 'build_year' in df.columns:
        df['building_age'] = (2015 - df['build_year']).clip(lower=0)
    if 'full_sq' in df.columns and 'num_room' in df.columns:
        df['sqm_per_room'] = df['full_sq'] / (df['num_room'] + 1)
    if 'life_sq' in df.columns and 'full_sq' in df.columns:
        df['living_area_ratio'] = df['life_sq'] / (df['full_sq'] + 1)
    if 'kitch_sq' in df.columns and 'full_sq' in df.columns:
        df['kitchen_area_ratio'] = df['kitch_sq'] / (df['full_sq'] + 1)
    if 'floor' in df.columns and 'max_floor' in df.columns:
        df['floor_ratio'] = df['floor'] / (df['max_floor'] + 1)
        df['is_first_floor'] = (df['floor'] == 1).astype(int)
        df['is_last_floor'] = (df['floor'] == df['max_floor']).astype(int)
    return df

def quick_feature_selection(X, y, n_features=30, random_state=None):
    """Feature selection using multiple methods"""
    print(f"\n[Feature Selection] Selecting top {n_features} features...")
    numeric_X = X.select_dtypes(include=[np.number])
    
    if numeric_X.shape[1] <= n_features:
        return numeric_X.columns.tolist()
    
    imputer = SimpleImputer(strategy='median')
    X_imputed = imputer.fit_transform(numeric_X)
    
    # Method 1: Correlation
    correlations = numeric_X.corrwith(y).abs()
    top_corr = correlations.nlargest(n_features).index.tolist()
    
    # Method 2: F-statistic
    selector = SelectKBest(score_func=f_regression, k=n_features)
    selector.fit(X_imputed, y)
    top_fstat = numeric_X.columns[selector.get_support()].tolist()
    
    # Method 3: Random Forest
    print("  Training Random Forest...")
    rf = RandomForestRegressor(n_estimators=50, max_depth=8, random_state=random_state, n_jobs=-1)
    rf.fit(X_imputed, y)
    importances = pd.Series(rf.feature_importances_, index=numeric_X.columns)
    top_rf = importances.nlargest(n_features).index.tolist()
    
    # Consensus
    from collections import Counter
    all_features = top_corr + top_fstat + top_rf
    feature_votes = Counter(all_features)
    selected = [f for f, votes in feature_votes.most_common() if votes >= 2]
    
    if len(selected) < n_features:
        for f in top_rf:
            if f not in selected:
                selected.append(f)
            if len(selected) >= n_features:
                break
    
    print(f"  Selected {len(selected[:n_features])} features")
    return selected[:n_features]

In [4]:
# ============================================================================
# STEP 1: DATA LOADING & SAMPLING
# ============================================================================
print("\n" + "="*80)
print("STEP 1: DATA LOADING & SAMPLING")
print("="*80)

train_df = pd.read_csv('train.csv')
print(f"Original data: {train_df.shape}")

if USE_SAMPLING:
    train_df = stratified_sample(train_df, 'price_doc', SAMPLE_SIZE, ERP_ID)
    print(f"Sampled data: {train_df.shape}")


STEP 1: DATA LOADING & SAMPLING
Original data: (181507, 279)


In [5]:
# ============================================================================
# STEP 2: FEATURE ENGINEERING
# ============================================================================
print("\n" + "="*80)
print("STEP 2: FEATURE ENGINEERING")
print("="*80)

train_df = create_features(train_df)
X = train_df.drop('price_doc', axis=1)
y = train_df['price_doc']

# Remove non-predictive columns
cols_to_drop = ['id', 'timestamp']
X = X.drop([c for c in cols_to_drop if c in X.columns], axis=1)


STEP 2: FEATURE ENGINEERING


In [6]:
# ============================================================================
# STEP 3: TRAIN-VALIDATION SPLIT (70-30)
# ============================================================================
print("\n" + "="*80)
print("STEP 3: TRAIN-VALIDATION SPLIT (70-30)")
print("="*80)

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.3, random_state=ERP_ID
)
print(f"Training set: {X_train.shape}")
print(f"Validation set: {X_val.shape}")
print(f"Random state: {ERP_ID}")


STEP 3: TRAIN-VALIDATION SPLIT (70-30)
Training set: (127054, 278)
Validation set: (54453, 278)
Random state: 27857


In [7]:
# ============================================================================
# STEP 4: FEATURE SELECTION
# ============================================================================
print("\n" + "="*80)
print("STEP 4: FEATURE SELECTION")
print("="*80)

if USE_FEATURE_SELECTION:
    selected_features = quick_feature_selection(X_train, y_train, MAX_FEATURES, ERP_ID)
    X_train_selected = X_train[selected_features]
    X_val_selected = X_val[selected_features]
    print(f"\nâœ“ Using {len(selected_features)} selected features")
else:
    X_train_selected = X_train.select_dtypes(include=[np.number])
    X_val_selected = X_val.select_dtypes(include=[np.number])
    selected_features = X_train_selected.columns.tolist()


STEP 4: FEATURE SELECTION

[Feature Selection] Selecting top 30 features...


MemoryError: Unable to allocate 255. MiB for an array with shape (127054, 263) and data type int64