In [2]:
import pandas as pd
import numpy as np
from math import radians
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.metrics import mean_squared_error

# Define an advanced RMSE function for logging
def print_rmse(y_true, y_pred, model_name="Model"):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    print(f"{model_name} RMSE: {rmse}")
    return rmse

# Load, process data, and add advanced features
def load_and_process_data(train_path, target_path, test_path, test_size=0.2, random_state=42):
    # Load the data
    X_train = pd.read_csv(train_path, low_memory=False).drop(columns=['ID'], errors='ignore')
    y_train = pd.read_csv(target_path)['TARGET']
    X_test = pd.read_csv(test_path, low_memory=False)
    
    # Drop the 'ID' column from test data if it exists
    test_ids = X_test.pop('ID') if 'ID' in X_test.columns else np.arange(len(X_test))

    # Process both train and test datasets
    for df in [X_train, X_test]:
        # Convert 'tradeTime' to datetime
        if 'tradeTime' in df.columns:
            df['tradeTime'] = pd.to_datetime(df['tradeTime'], errors='coerce')
        
        # Replace categorical values as needed
        df.replace({
            'elevator': {1: 'has elevator', 0: 'no elevator'},
            'subway': {1: 'has subway', 0: 'no subway'},
            'buildingStructure': {1: 'unknown', 2: 'mixed', 3: 'brick and wood', 4: 'concrete', 5: 'steel', 6: 'steel-concrete composite'},
            'renovationCondition': {1: 'other', 2: 'rough', 3: 'Simplicity', 4: 'hardcover'},
            'buildingType': {1: 'tower', 2: 'bungalow', 3: 'combination of plate and tower', 4: 'plate'}
        }, inplace=True)
        
        # Ensure 'constructionTime' is properly numeric and clean up invalid values
        if 'constructionTime' in df.columns:
            df['constructionTime'] = pd.to_numeric(df['constructionTime'], errors='coerce')  # Force invalid to NaN

        # Convert floor information to numeric
        if 'floor' in df.columns:
            df['floor'] = df['floor'].astype(str).str.extractall('(\d+)').unstack().fillna('').sum(axis=1).astype(int)

        # Convert Lat and Lng to numeric and calculate distance to the capital
        df['Lat'] = pd.to_numeric(df['Lat'], errors='coerce')
        df['Lng'] = pd.to_numeric(df['Lng'], errors='coerce')

        capital_Lng = np.radians(116.4074)
        capital_Lat = np.radians(39.9042)
        df['distanceToCapital'] = np.arccos(
            np.sin(np.radians(df['Lat'])) * np.sin(capital_Lat) +
            np.cos(np.radians(df['Lat'])) * np.cos(capital_Lat) *
            np.cos(capital_Lng - np.radians(df['Lng']))
        ) * 6371.0088  # Earth's radius in kilometers

        # Add age of the building
        if 'tradeTime' in df.columns and 'constructionTime' in df.columns:
            df['ageOfBuilding'] = df['tradeTime'].dt.year - df['constructionTime']

    # Handle missing values for 'elevator' and 'subway'
    for df in [X_train, X_test]:
        df['elevator'] = df['elevator'].fillna(0)  # Assuming no elevator if missing
        df['subway'] = df['subway'].fillna(0)  # Assuming no subway access if missing

    # Ensure all columns are numeric where expected and handle NaNs
    X_train = X_train.apply(pd.to_numeric, errors='coerce')  # Force conversion of all columns to numeric
    X_test = X_test.apply(pd.to_numeric, errors='coerce')
    
    print(f"X_train columns before processing: {X_train.shape}")
    print(f"X_test columns before processing: {X_test.shape}")
    
    # Align columns between X_train and X_test to avoid mismatches
    common_cols = X_train.columns.intersection(X_test.columns)
    X_train, X_test = X_train[common_cols], X_test[common_cols]
    
    # Add imputation and scaling
    imputer = SimpleImputer(strategy="median")
    scaler = PowerTransformer()

    # Impute and scale both training and test sets
    X_train = pd.DataFrame(scaler.fit_transform(imputer.fit_transform(X_train)), columns=common_cols)
    X_test = pd.DataFrame(scaler.transform(imputer.transform(X_test)), columns=common_cols)
    
    print(f"X_train shape after scaling: {X_train.shape}")
    print(f"X_test shape after scaling: {X_test.shape}")

    X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
        X_train, y_train, test_size=test_size, random_state=random_state
    )

    return X_train_split, X_val_split, y_train_split, y_val_split, X_test, test_ids

def build_mlp_model(input_shape):
    model = Sequential([
        Dense(512, activation='relu', input_shape=(input_shape,)),
        BatchNormalization(),
        Dropout(0.4),
        Dense(256, activation='relu'),
        Dropout(0.3),
        Dense(128, activation='relu'),
        Dropout(0.3),
        Dense(1)
    ])
    return model

# Compile and train MLP model
def compile_and_train_mlp(model, X_train, y_train, X_val, y_val, epochs=50, batch_size=32):
    model.compile(optimizer='adam', loss='mse')
    early_stopping = EarlyStopping(monitor='val_loss', patience=8, restore_best_weights=True)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=4, min_lr=1e-6)
    
    model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=epochs,
        batch_size=batch_size,
        callbacks=[early_stopping, reduce_lr],
        verbose=1
    )
    return model

# Train a RandomForest model
def train_random_forest(X_train, y_train, X_val, y_val):
    rf_model = RandomForestRegressor(
        n_estimators=500, 
        max_depth=15, 
        min_samples_split=5, 
        min_samples_leaf=2, 
        random_state=42, 
        n_jobs=-1
    )

    # Fit the model
    rf_model.fit(X_train, y_train)

    # Predict on validation set
    y_pred_val = rf_model.predict(X_val)

    # Print and return RMSE
    val_rmse = print_rmse(y_val, y_pred_val, model_name="RandomForest")
    
    return rf_model, val_rmse

# Save predictions for submission
def save_submission(y_pred, test_ids, filename='Latest_submission.csv'):
    submission = pd.DataFrame({
        "ID": test_ids,
        "TARGET": y_pred
    })
    submission.to_csv(filename, index=False)

# Load, process, and train
X_train_split, X_val_split, y_train_split, y_val_split, X_test_scaled, test_ids = load_and_process_data(
    'data/X_train.csv', 'data/y_train.csv', 'data/X_test.csv'
)

# Train Random Forest model
rf_model, rf_rmse = train_random_forest(X_train_split, y_train_split, X_val_split, y_val_split)

# Use Random Forest predictions as a new feature for MLP
rf_train_predictions = rf_model.predict(X_train_split).reshape(-1, 1)
rf_val_predictions = rf_model.predict(X_val_split).reshape(-1, 1)
rf_test_predictions = rf_model.predict(X_test_scaled).reshape(-1, 1)

# Append RF predictions to training and validation sets
X_train_split_with_rf = np.hstack((X_train_split, rf_train_predictions))
X_val_split_with_rf = np.hstack((X_val_split, rf_val_predictions))
X_test_with_rf = np.hstack((X_test_scaled, rf_test_predictions))

# Train MLP model using RandomForest predictions as an additional feature
mlp_model = build_mlp_model(X_train_split_with_rf.shape[1])
mlp_model = compile_and_train_mlp(mlp_model, X_train_split_with_rf, y_train_split, X_val_split_with_rf, y_val_split)

# Predict using the trained MLP model on the validation set
y_pred_val_mlp = mlp_model.predict(X_val_split_with_rf).flatten()
mlp_rmse = print_rmse(y_val_split, y_pred_val_mlp, model_name="MLP with RF Feature")

# Predict on the test set using the trained MLP model
y_pred_test_mlp = mlp_model.predict(X_test_with_rf).flatten()

# Save predictions (final submission)
save_submission(y_pred_test_mlp, test_ids, filename='Latest_submission.csv')

X_train columns before processing: (255080, 22)
X_test columns before processing: (63771, 22)


  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)
  x = um.multiply(x, x, out=x)


X_train shape after scaling: (255080, 22)
X_test shape after scaling: (63771, 22)
RandomForest RMSE: 53.67793232245581
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
MLP with RF Feature RMSE: 59.368402394551794
