# Machine Learning Models for House Price Prediction
This notebook implements and compares different machine learning models for predicting house prices using our preprocessed data.

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns

# Data Splitting and Model Setup
Prepare our preprocessed data for modeling by:
1. Loading the scaled and encoded data
2. Splitting into features (X) and target (y)
3. Creating training and validation sets
4. Setting up cross-validation

In [None]:
# Load preprocessed data
df_train = pd.read_csv('processed_train.csv')
df_test = pd.read_csv('processed_test.csv')

# Separate features and target
X = df_train.drop(['SalePrice', 'SalePrice_Log'], axis=1)
y = df_train['SalePrice_Log']  # Using log-transformed target

# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Setup cross-validation
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Linear Regression Model
Implement a basic linear regression model as our baseline:
1. Train the model
2. Perform cross-validation
3. Make predictions
4. Calculate RMSE and R² scores

In [None]:
# Initialize and train linear regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Cross-validation scores
lr_cv_scores = cross_val_score(lr_model, X_train, y_train, cv=cv, 
                             scoring='neg_mean_squared_error')
lr_rmse_cv = np.sqrt(-lr_cv_scores.mean())

# Predictions and metrics
lr_pred = lr_model.predict(X_val)
lr_rmse = np.sqrt(mean_squared_error(y_val, lr_pred))
lr_r2 = r2_score(y_val, lr_pred)

print(f'Linear Regression Results:')
print(f'Cross-validation RMSE: {lr_rmse_cv:.4f}')
print(f'Validation RMSE: {lr_rmse:.4f}')
print(f'Validation R²: {lr_r2:.4f}')

# Random Forest Model
Implement a Random Forest regressor with:
1. Hyperparameter tuning
2. Cross-validation
3. Feature importance analysis

In [None]:
# Initialize and train Random Forest
rf_model = RandomForestRegressor(n_estimators=100, max_depth=20, 
                               min_samples_split=5, random_state=42)
rf_model.fit(X_train, y_train)

# Cross-validation scores
rf_cv_scores = cross_val_score(rf_model, X_train, y_train, cv=cv,
                             scoring='neg_mean_squared_error')
rf_rmse_cv = np.sqrt(-rf_cv_scores.mean())

# Predictions and metrics
rf_pred = rf_model.predict(X_val)
rf_rmse = np.sqrt(mean_squared_error(y_val, rf_pred))
rf_r2 = r2_score(y_val, rf_pred)

print(f'Random Forest Results:')
print(f'Cross-validation RMSE: {rf_rmse_cv:.4f}')
print(f'Validation RMSE: {rf_rmse:.4f}')
print(f'Validation R²: {rf_r2:.4f}')

# Feature importance
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print('\nTop 10 Most Important Features:')
print(feature_importance.head(10))

# XGBoost Model
Implement XGBoost regressor with:
1. Hyperparameter tuning
2. Early stopping
3. Learning rate scheduling

In [None]:
# Initialize and train XGBoost
xgb_model = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.01,
    max_depth=5,
    min_child_weight=1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

# Train with early stopping
xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric='rmse',
    early_stopping_rounds=50,
    verbose=False
)

# Predictions and metrics
xgb_pred = xgb_model.predict(X_val)
xgb_rmse = np.sqrt(mean_squared_error(y_val, xgb_pred))
xgb_r2 = r2_score(y_val, xgb_pred)

print(f'XGBoost Results:')
print(f'Validation RMSE: {xgb_rmse:.4f}')
print(f'Validation R²: {xgb_r2:.4f}')

# Model Comparison and Evaluation
Compare the performance of all models and visualize:
1. RMSE and R² scores
2. Predictions vs actual values
3. Residual analysis

In [None]:
# Create comparison dataframe
model_comparison = pd.DataFrame({
    'Model': ['Linear Regression', 'Random Forest', 'XGBoost'],
    'RMSE': [lr_rmse, rf_rmse, xgb_rmse],
    'R²': [lr_r2, rf_r2, xgb_r2]
})

print("Model Performance Comparison:")
display(model_comparison)

# Visualization of predictions vs actual
plt.figure(figsize=(15, 5))

# Plot predictions vs actual for each model
models = {'Linear Regression': lr_pred, 
          'Random Forest': rf_pred, 
          'XGBoost': xgb_pred}

for i, (name, pred) in enumerate(models.items(), 1):
    plt.subplot(1, 3, i)
    plt.scatter(y_val, pred, alpha=0.5)
    plt.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], 'r--')
    plt.xlabel('Actual')
    plt.ylabel('Predicted')
    plt.title(f'{name}\nR² = {r2_score(y_val, pred):.4f}')

plt.tight_layout()
plt.show()

# Final Prediction Generation
Generate predictions on the test set using the best performing model and prepare the submission file.

In [None]:
# Use best model (XGBoost) for final predictions
final_predictions = xgb_model.predict(df_test)

# Transform predictions back to original scale
final_predictions = np.exp(final_predictions) - 1

# Create submission dataframe
submission = pd.DataFrame({
    'Id': df_test.index,
    'SalePrice': final_predictions
})

# Save predictions
submission.to_csv('house_price_predictions.csv', index=False)
print("Predictions saved to 'house_price_predictions.csv'")