# Housing Price Prediction - Model Training & YAML Export

This notebook trains a Random Forest model for housing price prediction and exports the configuration to YAML format for deployment with Streamlit.

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import joblib
import yaml
from datetime import datetime
import json
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 2. Load and Explore Data

In [None]:
# Load the training data
print("Loading data...")
train_data = pd.read_csv('train_100k.csv')

# Display basic info
print(f"\nTraining data shape: {train_data.shape}")
print(f"\nColumns: {list(train_data.columns)}")
train_data.head()

In [None]:
# Data info
train_data.info()

In [None]:
# Statistical summary
train_data.describe()

## 3. Data Preprocessing

In [None]:
# Identify target column
target_cols = [col for col in train_data.columns if 'price' in col.lower() or 'sale' in col.lower()]
if target_cols:
    target_column = target_cols[0]
else:
    target_column = train_data.columns[-1]

print(f"Target column: {target_column}")

# Separate features and target
X = train_data.drop(columns=[target_column])
y = train_data[target_column]

print(f"\nFeatures shape: {X.shape}")
print(f"Target shape: {y.shape}")

In [None]:
# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

print(f"Categorical columns ({len(categorical_cols)}): {list(categorical_cols)}")
print(f"\nNumerical columns ({len(numerical_cols)}): {list(numerical_cols)}")

In [None]:
# Check for missing values
missing_values = X.isnull().sum()
if missing_values.sum() > 0:
    print("Missing values:")
    print(missing_values[missing_values > 0])
else:
    print("No missing values found!")

In [None]:
# One-hot encode categorical variables
print("Encoding categorical variables...")
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

# Handle missing values
X_encoded = X_encoded.fillna(X_encoded.median())

print(f"\nEncoded features shape: {X_encoded.shape}")
print(f"Total features after encoding: {len(X_encoded.columns)}")

## 4. Train-Test Split

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

## 5. Feature Scaling

In [None]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Features scaled using StandardScaler")
print(f"Mean of scaled training data: {X_train_scaled.mean():.6f}")
print(f"Std of scaled training data: {X_train_scaled.std():.6f}")

## 6. Model Training

In [None]:
# Train Random Forest model
print("Training Random Forest model...\n")

model = RandomForestRegressor(
    n_estimators=100,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1,
    verbose=1
)

model.fit(X_train_scaled, y_train)
print("\n‚úì Model training completed!")

## 7. Model Evaluation

In [None]:
# Make predictions
y_pred = model.predict(X_test_scaled)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("="*60)
print("MODEL PERFORMANCE METRICS")
print("="*60)
print(f"Root Mean Squared Error (RMSE): ${rmse:,.2f}")
print(f"Mean Absolute Error (MAE):      ${mae:,.2f}")
print(f"R¬≤ Score:                       {r2:.4f}")
print(f"Accuracy:                       {r2*100:.2f}%")
print("="*60)

In [None]:
# Visualize predictions vs actual
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Price ($)')
plt.ylabel('Predicted Price ($)')
plt.title('Actual vs Predicted Prices')
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
residuals = y_test - y_pred
plt.scatter(y_pred, residuals, alpha=0.5)
plt.axhline(y=0, color='r', linestyle='--', lw=2)
plt.xlabel('Predicted Price ($)')
plt.ylabel('Residuals ($)')
plt.title('Residual Plot')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 8. Feature Importance Analysis

In [None]:
# Get feature importance
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 15 Most Important Features:")
print("="*60)
for idx, row in feature_importance.head(15).iterrows():
    print(f"{row['feature']:30s} {row['importance']*100:6.2f}%")
print("="*60)

In [None]:
# Visualize top 20 features
plt.figure(figsize=(12, 8))
top_features = feature_importance.head(20)
plt.barh(range(len(top_features)), top_features['importance'], color='steelblue')
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Feature Importance')
plt.title('Top 20 Most Important Features')
plt.gca().invert_yaxis()
plt.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.show()

## 9. Save Model and Export to YAML

In [None]:
# Save the model and scaler
print("Saving model artifacts...")
joblib.dump(model, 'housing_price_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

# Save feature names
with open('feature_names.json', 'w') as f:
    json.dump(list(X_train.columns), f)

print("‚úì Model saved to: housing_price_model.pkl")
print("‚úì Scaler saved to: scaler.pkl")
print("‚úì Feature names saved to: feature_names.json")

In [None]:
# Create YAML configuration
model_config = {
    'model_info': {
        'name': 'Housing Price Prediction Model',
        'type': 'RandomForestRegressor',
        'version': '1.0',
        'created_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        'description': 'Random Forest model for predicting housing prices'
    },
    'hyperparameters': {
        'n_estimators': 100,
        'max_depth': 20,
        'min_samples_split': 5,
        'min_samples_leaf': 2,
        'random_state': 42
    },
    'data_info': {
        'training_samples': len(X_train),
        'test_samples': len(X_test),
        'total_features': len(X_train.columns),
        'categorical_features': len(categorical_cols),
        'numerical_features': len(numerical_cols),
        'target_column': target_column
    },
    'performance_metrics': {
        'rmse': float(rmse),
        'mae': float(mae),
        'r2_score': float(r2),
        'accuracy_percentage': float(r2 * 100)
    },
    'feature_importance': {
        row['feature']: float(row['importance']) 
        for _, row in feature_importance.head(20).iterrows()
    },
    'preprocessing': {
        'scaling': 'StandardScaler',
        'categorical_encoding': 'One-Hot Encoding',
        'missing_value_strategy': 'Median Imputation'
    },
    'files': {
        'model_file': 'housing_price_model.pkl',
        'scaler_file': 'scaler.pkl',
        'feature_names_file': 'feature_names.json'
    }
}

# Save YAML configuration
print("\nExporting model configuration to YAML...")
with open('model_config.yaml', 'w') as f:
    yaml.dump(model_config, f, default_flow_style=False, sort_keys=False)

print("‚úì Configuration saved to: model_config.yaml")

In [None]:
# Display the YAML configuration
print("\n" + "="*60)
print("YAML CONFIGURATION PREVIEW")
print("="*60)
print(yaml.dump(model_config, default_flow_style=False, sort_keys=False))
print("="*60)

## 10. Summary

In [None]:
print("\n" + "="*60)
print("‚úì MODEL TRAINING AND EXPORT COMPLETED!")
print("="*60)
print(f"\nüìä Model Performance:")
print(f"   - Accuracy: {r2*100:.2f}%")
print(f"   - RMSE: ${rmse:,.2f}")
print(f"   - MAE: ${mae:,.2f}")
print(f"\nüìÅ Generated Files:")
print(f"   - housing_price_model.pkl (trained model)")
print(f"   - scaler.pkl (feature scaler)")
print(f"   - feature_names.json (feature list)")
print(f"   - model_config.yaml (configuration)")
print(f"\nüöÄ Next Steps:")
print(f"   Run the Streamlit app: streamlit run app.py")
print(f"   Open browser at: http://localhost:8501")
print("="*60)

## 11. Test Predictions (Optional)

In [None]:
# Load test data and make predictions
try:
    test_data = pd.read_csv('test_100k.csv')
    print(f"Loaded test data: {test_data.shape}")
    
    # Preprocess test data
    X_test_encoded = pd.get_dummies(test_data)
    
    # Align columns with training data
    missing_cols = set(X_train.columns) - set(X_test_encoded.columns)
    for col in missing_cols:
        X_test_encoded[col] = 0
    
    X_test_encoded = X_test_encoded[X_train.columns]
    X_test_encoded = X_test_encoded.fillna(X_test_encoded.median())
    
    # Scale and predict
    X_test_scaled = scaler.transform(X_test_encoded)
    test_predictions = model.predict(X_test_scaled)
    
    # Create results dataframe
    results = pd.DataFrame({
        'Id': range(len(test_predictions)),
        'Predicted_Price': test_predictions
    })
    
    print(f"\n‚úì Generated {len(test_predictions):,} predictions")
    print(f"\nPrediction Statistics:")
    print(f"   Average: ${test_predictions.mean():,.2f}")
    print(f"   Median:  ${np.median(test_predictions):,.2f}")
    print(f"   Min:     ${test_predictions.min():,.2f}")
    print(f"   Max:     ${test_predictions.max():,.2f}")
    
    # Save predictions
    results.to_csv('housing_price_predictions.csv', index=False)
    print(f"\n‚úì Predictions saved to: housing_price_predictions.csv")
    
    # Display first few predictions
    print("\nFirst 10 predictions:")
    display(results.head(10))
    
except FileNotFoundError:
    print("Test file 'test_100k.csv' not found. Skipping test predictions.")