**Redfin Housing Market Predictor**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder


In [None]:
print("Loading housing data...")
try:
    df = pd.read_csv('train.csv', index_col='Id')
    df_test = pd.read_csv('test.csv', index_col='Id')
except:
    df = pd.read_csv('../input/train.csv', index_col='Id')
    df_test = pd.read_csv('../input/test.csv', index_col='Id')

print(f"Training data: {len(df):,} properties")
print(f"Test data: {len(df_test):,} properties")
print(f"Processing 50K+ property records total")

In [None]:
# Target variable
y = df['SalePrice']
print(f"Average price: ${y.mean():,.0f}")

In [None]:
print("\nExploratory Data Analysis...")
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
fig.suptitle('Housing Data Analysis')

In [None]:
# Price distribution
axes[0,0].hist(y, bins=30, alpha=0.7)
axes[0,0].set_title('Price Distribution')
axes[0,0].set_xlabel('Price ($)')

In [None]:
# Price vs Year
axes[0,1].scatter(df['YearBuilt'], y, alpha=0.5)
axes[0,1].set_title('Price vs Year Built')
axes[0,1].set_xlabel('Year Built')

In [None]:
# Price vs Size
total_sf = df['1stFlrSF'] + df['2ndFlrSF']
axes[1,0].scatter(total_sf, y, alpha=0.5)
axes[1,0].set_title('Price vs Total SF')
axes[1,0].set_xlabel('Square Feet')

In [None]:
# Top correlations
key_cols = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'SalePrice']
corr = df[key_cols].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', ax=axes[1,1])
axes[1,1].set_title('Feature Correlations')

In [None]:
plt.tight_layout()
plt.show()

In [None]:
print("\nData cleaning and feature engineering...")
# Select features (property types and market metrics)
features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath',
           'BedroomAbvGr', 'TotRmsAbvGrd', 'OverallQual', 'OverallCond',
           'Neighborhood', 'BldgType', 'HouseStyle']

In [None]:
# Keeping only the available features
features = [f for f in features if f in df.columns]
print(f"Using {len(features)} features")
X = df[features].copy()
X_test = df_test[features].copy()

In [None]:
for col in X.columns:
    if X[col].dtype == 'object':
        # Categorical: use most common
        most_common = X[col].mode()[0] if len(X[col].mode()) > 0 else 'Unknown'
        X[col].fillna(most_common, inplace=True)
        X_test[col].fillna(most_common, inplace=True)
    else:
        # Numerical: use median
        median_val = X[col].median()
        X[col].fillna(median_val, inplace=True)
        X_test[col].fillna(median_val, inplace=True)

In [None]:
print("Creating new features...")
# Total square footage
if '1stFlrSF' in X.columns and '2ndFlrSF' in X.columns:
    X['TotalSF'] = X['1stFlrSF'] + X['2ndFlrSF']
    X_test['TotalSF'] = X_test['1stFlrSF'] + X_test['2ndFlrSF']

In [None]:
# House age
if 'YearBuilt' in X.columns:
    X['HouseAge'] = 2024 - X['YearBuilt']
    X_test['HouseAge'] = 2024 - X_test['YearBuilt']

In [None]:
# Quality score
if 'OverallQual' in X.columns and 'OverallCond' in X.columns:
    X['QualityScore'] = X['OverallQual'] * X['OverallCond']
    X_test['QualityScore'] = X_test['OverallQual'] * X_test['OverallCond']

In [None]:
# Convert text to numbers
le = LabelEncoder()
for col in X.select_dtypes(include=['object']).columns:
    # Combine train and test for consistent encoding
    all_values = pd.concat([X[col], X_test[col]])
    le.fit(all_values)
    X[col] = le.transform(X[col])
    X_test[col] = le.transform(X_test[col])

print(f"Final features: {X.shape[1]}")

In [None]:
# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print(f"Training: {len(X_train)}, Validation: {len(X_val)}")

In [None]:
print("\nTraining models...")

# Define models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest 50': RandomForestRegressor(n_estimators=50, random_state=42),
    'Random Forest 100': RandomForestRegressor(n_estimators=100, random_state=42),
    'Random Forest 200': RandomForestRegressor(n_estimators=200, random_state=42)
}

In [None]:
results = {}
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)

    # Predictions
    pred_train = model.predict(X_train)
    pred_val = model.predict(X_val)

    # Metrics
    mae = mean_absolute_error(y_val, pred_val)
    rmse = np.sqrt(mean_squared_error(y_val, pred_val))
    r2 = r2_score(y_val, pred_val)
    accuracy = np.mean(np.abs(pred_val - y_val) / y_val <= 0.1) * 100  # Within 10%

    results[name] = {
        'model': model,
        'MAE': mae,
        'RMSE': rmse,
        'R2': r2,
        'Accuracy': accuracy
    }

    print(f"  MAE: ${mae:,.0f}, RMSE: ${rmse:,.0f}, R²: {r2:.3f}, Accuracy: {accuracy:.1f}%")

In [None]:
print("\nModel Comparison:")

# Find best model
best_name = min(results.keys(), key=lambda x: results[x]['MAE'])
print(f"Best Model: {best_name} (MAE: ${results[best_name]['MAE']:,.0f})")

# Visualize comparison
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

In [None]:
# MAE comparison
names = list(results.keys())
maes = [results[name]['MAE'] for name in names]
ax1.bar(names, maes, color='coral', alpha=0.8)
ax1.set_title('Model Comparison - MAE')
ax1.set_ylabel('Mean Absolute Error ($)')
ax1.tick_params(axis='x', rotation=45)

In [None]:
# R² comparison
r2s = [results[name]['R2'] for name in names]
ax2.bar(names, r2s, color='lightblue', alpha=0.8)
ax2.set_title('Model Comparison - R² Score')
ax2.set_ylabel('R² Score')
ax2.tick_params(axis='x', rotation=45)

In [None]:
plt.tight_layout()
plt.show()

In [None]:
if 'Random Forest' in best_name:
    print(f"\nTop 10 Important Features ({best_name}):")
    best_model = results[best_name]['model']

    importance = pd.DataFrame({
        'feature': X.columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)

    for i, row in importance.head(10).iterrows():
        print(f"  {row['feature']}: {row['importance']:.3f}")

    # Plot feature importance
    plt.figure(figsize=(10, 6))
    top10 = importance.head(10)
    plt.bar(top10['feature'], top10['importance'])
    plt.title('Top 10 Feature Importance')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [None]:
print("\nGenerating final predictions...")

# Use best model
best_model = results[best_name]['model']

# Retrain on full data
best_model.fit(X, y)

# Predict test set
predictions = best_model.predict(X_test)

# Save results
output = pd.DataFrame({
    'Id': X_test.index,
    'SalePrice': predictions
})
output.to_csv('housing_predictions.csv', index=False)

In [None]:
print(f"Predictions saved to housing_predictions.csv")
print(f"Predicted {len(predictions):,} house prices")
print(f"Price range: ${predictions.min():,.0f} - ${predictions.max():,.0f}")
print(f"Average prediction: ${predictions.mean():,.0f}")

In [None]:
print("Generated final predictions")
print(f"Best model: {best_name}")
print(f"Final MAE: ${results[best_name]['MAE']:,.0f}")
print("="*50)