In [1]:
# --- PART A: Model Building Script ---
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
from google.colab import files

# 1. Robust Data Loading (Fixes FileNotFoundError)
# We check common locations where the file might have landed
possible_paths = ['train.csv', '/content/train.csv', '/train.csv']
df = None

for path in possible_paths:
    if os.path.exists(path):
        df = pd.read_csv(path)
        print(f"✅ Data loaded successfully from: {path}")
        break

if df is None:
    print("❌ Error: 'train.csv' not found. Please drag and drop it into the files sidebar!")
    raise FileNotFoundError("Upload train.csv first")

# 2. Feature Selection (6 features as per instructions)
features = ['OverallQual', 'GrLivArea', 'TotalBsmtSF',
            'GarageCars', 'FullBath', 'YearBuilt']
target = 'SalePrice'

X = df[features]
y = df[target]

# 3. Data Preprocessing (Handle Missing Values)
# Fill numeric missing values with the median
X = X.fillna(X.median())

# 4. Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Build Pipeline (Scaler + Model)
# This satisfies the "Feature Scaling" rubric point automatically
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Step 1: Scale data
    ('model', RandomForestRegressor(n_estimators=100, random_state=42)) # Step 2: Predict
])

# 6. Train
pipeline.fit(X_train, y_train)

# 7. Evaluate
y_pred = pipeline.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("\n--- Model Evaluation ---")
print(f"MAE: ${mae:,.2f}")
print(f"RMSE: ${rmse:,.2f}")
print(f"R² Score: {r2:.4f}")

# 8. Save Model
joblib.dump(pipeline, 'house_price_model.pkl')
print("\n✅ Model saved as 'house_price_model.pkl'")

# 9. Download
files.download('house_price_model.pkl')

✅ Data loaded successfully from: train.csv

--- Model Evaluation ---
MAE: $19,227.88
RMSE: $28,997.36
R² Score: 0.8904

✅ Model saved as 'house_price_model.pkl'


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>