# Model Training

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

df = pd.read_csv(r"train.csv")

# Simple cleaning: fill NA categorical with "None", numeric with 0
for col in df.columns:
    if df[col].dtype == "object":
        df[col] = df[col].fillna("None")
    else:
        df[col] = df[col].fillna(0)

# Features/target
y = df["SalePrice"]
X = df.drop(columns=["SalePrice","Id"])

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocess: one hot encode categorical
cat_cols = X.select_dtypes(include="object").columns
num_cols = X.select_dtypes(exclude="object").columns

preprocess = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ("num", "passthrough", num_cols)
])

# Model
rf = RandomForestRegressor(n_estimators=300, random_state=42)

pipe = Pipeline([
    ("prep", preprocess),
    ("model", rf)
])

pipe.fit(X_train, y_train)
pred = pipe.predict(X_test)

r2 = r2_score(y_test, pred)
mse = mean_squared_error(y_test, pred)

print("R² Score:", r2)
print("MSE:", mse)

import joblib
joblib.dump(pipe, r"house_price_prediction.pkl")
print("\nModel saved as house_price_prediction.pkl")


R² Score: 0.8902734388026132
MSE: 841638517.976838

Model saved as house_price_prediction.pkl


# Model Loading & Predicition using User Input

In [2]:
import joblib
import pandas as pd

# Load model
model = joblib.load(r"house_price_prediction.pkl")

# Load training CSV to extract correct structure and dtypes
train = pd.read_csv(r"train.csv")

# Drop unused columns
train = train.drop(columns=["SalePrice", "Id"])

# Create a template with the correct column names and dtypes
template = pd.DataFrame({col: [None] for col in train.columns})

# Fill categorical with "None" and numeric with 0
for col in template.columns:
    if train[col].dtype == object:
        template[col] = "None"
    else:
        template[col] = 0

# ---- USER INPUTS ----
GrLivArea = float(input("Enter above-ground living area (sq ft): "))
OverallQual = int(input("Enter overall material quality (1-10): "))
GarageCars = int(input("Enter number of garage cars: "))
TotalBsmtSF = float(input("Enter total basement area: "))
FullBath = int(input("Enter number of full baths: "))
YearBuilt = int(input("Enter year built: "))
Neighborhood = input("Enter neighborhood: ")
HouseStyle = input("Enter house style: ")

# Insert into template
template.loc[0, 'GrLivArea'] = GrLivArea
template.loc[0, 'OverallQual'] = OverallQual
template.loc[0, 'GarageCars'] = GarageCars
template.loc[0, 'TotalBsmtSF'] = TotalBsmtSF
template.loc[0, 'FullBath'] = FullBath
template.loc[0, 'YearBuilt'] = YearBuilt
template.loc[0, 'Neighborhood'] = Neighborhood
template.loc[0, 'HouseStyle'] = HouseStyle

# ---- Predict ----
prediction = model.predict(template)[0]
print("\nPredicted House Price:", prediction)



Predicted House Price: 206646.64333333334
