# House Price Prediction
Target column used: **SalePrice**

In [None]:
import pandas as pd
pd.read_csv("train.csv").shape

## Preprocessing and Model Training

In [None]:

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score, f1_score
import pickle
train = pd.read_csv("train.csv")
target = "SalePrice"
X = train.drop(columns=[target] + (["Id"] if "Id" in train.columns else []))
y = train[target]
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(exclude=[np.number]).columns.tolist()
numeric_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="median")),("scaler", StandardScaler())])
categorical_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="constant", fill_value="Missing")),("onehot", OneHotEncoder(handle_unknown="ignore", sparse=False))])
preprocessor = ColumnTransformer(transformers=[("num", numeric_transformer, numeric_features),("cat", categorical_transformer, categorical_features)], sparse_threshold=0)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
models = {"LinearRegression": LinearRegression(), "RidgeRegression": Ridge(alpha=1.0, random_state=42), "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)}
results = {}
pipelines = {}
for name, model in models.items():
    pipe = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_val)
    r2 = r2_score(y_val, preds)
    rmse = mean_squared_error(y_val, preds, squared=False)
    bins = np.quantile(y_train, [0, 0.33, 0.66, 1.0])
    def to_cat(arr): return np.digitize(arr, bins[1:-1])
    y_val_cat = to_cat(y_val.values)
    preds_cat = to_cat(preds)
    acc = accuracy_score(y_val_cat, preds_cat)
    f1 = f1_score(y_val_cat, preds_cat, average="macro")
    results[name] = dict(r2=float(r2), rmse=float(rmse), accuracy=float(acc), f1_macro=float(f1))
    pipelines[name] = pipe
results


## Save best model to model.pkl

In [None]:

best_name = min(results.keys(), key=lambda n: results[n]['rmse'])
best_pipeline = pipelines[best_name]
artifact = {'model_name': best_name, 'pipeline': best_pipeline, 'feature_columns': X.columns.tolist(), 'numeric_features': numeric_features, 'categorical_features': categorical_features, 'target_col': target}
with open("model.pkl","wb") as f:
    pickle.dump(artifact, f)
print("Saved", best_name)
