In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import joblib


In [6]:
# Load dataset
data = pd.read_csv("data/train.csv")

# View first 5 rows
data.head()


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [7]:
# Select features and target
features = [
    "OverallQual",
    "GrLivArea",
    "TotalBsmtSF",
    "GarageCars",
    "BedroomAbvGr",
    "FullBath"
]

X = data[features]
y = data["SalePrice"]

# Check shape
X.shape, y.shape


((1460, 6), (1460,))

In [11]:
# Check missing values in selected features
X.isnull().sum()

# Fill missing values with median
X = X.fillna(X.median())

X.isnull().sum()




OverallQual     0
GrLivArea       0
TotalBsmtSF     0
GarageCars      0
BedroomAbvGr    0
FullBath        0
dtype: int64

In [None]:
from sklearn.model_selection import train_test_split

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Check shapes
X_train.shape, X_test.shape, y_train.shape, y_test.shape

from sklearn.preprocessing import StandardScaler

# Initialize scaler
scaler = StandardScaler()

# Fit on training data, transform both
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Confirm shapes
X_train_scaled.shape, X_test_scaled.shape

from sklearn.linear_model import LinearRegression

# Create model
model = LinearRegression()

# Train model
model.fit(X_train_scaled, y_train)

print("Model training completed.")

# Predict house prices on test data
y_pred = model.predict(X_test_scaled)


from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R²: {r2}")


Model training completed.


In [17]:
import os
import joblib

# Make folder if it doesn't exist
if not os.path.exists('model'):
    os.makedirs('model')

# Now save the model
joblib.dump(model, 'model/house_price_model.pkl')

# Save the scaler too
joblib.dump(scaler, 'model/scaler.pkl')

print("Model and scaler saved successfully!")


import joblib

# Save model
joblib.dump(model, 'model/house_price_model.pkl')

# Save scaler
joblib.dump(scaler, 'model/scaler.pkl')


Model and scaler saved successfully!


['model/scaler.pkl']

In [None]:
import joblib

# Save the model
joblib.dump(model, 'house_price_model.pkl')

# Save the scaler too (important to scale new user input the same way)
joblib.dump(scaler, 'scaler.pkl')

# Load model and scaler
loaded_model = joblib.load('house_price_model.pkl')
loaded_scaler = joblib.load('scaler.pkl')

# Test prediction
sample_scaled = loaded_scaler.transform(X_test[:1])
print("Predicted price:", loaded_model.predict(sample_scaled))
