In [88]:
!pip install pandas
!pip install numpy
!pip install scikit-learn



In [89]:
import numpy as np
import pandas as pd
from pathlib import Path

In [90]:
def load_dataset():
    return pd.read_csv(Path("Housing.csv"))

In [91]:
# load the dataset
dataset = load_dataset()

# inspect the data
dataset.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [111]:
def get_mean_house_price():
    dataset = load_dataset()
    # convert to int for coherence with the dataset
    return int(dataset["price"].mean())

In [112]:
# print the mean
get_mean_house_price()

4766729

In [94]:
from sklearn.model_selection import train_test_split

# get features and target variable
X = dataset.drop("price", axis=1) 
y = dataset["price"] 

# Splitting into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

In [113]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error


# identify categorical features
categorical_features = X.select_dtypes(include=["object"]).columns.tolist()

# create transformers for scaling and encoding
num_transformer = StandardScaler()
cat_transformer = OneHotEncoder(drop="first")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_transformer, X_train.columns.difference(categorical_features)),
        ("cat", cat_transformer, categorical_features)
    ]
)

# get linear regression model from scikit
model = LinearRegression()

pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])

# fit pipeline on training data
pipeline.fit(X_train, y_train)

# predict on test set
y_pred = pipeline.predict(X_test)

# evaluate performance
mse = mean_squared_error(y_test, y_pred, squared=False)
print("Mean Squared Error:", mse)

Mean Squared Error: 1324506.96009144


In [115]:
import joblib

# store pipeline
joblib.dump(pipeline, 'pipeline.pkl')

['pipeline.pkl']