In [1]:
import pandas as pd
import numpy as np

df = pd.read_pickle("../pickle/clean_dataset.pkl")

In [2]:
df.head(7)

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,2
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,2
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,1
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,2
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,2
5,10850000,7500,3,3,1,1,0,1,0,1,2,1,1
6,10150000,8580,4,3,4,1,0,0,0,1,2,1,1


### Splitting the dataset

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [4]:
X = df.drop("price", axis=1)
y = np.log1p(df["price"])

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

### Building model

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, root_mean_squared_error

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [7]:
scaler = StandardScaler()

In [8]:
models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "RandomForest": RandomForestRegressor(random_state=42),
    "XGBoost": XGBRegressor(random_state=42, objective='reg:squarederror')
}

results = {}

for name, reg in models.items():
    model = Pipeline(steps=[
        ("scaler", scaler),
        ("regressor", reg)
    ])
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    r2 = r2_score(y_test, y_pred)
    rmse = root_mean_squared_error(y_test, y_pred)
    
    results[name] = (r2, rmse)
    print(f"\n{name}")
    print(f"R2 Score: {r2:.4f}")
    print(f"RMSE: {rmse:.4f}")


LinearRegression
R2 Score: 0.6559
RMSE: 0.2578

Ridge
R2 Score: 0.6558
RMSE: 0.2578

Lasso
R2 Score: -0.0045
RMSE: 0.4404

RandomForest
R2 Score: 0.6330
RMSE: 0.2662

XGBoost
R2 Score: 0.6355
RMSE: 0.2653


### Building and exporting pipeline

In [10]:
pipeline = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("regressor", LinearRegression())
])

In [11]:
pipeline.fit(X_train, y_train)