In [46]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer


In [47]:
df = pd.read_csv("jiji_housing_cleaned.csv")
df.head()

Unnamed: 0,title,region,region_name,price_title,property_size,bedrooms,bathrooms,furnishing,boosted,state,price_m2
0,"4bdrm Duplex in Abuja Estate, Owerri for sale","Imo State, Owerri",Owerri,170000000.0,600,4,5,Unfurnished,True,Imo,283333.33
1,"Furnished 5bdrm Bungalow in Prime Property, Be...","Edo State, Benin City",Benin City,45000000.0,1500,5,4,Furnished,True,Edo,30000.0
2,2bdrm Block of Flats in Uyo for sale,"Akwa Ibom State, Uyo",Uyo,30000000.0,400,2,1,Unfurnished,False,Akwa Ibom,75000.0
3,"Furnished 6bdrm Duplex in Port Harcourt, Obio-...","Rivers State, Obio-Akpor",Obio-Akpor,150000000.0,1162,6,6,Furnished,False,Rivers,129087.78
4,"12bdrm Block of Flats in Kapua, FHA for sale","Lugbe District, FHA",FHA,250000000.0,1300,12,16,Semi-furnished,False,Abuja,192307.69


In [48]:
df["boosted"] = df["boosted"].astype(int)
df.head()

Unnamed: 0,title,region,region_name,price_title,property_size,bedrooms,bathrooms,furnishing,boosted,state,price_m2
0,"4bdrm Duplex in Abuja Estate, Owerri for sale","Imo State, Owerri",Owerri,170000000.0,600,4,5,Unfurnished,1,Imo,283333.33
1,"Furnished 5bdrm Bungalow in Prime Property, Be...","Edo State, Benin City",Benin City,45000000.0,1500,5,4,Furnished,1,Edo,30000.0
2,2bdrm Block of Flats in Uyo for sale,"Akwa Ibom State, Uyo",Uyo,30000000.0,400,2,1,Unfurnished,0,Akwa Ibom,75000.0
3,"Furnished 6bdrm Duplex in Port Harcourt, Obio-...","Rivers State, Obio-Akpor",Obio-Akpor,150000000.0,1162,6,6,Furnished,0,Rivers,129087.78
4,"12bdrm Block of Flats in Kapua, FHA for sale","Lugbe District, FHA",FHA,250000000.0,1300,12,16,Semi-furnished,0,Abuja,192307.69


In [49]:
df.to_csv("jiji_housing_cleaned.csv", index=False)

In [14]:
features =["bathrooms", "furnishing", "region_name", "state", "boosted", "property_size"]
target= "price_title"

X = df[features]
y = df[target]

print(X.shape)
print(y.shape)

(1245, 6)
(1245,)


In [15]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print("X_train shape", X_train.shape)
print("X_test shape", X_test.shape)
print("y_train shape", y_train.shape)
print("y_test shape", y_test.shape)

X_train shape (996, 6)
X_test shape (249, 6)
y_train shape (996,)
y_test shape (249,)


In [16]:
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.fit_transform(X_test)
    
# print("\nFeatures scaled successfully")
# print("Training scaled set", X_train_scaled.shape)
# print("Testing scaled set", X_test_scaled.shape)

In [17]:
# baseline
y_mean = y_train.mean()
y_pred_baseline = [y_mean] * len(y_train)

mae_baseline = mean_absolute_error(y_train, y_pred_baseline)
print("Baseline MAE: ",round(mae_baseline, 2))

Baseline MAE:  114883556.84


In [43]:
cat_cols = ["furnishing", "region_name", "state", "boosted"]
num_cols = ["bathrooms", "property_size"]

preprocessor = ColumnTransformer(
    transformers=[
    ("cats",
    OneHotEncoder(handle_unknown="ignore", drop="first"),
    cat_cols),
    ("num",
    StandardScaler(), num_cols)
    ]
)

model = make_pipeline(
    preprocessor,
    # OneHotEncoder(handle_unknown="ignore"),
    Ridge(alpha=10)
    # LinearRegression()
)

model.fit(X_train, y_train)
print("\nModel train succesfully")


Model train succesfully


In [22]:
# lr = model.named_steps["linearregression"]
# encoder = model.named_steps["onehotencoder"]
# print("model coefficient")
# for features, coef in zip(features, lr.coef_.ravel()): 
#     print(f" {features}: {coef:.2f}") 
# print(f"\nModel intercept: {float(lr.intercept_):.2f}")

coefficient = model.named_steps["ridge"].coef_
print("Model coefficient :", coefficient[5])

Model coefficient : 74357019.52700306


In [44]:
pd.set_option('display.float_format', '{:,.2f}'.format)

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
train_mae = mean_absolute_error(y_train, y_train_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)

print("\n" + "=" * 60)
print("MODEL PERFORMANCE")
print("=" * 60)
print("\ntTraining set")
print(f" R2 score: {train_r2:.4}")
print(f" RMSE: {train_rmse:.2}")
print(f" MAE: {train_mae:.2}")
print("\ntTesting set")
print(f" R2 score: {test_r2:.4}")
print(f" RMSE: {test_rmse:.2}")
print(f" MAE: {test_mae:.2}")


MODEL PERFORMANCE

tTraining set
 R2 score: 0.354
 RMSE: 1.1e+08
 MAE: 8.8e+07

tTesting set
 R2 score: 0.3358
 RMSE: 1.1e+08
 MAE: 9.4e+07


