In [254]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

import warnings
warnings.filterwarnings("ignore")

In [255]:
def load_and_explore_dataset(filepath):
    print("=" * 60)
    print("LOAD AND  EXPLORE DATASET")
    print("=" * 60)

    df = pd.read_csv(filepath)

    print("Shape of the dataset")
    print(df.shape)
    print("\nCheck for missing values")
    print(df.isnull().sum())
    print("\nFirst five rows:")
    print(df.head())
    print("\nDescriptive stats")
    print(df.describe())
    print("\nDataset Info")
    print(df.info())
    print("\nState Distribution:")
    print(df["state"].value_counts())
    print("\nFurnishing Distribution:")
    print(df["furnishing"].value_counts())

    return df

In [256]:
def preprocessing_data(df):
    print("\n" + "=" * 60)
    print("PREPROCESSING LOADED DATA")
    print("=" * 60)

    df_processed = df.dropna().copy()
    
    label_encoder = {}
    df_columns = {"furnishing", "region_name", "state"}
    for col in df_columns:
        le = LabelEncoder()
        df_processed[col + "_encoded"] = le.fit_transform(df_processed[col])
        label_encoder[col] = le
        print(f"\n{col} encoded")
        for i, label in enumerate(le.classes_):
            print(f" {label} : {i}")
    print("preprocessed Dataset shape", df_processed.shape)

    return df_processed, label_encoder

In [257]:
def features_data(df_processed):
    print("\n" + "=" * 60)
    print("FEATURES DATA")
    print("=" * 60)

    feature_columns = ["property_size", "bathrooms", "furnishing_encoded", "region_name_encoded", "state_encoded", "boosted"]
    target_column = "price_title"

    X = df_processed[feature_columns]
    y = df_processed[target_column]
   

    print("\nFeatures Shape", df_processed[feature_columns].shape)
    print("\nTarget Shape", df_processed[target_column].shape)
    print("\nFeatures", feature_columns)
    

    return X, y, feature_columns

In [258]:
def split_data(X, y, test_size=0.2, random_state=42):
    print("\n" + "=" * 60)
    print("SPLITING DATA")
    print("=" * 60)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )

    print("\nTraining set size", X_train.shape[0])
    print("Testing set size", X_test.shape[0])
    print("\nTraining set price range {:.2f} - {:.2f}".format(
        float(y_train.min()), float(y_train.max())
    ))
    print("\nTesting set price range {:.2f} - {:.2f}".format(
        float(y_test.min()), float(y_test.max())
    ))

    return X_train, X_test, y_train, y_test

In [259]:
def scale_features(X_train, X_test):
    print("\n" + "=" * 60)
    print("SCALE FEATURES DATA")
    print("=" * 60)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    print("\nFeatures scaled successfully")
    print("Training scaled set", X_train_scaled.shape)
    print("Testing scaled set", X_test_scaled.shape)

    return X_train_scaled, X_test_scaled, scaler

In [260]:
def train_model(X_train_scaled, y_train, feature_columns): 
    print("\n" + "=" * 60) 
    print("TRAINING MODEL") 
    print("=" * 60) 
    
    model = LinearRegression() 
    model.fit(X_train_scaled, y_train) 
    
    print("\nModel train succesfully") 
    print("\nmodel coefficient") 
    for features, coef in zip(feature_columns, model.coef_.ravel()): 
        print(f" {features}: {coef:.2f}") 
    print(f"\nModel intercept: {float(model.intercept_):.2f}") 
        
    return model

In [261]:
def evaluate_model(model, X_train_scaled, X_test_scaled, y_train, y_test):
    print("\n" + "=" * 60)
    print("EVALUATING MODEL")
    print("=" * 60)

    #make prediction
    y_train_pred = model.predict(X_train_scaled)
    y_test_pred = model.predict(X_test_scaled)

    # evaluate
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    train_mae = mean_absolute_error(y_train, y_train_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    y_mean = y_train.mean()
    y_pred_baseline = [y_mean] * len(y_train)
    mae_baseline = mean_absolute_error(y_train, y_pred_baseline)

    print("\n" + "=" * 60)
    print("MODEL PERFORMANCE")
    print("=" * 60)
    print("Baseline MAE: ",round(mae_baseline, 2))
    print("\ntTraining set")
    print(f" R2 score: {train_r2:.4}")
    print(f" RMSE: {train_rmse:.2}")
    print(f" MAE: {train_mae:.2}")
    print("\ntTesting set")
    print(f" R2 score: {test_r2:.4}")
    print(f" RMSE: {test_rmse:.2}")
    print(f" MAE: {test_mae:.2}")

    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring="r2")
    print("\nCross Validation (5-folds)")
    print(f" R2 scores: {cv_scores}")
    print(f" Cross validation mean: {cv_scores.mean():.4f}")
    print(f" Cross validation STD: {cv_scores.std():.4f}")

    metrics = {
        "train_r2": train_r2,
        "test_r2": test_r2,
        "train_rmse": train_rmse,
        "test_rmse": test_rmse,
        "train_mae": train_mae,
        "test_mae": test_mae,
        "y_train_pred": y_train_pred,
        "y_test_pred": y_test_pred,
        "cv_scores": cv_scores
    }
    

    return metrics

In [262]:
def save_model_artifact(model, test_rmse, scaler, label_encoders, feature_columns):
    print("\n" + "=" * 60)
    print("SAVING MODEL ARTIFACT")
    print("=" * 60)

    joblib.dump(model, "house_price_prediction.pkl")
    print("House price prediction model saved successfully")

    joblib.dump(scaler, "scaler_features.pkl")
    print("Scaler features saved successfully")

    joblib.dump(label_encoders, "label_encoders.pkl")
    print("Label encoders saved successfully")

    joblib.dump(feature_columns, "feature_columns.pkl")
    print("Feature columns saved successfully")

    joblib.dump(test_rmse, "model_rmse.pkl")
    print("RMSE saved successfully")


    print("\n" + "=" * 60)
    print("ALL MODEL ARTIFACT SAVED SUCCESSFULLY")
    print("=" * 60)

In [263]:
def predict_house_price(bathrooms, property_size, state, region_name, furnishing, boosted):
    model = joblib.load("house_price_prediction.pkl")
    scaler = joblib.load("scaler_features.pkl")
    label_encoders = joblib.load("label_encoders.pkl")
    feature_columns = joblib.load("feature_columns.pkl")
    rmse = joblib.load("model_rmse.pkl")

    try:
        state_encoded = label_encoders["state"].transform([state])[0]
        region_name_encoded = label_encoders["region_name"].transform([region_name])[0]
        furnishing_encoded = label_encoders["furnishing"].transform([furnishing])[0]
    except ValueError as e:
        return f"Unknown category - {e}"

    features_dict = {
        "property_size": property_size,
        "bathrooms": bathrooms,
        "boosted": boosted,
        "furnishing_encoded": furnishing_encoded,
        "region_name_encoded": region_name_encoded,
        "state_encoded": state_encoded
    }

    features = np.array([[features_dict[col] for col in feature_columns]])
    features_scale = scaler.transform(features)

    predicted_price = model.predict(features_scale)[0]

    z_scores = {
        0.60: 0.84,
        0.80: 1.28,
        0.90: 1.64,
        0.95: 1.96
    }

    z = z_scores[0.60]
    margin = z * rmse

    lower_bound = max(0, predicted_price - margin)
    upper_bound = predicted_price + margin

    return {
        "lower_bound": lower_bound,
        "upper_bound": upper_bound
    }


In [268]:
def test_prediction():
    print("\n" + "=" * 60)
    print("TESTING PREIDICTION")
    print("=" * 60)

    # example 1
    price1 = predict_house_price(5, 500, "Lagos", "Lekki", "Furnished", 1)
    print("\n Furnished 500m2 house in Lekki, Lagos ")
    print(f" ₦{float(price1['lower_bound']):,.2f} – ₦{float(price1['upper_bound']):,.2f}")

    # example 2
    price2 = predict_house_price(7, 1200, "Abuja", "Life Camp", "Semi-furnished", 0)
    print("\n Semi-furnished 1200m2 house in Wuse, Abuja")
    print(f" ₦{float(price2['lower_bound']):,.2f} – ₦{float(price2['upper_bound']):,.2f}")

    price3 = predict_house_price(3, 350, "Oyo", "Ibadan", "Unfurnished", 1)
    print("\n Unfurnished 350m2 house in Ibadan, Oyo")
    print(f" ₦{float(price3['lower_bound']):,.2f} – ₦{float(price3['upper_bound']):,.2f}")

    print("=" * 60)

In [265]:
def main():
    filepath = "jiji_housing_cleaned.csv"

    df = load_and_explore_dataset(filepath)

    df_processed, label_encoders = preprocessing_data(df)

    X, y, feature_columns = features_data(df_processed)

    X_train, X_test, y_train, y_test = split_data(X, y)

    X_train_scaled, X_test_scaled, scaler = scale_features(X_train, X_test)

    model = train_model(X_train_scaled, y_train, feature_columns)

    metrics = evaluate_model(model, X_train_scaled, X_test_scaled, y_train, y_test,)

    save_model_artifact(model, metrics["test_rmse"], scaler, label_encoders, feature_columns)

    test_prediction()


In [269]:
if __name__ == "__main__":
    main()

LOAD AND  EXPLORE DATASET
Shape of the dataset
(1245, 11)

Check for missing values
title            0
region           0
region_name      0
price_title      0
property_size    0
bedrooms         0
bathrooms        0
furnishing       0
boosted          0
state            0
price_m2         0
dtype: int64

First five rows:
                                               title  \
0      4bdrm Duplex in Abuja Estate, Owerri for sale   
1  Furnished 5bdrm Bungalow in Prime Property, Be...   
2               2bdrm Block of Flats in Uyo for sale   
3  Furnished 6bdrm Duplex in Port Harcourt, Obio-...   
4       12bdrm Block of Flats in Kapua, FHA for sale   

                     region region_name  price_title  property_size  bedrooms  \
0         Imo State, Owerri      Owerri  170000000.0            600         4   
1     Edo State, Benin City  Benin City   45000000.0           1500         5   
2      Akwa Ibom State, Uyo         Uyo   30000000.0            400         2   
3  Rivers State