In [78]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer

import warnings
warnings.filterwarnings("ignore")

In [61]:
def load_and_explore_dataset(filepath):
    print("=" * 60)
    print("LOAD AND  EXPLORE DATASET")
    print("=" * 60)

    df = pd.read_csv(filepath)

    print("Shape of the dataset")
    print(df.shape)
    print("\nCheck for missing values")
    print(df.isnull().sum())
    print("\nFirst five rows:")
    print(df.head())
    print("\nDescriptive stats")
    print(df.describe())
    print("\nDataset Info")
    print(df.info())
    print("\nState Distribution:")
    print(df["state"].value_counts())
    print("\nFurnishing Distribution:")
    print(df["furnishing"].value_counts())

    return df

In [62]:
def preprocessing_data(df):
    print("\n" + "=" * 60)
    print("PREPROCESSING LOADED DATA")
    print("=" * 60)


    df_processed = df.dropna().copy()

    print("Preprocessed Dataset shape:", df_processed.shape)
    
    return df_processed

In [63]:
def features_data(df_processed):
    print("\n" + "=" * 60)
    print("FEATURES DATA")
    print("=" * 60)
    
    num_feats = ["property_size", "bathrooms", "boosted"]
    cat_feats = ["state", "region_name", "furnishing"]

    target_column = "price_title"

    X = df_processed[num_feats + cat_feats]
    y = df_processed[target_column]

    print("Features Shape:", X.shape)
    print("Target Shape:", y.shape)

    return X, y, num_feats, cat_feats

In [64]:
def split_data(X, y, test_size=0.2, random_state=42):
    print("\n" + "=" * 60)
    print("SPLITING DATA")
    print("=" * 60)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )

    print("\nTraining set size", X_train.shape[0])
    print("Testing set size", X_test.shape[0])
    print("\nTraining set price range {:.2f} - {:.2f}".format(
        float(y_train.min()), float(y_train.max())
    ))
    print("\nTesting set price range {:.2f} - {:.2f}".format(
        float(y_test.min()), float(y_test.max())
    ))

    return X_train, X_test, y_train, y_test

In [65]:
# def scale_features(X_train, X_test):
#     print("\n" + "=" * 60)
#     print("SCALE FEATURES DATA")
#     print("=" * 60)

#     scaler = StandardScaler()
#     X_train_scaled = scaler.fit_transform(X_train)
#     X_test_scaled = scaler.transform(X_test)
    
#     print("\nFeatures scaled successfully")
#     print("Training scaled set", X_train_scaled.shape)
#     print("Testing scaled set", X_test_scaled.shape)

#     return X_train_scaled, X_test_scaled, scaler

In [66]:
# def train_model(X_train_scaled, y_train, feature_columns): 
#     print("\n" + "=" * 60) 
#     print("TRAINING MODEL") 
#     print("=" * 60) 
    
#     model = LinearRegression() 
#     model.fit(X_train_scaled, y_train) 
    
#     print("\nModel train succesfully") 
#     print("\nmodel coefficient") 
#     for features, coef in zip(feature_columns, model.coef_.ravel()): 
#         print(f" {features}: {coef:.2f}") 
#     print(f"\nModel intercept: {float(model.intercept_):.2f}") 
        
#     return model

In [90]:
def build_and_train_model(X_train, y_train, numerical_features, categorical_features):
    print("\n" + "=" * 60)
    print("BUILDING & TRAINING PIPELINE")
    print("=" * 60)

    num_feats = ["property_size", "bathrooms", "boosted"]
    cat_feats = ["state", "region_name", "furnishing"]

    num_transformer = Pipeline(steps=[("scaler", StandardScaler())])
    cat_transformer = Pipeline(steps=[("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))])

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", num_transformer, num_feats),
            ("cat", cat_transformer, cat_feats)
        ]
    )

    model = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("linearregressor", LinearRegression())
    ])

    model.fit(X_train, y_train)

    print("Model trained successfully")

    return model


In [96]:
def evaluate_model(model, X_train, X_test, y_train, y_test):
    print("\n" + "=" * 60)
    print("EVALUATING MODEL")
    print("=" * 60)

    #make prediction
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # evaluate
    y_mean = y_train.mean()
    y_pred_baseline = [y_mean] * len(y_train)
    mae_baseline = mean_absolute_error(y_train, y_pred_baseline)
    
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)

    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

    train_mae = mean_absolute_error(y_train, y_train_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)

    print("\n" + "=" * 60)
    print("MODEL PERFORMANCE")
    print("=" * 60)
    print("Baseline MAE: ",round(mae_baseline, 2))
    print("\ntTraining set")
    print(f" R2 score: {train_r2:.4}")
    print(f" RMSE: {train_rmse:.2}")
    print(f" MAE: {train_mae:.2}")
    print("\ntTesting set")
    print(f" R2 score: {test_r2:.4}")
    print(f" RMSE: {test_rmse:.2}")
    print(f" MAE: {test_mae:.2}")

    # cross validation
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring="r2")
    print("\nCross Validation (5-folds)")
    print(f" R2 scores: {cv_scores}")
    print(f" Cross validation mean: {cv_scores.mean():.4f}")
    print(f" Cross validation STD: {cv_scores.std():.4f}")

    metrics = {
        "train_r2": train_r2,
        "test_r2": test_r2,
        "train_rmse": train_rmse,
        "test_rmse": test_rmse,
        "train_mae": train_mae,
        "test_mae": test_mae,
        "y_train_pred": y_train_pred,
        "y_test_pred": y_test_pred,
        "cv_scores": cv_scores
    }

    return metrics

In [69]:
def save_model_artifact(model):
    print("\n" + "=" * 60)
    print("SAVING MODEL ARTIFACT")
    print("=" * 60)

    joblib.dump(model, "house_price_pipeline.pkl")
    print("Pipeline model saved successfully")


    print("\n" + "=" * 60)
    print("ALL MODEL ARTIFACT SAVED SUCCESSFULLY")
    print("=" * 60)

In [70]:
def predict_house_price(bathrooms, property_size, state, region_name, furnishing, boosted):

    model = joblib.load("house_price_pipeline.pkl")

    input_df = pd.DataFrame([{
        "property_size": property_size,
        "bathrooms": bathrooms,
        "state": state,
        "region_name": region_name,
        "furnishing": furnishing,
        "boosted": boosted
    }])
    predicted_price = model.predict(input_df)[0]

    return predicted_price

In [71]:
def test_prediction():
    print("\n" + "=" * 60)
    print("TESTING PREIDICTION")
    print("=" * 60)

    # example 1
    price1 = predict_house_price(5, 500, "Lagos", "Lekki", "Furnished", 1)
    print("\n Furnished 500m2 house in Lekki, Lagos ")
    print(f" ₦{float(price1):,.2f}")

    # example 2
    price2 = predict_house_price(7, 1200, "Abuja", "Life Camp", "Semi-furnished", 0)
    print("\n Semi-furnished 1200m2 house in Wuse, Abuja")
    print(f" ₦{float(price2):,.2f}")

    price3 = predict_house_price(3, 350, "Oyo", "Ibadan", "Unfurnished", 1)
    print("\n Unfurnished 350m2 house in Ibadan, Oyo")
    print(f" ₦{float(price3):,.2f}")

    print("=" * 60)

In [92]:
def main():
    filepath = "jiji_housing_cleaned.csv"

    df = load_and_explore_dataset(filepath)

    df_processed = preprocessing_data(df)

    X, y, num_features, cat_features = features_data(df_processed)

    X_train, X_test, y_train, y_test = split_data(X, y)

    model = build_and_train_model(X_train, y_train, num_features, cat_features)

    mertics = evaluate_model(model, X_train, X_test, y_train, y_test)

    save_model_artifact(model)

    test_prediction()

In [97]:
if __name__ == "__main__":
    main()

LOAD AND  EXPLORE DATASET
Shape of the dataset
(1245, 11)

Check for missing values
title            0
region           0
region_name      0
price_title      0
property_size    0
bedrooms         0
bathrooms        0
furnishing       0
boosted          0
state            0
price_m2         0
dtype: int64

First five rows:
                                               title  \
0      4bdrm Duplex in Abuja Estate, Owerri for sale   
1  Furnished 5bdrm Bungalow in Prime Property, Be...   
2               2bdrm Block of Flats in Uyo for sale   
3  Furnished 6bdrm Duplex in Port Harcourt, Obio-...   
4       12bdrm Block of Flats in Kapua, FHA for sale   

                     region region_name  price_title  property_size  bedrooms  \
0         Imo State, Owerri      Owerri  170000000.0            600         4   
1     Edo State, Benin City  Benin City   45000000.0           1500         5   
2      Akwa Ibom State, Uyo         Uyo   30000000.0            400         2   
3  Rivers State