# Machine Learning Modelling

In [48]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
import lightgbm as lgb
from catboost import CatBoostRegressor

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import math
import time

In [49]:
data = pd.read_csv('./data/Cleaned_data.csv')

## *Feature Engineering*

In [50]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21745 entries, 0 to 21744
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   State           21745 non-null  object 
 1   City            21745 non-null  object 
 2   Street          21745 non-null  object 
 3   Zipcode         21745 non-null  int64  
 4   Bedroom         21745 non-null  float64
 5   Bathroom        21745 non-null  float64
 6   Area            21745 non-null  float64
 7   PPSq            21745 non-null  float64
 8   LotArea         21745 non-null  float64
 9   MarketEstimate  21745 non-null  float64
 10  RentEstimate    21745 non-null  float64
 11  Latitude        21745 non-null  float64
 12  Longitude       21745 non-null  float64
 13  ListedPrice     21745 non-null  float64
dtypes: float64(10), int64(1), object(3)
memory usage: 2.3+ MB


In [51]:
def split_zipcode(zipcode):
    str_zip = str(zipcode).zfill(5)
    return [int(str_zip[0]), int(str_zip[1]), int(str_zip[2]), int(str_zip[3]), int(str_zip[4])]

In [52]:
zipcode_components = data['Zipcode'].apply(split_zipcode)
zipcode_df = pd.DataFrame(zipcode_components.tolist(), columns=['NationalArea', 'CityPO1', 'CityPO2', 'AssociatePO1', 'AssociatePO2'])

df = pd.concat([data, zipcode_df], axis=1)

In [53]:
df.sort_values(by='Zipcode')

Unnamed: 0,State,City,Street,Zipcode,Bedroom,Bathroom,Area,PPSq,LotArea,MarketEstimate,RentEstimate,Latitude,Longitude,ListedPrice,NationalArea,CityPO1,CityPO2,AssociatePO1,AssociatePO2
9237,MA,Amherst,Sunderland Rd,1002,4.0,3.0,3120.0,181.089744,2.700000,526800.000000,3499.0,42.422096,-72.538150,565000.0,0,1,0,0,2
8926,MA,Belchertown,Gold St,1007,3.0,2.0,1296.0,270.061728,1.730000,355000.000000,2949.0,42.342020,-72.410630,350000.0,0,1,0,0,7
8804,MA,Belchertown,Mill Valley Rd,1007,3.0,2.0,1512.0,210.978836,1.760000,324600.000000,2274.0,42.263100,-72.374750,319000.0,0,1,0,0,7
8812,MA,Chester,Maple St,1011,5.0,2.0,2136.0,46.816479,0.450000,88900.000000,2800.0,42.278910,-72.979530,100000.0,0,1,0,1,1
8915,MA,Chicopee,Fairmont St,1013,2.0,1.0,1008.0,237.996032,0.511387,242000.000000,1800.0,42.185658,-72.598274,239900.0,0,1,0,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
636,AK,Ketchikan,Schoenbar Rd,99901,4.0,3.0,2112.0,233.428030,0.340000,475400.000000,1949.0,55.347317,-131.637570,493000.0,9,9,9,0,1
516,AK,Coffman Cove,Coho Dr,99918,3.0,3.0,1850.0,539.459459,0.400000,942680.641922,2585.0,56.021515,-132.827740,998000.0,9,9,9,1,8
685,AK,Coffman Cove,NE Minke,99918,1.0,1.0,1200.0,290.833333,3.900000,346900.000000,1900.0,56.012170,-132.810640,349000.0,9,9,9,1,8
751,AK,Wrangell,.2 Mile Zimovia Hwy,99929,3.0,2.0,1800.0,206.666667,0.980000,370600.000000,2500.0,56.363064,-132.356580,372000.0,9,9,9,2,9


In [68]:
X = df[['Bedroom', 'Bathroom', 'Area', 'PPSq', 'MarketEstimate', 'Longitude', 'Latitude', 'NationalArea', 'CityPO1', 'CityPO2', 'AssociatePO1', 'AssociatePO2']]
y_listed = df['ListedPrice']
y_rent = df['RentEstimate']

# Split the data into training and testing sets
X_train, X_test, y_train_listed, y_test_listed = train_test_split(X, y_listed, test_size=0.2, random_state=42)
X_train, X_test, y_train_rent, y_test_rent = train_test_split(X, y_rent, test_size=0.2, random_state=42)

# Define the models to train
# Define the models to train
models = {
    'LinearRegression': LinearRegression(),
    'Lasso': Lasso(random_state=42),
    'Ridge': Ridge(random_state=42),
    'RandomForestRegressor': RandomForestRegressor(random_state=42),
    'GradientBoostingRegressor': GradientBoostingRegressor(random_state=42),
    'XGBRegressor': XGBRegressor(),
    'CatBoostRegressor': CatBoostRegressor(random_state=42, verbose=0),
    'LightGBM': lgb.LGBMRegressor(random_state=42),
    'AdaBoost': AdaBoostRegressor(random_state=42),
    'KNeighborsRegressor': KNeighborsRegressor(),
}

# Function to train and evaluate models
def train_and_evaluate(models, X_train, y_train, X_test, y_test):
    results = {}
    for name, model in models.items():
        print(f"Training {name}...")
        # Train the model
        model.fit(X_train, y_train)
        # Make predictions
        predictions = model.predict(X_test)
        # Evaluate the model
        mae = mean_absolute_error(y_test, predictions)
        r2 = r2_score(y_test, predictions)
        # Store results
        results[name] = {'MAE': mae, 'R²': r2}
    return results

# Train and evaluate models for ListedPrice
print("Evaluating models for ListedPrice:")
results_listed = train_and_evaluate(models, X_train, y_train_listed, X_test, y_test_listed)
for model_name, metrics in results_listed.items():
    print(f"\n\nModel: {model_name} - ListedPrice")
    print(f"  MAE: {metrics['MAE']}, R²: {metrics['R²']}\n")

# Train and evaluate models for RentEstimate
print("Evaluating models for RentEstimate:")
results_rent = train_and_evaluate(models, X_train, y_train_rent, X_test, y_test_rent)
for model_name, metrics in results_rent.items():
    print(f"\n\nModel: {model_name} - RentEstimate")
    print(f"  MAE: {metrics['MAE']}, R²: {metrics['R²']}\n")

Evaluating models for ListedPrice:
Training LinearRegression...
Training Lasso...
Training Ridge...
Training RandomForestRegressor...
Training GradientBoostingRegressor...
Training XGBRegressor...
Training CatBoostRegressor...
Training LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001133 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1365
[LightGBM] [Info] Number of data points in the train set: 17396, number of used features: 12
[LightGBM] [Info] Start training from score 536621.625489
Training AdaBoost...
Training KNeighborsRegressor...


Model: LinearRegression - ListedPrice
  MAE: 26663.28501208368, R²: 0.9885854982587012



Model: Lasso - ListedPrice
  MAE: 26662.776099675306, R²: 0.988585471991989



Model: Ridge - ListedPrice
  MAE: 26663.12162154295, R²: 0.9885854907110087



Model: RandomForestRegressor - ListedPrice
  MAE: 20227.071149689586, R²: 0.9872292506253892



Model:

In [54]:
# Define a function to preprocess the data
def preprocess_data(df):
    features = df[['Bedroom', 'Bathroom', 'Area', 'PPSq', 'Longitude', 'Latitude', 
                   'NationalArea', 'CityPO1', 'CityPO2', 'AssociatePO1', 'AssociatePO2']]
    features_for_listed_price = features.copy()
    features_for_rent_estimate = features.copy()

    # Add cross-target variables
    features_for_listed_price['RentEstimate'] = df['RentEstimate']
    features_for_rent_estimate['ListedPrice'] = df['ListedPrice']

    return features_for_listed_price, features_for_rent_estimate, df['ListedPrice'], df['RentEstimate']

features_for_listed_price, features_for_rent_estimate, y_listed, y_rent = preprocess_data(df)


In [55]:
# Split for ListedPrice model
X_train_listed, X_test_listed, y_train_listed, y_test_listed = train_test_split(features_for_listed_price, y_listed, test_size=0.2, random_state=42)

# Split for RentEstimate model
X_train_rent, X_test_rent, y_train_rent, y_test_rent = train_test_split(features_for_rent_estimate, y_rent, test_size=0.2, random_state=42)

# Scaling the features
scaler = MinMaxScaler()

X_train_listed_scaled = scaler.fit_transform(X_train_listed)
X_test_listed_scaled = scaler.transform(X_test_listed)

X_train_rent_scaled = scaler.fit_transform(X_train_rent)
X_test_rent_scaled = scaler.transform(X_test_rent)


In [67]:
models = {
    'LinearRegression': LinearRegression(),
    'RandomForestRegressor': RandomForestRegressor(random_state=42),
    'GradientBoostingRegressor': GradientBoostingRegressor(random_state=42),
    'XGBRegressor': XGBRegressor(),
    'CatBoostRegressor': CatBoostRegressor(random_state=42, verbose=0),
    'LightGBM': lgb.LGBMRegressor(random_state=42),
}

def evaluate_model(model, name, X_train, y_train, X_test, y_test):
    print(f"Evaluating {name}...")
    start_time = time.time()
    cv_rmse_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
    cv_r2_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
    cv_mae_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')
    
    cv_rmse = [math.sqrt(-score) for score in cv_rmse_scores]
    cv_rmse_mean = np.mean(cv_rmse)
    cv_r2_mean = np.mean(cv_r2_scores)
    cv_mae_mean = -np.mean(cv_mae_scores)
    
    model.fit(X_train, y_train)
    train_time = time.time() - start_time
    
    start_time = time.time()
    y_pred = model.predict(X_test)
    prediction_time = time.time() - start_time
    
    test_rmse = math.sqrt(mean_squared_error(y_test, y_pred))
    test_r2 = r2_score(y_test, y_pred)
    test_mae = mean_absolute_error(y_test, y_pred)
    
    return [name, cv_rmse_mean, cv_r2_mean, cv_mae_mean, test_rmse, test_r2, test_mae, train_time, prediction_time]

In [68]:
def run_evaluation(X_train, y_train, X_test, y_test):
    results = []
    for name, model in models.items():
        result = evaluate_model(model, name, X_train, y_train, X_test, y_test)
        results.append(result)
    return pd.DataFrame(results, columns=['Model', 'CV RMSE', 'CV R-squared', 'CV MAE', 'Test RMSE', 'Test R-squared', 'Test MAE', 'Train Time', 'Prediction Time'])

print("Evaluating models for ListedPrice...")
results_listed = run_evaluation(X_train_listed_scaled, y_train_listed, X_test_listed_scaled, y_test_listed)

print("\nEvaluating models for RentEstimate...")
results_rent = run_evaluation(X_train_rent_scaled, y_train_rent, X_test_rent_scaled, y_test_rent)

Evaluating models for ListedPrice...
Evaluating LinearRegression...
Evaluating RandomForestRegressor...
Evaluating GradientBoostingRegressor...
Evaluating XGBRegressor...
Evaluating CatBoostRegressor...
Evaluating LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001562 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1363
[LightGBM] [Info] Number of data points in the train set: 13916, number of used features: 12
[LightGBM] [Info] Start training from score 546054.813596
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001168 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1364
[LightGBM] [Info] Number of data points in the train set: 13917, number of used features: 12
[LightGBM] [Info] Start training from score 526935.680032
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testin

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000376 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1364
[LightGBM] [Info] Number of data points in the train set: 13917, number of used features: 12
[LightGBM] [Info] Start training from score 2671.763032
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000383 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1362
[LightGBM] [Info] Number of data points in the train set: 13917, number of used features: 12
[LightGBM] [Info] Start training from score 2721.319597
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000250 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

In [76]:
# Function to display and sort the results
def display_results(results, target_name):
    results_df = pd.DataFrame(results, columns=['Model', 'CV RMSE', 'CV R-squared', 'CV MAE', 'Test RMSE', 'Test R-squared', 'Test MAE', 'Train Time', 'Prediction Time'])
    results_df_sorted = results_df.sort_values(by='Test R-squared')
    return results_df_sorted

# Display and sort the results
results_listed_df = display_results(results_listed, 'ListedPrice')
results_rent_df = display_results(results_rent, 'RentEstimate')


In [77]:
results_listed_df

Unnamed: 0,Model,CV RMSE,CV R-squared,CV MAE,Test RMSE,Test R-squared,Test MAE,Train Time,Prediction Time
0,LinearRegression,765891.25912,0.735226,247681.928687,551601.939272,0.793846,234850.054502,0.185829,0.0
1,RandomForestRegressor,724654.107843,0.757381,46073.720115,414084.390851,0.883823,33673.638144,339.542819,0.205689
3,XGBRegressor,701800.608041,0.787011,54023.7388,403317.535366,0.889786,36680.392433,4.881755,0.005117
4,CatBoostRegressor,635394.922561,0.825964,46868.108619,358579.166259,0.912881,35368.341405,93.843225,0.007293
5,LightGBM,649436.785644,0.818501,51942.882505,318285.272981,0.931361,34689.960615,5.666233,0.012007
2,GradientBoostingRegressor,576717.071355,0.850357,68751.096804,263274.470578,0.953037,59535.685163,87.191667,0.007992


In [78]:
results_rent_df

Unnamed: 0,Model,CV RMSE,CV R-squared,CV MAE,Test RMSE,Test R-squared,Test MAE,Train Time,Prediction Time
3,XGBRegressor,2485.501148,0.55999,490.268513,1766.059093,0.743613,464.687966,4.844705,0.005998
0,LinearRegression,2470.727067,0.575334,642.078254,1748.800095,0.7486,613.305989,0.31769,0.001995
2,GradientBoostingRegressor,2416.871671,0.57466,476.782595,1673.871555,0.769681,456.804897,81.141717,0.006989
1,RandomForestRegressor,2396.309769,0.597717,465.556923,1531.338844,0.807235,425.638754,346.652446,0.176925
4,CatBoostRegressor,2217.003418,0.652614,463.137492,1457.397736,0.825401,427.787685,108.325489,0.007885
5,LightGBM,2091.759767,0.688229,468.75189,1442.9518,0.828845,434.273486,5.244207,0.016002
