In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline

Here are the links of the datasets we used in this project:

California Housing Dataset : https://www.kaggle.com/datasets/harrywang/housing \
Audit Data : https://archive.ics.uci.edu/dataset/475/audit+data \
Bike Sharing Dataset : https://www.kaggle.com/code/tomvoss/london-bike-sharing-prediction

# California Housing Dataset

The task is to predict median house values in Californian districts, given a number of features from these districts.

In [2]:
housing_df = pd.read_csv("housing.csv")
# we are dropping na values so we will not have issues with knn regression etc. Normally to fix
# this problem we are supposed to use other preprocessing techniques.
housing_df = housing_df.dropna()
train_df, test_df = train_test_split(housing_df, test_size=0.2, random_state=123)
train_df.head()
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16346 entries, 2851 to 20168
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           16346 non-null  float64
 1   latitude            16346 non-null  float64
 2   housing_median_age  16346 non-null  float64
 3   total_rooms         16346 non-null  float64
 4   total_bedrooms      16346 non-null  float64
 5   population          16346 non-null  float64
 6   households          16346 non-null  float64
 7   median_income       16346 non-null  float64
 8   median_house_value  16346 non-null  float64
 9   ocean_proximity     16346 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.4+ MB


In [3]:
train_df = train_df.assign(
    rooms_per_household=train_df["total_rooms"] / train_df["households"]
)
test_df = test_df.assign(
    rooms_per_household=test_df["total_rooms"] / test_df["households"]
)

train_df = train_df.assign(
    bedrooms_per_household=train_df["total_bedrooms"] / train_df["households"]
)
test_df = test_df.assign(
    bedrooms_per_household=test_df["total_bedrooms"] / test_df["households"]
)

train_df = train_df.assign(
    population_per_household=train_df["population"] / train_df["households"]
)
test_df = test_df.assign(
    population_per_household=test_df["population"] / test_df["households"]
)
train_df.head()
train_df = train_df.drop(columns=["population", "total_rooms", "total_bedrooms"])
test_df = test_df.drop(columns=["population", "total_rooms", "total_bedrooms"])

In [4]:
# We are droping the categorical variable ocean_proximity. 
X_train = train_df.drop(columns=["median_house_value", "ocean_proximity"])
y_train = train_df["median_house_value"]

X_test = test_df.drop(columns=["median_house_value", "ocean_proximity"])
y_test = test_df["median_house_value"]

In [5]:
## pipelines for linear regression for each scaler 

pipe_linreg_ss = Pipeline(
    steps=[
        ("standard_scaler", StandardScaler()),
        ("lin_regressor", LinearRegression()),
    ]
)

pipe_linreg_mm = Pipeline(
    steps=[
        ("minmax_scaler", MinMaxScaler()),
        ("lin_regressor", LinearRegression()),
    ]
)

pipe_linreg_rs = Pipeline(
    steps=[
        ("robust_scaler", RobustScaler()),
        ("lin_regressor", LinearRegression()),
    ]
)

In [6]:
## pipelines for knn regression for each scaler 

pipe_knn_ss = Pipeline(
    steps=[
        ("standard_scaler", StandardScaler()),
        ("knn_regressor", KNeighborsRegressor()),
    ]
)

pipe_knn_mm = Pipeline(
    steps=[
        ("minmax_scaler", MinMaxScaler()),
        ("knn_regressor", KNeighborsRegressor()),
    ]
)

pipe_knn_rs = Pipeline(
    steps=[
        ("robust_scaler", RobustScaler()),
        ("knn_regressor", KNeighborsRegressor()),
    ]
)

In [7]:
## pipelines for decision tree regressor for each scaler 
pipe_dt_ss = Pipeline(
    steps=[
        ("standard_scaler", StandardScaler()),
        ("dt_regressor", DecisionTreeRegressor()),
    ]
)
pipe_dt_mm = Pipeline(
    steps=[
        ("minmax_scaler", MinMaxScaler()),
        ("dt_regressor", DecisionTreeRegressor()),
    ]
)

pipe_dt_rs = Pipeline(
    steps=[
        ("robust_scaler", RobustScaler()),
        ("dt_regressor", DecisionTreeRegressor()),
    ]
)


In [8]:
models = {
    "Linear Regression": {
        "StandardScaler": pipe_linreg_ss,
        "MinMaxScaler": pipe_linreg_mm,
        "RobustScaler": pipe_linreg_rs,
    },
    "KNN": {
        "StandardScaler": pipe_knn_ss,
        "MinMaxScaler": pipe_knn_mm,
        "RobustScaler": pipe_knn_rs,
    },
    "Decision Tree": {
        "StandardScaler": pipe_dt_ss,
        "MinMaxScaler": pipe_dt_mm,
        "RobustScaler": pipe_dt_rs,
    },
}

results = []

# cross-validation for each model and pipeline
for model_name, pipelines in models.items():
    for scaler_name, pipeline in pipelines.items():
        cv_scores = -cross_val_score(pipeline, X_train, y_train, cv=5, scoring="neg_root_mean_squared_error")
        mean_rmse = np.mean(cv_scores)
        results.append({
            "Model": model_name,
            "Scaler": scaler_name,
            "Mean_RMSE_Housing": mean_rmse,
        })

results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Model,Scaler,Mean_RMSE_Housing
0,Linear Regression,StandardScaler,72879.586508
1,Linear Regression,MinMaxScaler,72879.586508
2,Linear Regression,RobustScaler,72879.586508
3,KNN,StandardScaler,65242.118917
4,KNN,MinMaxScaler,64590.108348
5,KNN,RobustScaler,66078.560621
6,Decision Tree,StandardScaler,73858.751504
7,Decision Tree,MinMaxScaler,74066.475924
8,Decision Tree,RobustScaler,74080.670604


# Audit Data 

In [9]:
audit_risk_df = pd.read_csv("audit_risk.csv")
audit_risk_df = audit_risk_df.dropna()
train_df_audit, test_df_audit = train_test_split(audit_risk_df, test_size=0.2, random_state=123)
train_df_audit.head()
train_df_audit.info()

<class 'pandas.core.frame.DataFrame'>
Index: 620 entries, 119 to 510
Data columns (total 27 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Sector_score    620 non-null    float64
 1   LOCATION_ID     620 non-null    object 
 2   PARA_A          620 non-null    float64
 3   Score_A         620 non-null    float64
 4   Risk_A          620 non-null    float64
 5   PARA_B          620 non-null    float64
 6   Score_B         620 non-null    float64
 7   Risk_B          620 non-null    float64
 8   TOTAL           620 non-null    float64
 9   numbers         620 non-null    float64
 10  Score_B.1       620 non-null    float64
 11  Risk_C          620 non-null    float64
 12  Money_Value     620 non-null    float64
 13  Score_MV        620 non-null    float64
 14  Risk_D          620 non-null    float64
 15  District_Loss   620 non-null    int64  
 16  PROB            620 non-null    float64
 17  RiSk_E          620 non-null    float6

In [10]:
train_df_audit = train_df_audit.drop(columns=["LOCATION_ID","Risk"])
test_df_audit = test_df_audit.drop(columns=["LOCATION_ID", "Risk"])

X_train_audit = train_df_audit.drop(columns=["Audit_Risk"])
y_train_audit = train_df_audit["Audit_Risk"]

X_test_audit = test_df_audit.drop(columns=["Audit_Risk"])
y_test_audit = test_df_audit["Audit_Risk"]

In [11]:

results_audit = []
# cross-validation for each model and pipeline
for model_name, pipelines in models.items():
    for scaler_name, pipeline in pipelines.items():
        cv_scores_audit = -cross_val_score(pipeline, X_train_audit, y_train_audit, cv=5, scoring="neg_root_mean_squared_error")
        mean_rmse_audit = np.mean(cv_scores_audit)
        results_audit.append({
            "Model": model_name,
            "Scaler": scaler_name,
            "Mean_RMSE_Audit": mean_rmse_audit,
        })

results_df_audit = pd.DataFrame(results_audit)
results_df_audit


Unnamed: 0,Model,Scaler,Mean_RMSE_Audit
0,Linear Regression,StandardScaler,23.17902
1,Linear Regression,MinMaxScaler,23.179255
2,Linear Regression,RobustScaler,23.179502
3,KNN,StandardScaler,26.710781
4,KNN,MinMaxScaler,27.701785
5,KNN,RobustScaler,27.857028
6,Decision Tree,StandardScaler,20.236636
7,Decision Tree,MinMaxScaler,22.060359
8,Decision Tree,RobustScaler,21.076014


# London Bike Sharing

In [12]:
bike_df = pd.read_csv("london_merged.csv")
bike_df = bike_df.dropna()
train_df_bike, test_df_bike = train_test_split(bike_df, test_size=0.2, random_state=123)
train_df_bike.head()
train_df_bike.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13931 entries, 16400 to 15725
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   timestamp     13931 non-null  object 
 1   cnt           13931 non-null  int64  
 2   t1            13931 non-null  float64
 3   t2            13931 non-null  float64
 4   hum           13931 non-null  float64
 5   wind_speed    13931 non-null  float64
 6   weather_code  13931 non-null  float64
 7   is_holiday    13931 non-null  float64
 8   is_weekend    13931 non-null  float64
 9   season        13931 non-null  float64
dtypes: float64(8), int64(1), object(1)
memory usage: 1.2+ MB


In [13]:
train_df_bike = train_df_bike.drop(columns=["timestamp"])
test_df_bike = test_df_bike.drop(columns=["timestamp"])

X_train_bike = train_df_bike.drop(columns=["cnt"])
y_train_bike = train_df_bike["cnt"]

X_test_bike = test_df_bike.drop(columns=["cnt"])
y_test_bike = test_df_bike["cnt"]

In [14]:
results_bike = []
# cross-validation for each model and pipeline
for model_name, pipelines in models.items():
    for scaler_name, pipeline in pipelines.items():
        cv_scores_bike = -cross_val_score(pipeline, X_train_bike, y_train_bike, cv=5, scoring="neg_root_mean_squared_error")
        mean_rmse_bike = np.mean(cv_scores_bike)
        results_bike.append({
            "Model": model_name,
            "Scaler": scaler_name,
            "Mean_RMSE_Bike": mean_rmse_bike,
        })

results_df_bike = pd.DataFrame(results_bike)
results_df_bike


Unnamed: 0,Model,Scaler,Mean_RMSE_Bike
0,Linear Regression,StandardScaler,926.996894
1,Linear Regression,MinMaxScaler,926.996894
2,Linear Regression,RobustScaler,926.996894
3,KNN,StandardScaler,924.748905
4,KNN,MinMaxScaler,926.313191
5,KNN,RobustScaler,925.019134
6,Decision Tree,StandardScaler,1205.702961
7,Decision Tree,MinMaxScaler,1206.162051
8,Decision Tree,RobustScaler,1202.828348


In [15]:
# Merge all three dataframes on "Model" and "Scaler"
combined_results_df = pd.merge(results_df, results_df_audit, on=["Model", "Scaler"], how="outer")
combined_results_df = pd.merge(combined_results_df, results_df_bike, on=["Model", "Scaler"], how="outer")

# Display the combined dataframe
combined_results_df

Unnamed: 0,Model,Scaler,Mean_RMSE_Housing,Mean_RMSE_Audit,Mean_RMSE_Bike
0,Decision Tree,MinMaxScaler,74066.475924,22.060359,1206.162051
1,Decision Tree,RobustScaler,74080.670604,21.076014,1202.828348
2,Decision Tree,StandardScaler,73858.751504,20.236636,1205.702961
3,KNN,MinMaxScaler,64590.108348,27.701785,926.313191
4,KNN,RobustScaler,66078.560621,27.857028,925.019134
5,KNN,StandardScaler,65242.118917,26.710781,924.748905
6,Linear Regression,MinMaxScaler,72879.586508,23.179255,926.996894
7,Linear Regression,RobustScaler,72879.586508,23.179502,926.996894
8,Linear Regression,StandardScaler,72879.586508,23.17902,926.996894
