## Model testing
I will test Linear Regression, Knn, and Random Forest. Logistic Regression will not be used because it is intended for classification rather than prediction of continuous values.

In [55]:
import pandas as pd
from taxipred.utils.constants import CLEANED_DATA_PATH
import matplotlib.pyplot as plt
import numpy as np


df = pd.read_csv(CLEANED_DATA_PATH)
df

Unnamed: 0,Trip_Distance_km,Day_of_Week,Traffic_Conditions,Weather,Trip_Duration_Minutes,Trip_Price
0,19.35,Weekday,Low,Clear,53.82,36.2624
1,36.87,Weekend,High,Clear,37.27,52.9032
2,8.64,Weekend,Medium,Clear,89.33,60.2028
3,3.85,Weekday,High,Rain,5.05,11.2645
4,41.79,Weekend,High,Clear,86.95,88.1328
...,...,...,...,...,...,...
713,5.49,Weekend,Medium,Clear,58.39,34.4049
714,45.95,Weekday,Medium,Clear,61.96,62.1295
715,7.70,Weekday,Low,Rain,54.18,33.1236
716,47.56,Weekday,Low,Clear,114.94,61.2090


## Linear regression testing

In [56]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [57]:
X = df[["Trip_Distance_km", "Day_of_Week", "Traffic_Conditions", "Weather", "Trip_Duration_Minutes"]]
y = df["Trip_Price"]

In [58]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 718 entries, 0 to 717
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Trip_Distance_km       718 non-null    float64
 1   Day_of_Week            718 non-null    object 
 2   Traffic_Conditions     718 non-null    object 
 3   Weather                718 non-null    object 
 4   Trip_Duration_Minutes  718 non-null    float64
dtypes: float64(2), object(3)
memory usage: 28.2+ KB


### Make the objects to numbers

In [59]:
X = pd.get_dummies(X, columns=["Day_of_Week", "Traffic_Conditions", "Weather"], drop_first=True)
X.head()


Unnamed: 0,Trip_Distance_km,Trip_Duration_Minutes,Day_of_Week_Weekend,Traffic_Conditions_Low,Traffic_Conditions_Medium,Weather_Rain,Weather_Snow
0,19.35,53.82,False,True,False,False,False
1,36.87,37.27,True,False,False,False,False
2,8.64,89.33,True,False,True,False,False
3,3.85,5.05,False,False,False,True,False
4,41.79,86.95,True,False,False,False,False


### 1. Train|test split

In [60]:
# Llm used to decide which test size to use
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [61]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((574, 7), (144, 7), (574,), (144,))

### 2. Feature scaling

In [62]:
scaler = MinMaxScaler()
scaler.fit(X_train)

scaled_X_train = scaler.transform(X_train)
scaled_X_test = scaler.transform(X_test)

print(f"{scaled_X_train.min():.2f} ≤ scaled_X_train ≤ {scaled_X_train.max():.2f}")
print(f"{scaled_X_test.min():.2f} ≤ scaled_X_test ≤ {scaled_X_test.max():.2f}")



0.00 ≤ scaled_X_train ≤ 1.00
0.00 ≤ scaled_X_test ≤ 1.00


### 3. Linear regression

In [63]:
from sklearn.linear_model import LinearRegression 

model = LinearRegression()
model.fit(scaled_X_train, y_train)

print(f"Parameters: {model.coef_}")
print(f"Intercept parameter: {model.intercept_}")



Parameters: [ 5.75110161e+01  3.31648403e+01 -2.46969726e+00  3.83357502e-01
  2.95429127e-02  4.02975774e-01  4.15088284e+00]
Intercept parameter: 7.189793006845953


### 4. Predict on test data

In [64]:
y_pred = model.predict(scaled_X_test)
print(y_pred)

[51.99981095 79.66327081 35.93891092 13.37318542 70.01241391 51.82508121
 55.48873692 43.61039281 21.32598381 79.68236312 44.78357093 56.78374199
 42.84826257 19.06737119 36.82874605 25.4548422  48.55034415 58.1786047
 76.41145338 66.95488011 66.19250342 71.61093562 37.93446546 60.26472494
 62.17792781 67.74378568 67.15426827 51.38795466 14.26837619 19.35654445
 52.26701861 13.14822059 70.52056463 49.85554954 84.61484836 43.69396437
 85.62157667 34.5028826  66.7242672  73.01727279 73.8326659  66.85446395
 41.72115663 46.52942478 50.7936961  49.27277519 78.28278091 62.15025649
 35.51103515 32.97531339 25.82884741 10.98272077 50.38942591 82.35126609
 79.2426266  21.59748858 64.2910738  75.28165019 32.56905722 50.92575352
 42.6190482  58.24135159 32.10767452 68.53627769 75.8871844  59.76645617
 25.32148846 69.35769104 85.28666599 46.93982136 42.21081133 87.1298011
 27.86669873 44.69676225 65.98296653 50.03317953 38.78731998 21.02804543
 54.28543448 36.80905088 44.11771272 29.65712534 21.1

### 5. Evaluate performance

In [65]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"Mae: {mae} Mse: {mse} Rmse: {rmse}")

Mae: 11.53818207900855 Mse: 216.5299837515505 Rmse: 14.714957823641578


## Knn testing