### Tts står för train | test split, mer ml

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv("../src/taxipred/data/cleaned_data.csv", header=0)
df.columns = df.columns.str.strip()  # Clean up any whitespace
print(df.columns.tolist())           # Confirm column names

print(f"{df.shape[0]}samples")
print(f"{df.shape[1]-1}features")
# Trip price is our label/target
df.head()


['Trip_Distance_km', 'Time_of_Day', 'Day_of_Week', 'Traffic_Conditions', 'Weather', 'Base_Fare', 'Per_Km_Rate', 'Per_Minute_Rate', 'Trip_Duration_Minutes', 'Trip_Price']
602samples
9features


Unnamed: 0,Trip_Distance_km,Time_of_Day,Day_of_Week,Traffic_Conditions,Weather,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price
0,19.35,Morning,Weekday,Low,Clear,3.56,0.8,0.32,53.82,36.2624
1,36.87,Evening,Weekend,High,Clear,2.7,1.21,0.15,37.27,52.9032
2,8.64,Afternoon,Weekend,Medium,Clear,2.55,1.71,0.48,89.33,60.2028
3,35.7,Afternoon,Weekday,Low,Rain,3.39,1.52,0.47,60.67,75.5657
4,41.79,Night,Weekend,High,Clear,4.6,1.77,0.11,86.95,88.1328


In [5]:
X, y = df.drop("Trip_Price", axis="columns"), df["Trip_Price"]
X.head(), y.head()

(   Trip_Distance_km Time_of_Day Day_of_Week Traffic_Conditions Weather  \
 0             19.35     Morning     Weekday                Low   Clear   
 1             36.87     Evening     Weekend               High   Clear   
 2              8.64   Afternoon     Weekend             Medium   Clear   
 3             35.70   Afternoon     Weekday                Low    Rain   
 4             41.79       Night     Weekend               High   Clear   
 
    Base_Fare  Per_Km_Rate  Per_Minute_Rate  Trip_Duration_Minutes  
 0       3.56         0.80             0.32                  53.82  
 1       2.70         1.21             0.15                  37.27  
 2       2.55         1.71             0.48                  89.33  
 3       3.39         1.52             0.47                  60.67  
 4       4.60         1.77             0.11                  86.95  ,
 0    36.2624
 1    52.9032
 2    60.2028
 3    75.5657
 4    88.1328
 Name: Trip_Price, dtype: float64)

In [6]:
print(type(X), type(y))

<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.series.Series'>


### Scikitlearn  

## Train | test and split

In [7]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((421, 9), (181, 9), (421,), (181,))

In [8]:
X_train.head()

Unnamed: 0,Trip_Distance_km,Time_of_Day,Day_of_Week,Traffic_Conditions,Weather,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes
272,49.66,Afternoon,Weekend,High,Clear,4.86,1.15,0.16,68.75
601,47.56,Morning,Weekday,Low,Clear,2.67,0.82,0.17,114.94
480,46.35,Morning,Weekday,Medium,Clear,3.1,0.95,0.24,89.15
436,40.91,Afternoon,Weekday,Medium,Clear,4.73,1.16,0.28,19.1
383,25.42,Evening,Weekday,Medium,Clear,4.84,1.43,0.2,67.58


In [9]:
y_train.head()

272    72.9690
601    61.2090
480    68.5285
436    57.5336
383    54.7066
Name: Trip_Price, dtype: float64

### Feature scaling

In [10]:
from sklearn.preprocessing import MinMaxScaler 

scaler = MinMaxScaler()
type(scaler)

sklearn.preprocessing._data.MinMaxScaler

In [11]:
scaler

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False


In [12]:
print(X_train.columns.tolist())

['Trip_Distance_km', 'Time_of_Day', 'Day_of_Week', 'Traffic_Conditions', 'Weather', 'Base_Fare', 'Per_Km_Rate', 'Per_Minute_Rate', 'Trip_Duration_Minutes']


### Uppdelning till numeriska features

In [13]:
#numerical columns

numeric_cols = ['Trip_Distance_km', 'Base_Fare', 'Per_Km_Rate', 'Per_Minute_Rate', 'Trip_Duration_Minutes']
X_train_numeric = X_train[numeric_cols]
X_test_numeric = X_test[numeric_cols]

#scale

scaler = MinMaxScaler()
scaler.fit(X_train_numeric)

scaled_X_train_numeric = scaler.transform(X_train_numeric)
scaled_X_test_numeric = scaler.transform(X_test_numeric)

print(f"{scaled_X_train_numeric.min() = }")
print(f"{scaled_X_train_numeric.max() = }")
print(f"{scaled_X_test_numeric.min() = }")
print(f"{scaled_X_test_numeric.max() = }")

scaled_X_train_numeric.min() = np.float64(0.0)
scaled_X_train_numeric.max() = np.float64(1.0)
scaled_X_test_numeric.min() = np.float64(-0.0033670033670034627)
scaled_X_test_numeric.max() = np.float64(1.0256410256410255)


In [14]:
scaled_X_train_numeric.shape

(421, 5)

In [15]:
scaled_X_train_numeric[:5]

array([[0.9932266 , 0.95622896, 0.43333333, 0.15384615, 0.55508142],
       [0.95012315, 0.21885522, 0.21333333, 0.17948718, 0.95732822],
       [0.92528736, 0.36363636, 0.3       , 0.35897436, 0.73273535],
       [0.8136289 , 0.91245791, 0.44      , 0.46153846, 0.12270313],
       [0.49568966, 0.94949495, 0.62      , 0.25641026, 0.54489245]])

In [16]:
type(scaled_X_train_numeric)

numpy.ndarray

In [17]:
import joblib
scaler = MinMaxScaler()
scaler.fit(X_train_numeric)
joblib.dump(scaler, 'scaler.joblib')

['scaler.joblib']

## Linear regression

In [123]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

model

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [124]:
model.fit(scaled_X_train_numeric, y_train)
print(f"Parameters or weights: {model.coef_}")
print(f"Intercept: {model.intercept_}")

Parameters or weights: [58.95885346  1.50816553 36.81590125 22.97896565 31.20501436]
Intercept: -22.115558282857045


In [125]:
for feature, weight in zip(numeric_cols, model.coef_):
    print(f"{feature}: {weight:.4f}")

Trip_Distance_km: 58.9589
Base_Fare: 1.5082
Per_Km_Rate: 36.8159
Per_Minute_Rate: 22.9790
Trip_Duration_Minutes: 31.2050


## Prediction

In [126]:
X_test_numeric.iloc[0]

Trip_Distance_km          8.92
Base_Fare                 3.69
Per_Km_Rate               1.07
Per_Minute_Rate           0.36
Trip_Duration_Minutes    42.14
Name: 110, dtype: float64

In [127]:
sample_features = scaled_X_test_numeric[0].reshape(1, -1)
sample_features

array([[0.1570197 , 0.56228956, 0.38      , 0.66666667, 0.32334756]])

In [128]:
predicted_price = model.predict(sample_features)[0]
print(f"Predicted taxi price: {predicted_price:.2f} kr")

Predicted taxi price: 27.39 kr


In [129]:
y_test.iloc[:5]

110    28.4048
419    58.5929
567    73.3691
77     22.8766
181    74.5998
Name: Trip_Price, dtype: float64

In [132]:
y_pred = model.predict(scaled_X_test_numeric)
y_pred[:5]

array([27.38958727, 60.64154094, 66.56024628, 22.45887442, 67.01213084])

In [133]:
y_test.iloc[:5]

110    28.4048
419    58.5929
567    73.3691
77     22.8766
181    74.5998
Name: Trip_Price, dtype: float64

## Evaluate

In [134]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"{mae = }")
print(f"{mse = }")
print(f"{rmse = }")


mae = 5.62459016770344
mse = 54.92599143563925
rmse = np.float64(7.411207151040864)
