In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error


In [11]:
df = pd.read_csv('toll_price.csv')

# Display the first few rows of the data to understand its structure
df.head()

Unnamed: 0.1,Unnamed: 0,datetime,direction,traffic_volume,toll_price,start_point,end_point
0,0,2018-01-01 00:00:00,NB,348,1.9025,2222,Parmer
1,1,2018-01-01 00:00:00,NB,210,1.64375,CVZ,183
2,2,2018-01-01 00:00:00,NB,263,1.743125,CVZ,Parmer
3,3,2018-01-01 00:00:00,SB,316,1.8425,2222,5th/CVZ
4,4,2018-01-01 00:00:00,SB,311,1.833125,Parmer,2222


In [12]:
df.shape

(78899, 7)

In [13]:
# Convert datetime to a pandas datetime object
df['datetime'] = pd.to_datetime(df['datetime'])

# Extract time-based features
df['hour'] = df['datetime'].dt.hour
df['day_of_week'] = df['datetime'].dt.dayofweek


In [14]:
df

Unnamed: 0.1,Unnamed: 0,datetime,direction,traffic_volume,toll_price,start_point,end_point,hour,day_of_week
0,0,2018-01-01 00:00:00,NB,348,1.902500,2222,Parmer,0,0
1,1,2018-01-01 00:00:00,NB,210,1.643750,CVZ,183,0,0
2,2,2018-01-01 00:00:00,NB,263,1.743125,CVZ,Parmer,0,0
3,3,2018-01-01 00:00:00,SB,316,1.842500,2222,5th/CVZ,0,0
4,4,2018-01-01 00:00:00,SB,311,1.833125,Parmer,2222,0,0
...,...,...,...,...,...,...,...,...,...
78894,78894,2018-09-30 23:30:00,NB,349,1.904375,CVZ,183,23,6
78895,78895,2018-09-30 23:30:00,NB,232,1.685000,CVZ,Parmer,23,6
78896,78896,2018-09-30 23:30:00,SB,250,1.718750,2222,5th/CVZ,23,6
78897,78897,2018-09-30 23:30:00,SB,278,1.771250,Parmer,2222,23,6


In [15]:
original_values = df[['direction', 'start_point', 'end_point']].copy()

# Encode the categorical variables
encoders = {}
for column in ['direction', 'start_point', 'end_point']:
    encoder = LabelEncoder()
    df[column] = encoder.fit_transform(df[column])
    encoders[column] = encoder

In [16]:
df.head()

Unnamed: 0.1,Unnamed: 0,datetime,direction,traffic_volume,toll_price,start_point,end_point,hour,day_of_week
0,0,2018-01-01,0,348,1.9025,0,3,0,0
1,1,2018-01-01,0,210,1.64375,1,0,0,0
2,2,2018-01-01,0,263,1.743125,1,3,0,0
3,3,2018-01-01,1,316,1.8425,0,2,0,0
4,4,2018-01-01,1,311,1.833125,2,1,0,0


In [18]:
features = ['direction','start_point','end_point','traffic_volume', 'hour', 'day_of_week']
target = 'toll_price'

# Split the data into training and testing sets
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
# # Scale features
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

# Train a RandomForest model
model = RandomForestRegressor(n_estimators=50, max_depth=5, random_state=10)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print(f'MAE: {mae}, RMSE: {rmse}')

# Function to make predictions with original values
def predict_with_original_values(direction, start_point, end_point, traffic_volume, hour, day_of_week):
    # Encode the input values
    direction_encoded = encoders['direction'].transform([direction])[0]
    start_point_encoded = encoders['start_point'].transform([start_point])[0]
    end_point_encoded = encoders['end_point'].transform([end_point])[0]

    # Create the input array
    input_array = [[direction_encoded, start_point_encoded, end_point_encoded, traffic_volume, hour, day_of_week]]

    # Scale the input array

    # Make prediction
    prediction = model.predict(input_array)
    return prediction[0]

MAE: 0.014909233910822686, RMSE: 0.017713073039464792


In [20]:
def predict_with_original_values(direction, start_point, end_point, traffic_volume, hour, day_of_week):
    # Encode the input values
    direction_encoded = encoders['direction'].transform([direction])[0]
    start_point_encoded = encoders['start_point'].transform([start_point])[0]
    end_point_encoded = encoders['end_point'].transform([end_point])[0]

    # Create the input array
    input_array = [[direction_encoded, start_point_encoded, end_point_encoded, traffic_volume, hour, day_of_week]]

    # Make prediction
    prediction = model.predict(input_array)
    return prediction[0]

# Example input
direction = 'NB'
start_point = '2222'
end_point = 'Parmer'
traffic_volume = 500
hour = 15
day_of_week = 2

predicted_toll_price = predict_with_original_values('NB', '2222', 'Parmer', 500, 15, 2)
print(f'Predicted Toll Price: {predicted_toll_price}')

Predicted Toll Price: 2.501950106711705




In [21]:
import pickle

with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)
with open('encoders.pkl', 'wb') as f:
    pickle.dump(encoders, f)