In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error


In [2]:
df = pd.read_csv('mopac-express-lane-prices.csv')

# Display the first few rows of the data to understand its structure
df.head()

Unnamed: 0,datetime,direction,toll_point,rate,pbm_rate
0,2018-01-01 00:00:00,NB,2222 to Parmer,0.25,0.33
1,2018-01-01 00:00:00,NB,CVZ to 183,0.25,0.33
2,2018-01-01 00:00:00,NB,CVZ to Parmer,0.5,0.66
3,2018-01-01 00:00:00,SB,2222 to 5th/CVZ,0.25,0.33
4,2018-01-01 00:00:00,SB,Parmer to 2222,0.25,0.33


In [3]:
df.shape

(78899, 5)

In [4]:
df[['start_point', 'end_point']] = df['toll_point'].str.split(' to ', expand=True)

# Drop the original 'toll_point' column as it's now split
df.drop(columns=['toll_point'], inplace=True)

# Display the first few rows to verify the changes
df.head()


Unnamed: 0,datetime,direction,rate,pbm_rate,start_point,end_point
0,2018-01-01 00:00:00,NB,0.25,0.33,2222,Parmer
1,2018-01-01 00:00:00,NB,0.25,0.33,CVZ,183
2,2018-01-01 00:00:00,NB,0.5,0.66,CVZ,Parmer
3,2018-01-01 00:00:00,SB,0.25,0.33,2222,5th/CVZ
4,2018-01-01 00:00:00,SB,0.25,0.33,Parmer,2222


In [5]:
np.random.seed(42)
df['traffic_volume'] = np.random.randint(100, 500, size=len(df))

# Convert datetime to a pandas datetime object
df['datetime'] = pd.to_datetime(df['datetime'])

# Extract time-based features
df['hour'] = df['datetime'].dt.hour
df['day_of_week'] = df['datetime'].dt.dayofweek


In [6]:
original_values = df[['direction', 'start_point', 'end_point']].copy()

# Encode the categorical variables
encoders = {}
for column in ['direction', 'start_point', 'end_point']:
    encoder = LabelEncoder()
    df[column] = encoder.fit_transform(df[column])
    encoders[column] = encoder

In [7]:
df.head()

Unnamed: 0,datetime,direction,rate,pbm_rate,start_point,end_point,traffic_volume,hour,day_of_week
0,2018-01-01,0,0.25,0.33,0,3,202,0,0
1,2018-01-01,0,0.25,0.33,1,0,448,0,0
2,2018-01-01,0,0.5,0.66,1,3,370,0,0
3,2018-01-01,1,0.25,0.33,0,2,206,0,0
4,2018-01-01,1,0.25,0.33,2,1,171,0,0


In [8]:
features = ['direction','start_point','end_point','traffic_volume', 'hour', 'day_of_week']
target = 'rate'

# Split the data into training and testing sets
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
# # Scale features
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

# Train a RandomForest model
model = RandomForestRegressor(n_estimators=50, max_depth=5, random_state=10)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print(f'MAE: {mae}, RMSE: {rmse}')

# Function to make predictions with original values
def predict_with_original_values(direction, start_point, end_point, traffic_volume, hour, day_of_week):
    # Encode the input values
    direction_encoded = encoders['direction'].transform([direction])[0]
    start_point_encoded = encoders['start_point'].transform([start_point])[0]
    end_point_encoded = encoders['end_point'].transform([end_point])[0]

    # Create the input array
    input_array = [[direction_encoded, start_point_encoded, end_point_encoded, traffic_volume, hour, day_of_week]]

    # Scale the input array

    # Make prediction
    prediction = model.predict(input_array)
    return prediction[0]

MAE: 0.3879431837494143, RMSE: 0.8945745492963216


In [18]:
def adjust_rate_based_on_traffic(predicted_rate, traffic_volume, threshold=400):
    if traffic_volume > threshold:
        adjusted_rate = predicted_rate * (1 + (traffic_volume - threshold) / threshold)
    else:
        adjusted_rate = predicted_rate * (1 - (threshold - traffic_volume) / threshold)
    return adjusted_rate

# Example prediction with adjustment
predicted_rate = predict_with_original_values('SB', 'CVZ', 'Parmer', 1000, 1, 2)
adjusted_rate = adjust_rate_based_on_traffic(predicted_rate, 1000)

predicted_rate_rounded = round(predicted_rate, 2)
adjusted_rate_rounded = round(adjusted_rate, 2)

print(f'Predicted rate: {predicted_rate_rounded}, Adjusted rate: {adjusted_rate_rounded}')

Predicted rate: 0.29, Adjusted rate: 0.72




In [20]:
import pickle

with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)
with open('encoders.pkl', 'wb') as f:
    pickle.dump(encoders, f)