In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


In [3]:
# Load the dataset from CSV
df = pd.read_csv("traffic_data.csv")

df.head(14)

# print(df['congestion_level'].value_counts())


Unnamed: 0,time_of_day,day_type,weather,road_type,vehicle_count,road_lanes,vehicle_mix_ratio,congestion_level
0,Morning,Weekend,Sunny,Local,154,6,0.2,Medium
1,Morning,Weekend,Sunny,Highway,160,6,0.55,Medium
2,Night,Weekend,Foggy,Mountain,40,3,0.37,Low
3,Afternoon,Weekday,Sunny,Local,82,3,0.58,Low
4,Night,Weekday,Foggy,Highway,296,3,0.62,High
5,Evening,Weekday,Foggy,Highway,37,4,0.25,Low
6,Afternoon,Weekend,Sunny,One-way,46,2,0.86,Low
7,Evening,Weekend,Snowy,Local,245,3,0.39,High
8,Evening,Weekday,Rainy,One-way,245,3,0.61,High
9,Morning,Weekday,Foggy,Local,178,3,0.34,Medium


In [4]:
data = df.copy()

time_map = {
    'Morning': 0,
    'Afternoon': 1,
    'Evening': 2,
    'Night': 3
}

day_map = {
    'Weekday': 0,
    'Weekend': 1
}

weather_map = {
    'Sunny': 0,
    'Rainy': 1,
    'Foggy': 2,
    'Snowy': 3
}

road_map = {
    'Highway': 0,
    'Local': 1,
    'One-way': 2,
    'Mountain': 3
}

congestion_map = {
    'Low': 0,
    'Medium': 1,
    'High': 2
}

data['time_of_day'] = data['time_of_day'].map(time_map)
data['day_type'] = data['day_type'].map(day_map)
data['weather'] = data['weather'].map(weather_map)
data['road_type'] = data['road_type'].map(road_map)
data['congestion_level'] = data['congestion_level'].map(congestion_map)


data.head(10)

Unnamed: 0,time_of_day,day_type,weather,road_type,vehicle_count,road_lanes,vehicle_mix_ratio,congestion_level
0,0,1,0,1,154,6,0.2,1
1,0,1,0,0,160,6,0.55,1
2,3,1,2,3,40,3,0.37,0
3,1,0,0,1,82,3,0.58,0
4,3,0,2,0,296,3,0.62,2
5,2,0,2,0,37,4,0.25,0
6,1,1,0,2,46,2,0.86,0
7,2,1,3,1,245,3,0.39,2
8,2,0,1,2,245,3,0.61,2
9,0,0,2,1,178,3,0.34,1


In [30]:
X= data.drop('congestion_level', axis=1) # inputs we use to make a prediction Features(X) , removes the tarhet col from the rest of the dataset
y = data['congestion_level'] # The actual output we want to predict Label(y)

X_train, X_test, y_train, y_test = train_test_split(
    X,y, test_size=0.2, random_state =42
) # 80% of the data is used for training and 20% for testing

models ={
    "Random Forest": RandomForestClassifier(), # good accuracy
}

for name, model in models.items():
    model.fit(X_train, y_train)  # Train the model
    y_pred = model.predict(X_test) # Make predictions
    accuracy = accuracy_score(y_test, y_pred)  # Calculate accuracy
    print(f"{name}: Accuracy = {accuracy:.2f}")

Random Forest: Accuracy = 1.00


In [25]:
# Model 1 - Default
model1 = RandomForestClassifier()
model1.fit(X_train, y_train)  # Train the Random Forest model
pred_1 = model1.predict(X_test)
accuracy_1 = accuracy_score(y_test,pred_1)
print(f"Random Forest Model 1 Accuracy: {accuracy_1:.2f}")

# Model 2 - More trees (200) - better performance in many cases but slower
model2 = RandomForestClassifier(n_estimators=200)
model2.fit(X_train, y_train)
pred_2 = model2.predict(X_test)
accuracy_2 = accuracy_score(y_test, pred_2)
print(f"Random Forest Model 2 Accuracy: {accuracy_2:.2f}")

# Model 3 - Max depth of trees (5) - prevents overfitting
model3 = RandomForestClassifier(max_depth=5)
model3.fit(X_train, y_train)
pred3 = model3.predict(X_test)
acc3 = accuracy_score(y_test, pred3)
print("Model 3 (max_depth=5):", acc3)

# Model 4 - Fewer trees + depth limit
model4 = RandomForestClassifier(n_estimators=100, max_depth=3)
model4.fit(X_train, y_train)
pred4 = model4.predict(X_test)
acc4 = accuracy_score(y_test, pred4)
print("Model 4 (n_estimators=100, max_depth=3):", acc4)


Random Forest Model 1 Accuracy: 1.00
Random Forest Model 2 Accuracy: 1.00
Model 3 (max_depth=5): 0.9833333333333333
Model 4 (n_estimators=100, max_depth=3): 0.9333333333333333


In [None]:
import joblib

# Replace model2 with your best model
joblib.dump(model2, "traffic_model.pkl")
# Load the model
model = joblib.load("traffic_model.pkl")
