In [50]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


In [100]:
import pandas as pd
import numpy as np
import random

# Define possible values
time_of_day_options = ['Morning', 'Afternoon', 'Evening', 'Night']
day_type_options = ['Weekday', 'Weekend']
weather_options = ['Sunny', 'Rainy', 'Foggy', 'Snowy']
road_type_options = ['Highway', 'Local', 'One-way', 'Mountain']

# Scoring-based label assignment
def assign_congestion(row):
    score = 0
    score += row['vehicle_count'] / 100
    score += row['vehicle_mix_ratio'] * 2

    if row['road_lanes'] >= 4 and row['road_type'] == 'Highway':
        score -= 2
    if row['time_of_day'] == 'Morning' and row['day_type'] == 'Weekend':
        score -= 1.5
    if row['weather'] == 'Sunny':
        score -= 0.5

    if score < 2:
        return "Low"
    elif score < 4.5:
        return "Medium"
    else:
        return "High"

# Generate samples per class
samples_per_class = 300
data = []

while len(data) < samples_per_class * 3:
    row = {
        'time_of_day': random.choice(time_of_day_options),
        'day_type': random.choice(day_type_options),
        'weather': random.choice(weather_options),
        'road_type': random.choice(road_type_options),
        'vehicle_count': random.randint(20, 400),
        'road_lanes': random.randint(1, 6),
        'vehicle_mix_ratio': round(random.uniform(0.0, 1.0), 2)
    }

    label = assign_congestion(row)
    count = sum(1 for d in data if d['congestion_level'] == label)

    if count < samples_per_class:
        row['congestion_level'] = label
        data.append(row)

# Create DataFrame
df = pd.DataFrame(data)

# Save as CSV
df.to_csv("traffic_data.csv", index=False)

print("✅ Dataset generated and saved as traffic_data_balanced.csv")


✅ Dataset generated and saved as traffic_data_balanced.csv


In [109]:
df = pd.read_csv("traffic_data.csv")
df['congestion_level'].value_counts()


congestion_level
Low       300
Medium    300
High      300
Name: count, dtype: int64

In [110]:
data = df.copy()

time_map = {
    'Morning': 0,
    'Afternoon': 1,
    'Evening': 2,
    'Night': 3
}

day_map = {
    'Weekday': 0,
    'Weekend': 1
}

weather_map = {
    'Sunny': 0,
    'Rainy': 1,
    'Foggy': 2,
    'Snowy': 3
}

road_map = {
    'Highway': 0,
    'Local': 1,
    'One-way': 2,
    'Mountain': 3
}

congestion_map = {
    'Low': 0,
    'Medium': 1,
    'High': 2
}

data['time_of_day'] = data['time_of_day'].map(time_map)
data['day_type'] = data['day_type'].map(day_map)
data['weather'] = data['weather'].map(weather_map)
data['road_type'] = data['road_type'].map(road_map)
data['congestion_level'] = data['congestion_level'].map(congestion_map)


data.head(10)

Unnamed: 0,time_of_day,day_type,weather,road_type,vehicle_count,road_lanes,vehicle_mix_ratio,congestion_level
0,1,1,0,0,93,4,0.8,0
1,3,1,1,2,74,2,0.49,0
2,1,0,2,3,302,6,0.31,1
3,3,1,0,1,180,3,0.88,1
4,1,0,0,2,292,5,0.67,1
5,3,0,1,3,144,4,0.32,1
6,1,1,0,0,290,4,0.99,1
7,0,1,1,0,168,5,0.57,0
8,3,1,0,1,217,3,0.28,1
9,3,0,3,2,26,1,0.83,0


In [111]:
X= data.drop('congestion_level', axis=1) # inputs we use to make a prediction Features(X) , removes the tarhet col from the rest of the dataset
y = data['congestion_level'] # The actual output we want to predict Label(y)

X_train, X_test, y_train, y_test = train_test_split(
    X,y, test_size=0.2, random_state =42
) # 80% of the data is used for training and 20% for testing

models ={
    "Random Forest": RandomForestClassifier(), # good accuracy
}

for name, model in models.items():
    model.fit(X_train, y_train)  # Train the model
    y_pred = model.predict(X_test) # Make predictions
    accuracy = accuracy_score(y_test, y_pred)  # Calculate accuracy
    print(f"{name}: Accuracy = {accuracy:.2f}")

Random Forest: Accuracy = 0.84


In [114]:
# Model 1 - Default
model1 = RandomForestClassifier()
model1.fit(X_train, y_train)  # Train the Random Forest model
pred_1 = model1.predict(X_test)
accuracy_1 = accuracy_score(y_test,pred_1)
print(f"Random Forest Model 1 Accuracy: {accuracy_1:.2f}")

# Model 2 - More trees (200) - better performance in many cases but slower
model2 = RandomForestClassifier(n_estimators=200)
model2.fit(X_train, y_train)
pred_2 = model2.predict(X_test)
accuracy_2 = accuracy_score(y_test, pred_2)
print(f"Random Forest Model 2 Accuracy: {accuracy_2:.2f}")

# Model 3 - Max depth of trees (5) - prevents overfitting
model3 = RandomForestClassifier(max_depth=5)
model3.fit(X_train, y_train)
pred3 = model3.predict(X_test)
acc3 = accuracy_score(y_test, pred3)
print("Model 3 (max_depth=5):", acc3)

# Model 4 - Fewer trees + depth limit
model4 = RandomForestClassifier(n_estimators=100, max_depth=3)
model4.fit(X_train, y_train)
pred4 = model4.predict(X_test)
acc4 = accuracy_score(y_test, pred4)
print("Model 4 (n_estimators=100, max_depth=3):", acc4)


Random Forest Model 1 Accuracy: 0.87
Random Forest Model 2 Accuracy: 0.87
Model 3 (max_depth=5): 0.8444444444444444
Model 4 (n_estimators=100, max_depth=3): 0.7444444444444445


In [115]:
import joblib

# Replace model2 with your best model
joblib.dump(model2, "traffic_model.pkl")
# Load the model
model = joblib.load("traffic_model.pkl")
