In [None]:

"""
Milestone 4 ‚Äî Advanced model (safe + faster tuning for small data)
- Uses LightGBM (if available) + RandomForest baseline
- Hyperparameter tuning with RandomizedSearchCV (reduced scope)
- TimeSeriesSplit for CV (smaller splits)
- Saves models, plots, and evaluation CSV
"""
import os
import sys
import warnings
from datetime import datetime

import numpy as np
import pandas as pd
import joblib
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, accuracy_score, classification_report, confusion_matrix
from sklearn.inspection import permutation_importance

warnings.filterwarnings("ignore")

DATA_PATH = "traffic.csv"
OUT_DIR = "milestone4_outputs_advanced"
os.makedirs(OUT_DIR, exist_ok=True)
os.makedirs(os.path.join(OUT_DIR, "models"), exist_ok=True)
os.makedirs(os.path.join(OUT_DIR, "plots"), exist_ok=True)

RANDOM_STATE = 42


USE_LIGHTGBM = True      
SKIP_TUNING = False      

try:
    import lightgbm as lgb
    from lightgbm import LGBMRegressor, LGBMClassifier
except Exception:
    print("‚ö†Ô∏è LightGBM not available ‚Äî will use RandomForest only.")
    USE_LIGHTGBM = False


def rmse(y_true, y_pred):
    return mean_squared_error(y_true, y_pred) ** 0.5

def safe_load_csv(path):
    if not os.path.exists(path):
        raise FileNotFoundError(f"File not found: {path} ‚Äî ÿ∂ÿπ ŸÖŸÑŸÅ traffic_data.csv ŸÅŸä ŸÜŸÅÿ≥ ÿßŸÑŸÖÿ¨ŸÑÿØ ÿ£Ÿà ÿπÿØŸëŸÑ DATA_PATH")
    return pd.read_csv(path)


df = safe_load_csv(DATA_PATH)
df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df.sort_values(['street_name', 'timestamp']).reset_index(drop=True)

print(f"‚úÖ Loaded {len(df)} rows ‚Äî columns: {df.columns.tolist()}")


df['hour'] = df['timestamp'].dt.hour
df['weekday'] = df['timestamp'].dt.weekday
df['is_weekend'] = df['weekday'].isin([5,6]).astype(int)


LAGS = [1,2,3]
for lag in LAGS:
    df[f'veh_count_lag_{lag}'] = df.groupby('street_name')['vehicle_count'].shift(lag)
df['veh_roll_3'] = df.groupby('street_name')['vehicle_count'].rolling(window=3, min_periods=1).mean().reset_index(level=0, drop=True)


df['vehicle_count_next'] = df.groupby('street_name')['vehicle_count'].shift(-1)
le_light = LabelEncoder()
df['lighting_demand_enc'] = le_light.fit_transform(df['lighting_demand'])


cat_cols = ['street_name','light_level','weather','traffic_light']
label_encoders = {}
for c in cat_cols:
    le = LabelEncoder()
    df[c + '_enc'] = le.fit_transform(df[c])
    label_encoders[c] = le


required = ['vehicle_count_next'] + [f'veh_count_lag_{max(LAGS)}']
df_model = df.dropna(subset=required).copy()
print(f"Rows available for modeling after dropna: {len(df_model)}")


FEATURES = [
    'vehicle_count','vehicle_speed','solar_energy_level',
    'hour','weekday','is_weekend','veh_roll_3'
] + [f'veh_count_lag_{l}' for l in LAGS] + [c + '_enc' for c in cat_cols]

X = df_model[FEATURES]
y_reg = df_model['vehicle_count_next']
y_clf = df_model['lighting_demand_enc']


unique_times = df_model['timestamp'].sort_values().unique()
split_idx = int(len(unique_times) * 0.8) if len(unique_times) > 1 else 0
time_cutoff = unique_times[split_idx]
train_mask = df_model['timestamp'] <= time_cutoff

X_train, X_test = X[train_mask], X[~train_mask]
y_train_reg, y_test_reg = y_reg[train_mask], y_reg[~train_mask]
y_train_clf, y_test_clf = y_clf[train_mask], y_clf[~train_mask]

print(f"Train rows: {len(X_train)}, Test rows: {len(X_test)}")

y_pred_baseline = X_test[f"veh_count_lag_{LAGS[0]}"]
baseline_mae = mean_absolute_error(y_test_reg, y_pred_baseline)
print(f"Baseline persistence MAE: {baseline_mae:.3f}")

rf_reg = RandomForestRegressor(n_estimators=200, random_state=RANDOM_STATE, n_jobs=-1)
rf_clf = RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE, n_jobs=-1)

rf_reg.fit(X_train, y_train_reg)
y_pred_rf_reg = rf_reg.predict(X_test)
rf_reg_mae = mean_absolute_error(y_test_reg, y_pred_rf_reg)
rf_reg_rmse = rmse(y_test_reg, y_pred_rf_reg)
print(f"RF Regressor ‚Äî MAE: {rf_reg_mae:.3f}, RMSE: {rf_reg_rmse:.3f}")

rf_clf.fit(X_train, y_train_clf)
y_pred_rf_clf = rf_clf.predict(X_test)
rf_clf_acc = accuracy_score(y_test_clf, y_pred_rf_clf) if len(y_test_clf)>0 else None
print(f"RF Classifier ‚Äî Accuracy: {rf_clf_acc}")


best_reg = rf_reg
best_clf = rf_clf
results_summary = {}


n_splits_cv = min(3, max(2, int(len(X_train) / 10)))  
tscv = TimeSeriesSplit(n_splits=n_splits_cv)

if USE_LIGHTGBM and not SKIP_TUNING:
    print("üî∑ Running LightGBM + RandomizedSearchCV (regressor + classifier) ‚Äî reduced scope for speed...")
    
    reg_param_dist = {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.05, 0.1],
        'num_leaves': [15, 31, 63],
        'min_child_samples': [5, 10, 20],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0],
    }
    lgb_reg = LGBMRegressor(random_state=RANDOM_STATE, n_jobs=1)  

    reg_search = RandomizedSearchCV(
        estimator=lgb_reg,
        param_distributions=reg_param_dist,
        n_iter=5,              
        cv=tscv,
        scoring='neg_mean_absolute_error',
        random_state=RANDOM_STATE,
        n_jobs=1,            
        verbose=1
    )

    try:
        reg_search.fit(X_train, y_train_reg)
        best_reg = reg_search.best_estimator_
        print("Best regressor params:", reg_search.best_params_)
        y_pred_lgb_reg = best_reg.predict(X_test)
        lgb_reg_mae = mean_absolute_error(y_test_reg, y_pred_lgb_reg)
        lgb_reg_rmse = rmse(y_test_reg, y_pred_lgb_reg)
        print(f"LGB Regressor ‚Äî MAE: {lgb_reg_mae:.3f}, RMSE: {lgb_reg_rmse:.3f}")
        results_summary['lgb_reg'] = {'mae': lgb_reg_mae, 'rmse': lgb_reg_rmse}
    except KeyboardInterrupt:
        print("‚ö†Ô∏è Tuning interrupted by user ‚Äî falling back to RandomForest regressor.")
        best_reg = rf_reg
    except Exception as e:
        print("‚ö†Ô∏è Reg search failed, fallback to RandomForest regressor. Error:", e)
        best_reg = rf_reg

    # Classifier param grid (smaller)
    clf_param_dist = {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.05],
        'num_leaves': [15, 31],
        'min_child_samples': [5, 10],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0],
    }
    lgb_clf = LGBMClassifier(random_state=RANDOM_STATE, n_jobs=1)
    clf_search = RandomizedSearchCV(
        estimator=lgb_clf,
        param_distributions=clf_param_dist,
        n_iter=5,
        cv=tscv,
        scoring='accuracy',
        random_state=RANDOM_STATE,
        n_jobs=1,
        verbose=1
    )

    try:
        clf_search.fit(X_train, y_train_clf)
        best_clf = clf_search.best_estimator_
        print("Best classifier params:", clf_search.best_params_)
        y_pred_lgb_clf = best_clf.predict(X_test)
        lgb_clf_acc = accuracy_score(y_test_clf, y_pred_lgb_clf) if len(y_test_clf)>0 else None
        print(f"LGB Classifier ‚Äî Accuracy: {lgb_clf_acc}")
        results_summary['lgb_clf'] = {'acc': lgb_clf_acc}
    except KeyboardInterrupt:
        print("‚ö†Ô∏è Tuning interrupted by user ‚Äî falling back to RandomForest classifier.")
        best_clf = rf_clf
    except Exception as e:
        print("‚ö†Ô∏è Clf search failed, fallback to RandomForest classifier. Error:", e)
        best_clf = rf_clf

else:
    if not USE_LIGHTGBM:
        print("Skipping LightGBM tuning (not installed).")
    elif SKIP_TUNING:
        print("SKIP_TUNING is True ‚Äî skipping LightGBM RandomizedSearchCV (using default LightGBM or RandomForest).")
       
        if USE_LIGHTGBM:
            try:
                quick_reg = LGBMRegressor(n_estimators=100, random_state=RANDOM_STATE, n_jobs=1)
                quick_clf = LGBMClassifier(n_estimators=100, random_state=RANDOM_STATE, n_jobs=1)
                quick_reg.fit(X_train, y_train_reg)
                best_reg = quick_reg
                quick_clf.fit(X_train, y_train_clf)
                best_clf = quick_clf
            except Exception:
                best_reg = rf_reg
                best_clf = rf_clf



if 'lgb_reg' in results_summary and results_summary['lgb_reg']['mae'] < rf_reg_mae:
    chosen_reg = best_reg
    chosen_reg_name = "LightGBM Regressor"
    chosen_reg_scores = results_summary['lgb_reg']
else:
    chosen_reg = rf_reg
    chosen_reg_name = "RandomForest Regressor"
    chosen_reg_scores = {'mae': rf_reg_mae, 'rmse': rf_reg_rmse}


if 'lgb_clf' in results_summary and results_summary['lgb_clf']['acc'] is not None and results_summary['lgb_clf']['acc'] >= rf_clf_acc:
    chosen_clf = best_clf
    chosen_clf_name = "LightGBM Classifier"
else:
    chosen_clf = rf_clf
    chosen_clf_name = "RandomForest Classifier"


y_pred_chosen_reg = chosen_reg.predict(X_test)
y_pred_chosen_clf = chosen_clf.predict(X_test)

print(f"üîî Final chosen regressor: {chosen_reg_name}")
print(f"Final regressor MAE: {mean_absolute_error(y_test_reg, y_pred_chosen_reg):.3f}, RMSE: {rmse(y_test_reg, y_pred_chosen_reg):.3f}")
print(f"üîî Final chosen classifier: {chosen_clf_name}")
print("Classification report (chosen classifier):")

print(classification_report(y_test_clf, y_pred_chosen_clf, 
                            labels=np.arange(len(le_light.classes_)), 
                            target_names=le_light.classes_))

perm_reg = permutation_importance(chosen_reg, X_test, y_test_reg, n_repeats=20, random_state=RANDOM_STATE, n_jobs=1)
perm_clf = permutation_importance(chosen_clf, X_test, y_test_clf, n_repeats=20, random_state=RANDOM_STATE, n_jobs=1)

imp_reg = pd.Series(perm_reg.importances_mean, index=FEATURES).sort_values(ascending=False)
imp_clf = pd.Series(perm_clf.importances_mean, index=FEATURES).sort_values(ascending=False)

if imp_clf.sum() == 0:
    print("‚ö†Ô∏è Classifier permutation importances all zero ‚Äî using built-in feature_importances_.")
    try:
        imp_clf = pd.Series(chosen_clf.feature_importances_, index=FEATURES).sort_values(ascending=False)
    except Exception:
        pass


plt.figure(figsize=(8,6))
imp_reg.head(12).sort_values().plot.barh()
plt.title("Top Features ‚Äî Regressor (permutation)")
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "plots", "feature_importance_reg.png"))
plt.close()

plt.figure(figsize=(8,6))
imp_clf.head(12).sort_values().plot.barh()
plt.title("Top Features ‚Äî Classifier (permutation or builtin)")
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "plots", "feature_importance_clf.png"))
plt.close()


joblib.dump(chosen_reg, os.path.join(OUT_DIR, "models", "chosen_regressor.pkl"))
joblib.dump(chosen_clf, os.path.join(OUT_DIR, "models", "chosen_classifier.pkl"))
joblib.dump(le_light, os.path.join(OUT_DIR, "models", "label_encoder_lighting.pkl"))
for k,v in label_encoders.items():
    joblib.dump(v, os.path.join(OUT_DIR, "models", f"label_encoder_{k}.pkl"))
-
eval_df = X_test.copy()
eval_df['true_vehicle_next'] = y_test_reg.values
eval_df['pred_vehicle_next'] = y_pred_chosen_reg
eval_df['true_lighting'] = le_light.inverse_transform(y_test_clf.values)
eval_df['pred_lighting'] = le_light.inverse_transform(y_pred_chosen_clf)
eval_df.to_csv(os.path.join(OUT_DIR, "evaluation_predictions.csv"), index=False)

print("‚úÖ All outputs saved to:", OUT_DIR)
print("- Models:", os.listdir(os.path.join(OUT_DIR, "models")))
print("- Plots:", os.listdir(os.path.join(OUT_DIR, "plots")))
print("- Eval file:", os.path.join(OUT_DIR, "evaluation_predictions.csv"))


def predict_new_simple(input_dict):
    df_in = pd.DataFrame([input_dict])
    X_in = df_in[FEATURES]
    pc = chosen_reg.predict(X_in)[0]
   
    pl_int = int(chosen_clf.predict(X_in)[0]) 
    pl = le_light.inverse_transform([pl_int])[0]
    return {'pred_vehicle_next': float(pc), 'pred_lighting': pl}


if len(X_test) > 0:
    example = X_test.iloc[-1].to_dict()
    print("\nExample prediction (last test row):", predict_new_simple(example))



In [None]:
# milestone4_streaming_predictor_v1.py
"""
Milestone 4 ‚Äî Real-Time Streaming Predictor (Original Strong Model)
Reads live data from Kafka topic 'traffic-stream',
predicts vehicle count next & lighting demand,
and sends predictions to 'traffic-predictions' topic.
"""

import json
import joblib
import pandas as pd
import os 
from kafka import KafkaConsumer, KafkaProducer
from datetime import datetime


KAFKA_BROKER = "kafka:9092"
INPUT_TOPIC = "traffic_topic"
OUTPUT_TOPIC = "traffic-predictions"

MODEL_DIR = "milestone4_outputs_advanced/models"


REGRESSOR_PATH = os.path.join(MODEL_DIR, "chosen_regressor.pkl")
CLASSIFIER_PATH = os.path.join(MODEL_DIR, "chosen_classifier.pkl")
ENCODER_LIGHT_PATH = os.path.join(MODEL_DIR, "label_encoder_lighting.pkl")



print("üì¶ Loading trained models...")
try:
    regressor = joblib.load(REGRESSOR_PATH)
    classifier = joblib.load(CLASSIFIER_PATH)
    le_light = joblib.load(ENCODER_LIGHT_PATH)
    print("‚úÖ Models loaded successfully.")
except FileNotFoundError as e:
    print(f"‚ùå FileNotFoundError: Failed to load models. Check the path: {e}")
    exit() 


print(f"üîå Connecting to Kafka broker at {KAFKA_BROKER}...")
consumer = KafkaConsumer(
    INPUT_TOPIC,
    bootstrap_servers=KAFKA_BROKER,
    value_deserializer=lambda x: json.loads(x.decode("utf-8")),
)

producer = KafkaProducer(
    bootstrap_servers=KAFKA_BROKER,
    value_serializer=lambda x: json.dumps(x).encode("utf-8"),
)

print(f"‚úÖ Connected to Kafka ‚Äî Listening for messages on topic '{INPUT_TOPIC}'...")
print("-----------------------------------------------------------")


FEATURES = [
    'vehicle_count','vehicle_speed','solar_energy_level',
    'hour','weekday','is_weekend','veh_roll_3',
    'veh_count_lag_1','veh_count_lag_2','veh_count_lag_3',
    'street_name_enc','light_level_enc','weather_enc','traffic_light_enc'
]


def prepare_input(data):
   
    for f in FEATURES:
       
        if f not in data or data[f] is None:
            data[f] = 0 


    X_new = pd.DataFrame([data], columns=FEATURES)
    return X_new

for msg in consumer:
    try:
        data = msg.value
        print(f"\nüì• New data received: {data}")

   
        data["timestamp"] = data.get("timestamp", datetime.now().isoformat())

      
        X_new = prepare_input(data)

     
        pred_vehicle_next = float(regressor.predict(X_new)[0])
        
        pred_lighting_encoded = classifier.predict(X_new)
        pred_lighting = le_light.inverse_transform(pred_lighting_encoded.astype(int))[0]

       
        result = {
            "timestamp": data["timestamp"],
            "street_name": data.get("street_name", "unknown"),
            "predicted_vehicle_count_next": round(pred_vehicle_next, 2),
            "predicted_lighting_demand": pred_lighting
        }

        
        producer.send(OUTPUT_TOPIC, value=result)
        producer.flush()  

        print(f"üì° Sent prediction ‚Üí Topic '{OUTPUT_TOPIC}': {result}")

    except Exception as e:
        print(f"‚ùå Error processing message: {e}")