In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from collections import Counter
from src.config import SPECIFIC_INCIDENT_NUMS
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)

In [None]:
# Set output dir + helper
from pathlib import Path
EDA_PLOTS_DIR = Path('visualizations/eda')
EDA_PLOTS_DIR.mkdir(parents=True, exist_ok=True)

def save_and_close(fig, filename):
    name_pdf = Path(filename).with_suffix('.pdf').name
    out = EDA_PLOTS_DIR / name_pdf
    fig.tight_layout()
    fig.savefig(out, format='pdf', bbox_inches='tight')
    plt.close(fig)
    print(f"Saved: {out}")

In [None]:
raw_specific_df = []
smoothed_specific_df = []
traffic_parquet = pd.read_parquet(Path('data/traffic_data.parquet'))

for num in SPECIFIC_INCIDENT_NUMS:
    data_path = f'/home/karam-abu-judom/Sync/BME/2025_2026_1/Thesis/thesis_repo/data/incident_files/raw/incident_{num}.csv'
    df = pd.read_csv(data_path)
    raw_specific_df.append(df)
    data_path = f'/home/karam-abu-judom/Sync/BME/2025_2026_1/Thesis/thesis_repo/data/incident_files/smoothed/incident_{num}.csv'
    df = pd.read_csv(data_path)
    smoothed_specific_df.append(df)

print(f"len(raw_specific_df): {len(raw_specific_df)}")
print(f"len(smoothed_specific_df): {len(smoothed_specific_df)}")
print(f"len(traffic_parquet): {len(traffic_parquet)}")


In [None]:
traffic_parquet.head()

Check if there are any NULL values in the CSV files.

In [None]:
null_count = 0
for df in raw_specific_df:
    if df.isnull().values.any():
        print(f"Incident {num} contains NULL values.")
        null_count += 1
    if len(df) != 3025:
        print(f"Incident {num} has {len(df)} rows instead of 3025.")
print("Raw check complete.")
for df in smoothed_specific_df:
    if df.isnull().values.any():
        print(f"Incident {num} contains NULL values.")
        null_count += 1
    if len(df) != 3025:
        print(f"Incident {num} has {len(df)} rows instead of 3025.")
print("Smoothed check complete.")

if null_count == 0:
    print("No NULL values found in any of the selected incident files.")
else:
    print(f"NULL values found in {null_count} incident files.")

Summary statistics after and before smoothing

In [None]:
features = ['occ', 'speed', 'flow']
summary=traffic_parquet[features].agg(['mean', 'median', 'std'])
print("Summary for the parquet file:")
print(summary)
print('------------------------------------------')
smoothed_specific_df_concat = pd.concat(smoothed_specific_df, ignore_index=True)
summary = smoothed_specific_df_concat[features].agg(['mean', 'median', 'std'])
print("Summary for the 175 selected files (BEFORE SMOOTHING):")
print(summary)
print('------------------------------------------')
features = ['occ_smoothed', 'speed_smoothed']
summary = smoothed_specific_df_concat[features].agg(['mean', 'median', 'std'])
print("Summary for the 175 selected files (AFTER SMOOTHING):")
print(summary)

In [None]:
vars_to_plot = ['occ', 'speed', 'flow']
labels = {'speed': 'Speed', 'occ': 'Occupancy', 'flow': 'Flow'}

for col in vars_to_plot:
    data = pd.to_numeric(traffic_parquet[col], errors='coerce').dropna()
    fig, ax = plt.subplots(figsize=(6, 4))
    sns.histplot(data, kde=True, stat='density', bins=40, edgecolor='none', alpha=0.6, ax=ax)
    sns.kdeplot(data, color='black', lw=1.2, ax=ax)
    ax.set_title(f"Distribution of {labels[col]}")
    ax.set_xlabel(labels[col])
    ax.set_ylabel("Density")
    save_and_close(fig, f"distribution_{col}")

In [None]:
df_hourly = traffic_parquet.copy()
df_hourly['timestamp'] = pd.to_datetime(df_hourly['timestamp'], errors='coerce')
for col in ['speed', 'occ', 'flow']:
    df_hourly[col] = pd.to_numeric(df_hourly[col], errors='coerce')

df_hourly['hour'] = df_hourly['timestamp'].dt.hour
df_hourly['is_weekend'] = np.where(df_hourly['timestamp'].dt.dayofweek >= 5, 'Weekend', 'Weekday')
agg_hour = df_hourly.groupby(['hour', 'is_weekend'])[['speed', 'occ', 'flow']].mean().reset_index()

labels = {'speed': 'Speed', 'occ': 'Occupancy', 'flow': 'Flow'}
for var in ['speed', 'occ', 'flow']:
    fig, ax = plt.subplots(figsize=(6, 4))
    sns.lineplot(data=agg_hour, x='hour', y=var, hue='is_weekend', marker='o', ax=ax)
    ax.set_xticks(range(0, 24, 2))
    ax.set_xlabel('Hour of Day')
    ax.set_ylabel(f'Average {labels[var]}' + (' (mph)' if var == 'speed' else ''))
    ax.set_title(f'Average {labels[var]} by Hour (Weekday vs Weekend)')
    ax.grid(True, linestyle='--', alpha=0.3)
    ax.legend(title='')
    save_and_close(fig, f"hourly_weekday_weekend_{var}")

In [None]:
df_daily = traffic_parquet.copy()
df_daily['timestamp'] = pd.to_datetime(df_daily['timestamp'], errors='coerce')
for col in ['speed', 'occ', 'flow']:
    df_daily[col] = pd.to_numeric(df_daily[col], errors='coerce')

weekday_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
df_daily['weekday'] = pd.Categorical(df_daily['timestamp'].dt.day_name(), categories=weekday_order, ordered=True)

agg_day = df_daily.groupby('weekday', observed=True)[['speed', 'occ', 'flow']].mean().reset_index()

labels = {'speed': 'Speed', 'occ': 'Occupancy', 'flow': 'Flow'}
for var in ['speed', 'occ', 'flow']:
    fig, ax = plt.subplots(figsize=(7, 4))
    sns.lineplot(data=agg_day, x='weekday', y=var, marker='o', ax=ax, color='#4C72B0')
    ax.set_xlabel('Day of Week')
    ax.set_ylabel(f'Average {labels[var]}' + (' (mph)' if var == 'speed' else ''))
    ax.set_title(f'Average {labels[var]} by Day of Week')
    ax.tick_params(axis='x', rotation=30)
    ax.grid(axis='y', linestyle='--', alpha=0.3)
    save_and_close(fig, f"weekday_average_{var}")

In [None]:
df_corr = traffic_parquet[['speed', 'occ', 'flow']].apply(pd.to_numeric, errors='coerce')
corr = df_corr.corr(method='pearson')

fig, ax = plt.subplots(figsize=(5, 4))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='vlag', vmin=-1, vmax=1, square=True, cbar_kws={'shrink': 0.8}, ax=ax)
ax.set_title('Correlation Matrix')
fig.tight_layout()
save_and_close(fig, "correlation_matrix")

Check if there are any streams with multiple station IDs

In [None]:
for df in smoothed_specific_df:
    unique_stations = df["station_id"].unique()
    if len(unique_stations) != 1:
        print(f"Stream {df['stream_id'].iloc[0]} has multiple station IDs: {unique_stations}")

___

In [None]:
if 'smoothed_specific_df_concat' not in globals():
    smoothed_specific_df_concat = pd.concat(smoothed_specific_df, ignore_index=True)

df_cmp = smoothed_specific_df_concat[['speed', 'occ', 'flow', 'is_incident']].copy()
for col in ['speed', 'occ', 'flow']:
    df_cmp[col] = pd.to_numeric(df_cmp[col], errors='coerce')
df_cmp = df_cmp.dropna(subset=['speed', 'occ', 'flow', 'is_incident'])
df_cmp['is_incident'] = df_cmp['is_incident'].astype(int)

agg = df_cmp.groupby('is_incident')[['speed', 'occ', 'flow']].mean().reset_index()

labels = {'speed': 'Speed', 'occ': 'Occupancy', 'flow': 'Flow'}
for var in ['speed', 'occ', 'flow']:
    fig, ax = plt.subplots(figsize=(5, 4))
    sns.barplot(data=agg, x='is_incident', y=var, ax=ax, palette={'0': '#4C72B0', '1': '#DD8452'})
    ax.set_xlabel('is_incident')
    ax.set_ylabel(f'Average {labels[var]}' + (' (mph)' if var == 'speed' else ''))
    ax.set_title(f'Average {labels[var]} by Incident Status')
    ax.set_xticks([0, 1])
    ax.set_xticklabels(['0', '1'])
    ax.grid(axis='y', linestyle='--', alpha=0.3)
    save_and_close(fig, f"incident_status_average_{var}")

Prove class imbalance

In [None]:
class_counts = smoothed_specific_df_concat['is_incident'].value_counts(normalize=True)
print("Class distribution (percentage):")
print(class_counts * 100)

In [None]:
if 'raw_specific_df_concat' not in globals():
    raw_specific_df_concat = pd.concat(raw_specific_df, ignore_index=True)
if 'smoothed_specific_df_concat' not in globals():
    smoothed_specific_df_concat = pd.concat(smoothed_specific_df, ignore_index=True)

raw = raw_specific_df_concat[['speed', 'occ']].apply(pd.to_numeric, errors='coerce').dropna()
smt = smoothed_specific_df_concat[['speed_smoothed', 'occ_smoothed']].apply(pd.to_numeric, errors='coerce').dropna()

# Speed KDE
fig, ax = plt.subplots(figsize=(6, 4))
spd_before = raw['speed'].dropna()
spd_after = smt['speed_smoothed'].dropna()
l_spd, r_spd = np.nanquantile(pd.concat([spd_before, spd_after]), [0.005, 0.995])
sns.kdeplot(spd_before, ax=ax, lw=2, linestyle='--', color='#4C72B0', label='Before smoothing')
sns.kdeplot(spd_after, ax=ax, lw=2, linestyle='-', color='#DD8452', label='After smoothing')
ax.set_xlim(l_spd, r_spd)
ax.set_title('KDE: Speed (Before vs After)')
ax.set_xlabel('Speed')
ax.set_ylabel('Density')
ax.legend()
save_and_close(fig, "kde_before_after_speed")

# Occupancy KDE
fig, ax = plt.subplots(figsize=(6, 4))
occ_before = raw['occ'].dropna()
occ_after = smt['occ_smoothed'].dropna()
l_occ, r_occ = np.nanquantile(pd.concat([occ_before, occ_after]), [0.005, 0.995])
sns.kdeplot(occ_before, ax=ax, lw=2, linestyle='--', color='#4C72B0', label='Before smoothing')
sns.kdeplot(occ_after, ax=ax, lw=2, linestyle='-', color='#DD8452', label='After smoothing')
ax.set_xlim(l_occ, r_occ)
ax.set_title('KDE: Occupancy (Before vs After)')
ax.set_xlabel('Occupancy')
ax.set_ylabel('Density')
ax.legend()
save_and_close(fig, "kde_before_after_occupancy")

In [None]:
# Correlation matrices: before vs after smoothing for (speed, occ)
if 'raw_specific_df_concat' not in globals():
    raw_specific_df_concat = pd.concat(raw_specific_df, ignore_index=True)
if 'smoothed_specific_df_concat' not in globals():
    smoothed_specific_df_concat = pd.concat(smoothed_specific_df, ignore_index=True)

before = raw_specific_df_concat[['speed', 'occ']].apply(pd.to_numeric, errors='coerce').dropna()
after = smoothed_specific_df_concat[['speed_smoothed', 'occ_smoothed']].apply(pd.to_numeric, errors='coerce').dropna()

corr_before = before.corr(method='pearson')
corr_after = after.rename(columns={'speed_smoothed': 'speed', 'occ_smoothed': 'occ'}).corr(method='pearson')

fig, axes = plt.subplots(1, 2, figsize=(8, 3.5))
sns.heatmap(corr_before, ax=axes[0], annot=True, vmin=-1, vmax=1, cmap='vlag', square=True, cbar=False, fmt='.2f')
axes[0].set_title('Before smoothing')
sns.heatmap(corr_after, ax=axes[1], annot=True, vmin=-1, vmax=1, cmap='vlag', square=True, cbar=True, fmt='.2f')
axes[1].set_title('After smoothing')
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

values_str = """
64.0428497501546
66.4845064566845
60.5140244022717
67.7287537670514
65.6851810433206
63.9293956532458
64.8880368312117
64.4344827183552
63.160430650206
64.3010014916196
63.5298288661391
65.4878787954852
63.3840790634106
62.7551380905519
60.4307449168042
63.2248760164877
64.5592665882068
65.9933374512893
65.4536705879877
67.5287936650416
68.1797520222228
64.9101406605332
66.8771616193769
62.7757056789689
66.5119104851205
69.6110817845193
69.0628293790384
67.0869408579196
64.5468199685855
62.5584791504994
63.9110417171196
67.9880163971468
64.6771249285006
66.879693670052
65.7661157462282
65.2483861811579
67.5642717927854
68.1061828511886
60.6714687196863
68.8630242464429
63.987935787281
65.4693940806477
68.4600535180971
66.2991038692718
66.931539176185
64.8136565488189
69.8843671416875
65.8306200037577
64.2641783529849
66.9613930923003
63.6472694770959
68.4100516324664
63.1370529539883
65.6117714013936
64.3430972060437
67.9986044168803
66
67.4426360974599
66.6957622903781
66.6181003974595
66.7156932352367
69.3355991880659
64.7059376430073
64.1427634264825
64.0374094340654
63.2821192980197
62.0792710835178
63.2694228089777
61.5802045953994
66.9575346221104
63.7788666253082
61.5095972088597
62.66741199212
58.2865464895981
63.5986122533404
63.0044750002337
64.1749180837489
62.9036882875965
61.5027500535088
61.317269693735
63.5855002685223
65.3863814390491
67.4930223136056
72.1703576357762
61.8797252827615
61.2228122550815
67.7387872807456
63.6196004594271
65.7014160885027
64.7312818557198
64.3007042342333
66.7198485640535
63.5755310922839
64.4732986335181
66.6691529936942
65.7624580333693
66.0524459337248
65.2569537532023
68.0372311302682
""".strip()

vals = [float(x) for x in values_str.splitlines() if x.strip()]

fig, ax = plt.subplots(figsize=(6, 4))
sns.histplot(vals, bins=30, stat='count', color='#4C72B0', edgecolor='white', alpha=0.85, kde=True, ax=ax)
ax.set_title('Histogram of speeds (cell 23)')
ax.set_xlabel('Speed (mph)')
ax.set_ylabel('Count')

In [None]:
# Show the mean and median flow values for data/traffic_data.parquet
traffic_data = pd.read_parquet(Path('data/traffic_data.parquet'))
mean_flow = traffic_data['flow'].mean()
median_flow = traffic_data['flow'].median()
print(f"Mean flow: {mean_flow}"
      f"\nMedian flow: {median_flow}")