# CS 163 Final Project EDA Summary

In [None]:
# ================== IMPORT LIBRARIES ==================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from fpdf import FPDF
import plotly.express as px
import plotly.graph_objects as go
import folium
from folium.plugins import HeatMap
import pandas as pd

In [None]:
# ================== SECTION 1: Load Dataset ==================
file_path = "C:/Users/sherv/Desktop/SP25/CS163/US_Accidents_March23.csv"

# Load dataset (keep original unchanged)
df_original = pd.read_csv(file_path, low_memory=False)  
df = df_original.copy()  # Work on a copy

In [None]:
# ================== SECTION 2: Data Cleaning & Preprocessing ==================
# Select relevant columns for analysis
columns_to_keep = ["Severity", "State", "Temperature(F)", "Humidity(%)", 
                   "Visibility(mi)", "Precipitation(in)", "Weather_Condition", 
                   "Pressure(in)", "Wind_Speed(mph)"]
df = df[columns_to_keep]

# Identify missing values
missing_values = df.isnull().sum()
print("\nMissing Values Before Cleaning:\n", missing_values)

# Drop missing values for simplicity
df.dropna(inplace=True)

# Check the dataset size after dropping missing values
print(f"\nRemaining Rows After Dropping NaN: {df.shape[0]:,}")

# ================== SECTION 3: Descriptive Statistics ==================
# Ensure Pandas doesn't truncate output
pd.set_option("display.max_columns", None)  # Show all columns
pd.set_option("display.float_format", "{:,.2f}".format)  # Avoid scientific notation

# Summary statistics with better formatting
summary_stats = df.describe().apply(lambda x: x.map(lambda y: f"{y:,.2f}"))  
print("\nSummary Statistics:\n", summary_stats.to_string())  # Force full output

# Count the number of occurrences for each weather condition
weather_condition_counts = df["Weather_Condition"].value_counts()

# Print full weather condition counts
print("\nWeather Condition Counts:\n", weather_condition_counts.to_string())  # Ensure full output

In [None]:
# ================== SECTION 3: Descriptive Statistics ==================
# Summary statistics with improved formatting
summary_stats = df.describe().apply(lambda x: x.map(lambda y: f"{y:,.2f}"))  # Format numbers
print("Summary Statistics:\n", summary_stats)

# Format float output to avoid scientific notation
pd.options.display.float_format = "{:,.2f}".format

The average temperature during accidents is 61.4°F, with humidity averaging 65.9%, and low precipitation values (mean = 0.0085 inches). The correlation matrix indicates weak correlations between severity and environmental factors (e.g., precipitation, temperature, humidity), confirming that these variables alone are not strong predictors of accident severity. However, humidity and visibility are negatively correlated (-0.41), meaning that higher humidity tends to reduce visibility, which could indirectly affect accident rates.



In [None]:
# ================== SECTION 4: Data Visualizations ==================
# 1️⃣ Accident Severity Distribution Bar Chart
# Count the number of accidents per severity level
severity_counts = df["Severity"].value_counts().sort_index()

# Create a bar chart
plt.figure(figsize=(8, 5))
sns.barplot(x=severity_counts.index, y=severity_counts.values, palette="Blues")

# Annotate bars with exact counts
for i, count in enumerate(severity_counts.values):
    plt.text(i, count + 10000, f"{count:,}", ha="center", fontsize=12)

# Labels and title
plt.title("Accident Severity Distribution", fontsize=14)
plt.xlabel("Severity Level", fontsize=12)
plt.ylabel("Number of Accidents", fontsize=12)
plt.xticks(ticks=[0, 1, 2, 3], labels=["1 (Low)", "2 (Moderate)", "3 (High)", "4 (Severe)"])

# Save and show
plt.show()


The Accident Severity Distribution bar chart reveals that Severity Level 2 (Moderate) accounts for the majority of accidents (~3.9 million), followed by Severity Level 3 (High) and a much smaller proportion of Severity Level 1 (Low) and Level 4 (Severe). This suggests that most accidents cause moderate disruptions to traffic flow rather than extreme consequences.

## Key Heatmap Insights:
### Severity vs. Other Factors:

Severity has weak correlations with all other features (-0.03 to 0.04), meaning accident severity isn't strongly dependent on temperature, humidity, visibility, or precipitation.
Temperature(F) vs. Humidity(%) (-0.34)

Moderate negative correlation: As temperature increases, humidity decreases (warmer air holds more moisture but reduces relative humidity).
Humidity(%) vs. Visibility(mi) (-0.41)

Moderate negative correlation: Higher humidity reduces visibility (likely due to fog, mist, or rain).
Visibility(mi) vs. Precipitation(in) (-0.12)

Slight negative correlation: Increased precipitation (rain/snow) slightly reduces visibility, but it’s not a strong effect.
Precipitation(in) vs. Severity (0.02)

No significant correlation: This means higher precipitation doesn't strongly impact accident severity.

### What This Means for our EDA
No strong predictor of accident severity among these weather variables.
Humidity and visibility are more closely related, which makes sense since fog and precipitation affect visibility.
You may need to explore other factors (e.g., road conditions, time of day, traffic volume) to better predict accident severity.


In [None]:
# 3️⃣ Scatter plot: Precipitation vs. Severity

plt.figure(figsize=(8, 5))
sns.scatterplot(x=df["Precipitation(in)"], y=df["Severity"], alpha=0.5, color="blue")

# Apply log scale to precipitation
plt.xscale("log")
plt.xticks([0.1, 1, 5, 10, 20, 50], labels=["0.1", "1", "5", "10", "20", "50"])  # Custom tick labels

# Improve readability
plt.title("Precipitation vs Severity (Log Scale)", fontsize=14)
plt.xlabel("Precipitation (inches, log scale)", fontsize=12)
plt.ylabel("Severity", fontsize=12)

plt.grid(True, which="both", linestyle="--", linewidth=0.5)
plt.show()

The Precipitation vs. Severity (Log Scale) scatter plot demonstrates that most accidents occur in low precipitation levels (<1 inch), with a small number of cases at higher precipitation levels. Since Severity is categorical (1-4), the log scale helps visualize how accident severity is distributed across different precipitation levels. The lack of a strong pattern suggests that precipitation alone does not strongly determine accident severity.

In [None]:
# 4️⃣ Bar Chart: Accidents by State
# Count the number of accidents per state
import matplotlib.ticker as mticker
state_counts = df["State"].value_counts().sort_values(ascending=False)

# Create a bar plot
plt.figure(figsize=(15, 6))
sns.barplot(x=state_counts.index, y=state_counts.values, palette="viridis")

# Improve Y-axis scale (increments of 100,000)
plt.yticks(range(0, max(state_counts.values) + 100000, 100000))  # Set tick intervals

# Format Y-axis labels with commas instead of scientific notation
plt.gca().yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f'{int(x):,}'))

# Rotate x-axis labels for readability
plt.xticks(rotation=90)

# Add vertical labels on bars
for i, count in enumerate(state_counts.values):
    plt.text(i, count + 10000, f"{count:,}", ha="center", va="bottom", fontsize=9, rotation=90)

# Labels and Title
plt.title("Accident Distribution by State", fontsize=14)
plt.xlabel("State", fontsize=12)
plt.ylabel("Number of Accidents", fontsize=12)

# Save and show
plt.show()

The Accident Distribution by State bar chart highlights that California (CA), Florida (FL), and Texas (TX) have the highest number of recorded accidents, with California exceeding 1 million accidents. This trend suggests that states with high population density and urban traffic congestion tend to experience more accidents. Conversely, states like Vermont (VT), South Dakota (SD), and Wyoming (WY) have significantly lower accident counts, likely due to lower population density and fewer urban roadways.

In [None]:

# ================== Data Cleaning ==================
# Keep only relevant columns and drop missing values
df_map = df_original[["Start_Lat", "Start_Lng"]].dropna()

# Convert DataFrame to a list of [lat, lon] for HeatMap
heat_data = df_map.values.tolist()

# ================== Create Folium Heatmap ==================
# Initialize the map centered in the U.S.
m = folium.Map(location=[37.8, -96], zoom_start=5, tiles="CartoDB Voyager", attr="Stamen Terrain, OpenStreetMap")


# Add heatmap layer
HeatMap(
    heat_data, 
    radius=8,    # Adjust size of heat points
    blur=4,      # Adjust blur effect
    max_zoom=10  # Improve zoom visibility
).add_to(m)

# ================== Save and Display Map ==================
# Save map to an HTML file
m.save("US_Accident_Heatmap.html")

# Display map (if running in Jupyter Notebook)
m

## General Conclusion from initial EDA

Overall, these findings suggest that urban density, traffic congestion, and other external factors likely have a more significant impact on accident frequency and severity than weather conditions alone. Further analysis could explore time-of-day trends, road conditions, and traffic volume to refine predictive insights. 🚗📊









# EDA Visualizations #


In [None]:
len(df_original
    )

In [None]:
import folium
from folium.plugins import HeatMap
import pandas as pd

# === Sample the dataset to reduce clutter ===
df_map = df_original[["Start_Lat", "Start_Lng"]].dropna()

# Adjust this to control how many points you show (e.g., 10,000 points)
df_sampled = df_map.sample(n= 3000000, random_state=42)

# Convert to list for heatmap
heat_data = df_sampled.values.tolist()

# === Create Heatmap ===
m = folium.Map(location=[37.8, -96], zoom_start=5, tiles="CartoDB Voyager", attr="Stamen Terrain, OpenStreetMap")

HeatMap(
    heat_data,
    radius=2,    # smaller radius = tighter points
    blur=2,      # lower blur = sharper edges
    max_zoom=10
).add_to(m)

m.save("US_Accident_Heatmap_Sampled.html")

m


## Pre-Visualizations ##

In [None]:

# Weather-related columns
weather_columns = [
    "Temperature(F)",
    "Humidity(%)",
    "Wind_Speed(mph)",
    "Pressure(in)",
    "Precipitation(in)",
    "Visibility(mi)"
]

# Columns to retain in final dataset
columns_to_keep = ["ID", "Severity"] + weather_columns

# Drop rows with missing weather data from the original dataframe
df_weather_subset = df_original.dropna(subset=weather_columns)[columns_to_keep]

# Save the result
df_weather_subset.to_csv("US_Accidents_Weather_Only_With_ID.csv", index=False)

# Summary output
print(f"Original dataset size: {df_original.shape[0]} rows")
print(f"Filtered dataset size (weather + severity + ID only): {df_weather_subset.shape[0]} rows")
print(f"Columns in new dataset: {df_weather_subset.columns.tolist()}")

In [None]:
import re

# Define expanded regex pattern to capture I-5, I 5, US-101, US 101, Hwy, etc.
highway_pattern = r'\b(I[-\s]?\d+|US[-\s]?\d+|Hwy|HWY|highway)\b'

# Create a new 'highway' column in df_original
df_original['highway'] = df_original['Description'].str.contains(highway_pattern, flags=re.IGNORECASE, na=False)

# Create a filtered DataFrame containing only highway-related accidents
df_highway_only = df_original[df_original['highway'] == True]

# Save the filtered data to CSV (optional)
df_highway_only.to_csv("Filtered_Highway_Accidents.csv", index=False)

# Preview the result
print(f"✅ Rows with highway mentions: {df_highway_only.shape[0]}")
print(df_highway_only[['ID', 'Description', 'highway']].head())


In [None]:
import re

# Use non-capturing groups to avoid warning
highway_pattern = r'\b(?:I[-\s]?\d+|US[-\s]?\d+|Hwy|HWY|highway)\b'

# Add 'highway' column with True/False values based on Description content
df_original['highway'] = df_original['Description'].str.contains(highway_pattern, flags=re.IGNORECASE, na=False)


In [None]:
import pandas as pd
from scipy.stats import ttest_ind, f_oneway, pearsonr

# Load dataset
df = pd.read_csv("US_Accidents_With_Highway_Flag.csv")

# Define numeric columns to include (excluding ID and categorical features)
numeric_cols = df.select_dtypes(include=["float64", "int64"]).columns.tolist()

# Drop 'Severity' from the list since it's used as the label
if "Severity" in numeric_cols:
    numeric_cols.remove("Severity")

# Drop rows with missing values in any selected numeric column
df = df[["Severity"] + numeric_cols].dropna()

# Group severity into binary for T-test: Low (1-2), High (3-4)
df["Severity_Group"] = df["Severity"].apply(lambda x: "Low" if x <= 2 else "High")

# Prepare results
results = []

for col in numeric_cols:
    # T-Test between Low vs High severity
    group_low = df[df["Severity_Group"] == "Low"][col]
    group_high = df[df["Severity_Group"] == "High"][col]
    t_stat, t_pval = ttest_ind(group_low, group_high, equal_var=False)

    # ANOVA across all 4 severity levels
    groups = [df[df["Severity"] == s][col] for s in sorted(df["Severity"].unique())]
    a_stat, a_pval = f_oneway(*groups)

    # Pearson correlation with Severity
    corr, corr_pval = pearsonr(df[col], df["Severity"])

    # Append result
    results.append({
        "Feature": col,
        "T-Test p-value": round(t_pval, 5),
        "ANOVA p-value": round(a_pval, 5),
        "Correlation (r)": round(corr, 3),
        "Correlation p-value": round(corr_pval, 5)
    })

# Convert to DataFrame
results_df = pd.DataFrame(results)

# Display results
print("\nStatistical Test Results for All Numeric Features:\n")
print(results_df.to_string(index=False))


In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# === Step 1: Load your dataset ===
df = pd.read_csv("US_Accidents_With_Highway_Flag.csv")

# === Step 2: Define features and target ===
features = [
    "Temperature(F)", "Humidity(%)", "Wind_Speed(mph)",
    "Pressure(in)", "Precipitation(in)", "Visibility(mi)", "highway"
]

# Drop rows with missing values in selected columns
df = df[["Severity"] + features].dropna()
df["highway"] = df["highway"].astype(int)

# Binary classification target: 0 = Low (1–2), 1 = High (3–4)
df["Severity_Binary"] = df["Severity"].apply(lambda x: 0 if x <= 2 else 1)

X = df[features]
y = df["Severity_Binary"]

# === Step 3: Scale numeric features ===
scaler = StandardScaler()
X_scaled = X.copy()
X_scaled[features[:-1]] = scaler.fit_transform(X_scaled[features[:-1]])

# === Step 4: Train-test split ===
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, stratify=y, random_state=42
)

# === Step 5: Train Random Forest with class balancing ===
rf_model = RandomForestClassifier(
    n_estimators=100, class_weight='balanced', random_state=42
)
rf_model.fit(X_train, y_train)

# === Step 6: Predict and evaluate ===
y_pred = rf_model.predict(X_test)

# Print classification report
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred, target_names=["Low Severity", "High Severity"]))

# Show confusion matrix
ConfusionMatrixDisplay.from_estimator(rf_model, X_test, y_test, cmap="Blues")
plt.title("Random Forest - Confusion Matrix")
plt.show()


In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# === Load dataset ===
df = pd.read_csv("US_Accidents_With_Highway_Flag.csv")

# === Define features and target ===
features = [
    "Temperature(F)", "Humidity(%)", "Wind_Speed(mph)",
    "Pressure(in)", "Precipitation(in)", "Visibility(mi)", "highway"
]

df = df[["Severity"] + features].dropna()
df["highway"] = df["highway"].astype(int)
df["Severity_Binary"] = df["Severity"].apply(lambda x: 0 if x <= 2 else 1)

# === Split first, then balance ===
X = df[features]
y = df["Severity_Binary"]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# === Combine and balance training data (undersampling) ===
train_df = pd.concat([X_train, y_train], axis=1)
majority = train_df[train_df["Severity_Binary"] == 0]
minority = train_df[train_df["Severity_Binary"] == 1]
majority_downsampled = resample(majority, replace=False, n_samples=len(minority), random_state=42)
train_balanced = pd.concat([majority_downsampled, minority])

# === Balance the test set similarly ===
test_df = pd.concat([X_test, y_test], axis=1)
majority_test = test_df[test_df["Severity_Binary"] == 0]
minority_test = test_df[test_df["Severity_Binary"] == 1]
majority_test_down = resample(majority_test, replace=False, n_samples=len(minority_test), random_state=42)
test_balanced = pd.concat([majority_test_down, minority_test])

# === Scale and split X/y ===
scaler = StandardScaler()
X_train_bal = scaler.fit_transform(train_balanced[features])
y_train_bal = train_balanced["Severity_Binary"]
X_test_bal = scaler.transform(test_balanced[features])
y_test_bal = test_balanced["Severity_Binary"]

# === Train Random Forest ===
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_bal, y_train_bal)

# === Evaluate ===
y_pred = rf.predict(X_test_bal)

print("Classification Report (Balanced Train & Test):\n")
print(classification_report(y_test_bal, y_pred, target_names=["Low", "High"]))

ConfusionMatrixDisplay.from_estimator(rf, X_test_bal, y_test_bal, cmap="Blues")
plt.title("Random Forest - Balanced Train/Test")
plt.show()

# === Feature Importance Bar Plot ===
importances = rf.feature_importances_
feature_names = features

feat_df = pd.DataFrame({"Feature": feature_names, "Importance": importances})
feat_df = feat_df.sort_values(by="Importance", ascending=True)

feat_df.plot(kind="barh", x="Feature", y="Importance", legend=False, color="skyblue")
plt.title("Feature Importance (Random Forest)")
plt.xlabel("Importance Score")
plt.tight_layout()
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# === Feature importance data (assuming you've already trained your model) ===
importances = rf.feature_importances_
feature_names = features

feat_df = pd.DataFrame({"Feature": feature_names, "Importance": importances})
feat_df = feat_df.sort_values(by="Importance", ascending=True)

# === Generate color palette based on number of features ===
colors = sns.color_palette("coolwarm", len(feat_df))

# === Plot feature importance with custom colors ===
plt.figure(figsize=(8, 5))
plt.barh(feat_df["Feature"], feat_df["Importance"], color=colors)
plt.title("Feature Importance (Random Forest)")
plt.xlabel("Importance Score")
plt.tight_layout()
plt.show()



In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# === Load dataset ===
df = pd.read_csv("US_Accidents_With_Highway_Flag.csv")

# === Define features and target ===
features = [
    "Temperature(F)", "Humidity(%)", "Wind_Speed(mph)",
    "Pressure(in)", "Precipitation(in)", "Visibility(mi)", "highway"
]

df = df[["Severity"] + features].dropna()
df["highway"] = df["highway"].astype(int)
df["Severity_Binary"] = df["Severity"].apply(lambda x: 0 if x <= 2 else 1)

# === Split first, then balance ===
X = df[features]
y = df["Severity_Binary"]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# === Combine and balance training data (undersampling) ===
train_df = pd.concat([X_train, y_train], axis=1)

# Separate classes
majority = train_df[train_df["Severity_Binary"] == 0]
minority = train_df[train_df["Severity_Binary"] == 1]

# Undersample majority class to match minority
majority_downsampled = resample(majority, replace=False, n_samples=len(minority), random_state=42)
train_balanced = pd.concat([majority_downsampled, minority])

# === Balance the test set the same way ===
test_df = pd.concat([X_test, y_test], axis=1)
majority_test = test_df[test_df["Severity_Binary"] == 0]
minority_test = test_df[test_df["Severity_Binary"] == 1]
majority_test_down = resample(majority_test, replace=False, n_samples=len(minority_test), random_state=42)
test_balanced = pd.concat([majority_test_down, minority_test])

# === Scale and split X/y ===
scaler = StandardScaler()

X_train_bal = scaler.fit_transform(train_balanced[features])
y_train_bal = train_balanced["Severity_Binary"]

X_test_bal = scaler.transform(test_balanced[features])
y_test_bal = test_balanced["Severity_Binary"]

# === Train Random Forest ===
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_bal, y_train_bal)

# === Evaluate ===
y_pred = rf.predict(X_test_bal)

print("Classification Report (Balanced Train & Test):\n")
print(classification_report(y_test_bal, y_pred, target_names=["Low", "High"]))

ConfusionMatrixDisplay.from_estimator(rf, X_test_bal, y_test_bal, cmap="Blues")
plt.title("Random Forest - Balanced Train/Test")
plt.show()


In [None]:
import pandas as pd
from sklearn.utils import resample
from scipy.stats import ttest_ind, f_oneway, pearsonr

# === Load your dataset ===
df = pd.read_csv("US_Accidents_With_Highway_Flag.csv")

# === Define numeric columns ===
numeric_cols = df.select_dtypes(include=["float64", "int64"]).columns.tolist()
numeric_cols = [col for col in numeric_cols if col != "Severity"]

# === Drop missing values for selected numeric columns + Severity ===
df = df[["Severity"] + numeric_cols].dropna()

# === Binary group: Low (1–2) vs High (3–4) ===
df["Severity_Binary"] = df["Severity"].apply(lambda x: 0 if x <= 2 else 1)

# === Create a balanced small subset (undersample majority class) ===
low = df[df["Severity_Binary"] == 0]
high = df[df["Severity_Binary"] == 1]
sample_size = min(len(low), len(high), 10000)  # you can adjust 10000

low_sample = resample(low, replace=False, n_samples=sample_size, random_state=42)
high_sample = resample(high, replace=False, n_samples=sample_size, random_state=42)

df_balanced = pd.concat([low_sample, high_sample])

# === Add Severity Group for readability (for T-test) ===
df_balanced["Severity_Group"] = df_balanced["Severity"].apply(lambda x: "Low" if x <= 2 else "High")

# === Run statistical tests ===
results = []

for col in numeric_cols:
    group_low = df_balanced[df_balanced["Severity_Group"] == "Low"][col]
    group_high = df_balanced[df_balanced["Severity_Group"] == "High"][col]
    t_stat, t_pval = ttest_ind(group_low, group_high, equal_var=False)
    
    groups_anova = [df_balanced[df_balanced["Severity"] == s][col] for s in sorted(df_balanced["Severity"].unique())]
    a_stat, a_pval = f_oneway(*groups_anova)
    
    corr, corr_pval = pearsonr(df_balanced[col], df_balanced["Severity"])

    results.append({
        "Feature": col,
        "T-Test p-value": round(t_pval, 5),
        "ANOVA p-value": round(a_pval, 5),
        "Correlation (r)": round(corr, 3),
        "Correlation p-value": round(corr_pval, 5)
    })

# === Display results ===
results_df = pd.DataFrame(results)
print("\nStatistical Test Results (Balanced Subset):\n")
print(results_df.to_string(index=False))


In [None]:
import pandas as pd
from scipy.stats import chi2_contingency

# Load the dataset
df = pd.read_csv("US_Accidents_With_Highway_Flag.csv")

# Define categorical/boolean columns (you can expand this list)
categorical_cols = [
    "Amenity", "Bump", "Crossing", "Junction",
    "No_Exit", "Railway", "Roundabout", "Station", "Stop",
    "Traffic_Calming", "Traffic_Signal", "Turning_Loop"
]

# Clean the data
df = df[["Severity"] + categorical_cols].dropna()
df["Severity_Binary"] = df["Severity"].apply(lambda x: 0 if x <= 2 else 1)

# Run chi-square tests
results = []

for col in categorical_cols:
    contingency = pd.crosstab(df[col], df["Severity_Binary"])
    chi2, p, dof, _ = chi2_contingency(contingency)
    results.append({
        "Feature": col,
        "Chi2 Stat": round(chi2, 2),
        "P-Value": round(p, 5)
    })

# Show results
chi2_df = pd.DataFrame(results).sort_values(by="P-Value")
print("\nChi-Square Test Results (Categorical vs Severity):\n")
print(chi2_df.to_string(index=False))


In [None]:
import pandas as pd

importances = rf.feature_importances_
feature_names = X.columns
feat_df = pd.DataFrame({"Feature": feature_names, "Importance": importances})
feat_df = feat_df.sort_values(by="Importance", ascending=True)

feat_df.plot(kind="barh", x="Feature", y="Importance", legend=False, color="skyblue")
plt.title("Feature Importance (Random Forest)")
plt.xlabel("Importance Score")
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Example chi-square results
data = {
    "Feature": [
        "Amenity", "Bump", "Crossing", "Junction", "No_Exit", "Railway", "Roundabout",
        "Station", "Stop", "Traffic_Calming", "Traffic_Signal", "Turning_Loop"
    ],
    "Chi2 Stat": [
        10790.17, 219.18, 96950.15, 21154.17, 836.95, 808.37, 35.00,
        18902.32, 24656.23, 233.59, 84836.52, 0.00
    ],
    "P-Value": [
        0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00,
        0.00, 0.00, 0.00, 0.00, 1.00
    ]
}

chi_df = pd.DataFrame(data)
chi_df = chi_df.sort_values(by="Chi2 Stat", ascending=True)

# Plot
plt.figure(figsize=(10, 6))
bars = plt.barh(chi_df["Feature"], chi_df["Chi2 Stat"], color="coral")
plt.title("Chi-Square Test Statistic by Feature")
plt.xlabel("Chi2 Statistic (Strength of Association with Severity)")
plt.tight_layout()
plt.show()


## Visualization 1 ##