In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

# Load dataset
df = pd.read_csv('C:\\Users\\LOQ\\traffic-accident\\data\\accident_prediction_india.csv')

print(f"Original Dataset: {df.shape}")
print("\nColumns:")
print(df.columns.tolist())


Original Dataset: (3000, 22)

Columns:
['State Name', 'City Name', 'Year', 'Month', 'Day of Week', 'Time of Day', 'Accident Severity', 'Number of Vehicles Involved', 'Vehicle Type Involved', 'Number of Casualties', 'Number of Fatalities', 'Weather Conditions', 'Road Type', 'Road Condition', 'Lighting Conditions', 'Traffic Control Presence', 'Speed Limit (km/h)', 'Driver Age', 'Driver Gender', 'Driver License Status', 'Alcohol Involvement', 'Accident Location Details']


In [4]:
# Auto-detect important columns
columns_map = {}

# Find target column
for col in df.columns:
    col_lower = col.lower()
    if 'severity' in col_lower:
        columns_map['target'] = col
        break

# Find time columns
for col in df.columns:
    col_lower = col.lower()
    if 'date' in col_lower or 'time' in col_lower:
        columns_map['datetime'] = col
        break
    if 'year' in col_lower:
        columns_map['year'] = col
    if 'month' in col_lower:
        columns_map['month'] = col
    if 'day' in col_lower or 'hour' in col_lower:
        columns_map['time'] = col

# Find weather column
for col in df.columns:
    if 'weather' in col.lower():
        columns_map['weather'] = col
        break

# Find location/road columns
for col in df.columns:
    col_lower = col.lower()
    if 'road' in col_lower or 'location' in col_lower or 'area' in col_lower:
        columns_map['location'] = col
        break

print("Detected Important Columns:")
print("="*60)
for key, val in columns_map.items():
    print(f"{key:15s}: {val}")


Detected Important Columns:
target         : Accident Severity
year           : Year
month          : Month
time           : Day of Week
datetime       : Time of Day
weather        : Weather Conditions
location       : Road Type


In [5]:
# Create a working copy
df_work = df.copy()

print("Handling Missing Values...")
print("="*60)

# Fill missing values based on column type
for col in df_work.columns:
    if df_work[col].isnull().sum() > 0:
        if df_work[col].dtype in ['int64', 'float64']:
            # Numeric: fill with median
            df_work[col].fillna(df_work[col].median(), inplace=True)
            print(f"✓ {col}: Filled with median")
        else:
            # Categorical: fill with mode or 'Unknown'
            mode_val = df_work[col].mode()
            if len(mode_val) > 0:
                df_work[col].fillna(mode_val[0], inplace=True)
                print(f"✓ {col}: Filled with mode")
            else:
                df_work[col].fillna('Unknown', inplace=True)
                print(f"✓ {col}: Filled with 'Unknown'")

print(f"\n✓ Missing values handled. New shape: {df_work.shape}")


Handling Missing Values...
✓ Traffic Control Presence: Filled with mode
✓ Driver License Status: Filled with mode

✓ Missing values handled. New shape: (3000, 22)


In [6]:
print("Creating Time Features...")
print("="*60)

# If datetime column exists
if 'datetime' in columns_map:
    try:
        df_work['datetime_parsed'] = pd.to_datetime(df_work[columns_map['datetime']], errors='coerce')
        
        df_work['hour'] = df_work['datetime_parsed'].dt.hour
        df_work['day_of_week'] = df_work['datetime_parsed'].dt.dayofweek
        df_work['month'] = df_work['datetime_parsed'].dt.month
        df_work['year'] = df_work['datetime_parsed'].dt.year
        
        print("✓ Extracted: hour, day_of_week, month, year")
    except:
        print("✗ Could not parse datetime column")

# If separate columns exist
else:
    if 'year' in columns_map:
        df_work['year'] = df_work[columns_map['year']]
        print(f"✓ Using existing year column")
    
    if 'month' in columns_map:
        df_work['month'] = df_work[columns_map['month']]
        print(f"✓ Using existing month column")

# Create derived features
if 'day_of_week' in df_work.columns:
    df_work['is_weekend'] = (df_work['day_of_week'] >= 5).astype(int)
    print("✓ Created: is_weekend")

if 'hour' in df_work.columns:
    df_work['is_rush_hour'] = df_work['hour'].isin([7, 8, 9, 17, 18, 19]).astype(int)
    df_work['is_night'] = df_work['hour'].isin(range(22, 24)) | df_work['hour'].isin(range(0, 6))
    df_work['is_night'] = df_work['is_night'].astype(int)
    print("✓ Created: is_rush_hour, is_night")

if 'month' in df_work.columns:
    # Monsoon season in India (June-September)
    df_work['is_monsoon'] = df_work['month'].isin([6, 7, 8, 9]).astype(int)
    print("✓ Created: is_monsoon")

print(f"\n✓ Time features created. Shape: {df_work.shape}")


Creating Time Features...
✓ Extracted: hour, day_of_week, month, year
✓ Created: is_weekend
✓ Created: is_rush_hour, is_night
✓ Created: is_monsoon

✓ Time features created. Shape: (3000, 31)


In [7]:
print("Processing Weather Features...")
print("="*60)

if 'weather' in columns_map:
    weather_col = columns_map['weather']
    
    # Map to broad categories
    def map_weather(condition):
        if pd.isna(condition):
            return 'Clear'
        
        c = str(condition).lower()
        
        if any(x in c for x in ['clear', 'fair', 'sunny', 'fine']):
            return 'Clear'
        elif any(x in c for x in ['rain', 'drizzle', 'shower', 'wet']):
            return 'Rain'
        elif any(x in c for x in ['fog', 'mist', 'haze', 'smoke']):
            return 'Fog'
        elif any(x in c for x in ['cloud', 'overcast']):
            return 'Cloudy'
        elif any(x in c for x in ['snow', 'ice', 'sleet', 'hail']):
            return 'Snow'
        elif any(x in c for x in ['storm', 'thunder', 'wind']):
            return 'Storm'
        else:
            return 'Other'
    
    df_work['weather_main'] = df_work[weather_col].apply(map_weather)
    
    print("Weather Categories:")
    print(df_work['weather_main'].value_counts())
    print("\n✓ Weather mapped to: Clear, Rain, Fog, Cloudy, Snow, Storm, Other")
else:
    # Create default weather column
    df_work['weather_main'] = 'Clear'
    print("⚠ No weather column found. Created default 'Clear' weather")

print(f"\n✓ Weather features processed. Shape: {df_work.shape}")


Processing Weather Features...
Weather Categories:
weather_main
Rain     631
Storm    611
Other    608
Fog      576
Clear    574
Name: count, dtype: int64

✓ Weather mapped to: Clear, Rain, Fog, Cloudy, Snow, Storm, Other

✓ Weather features processed. Shape: (3000, 32)


In [8]:
print("Processing Road/Location Features...")
print("="*60)

if 'location' in columns_map:
    location_col = columns_map['location']
    
    # Map to road types
    def classify_road(location):
        if pd.isna(location):
            return 'unknown'
        
        loc = str(location).lower()
        
        if any(x in loc for x in ['highway', 'expressway', 'nh-', 'national']):
            return 'highway'
        elif any(x in loc for x in ['main', 'arterial', 'state', 'sh-']):
            return 'arterial'
        elif any(x in loc for x in ['urban', 'city', 'town', 'metro']):
            return 'urban'
        elif any(x in loc for x in ['rural', 'village', 'gram']):
            return 'rural'
        elif any(x in loc for x in ['residential', 'colony', 'sector']):
            return 'residential'
        else:
            return 'unknown'
    
    df_work['road_type'] = df_work[location_col].apply(classify_road)
    
    print("Road Type Distribution:")
    print(df_work['road_type'].value_counts())
    print("\n✓ Road types: highway, arterial, urban, rural, residential, unknown")
else:
    df_work['road_type'] = 'unknown'
    print("⚠ No location column found. Created default 'unknown' road_type")

# Additional binary features (if columns exist)
for feature_name, keywords in [
    ('is_junction', ['junction', 'intersection', 'crossing', 'chowk']),
    ('is_urban', ['urban', 'city', 'town', 'metro']),
    ('is_highway', ['highway', 'expressway', 'nh-', 'national'])
]:
    found = False
    for col in df_work.columns:
        if any(kw in col.lower() for kw in keywords):
            if df_work[col].dtype == 'object':
                df_work[feature_name] = df_work[col].notna().astype(int)
            else:
                df_work[feature_name] = (df_work[col] > 0).astype(int)
            found = True
            print(f"✓ Created: {feature_name}")
            break
    
    if not found:
        df_work[feature_name] = 0
        print(f"⚠ Created default: {feature_name} = 0")

print(f"\n✓ Road features processed. Shape: {df_work.shape}")


Processing Road/Location Features...
Road Type Distribution:
road_type
highway    1520
rural       768
urban       712
Name: count, dtype: int64

✓ Road types: highway, arterial, urban, rural, residential, unknown
⚠ Created default: is_junction = 0
✓ Created: is_urban
⚠ Created default: is_highway = 0

✓ Road features processed. Shape: (3000, 36)


In [9]:
print("Creating Target Variable...")
print("="*60)

if 'target' in columns_map:
    target_col = columns_map['target']
    
    print(f"Original {target_col} distribution:")
    print(df_work[target_col].value_counts())
    
    # Convert to binary: High Risk (1) vs Low Risk (0)
    # Adjust threshold based on your data
    unique_vals = sorted(df_work[target_col].unique())
    
    if len(unique_vals) <= 3:
        # If 1,2,3 or similar: 3 = high, 1-2 = low
        threshold = unique_vals[int(len(unique_vals) * 0.6)]  # Top 40% as high risk
        df_work['target_binary'] = (df_work[target_col] >= threshold).astype(int)
    else:
        # For string categories, map manually
        severity_map = {
            'fatal': 1, 'serious': 1, 'severe': 1, 'major': 1, 'high': 1,
            'slight': 0, 'minor': 0, 'low': 0, 'moderate': 0
        }
        
        def map_severity(val):
            val_lower = str(val).lower()
            for key, result in severity_map.items():
                if key in val_lower:
                    return result
            return 0  # Default to low risk
        
        df_work['target_binary'] = df_work[target_col].apply(map_severity)
    
    print(f"\nBinary Target Distribution:")
    print(df_work['target_binary'].value_counts())
    print(f"\nClass Balance: {df_work['target_binary'].value_counts(normalize=True).to_dict()}")
    
else:
    print("⚠ No target column found! Using first categorical as target")
    cat_cols = df_work.select_dtypes(include=['object']).columns
    if len(cat_cols) > 0:
        df_work['target_binary'] = (df_work[cat_cols[0]] == df_work[cat_cols[0]].value_counts().index[0]).astype(int)
    else:
        df_work['target_binary'] = 0

print(f"\n✓ Target variable created. Shape: {df_work.shape}")


Creating Target Variable...
Original Accident Severity distribution:
Accident Severity
Minor      1034
Fatal       985
Serious     981
Name: count, dtype: int64

Binary Target Distribution:
target_binary
1    2015
0     985
Name: count, dtype: int64

Class Balance: {1: 0.6716666666666666, 0: 0.3283333333333333}

✓ Target variable created. Shape: (3000, 37)


In [10]:
print("Selecting Final Features...")
print("="*60)

# Define feature columns
feature_columns = []

# Time features
time_features = ['hour', 'day_of_week', 'month', 'is_weekend', 'is_rush_hour', 'is_night', 'is_monsoon']
for feat in time_features:
    if feat in df_work.columns:
        feature_columns.append(feat)

# Weather features
if 'weather_main' in df_work.columns:
    feature_columns.append('weather_main')

# Road/location features
road_features = ['road_type', 'is_junction', 'is_urban', 'is_highway']
for feat in road_features:
    if feat in df_work.columns:
        feature_columns.append(feat)

# Add any remaining numeric columns (but exclude target and datetime)
exclude_cols = ['target_binary'] + [columns_map.get('target', ''), columns_map.get('datetime', ''), 
                'datetime_parsed', columns_map.get('weather', ''), columns_map.get('location', '')]

numeric_cols = df_work.select_dtypes(include=[np.number]).columns.tolist()
for col in numeric_cols:
    if col not in feature_columns and col not in exclude_cols and 'unnamed' not in col.lower():
        feature_columns.append(col)
        if len(feature_columns) > 20:  # Limit total features
            break

# Clean and prepare final dataset
df_final = df_work[feature_columns + ['target_binary']].copy()

# Drop any remaining NaN
df_final = df_final.dropna()

print(f"Selected Features ({len(feature_columns)}):")
print("="*60)
for i, feat in enumerate(feature_columns, 1):
    dtype = df_final[feat].dtype
    unique = df_final[feat].nunique()
    print(f"{i:2d}. {feat:20s} | Type: {str(dtype):10s} | Unique: {unique}")

print(f"\n✓ Final Dataset Shape: {df_final.shape}")
print(f"✓ Target Balance: {df_final['target_binary'].value_counts().to_dict()}")


Selecting Final Features...
Selected Features (19):
 1. hour                 | Type: float64    | Unique: 24
 2. day_of_week          | Type: float64    | Unique: 1
 3. month                | Type: float64    | Unique: 1
 4. is_weekend           | Type: int32      | Unique: 1
 5. is_rush_hour         | Type: int32      | Unique: 2
 6. is_night             | Type: int32      | Unique: 2
 7. is_monsoon           | Type: int32      | Unique: 1
 8. weather_main         | Type: object     | Unique: 5
 9. road_type            | Type: object     | Unique: 3
10. is_junction          | Type: int64      | Unique: 1
11. is_urban             | Type: int32      | Unique: 1
12. is_highway           | Type: int64      | Unique: 1
13. Year                 | Type: int64      | Unique: 6
14. Number of Vehicles Involved | Type: int64      | Unique: 5
15. Number of Casualties | Type: int64      | Unique: 11
16. Number of Fatalities | Type: int64      | Unique: 6
17. Speed Limit (km/h)   | Type: int64     

In [11]:
from sklearn.model_selection import train_test_split

print("Splitting Dataset...")
print("="*60)

# Separate features and target
X = df_final.drop('target_binary', axis=1)
y = df_final['target_binary']

# Split: 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

# Create train and test dataframes
train_df = pd.concat([X_train, y_train], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

# Save to CSV
train_df.to_csv('../data/train.csv', index=False)
test_df.to_csv('../data/test.csv', index=False)

print(f"✓ Train Set: {train_df.shape} | Target: {y_train.value_counts().to_dict()}")
print(f"✓ Test Set:  {test_df.shape} | Target: {y_test.value_counts().to_dict()}")

print("\n" + "="*60)
print("✓ Files Saved:")
print("  - ../data/train.csv")
print("  - ../data/test.csv")
print("="*60)


Splitting Dataset...
✓ Train Set: (2004, 20) | Target: {1: 1338, 0: 666}
✓ Test Set:  (502, 20) | Target: {1: 335, 0: 167}

✓ Files Saved:
  - ../data/train.csv
  - ../data/test.csv


In [12]:
import json

# Create feature metadata
feature_metadata = {
    'total_features': len(feature_columns),
    'feature_list': feature_columns,
    'categorical_features': [col for col in feature_columns if df_final[col].dtype == 'object'],
    'numerical_features': [col for col in feature_columns if df_final[col].dtype in ['int64', 'float64']],
    'target_column': 'target_binary',
    'train_size': train_df.shape[0],
    'test_size': test_df.shape[0],
    'class_distribution': y.value_counts().to_dict()
}

# Save metadata
with open('../models/feature_metadata.json', 'w') as f:
    json.dump(feature_metadata, f, indent=2)

print("Feature Metadata:")
print("="*60)
print(json.dumps(feature_metadata, indent=2))

print("\n✓ Metadata saved: models/feature_metadata.json")
print("\n" + "="*70)
print("✓✓✓ FEATURE ENGINEERING COMPLETE! ✓✓✓".center(70))
print("="*70)


Feature Metadata:
{
  "total_features": 19,
  "feature_list": [
    "hour",
    "day_of_week",
    "month",
    "is_weekend",
    "is_rush_hour",
    "is_night",
    "is_monsoon",
    "weather_main",
    "road_type",
    "is_junction",
    "is_urban",
    "is_highway",
    "Year",
    "Number of Vehicles Involved",
    "Number of Casualties",
    "Number of Fatalities",
    "Speed Limit (km/h)",
    "Driver Age",
    "year"
  ],
  "categorical_features": [
    "weather_main",
    "road_type"
  ],
  "numerical_features": [
    "hour",
    "day_of_week",
    "month",
    "is_junction",
    "is_highway",
    "Year",
    "Number of Vehicles Involved",
    "Number of Casualties",
    "Number of Fatalities",
    "Speed Limit (km/h)",
    "Driver Age",
    "year"
  ],
  "target_column": "target_binary",
  "train_size": 2004,
  "test_size": 502,
  "class_distribution": {
    "1": 1673,
    "0": 833
  }
}

✓ Metadata saved: models/feature_metadata.json

                ✓✓✓ FEATURE ENGINEERING C