In [2]:
import pandas as pd
import warnings
import os
import sys
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from sklearn import preprocessing
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer

import missingno as msno

root_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))

sys.path.insert(0, root_dir)

from config import DATA_PATH

# Show all columns
pd.set_option('display.max_columns', None)

In [3]:
df = pd.read_csv(os.path.join(DATA_PATH, 'RTA Dataset.csv'))

print(df.shape)

(12316, 32)


In [4]:
df['Time'] = pd.to_datetime(df['Time'], format='%H:%M:%S').dt.time

In [5]:
df.head()

Unnamed: 0,Time,Day_of_week,Age_band_of_driver,Sex_of_driver,Educational_level,Vehicle_driver_relation,Driving_experience,Type_of_vehicle,Owner_of_vehicle,Service_year_of_vehicle,Defect_of_vehicle,Area_accident_occured,Lanes_or_Medians,Road_allignment,Types_of_Junction,Road_surface_type,Road_surface_conditions,Light_conditions,Weather_conditions,Type_of_collision,Number_of_vehicles_involved,Number_of_casualties,Vehicle_movement,Casualty_class,Sex_of_casualty,Age_band_of_casualty,Casualty_severity,Work_of_casuality,Fitness_of_casuality,Pedestrian_movement,Cause_of_accident,Accident_severity
0,17:02:00,Monday,18-30,Male,Above high school,Employee,1-2yr,Automobile,Owner,Above 10yr,No defect,Residential areas,,Tangent road with flat terrain,No junction,Asphalt roads,Dry,Daylight,Normal,Collision with roadside-parked vehicles,2,2,Going straight,na,na,na,na,,,Not a Pedestrian,Moving Backward,Slight Injury
1,17:02:00,Monday,31-50,Male,Junior high school,Employee,Above 10yr,Public (> 45 seats),Owner,5-10yrs,No defect,Office areas,Undivided Two way,Tangent road with flat terrain,No junction,Asphalt roads,Dry,Daylight,Normal,Vehicle with vehicle collision,2,2,Going straight,na,na,na,na,,,Not a Pedestrian,Overtaking,Slight Injury
2,17:02:00,Monday,18-30,Male,Junior high school,Employee,1-2yr,Lorry (41?100Q),Owner,,No defect,Recreational areas,other,,No junction,Asphalt roads,Dry,Daylight,Normal,Collision with roadside objects,2,2,Going straight,Driver or rider,Male,31-50,3,Driver,,Not a Pedestrian,Changing lane to the left,Serious Injury
3,01:06:00,Sunday,18-30,Male,Junior high school,Employee,5-10yr,Public (> 45 seats),Governmental,,No defect,Office areas,other,Tangent road with mild grade and flat terrain,Y Shape,Earth roads,Dry,Darkness - lights lit,Normal,Vehicle with vehicle collision,2,2,Going straight,Pedestrian,Female,18-30,3,Driver,Normal,Not a Pedestrian,Changing lane to the right,Slight Injury
4,01:06:00,Sunday,18-30,Male,Junior high school,Employee,2-5yr,,Owner,5-10yrs,No defect,Industrial areas,other,Tangent road with flat terrain,Y Shape,Asphalt roads,Dry,Darkness - lights lit,Normal,Vehicle with vehicle collision,2,2,Going straight,na,na,na,na,,,Not a Pedestrian,Overtaking,Slight Injury


In [7]:
dump_df = df.drop(['Time'], axis=1)

In [8]:
def lowercase_column_and_values(df):
    df.columns = df.columns.str.lower()
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].str.lower()
    return df

dump_df = lowercase_column_and_values(dump_df)

In [9]:
dump_df.head()

Unnamed: 0,day_of_week,age_band_of_driver,sex_of_driver,educational_level,vehicle_driver_relation,driving_experience,type_of_vehicle,owner_of_vehicle,service_year_of_vehicle,defect_of_vehicle,area_accident_occured,lanes_or_medians,road_allignment,types_of_junction,road_surface_type,road_surface_conditions,light_conditions,weather_conditions,type_of_collision,number_of_vehicles_involved,number_of_casualties,vehicle_movement,casualty_class,sex_of_casualty,age_band_of_casualty,casualty_severity,work_of_casuality,fitness_of_casuality,pedestrian_movement,cause_of_accident,accident_severity
0,monday,18-30,male,above high school,employee,1-2yr,automobile,owner,above 10yr,no defect,residential areas,,tangent road with flat terrain,no junction,asphalt roads,dry,daylight,normal,collision with roadside-parked vehicles,2,2,going straight,na,na,na,na,,,not a pedestrian,moving backward,slight injury
1,monday,31-50,male,junior high school,employee,above 10yr,public (> 45 seats),owner,5-10yrs,no defect,office areas,undivided two way,tangent road with flat terrain,no junction,asphalt roads,dry,daylight,normal,vehicle with vehicle collision,2,2,going straight,na,na,na,na,,,not a pedestrian,overtaking,slight injury
2,monday,18-30,male,junior high school,employee,1-2yr,lorry (41?100q),owner,,no defect,recreational areas,other,,no junction,asphalt roads,dry,daylight,normal,collision with roadside objects,2,2,going straight,driver or rider,male,31-50,3,driver,,not a pedestrian,changing lane to the left,serious injury
3,sunday,18-30,male,junior high school,employee,5-10yr,public (> 45 seats),governmental,,no defect,office areas,other,tangent road with mild grade and flat terrain,y shape,earth roads,dry,darkness - lights lit,normal,vehicle with vehicle collision,2,2,going straight,pedestrian,female,18-30,3,driver,normal,not a pedestrian,changing lane to the right,slight injury
4,sunday,18-30,male,junior high school,employee,2-5yr,,owner,5-10yrs,no defect,industrial areas,other,tangent road with flat terrain,y shape,asphalt roads,dry,darkness - lights lit,normal,vehicle with vehicle collision,2,2,going straight,na,na,na,na,,,not a pedestrian,overtaking,slight injury


In [10]:
def remove_trailing_spaces(df):
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].str.strip()
    return df

dump_df = remove_trailing_spaces(dump_df)
dump_df['area_accident_occured'].value_counts()

area_accident_occured
other                              3819
office areas                       3451
residential areas                  2060
church areas                       1060
industrial areas                    456
school areas                        415
recreational areas                  328
outside rural areas                 218
hospital areas                      121
market areas                         63
rural village areas                  44
unknown                              22
rural village areasoffice areas      20
Name: count, dtype: int64

In [18]:
for col in dump_df.columns:
    # Store in a text file
    with open('column_values.txt', 'a') as f:
        f.write(f"Column Name: {col} \n")
        f.write(f"{col} Values: {dump_df[col].value_counts()} \n")
        f.write("-------------------------------------------- \n")

In [None]:
ordinal_list = ['day_of_week', 'age_band_of_driver', 'educational_level',
                'driving_experience', 'service_year_of_vehicle', 'defect_of_vehicle', 
                'age_band_of_casualty', 'casulty_severity', 'accident_severity']
nominal_list = ['sex_of_driver', 'type_of_vehicle', 'owner_of_vehicle',
                'area_accident_occured', 'lanes_or_medians', 'road_allignment', 
                'types_of_junction', 'road_surface_type', 'road_surface_conditions', 
                'light_conditions', 'weather_conditions', 'type_of_collision', 
                'vehicle_movement', 'casualty_class', 'sex_of_casualty', 
                'work_of_casuality', 'fitness_of_casuality', 'pedestrian_movement', 'case_of_accident']
to_nan = ['unknown']