In [2]:
import pandas as pd
import numpy as np
import os
import warnings
import logging
import sys
warnings.filterwarnings('ignore')
from src.preprocessing import preprocess
from src.paths import get_data_path

# Show all columns
pd.set_option('display.max_columns', None)


logger = logging.getLogger()
logger.setLevel(logging.INFO)

# Create a handler to print logs in the notebook
handler = logging.StreamHandler()
handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)

np.random.seed(42)

In [3]:
get_data_path?

[0;31mSignature:[0m [0mget_data_path[0m[0;34m([0m[0;34m)[0m [0;34m->[0m [0mstr[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Get the path to the data directory.

Returns
-------
str
    Path to the data directory.
[0;31mFile:[0m      ~/Desktop/my-ml-process-1/src/paths.py
[0;31mType:[0m      function

In [4]:
data_path = get_data_path()
print(data_path)

/Users/pats/Desktop/my-ml-process-1/data


In [5]:
df = pd.read_csv(os.path.join(data_path, 'raw_data.csv'))
df.head()

Unnamed: 0,Time,Day_of_week,Age_band_of_driver,Sex_of_driver,Educational_level,Vehicle_driver_relation,Driving_experience,Type_of_vehicle,Owner_of_vehicle,Service_year_of_vehicle,Defect_of_vehicle,Area_accident_occured,Lanes_or_Medians,Road_allignment,Types_of_Junction,Road_surface_type,Road_surface_conditions,Light_conditions,Weather_conditions,Type_of_collision,Number_of_vehicles_involved,Number_of_casualties,Vehicle_movement,Casualty_class,Sex_of_casualty,Age_band_of_casualty,Casualty_severity,Work_of_casuality,Fitness_of_casuality,Pedestrian_movement,Cause_of_accident,Accident_severity
0,17:02:00,Monday,18-30,Male,Above high school,Employee,1-2yr,Automobile,Owner,Above 10yr,No defect,Residential areas,,Tangent road with flat terrain,No junction,Asphalt roads,Dry,Daylight,Normal,Collision with roadside-parked vehicles,2,2,Going straight,na,na,na,na,,,Not a Pedestrian,Moving Backward,Slight Injury
1,17:02:00,Monday,31-50,Male,Junior high school,Employee,Above 10yr,Public (> 45 seats),Owner,5-10yrs,No defect,Office areas,Undivided Two way,Tangent road with flat terrain,No junction,Asphalt roads,Dry,Daylight,Normal,Vehicle with vehicle collision,2,2,Going straight,na,na,na,na,,,Not a Pedestrian,Overtaking,Slight Injury
2,17:02:00,Monday,18-30,Male,Junior high school,Employee,1-2yr,Lorry (41?100Q),Owner,,No defect,Recreational areas,other,,No junction,Asphalt roads,Dry,Daylight,Normal,Collision with roadside objects,2,2,Going straight,Driver or rider,Male,31-50,3,Driver,,Not a Pedestrian,Changing lane to the left,Serious Injury
3,1:06:00,Sunday,18-30,Male,Junior high school,Employee,5-10yr,Public (> 45 seats),Governmental,,No defect,Office areas,other,Tangent road with mild grade and flat terrain,Y Shape,Earth roads,Dry,Darkness - lights lit,Normal,Vehicle with vehicle collision,2,2,Going straight,Pedestrian,Female,18-30,3,Driver,Normal,Not a Pedestrian,Changing lane to the right,Slight Injury
4,1:06:00,Sunday,18-30,Male,Junior high school,Employee,2-5yr,,Owner,5-10yrs,No defect,Industrial areas,other,Tangent road with flat terrain,Y Shape,Asphalt roads,Dry,Darkness - lights lit,Normal,Vehicle with vehicle collision,2,2,Going straight,na,na,na,na,,,Not a Pedestrian,Overtaking,Slight Injury


In [6]:
col_map={
    'Time': 'time',
    'Day_of_week': 'day_of_week',
    'Age_band_of_driver': 'driver_age',
    'Sex_of_driver': 'driver_sex',
    'Educational_level': 'educational_level',
    'Vehicle_driver_relation': 'vehicle_driver_relation',
    'Driving_experience': 'driving_experience',
    'Type_of_vehicle': 'vehicle_type',
    'Owner_of_vehicle': 'vehicle_owner',
    'Service_year_of_vehicle': 'service_year',
    'Defect_of_vehicle': 'vehicle_defect',
    'Area_accident_occured': 'accident_area',
    'Lanes_or_Medians': 'lanes',
    'Road_allignment': 'road_allignment',
    'Types_of_Junction': 'junction_type',
    'Road_surface_type': 'surface_type',
    'Road_surface_conditions': 'road_surface_conditions',
    'Light_conditions': 'light_condition',
    'Weather_conditions': 'weather_condition',
    'Type_of_collision': 'collision_type',
    'Number_of_vehicles_involved': 'vehicles_involved',
    'Number_of_casualties': 'casualties',
    'Vehicle_movement': 'vehicle_movement',
    'Casualty_class': 'casualty_class',
    'Sex_of_casualty': 'casualty_sex' , 
    'Age_band_of_casualty': 'casualty_age', 
    'Casualty_severity': 'casualty_severity',
    'Work_of_casuality': 'casualty_work', 
    'Fitness_of_casuality': 'casualty_fitness',
    'Pedestrian_movement': 'pedestrian_movement',
    'Cause_of_accident': 'accident_cause',
    'Accident_severity': 'accident_severity'
}

In [7]:
preprocess?

[0;31mSignature:[0m
[0mpreprocess[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mdf[0m[0;34m:[0m [0mpandas[0m[0;34m.[0m[0mcore[0m[0;34m.[0m[0mframe[0m[0;34m.[0m[0mDataFrame[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mthreshold[0m[0;34m:[0m [0mint[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtime_column[0m[0;34m:[0m [0mstr[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtime_type[0m[0;34m:[0m [0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtarget[0m[0;34m:[0m [0mstr[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mn_neighbors[0m[0;34m:[0m [0mint[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msampling_strategy[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mDict[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mint[0m[0;34m][0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcol_map[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mDict[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mstr[

In [8]:
preprocessed_df = preprocess(
    df=df,
    threshold=2000,
    time_column='time',
    time_type=['hour', 'minute'],
    target='accident_severity',
    col_map=col_map,
    n_neighbors=5
)

preprocessed_df.head()

2023-04-20 19:43:53,289 - INFO - Performing preprocessing
2023-04-20 19:43:53,290 - INFO - Renaming columns using column map: {'Time': 'time', 'Day_of_week': 'day_of_week', 'Age_band_of_driver': 'driver_age', 'Sex_of_driver': 'driver_sex', 'Educational_level': 'educational_level', 'Vehicle_driver_relation': 'vehicle_driver_relation', 'Driving_experience': 'driving_experience', 'Type_of_vehicle': 'vehicle_type', 'Owner_of_vehicle': 'vehicle_owner', 'Service_year_of_vehicle': 'service_year', 'Defect_of_vehicle': 'vehicle_defect', 'Area_accident_occured': 'accident_area', 'Lanes_or_Medians': 'lanes', 'Road_allignment': 'road_allignment', 'Types_of_Junction': 'junction_type', 'Road_surface_type': 'surface_type', 'Road_surface_conditions': 'road_surface_conditions', 'Light_conditions': 'light_condition', 'Weather_conditions': 'weather_condition', 'Type_of_collision': 'collision_type', 'Number_of_vehicles_involved': 'vehicles_involved', 'Number_of_casualties': 'casualties', 'Vehicle_movement

Unnamed: 0,day_of_week,driver_age,driver_sex,educational_level,vehicle_driver_relation,driving_experience,vehicle_type,vehicle_owner,accident_area,lanes,road_allignment,junction_type,surface_type,road_surface_conditions,light_condition,weather_condition,collision_type,vehicles_involved,casualties,vehicle_movement,casualty_class,casualty_sex,casualty_age,casualty_severity,pedestrian_movement,accident_cause,accident_severity,hour,minute
0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,3.0,9.0,2.8,5.0,1.0,0.0,0.0,3.0,2.0,3.0,1.0,1.0,2.0,3.0,2.0,5.0,3.0,5.0,9.0,2.0,17.0,2.0
1,1.0,1.0,1.0,4.0,0.0,3.0,11.0,3.0,6.0,4.0,5.0,1.0,0.0,0.0,3.0,2.0,8.0,1.0,1.0,2.0,3.0,2.0,5.0,3.0,5.0,16.0,2.0,17.0,2.0
2,1.0,0.0,1.0,4.0,0.0,0.0,5.0,3.0,1.0,6.0,5.4,1.0,0.0,0.0,3.0,2.0,2.0,1.0,1.0,2.0,0.0,1.0,1.0,2.0,5.0,0.0,1.0,17.0,2.0
3,3.0,0.0,1.0,4.0,0.0,2.0,11.0,0.0,6.0,6.0,6.0,7.0,2.0,0.0,0.0,2.0,8.0,1.0,1.0,2.0,2.0,0.0,0.0,2.0,5.0,1.0,2.0,1.0,6.0
4,3.0,0.0,1.0,4.0,0.0,1.0,6.0,3.0,4.0,6.0,5.0,7.0,0.0,0.0,0.0,2.0,8.0,1.0,1.0,2.0,3.0,2.0,5.0,3.0,5.0,16.0,2.0,1.0,6.0
