In [332]:
import pandas as pd
import numpy as np

## ACCIDENT

First we will clean the table "ACCIDENT"

In [333]:
df = pd.read_csv(r'../data/ACCIDENT.csv')
print(df.head())

    ACCIDENT_NO ACCIDENTDATE                    ACCIDENTTIME  ACCIDENT_TYPE  \
0  T20060000010   13/01/2006  12:42:00                                    1   
1  T20060000018   13/01/2006  19:10:00                                    1   
2  T20060000022   14/01/2006  12:10:00                                    7   
3  T20060000023   14/01/2006  11:49:00                                    1   
4  T20060000026   14/01/2006  10:45:00                                    1   

               Accident Type Desc  DAY_OF_WEEK Day Week Description  DCA_CODE  \
0          Collision with vehicle            6               Friday       113   
1          Collision with vehicle            6               Friday       113   
2  Fall from or in moving vehicle            7             Saturday       190   
3          Collision with vehicle            7             Saturday       130   
4          Collision with vehicle            7             Saturday       121   

                                  DCA 

  df = pd.read_csv(r'../data/ACCIDENT.csv')


In [334]:
print("Accident null: ", df['ACCIDENT_NO'].isnull().any())

print("Accident no is duplicated: ", df["ACCIDENT_NO"].duplicated().any())

print("All date are valid: ", pd.to_datetime(df['ACCIDENTDATE'], dayfirst= True, errors='coerce').notnull().all())

print("All time are valid: ", pd.to_datetime(df['ACCIDENTTIME'], dayfirst= True, errors='coerce').notnull().all())

print("Accident type null: ", df['ACCIDENT_TYPE'].isnull().any(), ", equal desc length: ", len(df['ACCIDENT_TYPE'].unique()) == len(df['Accident Type Desc'].unique()))

print("Day of week null: ", df['DAY_OF_WEEK'].isnull().any(), ", equal desc length: ",len(df['DAY_OF_WEEK'].unique()) == len(df['Day Week Description'].unique()))

print("DCA null: ", df['DCA_CODE'].isnull().any(), ", equal desc length: ",len(df['DCA_CODE'].unique()) == len(df['DCA Description'].unique()))

print("Light condition null: ", df['LIGHT_CONDITION'].isnull().any(), ", equal desc length: ",len(df['LIGHT_CONDITION'].unique()) == len(df['Light Condition Desc'].unique()))

print("Nodes not a number: ", pd.to_numeric(df['NODE_ID'], errors='coerce').isnull().any(), ", smaller than 0: ", (df['NODE_ID'] <= 0).any())

print("No of vehicles not a number: ", pd.to_numeric(df['NO_OF_VEHICLES'], errors='coerce').isnull().any())

print("No of persons not a number: ", pd.to_numeric(df['NO_PERSONS'], errors='coerce').isnull().any())

print("No of persons injured 2 not a number: ", pd.to_numeric(df['NO_PERSONS_INJ_2'], errors='coerce').isnull().any())

print("No of persons injured 3 not a number: ", pd.to_numeric(df['NO_PERSONS_INJ_3'], errors='coerce').isnull().any())

print("No of persons killed not a number: ", pd.to_numeric(df['NO_PERSONS_KILLED'], errors='coerce').isnull().any())

print("No of persons not injured not a number: ", pd.to_numeric(df['NO_PERSONS_NOT_INJ'], errors='coerce').isnull().any())

print("No of polices attended not a number: ", pd.to_numeric(df['POLICE_ATTEND'], errors='coerce').isnull().any())

print("Severity not a number: ", pd.to_numeric(df['SEVERITY'], errors='coerce').isnull().any())

print("Road geometry null: ", df['ROAD_GEOMETRY'].isnull().any(), ", equal desc length: ", len(df['ROAD_GEOMETRY'].unique()) == len(df['Road Geometry Desc'].unique()))

print("Speed zone not a number: ", pd.to_numeric(df['SPEED_ZONE'], errors='coerce').isnull().any())

Accident null:  False
Accident no is duplicated:  False
All date are valid:  True
All time are valid:  True
Accident type null:  False , equal desc length:  True
Day of week null:  False , equal desc length:  False
DCA null:  False , equal desc length:  True
Light condition null:  False , equal desc length:  True
Nodes not a number:  False , smaller than 0:  True
No of vehicles not a number:  False
No of persons not a number:  False
No of persons injured 2 not a number:  False
No of persons injured 3 not a number:  False
No of persons killed not a number:  False
No of persons not injured not a number:  False
No of polices attended not a number:  False
Severity not a number:  False
Road geometry null:  False , equal desc length:  True
Speed zone not a number:  False


We observe that there a some problems with the given dataset:
- Day of week: 0 and 1 are both given to Sunday
- Node ID: Contain negative values

We fix this as following:

In [335]:
df.loc[df['DAY_OF_WEEK'] == 0, ['DAY_OF_WEEK']] = 1

df = df.loc[df['NODE_ID'] > 0]

In [336]:
conditions = [
    df['DAY_OF_WEEK'].eq(1),
    df['DAY_OF_WEEK'].eq(2),
    df['DAY_OF_WEEK'].eq(3),
    df['DAY_OF_WEEK'].eq(4),
    df['DAY_OF_WEEK'].eq(5),
    df['DAY_OF_WEEK'].eq(6),
    df['DAY_OF_WEEK'].eq(7),
]

choices = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 
           'Thursday', 'Friday', 'Saturday']

df['Day Week Description'] = np.select(conditions, choices, default = "Unknown")

In [337]:
conditions = [
    df['ROAD_GEOMETRY'].eq(1),
    df['ROAD_GEOMETRY'].eq(2),
    df['ROAD_GEOMETRY'].eq(5),
    df['ROAD_GEOMETRY'].eq(4),
    df['ROAD_GEOMETRY'].eq(3),
    df['ROAD_GEOMETRY'].eq(9),
    df['ROAD_GEOMETRY'].eq(6),
    df['ROAD_GEOMETRY'].eq(8),
    df['ROAD_GEOMETRY'].eq(7),
]

choices = ['Cross intersection', 'T intersection', 'Not at intersection', 'Multiple intersection', 
           'Y intersection', 'Unknown', 'Dead end', 'Private property', 'Road closure']

df['Road Geometry Desc'] = np.select(conditions, choices)

In [338]:
conditions = [
    df['LIGHT_CONDITION'].eq(1),
    df['LIGHT_CONDITION'].eq(3),
    df['LIGHT_CONDITION'].eq(2),
    df['LIGHT_CONDITION'].eq(5),
    df['LIGHT_CONDITION'].eq(9),
    df['LIGHT_CONDITION'].eq(6),
    df['LIGHT_CONDITION'].eq(4)
]

choices = ['Day', 'Dark Street lights on', 'Dusk/Dawn', 'Dark No street lights', 
           'Unknown', 'Dark Street lights unknown', 'Dark Street lights off']

df['Light Condition Desc'] = np.select(conditions, choices)

In [339]:
df.to_csv("../data/clean/ACCIDENT.csv", index = False)

## PERSON

Now we'll move to table "PERSON"

In [340]:
df = pd.read_csv(r'../data/PERSON.csv')
print(df.head())

  df = pd.read_csv(r'../data/PERSON.csv')


    ACCIDENT_NO PERSON_ID VEHICLE_ID SEX   AGE Age Group INJ_LEVEL  \
0  T20060000010        01          A   F   NaN   unknown         4   
1  T20060000010        02          C   M  43.0     40-49         4   
2  T20060000010        03          C   M  22.0     22-25         4   
3  T20060000010        A           A   M  72.0       70+         4   
4  T20060000010        B           B   F  62.0     60-64         3   

  Inj Level Desc SEATING_POSITION HELMET_BELT_WORN ROAD_USER_TYPE  \
0    Not injured               LF                1              3   
1    Not injured               LF                1              3   
2    Not injured               LR                1              3   
3    Not injured               D                 1              2   
4   Other injury               D                 1              2   

  Road User Type Desc LICENCE_STATE PEDEST_MOVEMENT  POSTCODE TAKEN_HOSPITAL  \
0          Passengers                             0    3130.0                  
1   

In [341]:
print("Accident null: ", df['ACCIDENT_NO'].isnull().any())

print("Person ID null: ", df['PERSON_ID'].isnull().any())

print("Vehicle ID null: ", df['VEHICLE_ID'].isnull().any())

print("Sex null/ not F/M/U: ", ((df['SEX'] != "M") & (df['SEX'] != "F") & (df['SEX'] != "U")).any())

print("Age is not a number: ", pd.to_numeric(df['AGE'], errors='coerce').isnull().any())

print("Injured level is not a number: ", pd.to_numeric(df['INJ_LEVEL'], errors='coerce').isnull().any(), ", equal desc length: ",len(df['INJ_LEVEL'].unique()) == len(df['Inj Level Desc'].unique()))

print("Seating position null or empty string: ", (df['SEATING_POSITION'].isnull() | df['SEATING_POSITION'].str.isspace()).any())

print("Road user type not a number: ", pd.to_numeric(df['ROAD_USER_TYPE'], errors='coerce').isnull().any())

Accident null:  False
Person ID null:  False
Vehicle ID null:  False
Sex null/ not F/M/U:  True
Age is not a number:  True
Injured level is not a number:  True , equal desc length:  False
Seating position null or empty string:  True
Road user type not a number:  True


With the problems we found with `Sex`, `Age`, `INJ_LEVEL`, `SEATING_POSITION`, `ROAD_USER_TYPE`, we will fix as following:

In [342]:
df.loc[(df['SEX'] != "M") & (df['SEX'] != "F") & (df['SEX'] != "U"), ['SEX']] = "U"
df.loc[pd.to_numeric(df['AGE'], errors='coerce').isnull(), 'AGE'] = None

df.loc[pd.to_numeric(df['INJ_LEVEL'], errors='coerce').isnull(), "INJ_LEVEL"] = None

df.loc[(df['SEATING_POSITION'].isnull() | df['SEATING_POSITION'].str.isspace()), "SEATING_POSITION"] = None

df.loc[pd.to_numeric(df['ROAD_USER_TYPE'], errors='coerce').isnull(), 'ROAD_USER_TYPE'] = 9
df.loc[df['ROAD_USER_TYPE'] == 9, "Road User Type Desc"] = 'Unknown'
df['ROAD_USER_TYPE'] = pd.to_numeric(df['ROAD_USER_TYPE'], errors='coerce')

df['INJ_LEVEL'] = pd.to_numeric(df['INJ_LEVEL'], errors='coerce')

df['HELMET_BELT_WORN'] = pd.to_numeric(df['HELMET_BELT_WORN'], errors='coerce')

Now we will standardise `Age Group`:

In [343]:
df['Age Group'].unique()

array(['unknown', '40-49', '22-25', '70+', '60-64', '30-39', '17-21',
       '50-59', '26-29', '16-17', '13-15', '5-12', '0-4', '64-69'],
      dtype=object)

In [344]:
conditions = [
    df['AGE'].between(0, 4, inclusive = 'both'),
    df['AGE'].between(5, 12, inclusive = 'both'),
    df['AGE'].between(13, 15, inclusive = 'both'),
    df['AGE'].between(16, 17, inclusive = 'both'),
    df['AGE'].between(18, 21, inclusive = 'both'),
    df['AGE'].between(22, 25, inclusive = 'both'),
    df['AGE'].between(26, 29, inclusive = 'both'),
    df['AGE'].between(30, 39, inclusive = 'both'),
    df['AGE'].between(40, 49, inclusive = 'both'),
    df['AGE'].between(50, 59, inclusive = 'both'),
    df['AGE'].between(60, 64, inclusive = 'both'),
    df['AGE'].between(65, 69, inclusive = 'both'),
    df['AGE'].ge(70),
]

choices = ["0-4", "5-12", "13-15", "16-17", "18-21", "22-25",
          "26-29", "30-39", "40-49", "50-59", "60-64", "65-69", "70+"]

df['Age Group'] = np.select(conditions, choices, default = "Unknown")

Now we standardise `Road User type description`:

In [345]:
print(df['ROAD_USER_TYPE'].unique())
print(df['Road User Type Desc'].unique())

print(df.loc[df['ROAD_USER_TYPE'] == 7, 'Road User Type Desc'].head())
print(df.loc[df['ROAD_USER_TYPE'] == 8, 'Road User Type Desc'].head())

[3 2 5 4 9 1 6 7 8]
['Passengers' 'Drivers' 'Pillion Passengers' 'Motorcyclists' 'Unknown'
 'Pedestrians' 'Bicyclists']
2128    Drivers
2623    Drivers
2637    Drivers
3876    Drivers
3974    Drivers
Name: Road User Type Desc, dtype: object
5765     Passengers
6339     Passengers
6340     Passengers
6537     Passengers
11222    Passengers
Name: Road User Type Desc, dtype: object


In [346]:
df.loc[df['ROAD_USER_TYPE'] == 7, 'ROAD_USER_TYPE'] = 2
df.loc[df['ROAD_USER_TYPE'] == 8, 'ROAD_USER_TYPE'] = 3

conditions = [
    df['ROAD_USER_TYPE'].eq(3),
    df['ROAD_USER_TYPE'].eq(2),
    df['ROAD_USER_TYPE'].eq(5),
    df['ROAD_USER_TYPE'].eq(4),
    df['ROAD_USER_TYPE'].eq(9),
    df['ROAD_USER_TYPE'].eq(1),
    df['ROAD_USER_TYPE'].eq(6)
]

choices = ['Passengers', 'Drivers', 'Pillion Passengers', 'Motorcyclists', 
           'Unknown', 'Pedestrians', 'Bicyclists']

df['Road User Type Desc'] = np.select(conditions, choices, default = "Unknown")

In [347]:
print(df['INJ_LEVEL'].unique())
print(df['Inj Level Desc'].unique())

conditions = [
    df['INJ_LEVEL'].eq(4),
    df['INJ_LEVEL'].eq(3),
    df['INJ_LEVEL'].eq(2),
    df['INJ_LEVEL'].eq(1),
    df['INJ_LEVEL'].eq(9),
]

choices = ['Not injured', 'Other injury', 'Serious injury', 'Fatality', 'Unknown']

df['Inj Level Desc'] = np.select(conditions, choices, default = "Unknown")

[ 4.  3.  2.  1. nan]
['Not injured' 'Other injury' 'Serious injury' 'Fatality' 'Unknown']


In [348]:
df.to_csv("../data/clean/PERSON.csv", index = False)

## ACCIDENT_LOCATION

Now we move to table "ACCIDENT_LOCATION"

In [349]:
df = pd.read_csv(r'../data/ACCIDENT_LOCATION.csv')
print(df.head())

    ACCIDENT_NO  NODE_ID  ROAD_ROUTE_1   ROAD_NAME ROAD_TYPE    ROAD_NAME_INT  \
0  T20060000010    43078        2090.0      FOSTER    STREET           MCCRAE   
1  T20060000018    29720        5057.0      HALLAM      ROAD  BELGRAVE-HALLAM   
2  T20060000022   203074        9999.0      BROWNS      ROAD         TRUEMANS   
3  T20060000023    55462        2400.0  SPRINGVALE      ROAD      KEYSBOROUGH   
4  T20060000026   202988        9999.0   ELIZABETH    AVENUE        GREENHOOD   

  ROAD_TYPE_INT  DISTANCE_LOCATION DIRECTION_LOCATION  NEAREST_KM_POST  \
0        STREET                0.0                 SW              NaN   
1          ROAD               70.0                  S              NaN   
2          ROAD              210.0                  W              NaN   
3        AVENUE                0.0                  N              NaN   
4      CRESCENT               20.0                  N              NaN   

  OFF_ROAD_LOCATION  
0               NaN  
1               NaN  
2 

In [350]:
print("Accident null: ", df['ACCIDENT_NO'].isnull().any())

print("Nodes not a number: ", pd.to_numeric(df['NODE_ID'], errors='coerce').isnull().any(), ", smaller than 0: ", (df['NODE_ID'] <= 0).any())

Accident null:  False
Nodes not a number:  False , smaller than 0:  True


In [351]:
df = df.loc[df['NODE_ID'] > 0]

In [352]:
df.to_csv("../data/clean/ACCIDENT_LOCATION.csv", index = False)

## NODE

Now we will clean table "NODE"

In [353]:
df = pd.read_csv(r'../data/NODE.csv')
print(df.head())

    ACCIDENT_NO  NODE_ID NODE_TYPE  VICGRID94_X  VICGRID94_Y   LGA_NAME  \
0  T20060002689       22         I  2495701.925  2411599.135  MELBOURNE   
1  T20060010827       22         I  2495701.925  2411599.135  MELBOURNE   
2  T20060017279       22         I  2495701.925  2411599.135  MELBOURNE   
3  T20060041762       22         I  2495701.925  2411599.135  MELBOURNE   
4  T20060047478       22         I  2495701.925  2411599.135  MELBOURNE   

  LGA_NAME_ALL                     REGION_NAME DEG_URBAN_NAME        Lat  \
0    MELBOURNE  METROPOLITAN NORTH WEST REGION     MELB_URBAN -37.796596   
1    MELBOURNE  METROPOLITAN NORTH WEST REGION     MELB_URBAN -37.796596   
2    MELBOURNE  METROPOLITAN NORTH WEST REGION     MELB_URBAN -37.796596   
3    MELBOURNE  METROPOLITAN NORTH WEST REGION     MELB_URBAN -37.796596   
4    MELBOURNE  METROPOLITAN NORTH WEST REGION     MELB_URBAN -37.796596   

         Long  POSTCODE_NO  
0  144.951197         3051  
1  144.951197         3051  
2  14

In [354]:
print("Accident null: ", df['ACCIDENT_NO'].isnull().any())

print("Nodes not a number: ", pd.to_numeric(df['NODE_ID'], errors='coerce').isnull().any(), ", smaller than 0: ", (df['NODE_ID'] <= 0).any())

print("Lat not a number: ", pd.to_numeric(df['Lat'], errors='coerce').isnull().any())

print("Long not a number: ", pd.to_numeric(df['Long'], errors='coerce').isnull().any())

Accident null:  False
Nodes not a number:  False , smaller than 0:  False
Lat not a number:  False
Long not a number:  False


In [355]:
df.to_csv("../data/clean/NODE.csv", index = False)

## ROAD SURFACE CONDITION

Now we will clean table "ROAD_SURFACE_COND"

In [356]:
df = pd.read_csv(r'../data/ROAD_SURFACE_COND.csv')
print(df.head())

    ACCIDENT_NO  SURFACE_COND Surface Cond Desc  SURFACE_COND_SEQ
0  T20060000010             1               Dry                 1
1  T20060000018             1               Dry                 1
2  T20060000022             1               Dry                 1
3  T20060000023             1               Dry                 1
4  T20060000026             1               Dry                 1


In [357]:
print("Accident null: ", df['ACCIDENT_NO'].isnull().any())

print("Surface condition null: ", df['SURFACE_COND'].isnull().any(), pd.to_numeric(df['SURFACE_COND'], errors='coerce').isnull().any())

print(df['SURFACE_COND'].unique())
print(df['Surface Cond Desc'].unique())

Accident null:  False
Surface condition null:  False False
[1 2 9 3 5 4]
['Dry' 'Wet' 'Unknown' 'Muddy' 'Icy' 'Snowy']


We standardise `Surface Condition Description` as below:

In [358]:
conditions = [
    df['SURFACE_COND'].eq(1),
    df['SURFACE_COND'].eq(2),
    df['SURFACE_COND'].eq(9),
    df['SURFACE_COND'].eq(3),
    df['SURFACE_COND'].eq(5),
    df['SURFACE_COND'].eq(4)
]

choices = ['Dry', 'Wet', 'Unknown', 'Muddy', 'Icy', 'Snowy'] 

df['Surface Cond Desc'] = np.select(conditions, choices, default = "Unknown")

In [359]:
df.to_csv("../data/clean/ROAD_SURFACE_COND.csv", index = False)

## ATMOSPERIC CONDITION

Now we will clean table "ATMOSPERIC_CONDITION"

In [360]:
df = pd.read_csv(r'../data/ATMOSPHERIC_COND.csv')
print(df.head())

    ACCIDENT_NO  ATMOSPH_COND  ATMOSPH_COND_SEQ Atmosph Cond Desc
0  T20060000010             1                 1             Clear
1  T20060000018             1                 1             Clear
2  T20060000022             1                 1             Clear
3  T20060000023             1                 1             Clear
4  T20060000026             1                 1             Clear


In [361]:
print("Accident null: ", df['ACCIDENT_NO'].isnull().any())

print("Atmospheric condition not a number: ", pd.to_numeric(df['ATMOSPH_COND'], errors='coerce').isnull().any())

print(df['ATMOSPH_COND'].unique())
print(df['Atmosph Cond Desc'].unique())

Accident null:  False
Atmospheric condition not a number:  False
[1 9 5 7 2 6 4 3]
['Clear' 'Not known' 'Smoke' 'Strong winds' 'Raining' 'Dust' 'Fog'
 'Snowing']


In [362]:
conditions = [
    df['ATMOSPH_COND'].eq(1),
    df['ATMOSPH_COND'].eq(9),
    df['ATMOSPH_COND'].eq(5),
    df['ATMOSPH_COND'].eq(7),
    df['ATMOSPH_COND'].eq(2),
    df['ATMOSPH_COND'].eq(6),
    df['ATMOSPH_COND'].eq(4),
    df['ATMOSPH_COND'].eq(3)
]

choices = ['Clear', 'Unknown', 'Smoke', 'Strong winds',
           'Raining', 'Dust', 'Fog', 'Snowing']

df['Atmosph Cond Desc'] = np.select(conditions, choices, default = "Unknown")

In [363]:
df.to_csv("../data/clean/ATMOSPHERIC_COND.csv", index = False)

## VEHICLE

Lastly, we will clean the "VEHICLE" table

In [364]:
df = pd.read_csv(r'../data/VEHICLE.csv')
print(df.head())

  df = pd.read_csv(r'../data/VEHICLE.csv')


    ACCIDENT_NO VEHICLE_ID  VEHICLE_YEAR_MANUF VEHICLE_DCA_CODE  \
0  T20060000010          A              1996.0                2   
1  T20060000010          B              2003.0                1   
2  T20060000010          C              2001.0                8   
3  T20060000018          A              1998.0                2   
4  T20060000018          B              1991.0                1   

  INITIAL_DIRECTION  ROAD_SURFACE_TYPE Road Surface Type Desc REG_STATE  \
0                SW                  1                  Paved         V   
1                NW                  1                  Paved         V   
2                NW                  1                  Paved         V   
3                S                   1                  Paved         V   
4                N                   1                  Paved         V   

  VEHICLE_BODY_STYLE VEHICLE_MAKE  ... VEHICLE_COLOUR_1  VEHICLE_COLOUR_2  \
0             SEDAN        MITSUB  ...              MRN              

In [365]:
print("Accident null: ", df['ACCIDENT_NO'].isnull().any())

print("Vehicle null: ", df['VEHICLE_ID'].isnull().any())

print("Accident No and Vehicel ID is duplicated: ", df.duplicated(subset=['ACCIDENT_NO','VEHICLE_ID'], keep=False).any())

print("Vehicle manifacture year not a number: ", pd.to_numeric(df['VEHICLE_YEAR_MANUF'], errors='coerce').isnull().any())

print("Road Surface type not a number: ", pd.to_numeric(df['ROAD_SURFACE_TYPE'], errors='coerce').isnull().any())

print("Vehicle type not a number: ", pd.to_numeric(df['VEHICLE_TYPE'], errors='coerce').isnull().any())

Accident null:  False
Vehicle null:  False
Accident No and Vehicel ID is duplicated:  False
Vehicle manifacture year not a number:  True
Road Surface type not a number:  False
Vehicle type not a number:  False


In [366]:
df.loc[pd.to_numeric(df['VEHICLE_YEAR_MANUF'], errors='coerce').isnull(), 'VEHICLE_YEAR_MANUF'] = -1

In [367]:
print(df['ROAD_SURFACE_TYPE'].unique())
print(df['Road Surface Type Desc'].unique())

[1 2 3 9]
['Paved' 'Unpaved' 'Gravel' 'Unknown']


In [368]:
print(df['VEHICLE_TYPE'].unique())
print(df['Vehicle Type Desc'].unique())

[ 1  3  4 10 71  5  2  6 72 13 17  7 12  8 99  9 11 15 60 16 18 61 14 62
 63 20 27 19]
['Car' 'Taxi' 'Utility' 'Motor Cycle'
 'Light Commercial Vehicle (Rigid) <= 4.5 Tonnes GVM' 'Panel Van'
 'Station Wagon' 'Prime Mover (No of Trailers Unknown)'
 'Heavy Vehicle (Rigid) > 4.5 Tonnes' 'Bicycle' 'Other Vehicle'
 'Rigid Truck(Weight Unknown)' 'Motor Scooter' 'Bus/Coach' 'Unknown'
 'Mini Bus(9-13 seats)' 'Moped' 'Tram' 'Prime Mover Only' 'Train'
 'Not Applicable' 'Prime Mover - Single Trailer' 'Horse (ridden or drawn)'
 'Prime Mover B-Double' 'Prime Mover B-Triple' 'Quad Bike'
 'Plant machinery and Agricultural equipment' 'Parked trailers']


In [369]:
conditions = [
    df['VEHICLE_TYPE'].eq(1),
    df['VEHICLE_TYPE'].eq(3),
    df['VEHICLE_TYPE'].eq(4),
    df['VEHICLE_TYPE'].eq(10),
    df['VEHICLE_TYPE'].eq(71),
    df['VEHICLE_TYPE'].eq(5),
    df['VEHICLE_TYPE'].eq(2),
    df['VEHICLE_TYPE'].eq(6),
    df['VEHICLE_TYPE'].eq(72),
    df['VEHICLE_TYPE'].eq(13),
    df['VEHICLE_TYPE'].eq(17),
    df['VEHICLE_TYPE'].eq(7),
    df['VEHICLE_TYPE'].eq(12),
    df['VEHICLE_TYPE'].eq(8),
    df['VEHICLE_TYPE'].eq(99),
    df['VEHICLE_TYPE'].eq(9),
    df['VEHICLE_TYPE'].eq(11),
    df['VEHICLE_TYPE'].eq(15),
    df['VEHICLE_TYPE'].eq(60),
    df['VEHICLE_TYPE'].eq(16),
    df['VEHICLE_TYPE'].eq(18),
    df['VEHICLE_TYPE'].eq(61),
    df['VEHICLE_TYPE'].eq(14),
    df['VEHICLE_TYPE'].eq(62),
    df['VEHICLE_TYPE'].eq(63),
    df['VEHICLE_TYPE'].eq(20),
    df['VEHICLE_TYPE'].eq(27),
    df['VEHICLE_TYPE'].eq(19),
]

choices = ['Car', 'Taxi', 'Utility', 'Motor Cycle', 'Light Commercial Vehicle (Rigid) <= 4.5 Tonnes GVM', 
           'Panel Van', 'Station Wagon', 'Prime Mover (No of Trailers Unknown)',
           'Heavy Vehicle (Rigid) > 4.5 Tonnes', 'Bicycle', 'Other Vehicle', 'Rigid Truck(Weight Unknown)', 
           'Motor Scooter', 'Bus/Coach', 'Unknown', 'Mini Bus(9-13 seats)', 'Moped', 'Tram', 
           'Prime Mover Only', 'Train', 'Not Applicable', 'Prime Mover - Single Trailer', 
           'Horse (ridden or drawn)', 'Prime Mover B-Double', 'Prime Mover B-Triple', 'Quad Bike',
           'Plant machinery and Agricultural equipment', 'Parked trailers']

df['Vehicle Type Desc'] = np.select(conditions, choices, default = "Unknown")

In [370]:
conditions = [
    df['ROAD_SURFACE_TYPE'].eq(1),
    df['ROAD_SURFACE_TYPE'].eq(2),
    df['ROAD_SURFACE_TYPE'].eq(3),
    df['ROAD_SURFACE_TYPE'].eq(9),
]

choices = ['Paved', 'Unpaved', 'Gravel', 'Unknown']

df['Road Surface Type Desc'] = np.select(conditions, choices, default = "Unknown")

In [371]:
df['VEHICLE_BODY_STYLE'] = df['VEHICLE_BODY_STYLE'].str.strip()
df['VEHICLE_BODY_STYLE'].unique()

array(['SEDAN', 'COUPE', 'DC UTE', 'SOLO', 'T TRK', 'VAN', 'S WAG',
       'WAGON', 'P MVR', 'UTIL', '', 'MULTI', 'TRAY', 'SCOOTR', 'CYCLE',
       'BUS', 'CONVRT', 'TIPPER', 'MULTIX', 'RDSTR', 'P VAN', 'TRACT',
       'G UNIT', 'SP VEH', 'UNK', 'SED', 'S TRL', 'C MIX', 'TANKER',
       'TOURER', 'EXCVTR', 'T SDS', 'MOPED', 'PBCYC', 'M PLAT', 'SWEEP',
       'TRAIN', 'C CHAS', 'HOE', 'FRAME', 'P CARR', 'FLUSH', 'F LIFT',
       'CRANE', 'BD/PMR', 'SPREAD', 'MISC', 'CARVN', 'CHAIR', 'BOX',
       'SERV', 'HOR FL', 'TILTER', 'S AMRV', 'AMB', 'S TIP', 'LOADER',
       'M TRLY', 'EDUCTR', 'WINCH', 'JEEP', 'B HOE', 'R AMUS', 'S CAR',
       'DUMPER', 'ST TRK', 'AG IMP', 'SPRAY', 'FLOAT', 'TRAM', 'F UNIT',
       'C CARR', 'CONT-C', 'PBVEH', 'ARM V', 'WRKP U', 'ROLLER', 'GRADER',
       'DOZER', 'AFRAME', 'SKIP C', 'B BIN', 'CARAVN', 'BUGGY', 'TRICAR',
       'MACH', 'WHCBUG', 'COMPAC', 'S AMUS', 'T JINK', 'MOWER', 'TOWER',
       'AMUS', 'MACHNE', 'S TRAY', 'SKIP/C', 'SNOW M', 'P', 'C PUMP'

In [372]:
df.loc[df['VEHICLE_BODY_STYLE'] == '-', 'VEHICLE_BODY_STYLE'] = "Unknown"

In [373]:
df.to_csv("../data/clean/VEHICLE.csv", index = False)