In [1]:
import csv
import math
import sys
import os
import datetime
import gc
import psutil

import numpy as np
import pandas as pd

import random
from random import seed
from random import randrange

import sklearn
from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

from catboost import CatBoostClassifier

In [2]:
##################################################################
##Define all configs and params
##################################################################
class config_desc():
    DESCRIPTTION = [{"PATH": "Path to the input data files"}, 
                   {"SEED": "For reproducability"}]

class config():
    SEED = 13
    PATH = "/kaggle/input/Foml-2021/"
    TRAIN = PATH + "train.csv"
    TEST = PATH + "test.csv"


##################################################################
##Seed to ensure reproducability of results
##################################################################
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(config.SEED)
process = psutil.Process(os.getpid())

In [3]:
print('This code was executed succesfully on Python 3.7.10. Current version:', sys.version, "\n")
print('Version of Scikit used: 0.23.2. Current version:', sklearn.__version__)
print('Version of Numpy used: 1.19.5. Current version:', np.__version__, "\n")
print("Other Pre-requisites to run the program - Update the config.PATH var to reflect actual file path\n")
print("CONFIG DESCRIPTION:")
[print(config_desc.DESCRIPTTION[i]) for i in range(len(config_desc.DESCRIPTTION))]    
print("\nCURRENT CONFIG:\n", "\n".join(str(str(vars(config)).split(",")[1:-3]).replace('"', '').split(",")))

This code was executed succesfully on Python 3.7.10. Current version: 3.7.10 | packaged by conda-forge | (default, Sep 13 2021, 19:43:44) 
[GCC 9.4.0] 

Version of Scikit used: 0.23.2. Current version: 0.23.2
Version of Numpy used: 1.19.5. Current version: 1.19.5 

Other Pre-requisites to run the program - Update the config.PATH var to reflect actual file path

CONFIG DESCRIPTION:
{'PATH': 'Path to the input data files'}
{'SEED': 'For reproducability'}

CURRENT CONFIG:
 [ 'SEED': 13
  'PATH': '/kaggle/input/Foml-2021/'
  'TRAIN': '/kaggle/input/Foml-2021/train.csv'
  'TEST': '/kaggle/input/Foml-2021/test.csv']


In [4]:
train, test = [pd.read_csv(file) for file in [config.TRAIN, config.TEST]]

  """Entry point for launching an IPython kernel.


We see from the warning that the columns have mixed data types in them. Hopefully nothing has been corrupted during loading. We will take care of this during NP conversion

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51490 entries, 0 to 51489
Data columns (total 42 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Report Number                   51490 non-null  object 
 1   Local Case Number               45037 non-null  object 
 2   Agency Name                     51490 non-null  object 
 3   ACRS Report Type                51490 non-null  object 
 4   Crash Date/Time                 51490 non-null  object 
 5   Route Type                      46522 non-null  object 
 6   Road Name                       46988 non-null  object 
 7   Cross-Street Type               46471 non-null  object 
 8   Cross-Street Name               46968 non-null  object 
 9   Off-Road Description            6950 non-null   object 
 10  Municipality                    7567 non-null   object 
 11  Related Non-Motorist            3648 non-null   object 
 12  Collision Type                  

In [6]:
train.head(3)

Unnamed: 0,Report Number,Local Case Number,Agency Name,ACRS Report Type,Crash Date/Time,Route Type,Road Name,Cross-Street Type,Cross-Street Name,Off-Road Description,...,Driverless Vehicle,Parked Vehicle,Vehicle Year,Vehicle Make,Vehicle Model,Equipment Problems,Latitude,Longitude,Location,Fault
0,MP060D,10196,Montgomery County Police,Injury Crash,06/05/2017 04:27:00 PM,Maryland (State),OLD HUNDRED RD,County,THURSTON RD,12305 SHOREFIELD RD,...,No,No,2005,GMC,TRUCK,NO MISUSE,39.263378,-77.344203,3.6383 7.4233,1
1,MP000X,20080,Montgomery County Police,Injury Crash,02/22/2020 10:00:00 AM,County,JANET RD,County,FLACK ST,,...,No,Yes,2018,FORD,FIESTA,NO MISUSE,39.068913,-77.063227,3.0862 7.324,1
2,MP070N,10103,Montgomery County Police,Property Damage Crash,06/11/2017 08:21:00 AM,Maryland (State),COLESVILLE RD,Maryland (State),STRUC #15082,,...,No,No,2016,NEW FLYER,TBU,NO MISUSE,38.993578,-77.032718,3.9583 7.3783,0


We see some NaNs in the first couple of rows itself

In [7]:
train.columns

Index(['Report Number', 'Local Case Number', 'Agency Name', 'ACRS Report Type',
       'Crash Date/Time', 'Route Type', 'Road Name', 'Cross-Street Type',
       'Cross-Street Name', 'Off-Road Description', 'Municipality',
       'Related Non-Motorist', 'Collision Type', 'Weather',
       'Surface Condition', 'Light', 'Traffic Control',
       'Driver Substance Abuse', 'Non-Motorist Substance Abuse', 'Person ID',
       'Injury Severity', 'Circumstance', 'Drivers License State',
       'Vehicle ID', 'Vehicle Damage Extent', 'Vehicle First Impact Location',
       'Vehicle Second Impact Location', 'Vehicle Body Type',
       'Vehicle Movement', 'Vehicle Continuing Dir', 'Vehicle Going Dir',
       'Speed Limit', 'Driverless Vehicle', 'Parked Vehicle', 'Vehicle Year',
       'Vehicle Make', 'Vehicle Model', 'Equipment Problems', 'Latitude',
       'Longitude', 'Location', 'Fault'],
      dtype='object')

The "fault" col seems to be the label. 

At first glance, there seems to be lots of features which are very relevant to determining the label. E.g - Equipment Problems, Vehicle Year, Weather, Surface Condition, Light, Traffic Control, Driver Substance Abuse etc. There seem to be some like report number and case number which should ideally not have any connection with the label

In [8]:
print ("Label distribution in train", round(train.Fault.sum()/len(train),2))

Label distribution in train 0.55


Labels slightly skewed towards faulty drivers. This is true for Public LB also as my first entry was a dummy all 0 CSV file which gave around 45% on the public LB

In [9]:
missing = train.columns[train.isnull().any()]
print("Columns having null\n", missing)

Columns having null
 Index(['Local Case Number', 'Route Type', 'Road Name', 'Cross-Street Type',
       'Cross-Street Name', 'Off-Road Description', 'Municipality',
       'Related Non-Motorist', 'Collision Type', 'Weather',
       'Surface Condition', 'Light', 'Traffic Control',
       'Driver Substance Abuse', 'Non-Motorist Substance Abuse',
       'Circumstance', 'Drivers License State', 'Vehicle Damage Extent',
       'Vehicle First Impact Location', 'Vehicle Second Impact Location',
       'Vehicle Body Type', 'Vehicle Movement', 'Vehicle Continuing Dir',
       'Vehicle Going Dir', 'Vehicle Make', 'Vehicle Model',
       'Equipment Problems'],
      dtype='object')


We will have to impute these values else the models will not process them or use their in-built logic which might not necessarily the best. Rather than letting the model handle everything, let us try to analyze and decide the course of action for missing cols ourselves.

This is going to be sizable work. This will also be our EDA and we may get to know more about the cols

In [10]:
for col in missing:
    non_miss = train[train[col].notnull()][col]
    print("\nCOL NAME:", col, "-"*4, "TYPE OF 1st NON NULL:", type(non_miss.values[0]),"-"*4,"MISSING CNT:", train[col].isnull().sum())
    print("MODE:", train[col].mode().values[0]) if isinstance(non_miss.values[0], str) \
                                                else print("MODE, MEAN:", train[col].mode().values[0], non_miss.mean())
    print("NUM OF UNIQUE VALUES", len(train[col].unique()), "SAMPLE:", train[col].unique()[:4])
    ##print("FIRST FOUR VALUES:", [train[col][i] for i in range(4)])


COL NAME: Local Case Number ---- TYPE OF 1st NON NULL: <class 'str'> ---- MISSING CNT: 6453
MODE: 1005
NUM OF UNIQUE VALUES 1430 SAMPLE: ['10196' '20080' '10103' '10208']

COL NAME: Route Type ---- TYPE OF 1st NON NULL: <class 'str'> ---- MISSING CNT: 4968
MODE: Maryland (State)
NUM OF UNIQUE VALUES 11 SAMPLE: ['Maryland (State)' 'County' nan 'US (State)']

COL NAME: Road Name ---- TYPE OF 1st NON NULL: <class 'str'> ---- MISSING CNT: 4502
MODE: GEORGIA AVE
NUM OF UNIQUE VALUES 2227 SAMPLE: ['OLD HUNDRED RD' 'JANET RD' 'COLESVILLE RD' 'PARKLAND DR']

COL NAME: Cross-Street Type ---- TYPE OF 1st NON NULL: <class 'str'> ---- MISSING CNT: 5019
MODE: County
NUM OF UNIQUE VALUES 11 SAMPLE: ['County' 'Maryland (State)' 'Municipality' 'Other Public Roadway']

COL NAME: Cross-Street Name ---- TYPE OF 1st NON NULL: <class 'str'> ---- MISSING CNT: 4522
MODE: GEORGIA AVE
NUM OF UNIQUE VALUES 4526 SAMPLE: ['THURSTON RD' 'FLACK ST' 'STRUC #15082' 'FRANKFORT DR']

COL NAME: Off-Road Description ---

We create a dict which will tell us the action needed for each col. This will be useful later when processing test. Note that we will NOT blindly replace all missing cols by mode or mean but take specific action on a per col basis

In [11]:
col_imputed_val = []

##Missing Equip Prob col will be tagged as UNKNOWN
##Note that we have used UNKNOWN here instead of the mode which is "NO MISUSE"
col_imputed_val.append(("Equipment Problems", "UNKNOWN"))

col_imputed_val.append(("Vehicle Model", "MODE"))
col_imputed_val.append(("Vehicle Make", "MODE"))

##Missing Vehicle Going Dir will be tagged to Unknown
col_imputed_val.append(("Vehicle Going Dir", "Unknown"))
col_imputed_val.append(("Vehicle Continuing Dir", "Unknown"))
##Note that we will later combine the above 2 as they seem related

col_imputed_val.append(("Vehicle Movement", "MODE"))
col_imputed_val.append(("Vehicle Body Type", "MODE"))

col_imputed_val.append(("Vehicle Second Impact Location", "UNKNOWN"))
col_imputed_val.append(("Vehicle First Impact Location", "UNKNOWN"))

col_imputed_val.append(("Vehicle Damage Extent", "MODE"))
col_imputed_val.append(("Drivers License State", "MODE"))
##Above 2 dont seem to be very good features and may just create noise??

##Circumstance. This is a VERY critical feature. Unfortunatelt 75% values missing
##We cant use MODE here.  We use a new value "UNKNOWN"
##We also need to simplify the catogeries in this feature somehow
col_imputed_val.append(("Circumstance", "UNKNOWN"))

##Non-Motorist Substance Abuse. More than 90% values missing but VERY critical
col_imputed_val.append(("Non-Motorist Substance Abuse", "UNKNOWN"))

col_imputed_val.append(("Driver Substance Abuse", "UNKNOWN"))
col_imputed_val.append(("Surface Condition", "UNKNOWN"))
col_imputed_val.append(("Light", "UNKNOWN"))
col_imputed_val.append(("Traffic Control", "UNKNOWN"))

##Municipality has 43923 missin values and is not critical. We will drop
col_imputed_val.append(("Municipality", "DROP"))

##Related Non-Motorist has 48K missin values but could be critical
##usually if pedestrian or cyclist is hit the driver is at fault
##We wont drop it. We cant assign mode also. We will imput with OTHER
col_imputed_val.append(("Related Non-Motorist", "OTHER"))

col_imputed_val.append(("Collision Type", "MODE"))
col_imputed_val.append(("Weather", "UNKNOWN"))

col_imputed_val.append(("Off-Road Description", "MODE"))
col_imputed_val.append(("Cross-Street Name", "MODE"))
col_imputed_val.append(("Cross-Street Type", "MODE"))
col_imputed_val.append(("Road Name", "MODE"))
col_imputed_val.append(("Route Type", "MODE"))
##Note we can combine road name and street name

##Local Case Number. Ideally this should be unique but looks like there
##are only 1430 values for 50K data. We create a new value for missing
col_imputed_val.append(("Local Case Number", "123123123"))

In [12]:
##Let us inspect remaining columns
for col in (set(train.columns)-(set(missing))) - set(["Fault"]):
    print("\nCOL NAME:", col, "-"*4, "TYPE OF 1st VALUE:", type(non_miss.values[0]),"-"*4)
    print("NUM OF UNIQUE VALUES", len(train[col].unique()), "SAMPLE:", train[col].unique()[:4])
    ##print("FIRST FOUR VALUES:", [train[col][i] for i in range(4)])


COL NAME: Vehicle ID ---- TYPE OF 1st VALUE: <class 'str'> ----
NUM OF UNIQUE VALUES 49160 SAMPLE: ['7C2FEC-727F4' '67644F-64EF8' '30F7A9-92C98' '0F0C58-8E98F']

COL NAME: Crash Date/Time ---- TYPE OF 1st VALUE: <class 'str'> ----
NUM OF UNIQUE VALUES 39573 SAMPLE: ['06/05/2017 04:27:00 PM' '02/22/2020 10:00:00 AM'
 '06/11/2017 08:21:00 AM' '05/10/2019 07:30:00 AM']

COL NAME: ACRS Report Type ---- TYPE OF 1st VALUE: <class 'str'> ----
NUM OF UNIQUE VALUES 3 SAMPLE: ['Injury Crash' 'Property Damage Crash' 'Fatal Crash']

COL NAME: Speed Limit ---- TYPE OF 1st VALUE: <class 'str'> ----
NUM OF UNIQUE VALUES 15 SAMPLE: [40 25 35  0]

COL NAME: Agency Name ---- TYPE OF 1st VALUE: <class 'str'> ----
NUM OF UNIQUE VALUES 10 SAMPLE: ['Montgomery County Police' 'Rockville Police Departme' 'MONTGOMERY'
 'GAITHERSBURG']

COL NAME: Report Number ---- TYPE OF 1st VALUE: <class 'str'> ----
NUM OF UNIQUE VALUES 3256 SAMPLE: ['MP060D' 'MP000X' 'MP070N' 'MP130Y']

COL NAME: Parked Vehicle ---- TYPE O

- We will round and combine latitude and longitude. 
- Crash Date/Time can be split into 2 features. We will remove the year and keep only the date and month. We will separate the hour and convert it into 24 hour format as a separate feature. 
- Vehicle Year - We will just convert it into age of the vehicle assuming 2021 is the current year
- Vehicle ID and Person ID: We will just convert each of these features into number of past "fault" and number of "non-fault". Not sure what they are? I would have expected Vehicle ID to represent the same vehicle, but we see diff vehicles against same ID. Let us check it

In [13]:
print("Person ID intersection count:", len(set(train["Person ID"].values).intersection(test["Person ID"])))
print("Vehicle ID intersection count:", len(set(train["Vehicle ID"].values).intersection(test["Vehicle ID"])))

Person ID intersection count: 0
Vehicle ID intersection count: 0


We should definitely drop these 2 cols

In [14]:
col_imputed_val.append(("Person ID", "DROP"))
col_imputed_val.append(("Vehicle ID", "DROP"))

In [15]:
train["Latitude"], train["Longitude"],  = np.round(train["Latitude"], 5), np.round(train["Longitude"], 5)
test["Latitude"], test["Longitude"],  = np.round(test["Latitude"], 5), np.round(test["Longitude"], 5)
train["Geos"] = [str(train["Latitude"][i]) + "_" + str(train["Longitude"][i]) for i in range(len(train))]
test["Geos"] = [str(test["Latitude"][i]) + "_" + str(test["Longitude"][i]) for i in range(len(test))]

col_imputed_val.append(("Latitude", "DROP"))
col_imputed_val.append(("Longitude", "DROP"))

Note: This comment is not uptodate as I am going on experimenting with the col priority

Based on EDA, I have grouped the features based on priority into p0-p5:

p0: ["Circumstance", "Driver Substance Abuse", "Non-Motorist Substance Abuse", "Surface Condition", "Related Non-Motorist", "Person_ID_Fault", "Person_ID_NoFault"]
p1: ["Direction_change", "Light", "Traffic Control", "Collision Type", "Vehicle_ID_Fault", "Vehicle_ID_NoFault"]
p2: ["Vehicle Movement",  "Impact", "Equipment Problems", "Weather"]
p3: ["Model_Make", "Vehicle Body Type", "Crash_date", "Crash_time", "Vehicle_year"]
p4: ["Vehicle Damage Extent", "Drivers License State", "Cross-Street Type", "Local Case Number", "Report Number", "Route Type"]
p5: TBD

Add, Drop & Modify Features:
- Direction_change: It will show if Vehicle Going Dir is different from Vehicle Continuing Dir which will help with predictions in my opinion
- We already combined Longitude and Latitude into Geos. Maybe certain terrains are difficult to drive leading to accidents which is not driver fault. Some other areas (e.g. pub parking lot) could be hot-bed of faulty incidents
- Crash Date/Time can be split into 2 features. We will remove the year and keep only the date and month. We will separate the hour and convert it into 24 hour format as a separate feature. We will also generate a new feature called day of the week based on calendar as Fri and Sat night time driving is a good feature to have
- We will also retain MM/YY and will treat these 366 values as categorical. There are days (e.g. new year time, long weekends etc) which could impact labels
- Vehicle Year - We will just convert it into age of the vehicle assuming 2021 is the current year

In [16]:
def process_date(date):
    month, day, year = (int(x) for x in date.split('/'))    
    ans = datetime.date(year, month, day)
    return ans.strftime("%A")

train["Crash_WeekDay"] = [process_date(train["Crash Date/Time"][i][:10]) for i in range(len(train))]
test["Crash_WeekDay"] = [process_date(test["Crash Date/Time"][i][:10]) for i in range(len(test))]

def addnew_cols(df):
    df["Direction_change"] = df["Vehicle Going Dir"] + "-" + df["Vehicle Continuing Dir"]
    df["Vehicle Year"] = 2021 - pd.to_numeric(df["Vehicle Year"], errors='coerce').fillna(0).astype(np.int64)
    ##Fix some impossible values in the data
    df["Vehicle Year"] = [df["Vehicle Year"][i] if 0<=df["Vehicle Year"][i]<=30 else 4 for i in range(len(df))]
    df["Crash_Month"] = [int(df["Crash Date/Time"][i][:2]) for i in range(len(df))]
    df["Crash_Month_Day"] = [df["Crash Date/Time"][i][:5] for i in range(len(df))]
    df["Crash_Hour"] = [df["Crash Date/Time"][i][11:13] + df["Crash Date/Time"][i][20:22] for i in range(len(df))]
    return df
    
train, test = [addnew_cols(df) for df in [train, test]]

In [17]:
col_imputed_val.append(("Direction_change", "MODE"))
col_imputed_val.append(("Vehicle Going Dir", "DROP"))
col_imputed_val.append(("Vehicle Continuing Dir", "DROP"))
col_imputed_val.append(("Crash Date/Time", "DROP"))

I added this piece of logic after seveal iterations. First issue is - Typically PM = 12+hour and AM = hour but 12:20 AM would be midnight and 12:20 PM would be afternoon. This needs to be corrected. Secondly we will something unconventional here. I would view 1 AM, 2 AM and even 3 AM as closer to 24. We will change these to 25, 26 and 27 respectively. So the hour col will vary from 4,5,6,7....23, 24, 25, 26, 27. This mitigates the ordinal effect

In [18]:
train["Crash_Hour"] = [int(train["Crash_Hour"][i][0:2]) \
                       if (train["Crash_Hour"][i][2:4] == "AM" and int(train["Crash_Hour"][i][0:2])!=12) or \
                           (train["Crash_Hour"][i][2:4] == "PM" and int(train["Crash_Hour"][i][0:2])==12) \
                       else int(train["Crash_Hour"][i][0:2])+12 \
                    for i in range(len(train))]

test["Crash_Hour"] = [int(test["Crash_Hour"][i][0:2]) \
                      if (test["Crash_Hour"][i][2:4] == "AM" and int(test["Crash_Hour"][i][0:2])!=12) or \
                          (test["Crash_Hour"][i][2:4] == "PM" and int(test["Crash_Hour"][i][0:2])==12) \
                      else int(test["Crash_Hour"][i][0:2])+12 \
                    for i in range(len(test))]

train["Crash_Hour"] =  [train["Crash_Hour"][i]+24 if (1<=train["Crash_Hour"][i]<=3) else train["Crash_Hour"][i] for i in range(len(train))]
test["Crash_Hour"] =  [test["Crash_Hour"][i]+24 if (1<=test["Crash_Hour"][i]<=3) else test["Crash_Hour"][i] for i in range(len(test))]

The Local case number if float and it is better to treat this as categorical as they may not be ordinal (I feel)

In [19]:
train["Local Case Number"] = train["Local Case Number"].astype('str')
test["Local Case Number"] = test["Local Case Number"].astype('str')

Lastly I wanted to add these three columns for Model, Geos and Circumstance. I am not sure if I will get time to test this

In [20]:
dictModel = {}
models = train["Vehicle Model"].unique()
for model in models:
    total_count = len(train[train["Vehicle Model"]==model])
    if total_count > 10:
        atfault_count = len(train[(train.Fault==1) & (train["Vehicle Model"]==model)])
        atfault = round(atfault_count/total_count, 1)
        dictModel[model] = atfault
    else:
        dictModel[model] = 0.5
        
train["Model_prob"] = train["Vehicle Model"].map(dictModel)
##Use train data for test. No LEAK
test["Model_prob"] = test["Vehicle Model"].map(dictModel)

test['Model_prob'] = test['Model_prob'].replace(np.nan, 0.5)
train['Model_prob'] = train['Model_prob'].replace(np.nan, 0.5)

We now process col_imputed_val for train and test. We ensire there is no leakage. MODEs if taken are taken from TRAIN data

In [21]:
for tup in col_imputed_val:
    col, action = tup[0], tup[1]
    if action == "DROP":
        train, test = train.drop(col, 1), test.drop(col, 1)
    elif action in ["UNKNOWN", "Unknown", "OTHER"]:
        train[col], test[col] = train[col].fillna(action), test[col].fillna(action)
    elif action == "123123123":
        train[col], test[col] = train[col].fillna("123123123"), test[col].fillna("123123123")
    elif action == "MODE":
        mode_fault = train[train.Fault == 1][col].mode().values[0]
        mode_nofault = train[train.Fault == 0][col].mode().values[0]
        mode = train[col].mode().values[0]
        ##Impute appropriate mode to train based on label
        train.loc[((train[col].isnull()) & (train.Fault == 1)), col] = mode_fault
        train.loc[((train[col].isnull()) & (train.Fault == 0)), col] = mode_nofault
        ##Impute overall TRAIN mode to test (no leak)
        test[col] = test[col].fillna(mode)

  after removing the cwd from sys.path.


In [22]:
print("Columns having null\n", train.columns[train.isnull().any()])
print("Columns having null\n", test.columns[test.isnull().any()])

Columns having null
 Index([], dtype='object')
Columns having null
 Index([], dtype='object')


Thankfully, there are no test cols having null which are non-null cols in train. Our data does not contain missing values now. 

In [23]:
for col in set(train.columns)-set(["Fault"]):
    print("\nCOL NAME:", col, "NUM OF UNIQUE VALUES:", len(train[col].unique()), "SAMPLE:", train[col].unique()[:3])


COL NAME: Circumstance NUM OF UNIQUE VALUES: 140 SAMPLE: ['N/A, WET'
 'BACKUP DUE TO REGULAR CONGESTION, SLEET, HAIL, FREEZ. RAIN, WET'
 'UNKNOWN']

COL NAME: Surface Condition NUM OF UNIQUE VALUES: 11 SAMPLE: ['WET' 'DRY' 'UNKNOWN']

COL NAME: Cross-Street Name NUM OF UNIQUE VALUES: 4525 SAMPLE: ['THURSTON RD' 'FLACK ST' 'STRUC #15082']

COL NAME: Crash_WeekDay NUM OF UNIQUE VALUES: 7 SAMPLE: ['Monday' 'Saturday' 'Sunday']

COL NAME: Vehicle Movement NUM OF UNIQUE VALUES: 22 SAMPLE: ['MOVING CONSTANT SPEED' 'ACCELERATING' 'PARKING']

COL NAME: Route Type NUM OF UNIQUE VALUES: 10 SAMPLE: ['Maryland (State)' 'County' 'US (State)']

COL NAME: Agency Name NUM OF UNIQUE VALUES: 10 SAMPLE: ['Montgomery County Police' 'Rockville Police Departme' 'MONTGOMERY']

COL NAME: Crash_Hour NUM OF UNIQUE VALUES: 24 SAMPLE: [16 10  8]

COL NAME: Driver Substance Abuse NUM OF UNIQUE VALUES: 11 SAMPLE: ['NONE DETECTED' 'UNKNOWN' 'ALCOHOL CONTRIBUTED']

COL NAME: Vehicle Year NUM OF UNIQUE VALUES: 31 SAM

The bulk of the work is done. Hopefully rest should be fast. We will segrgate the cols into p0-p5. We will also create 2 sets. Because boosters like LightGBM, catboost etc can directly work with categorical vars, we can feed more cols to these type of algos without running into memory problems

In [24]:
p0 = ["Circumstance", "Driver Substance Abuse", "Non-Motorist Substance Abuse", "Surface Condition", "Related Non-Motorist", "Model_prob"]
p1 = ["Direction_change", "Light", "Traffic Control", "Collision Type", "Crash_WeekDay"]
p2 = ["Vehicle Movement", "Vehicle First Impact Location", "Vehicle Second Impact Location", "Equipment Problems", "Weather", "Speed Limit",\
     "Crash_Month", "Crash_Hour"]
p3 = ["Vehicle Body Type", "Vehicle Year", "Vehicle Damage Extent", "Injury Severity", "Driverless Vehicle", "Parked Vehicle", "Location", \
     "Crash_Month_Day"]
p4 = ["Drivers License State", "Cross-Street Type", "Local Case Number", "Report Number", "Route Type", \
      "Agency Name", "ACRS Report Type", "Road Name","Cross-Street Name", "Cross-Street Name", "Off-Road Description", "Geos"]


p0_boost = ["Circumstance", "Driver Substance Abuse", "Non-Motorist Substance Abuse", "Surface Condition", "Related Non-Motorist", "Geos"]
p1_boost = ["Direction_change", "Light", "Traffic Control", "Collision Type", "Crash_WeekDay", "Model_prob"]
p2_boost = ["Vehicle Movement", "Vehicle First Impact Location", "Vehicle Second Impact Location", "Equipment Problems", "Weather", "Speed Limit",\
     "Crash_Month", "Crash_Hour"]
p3_boost = ["Vehicle Body Type", "Vehicle Year", "Vehicle Damage Extent", "Injury Severity", "Driverless Vehicle", "Parked Vehicle"]
p4_boost = ["Drivers License State", "Cross-Street Type", "Route Type", "Crash_Month_Day", \
      "Agency Name", "ACRS Report Type", "Road Name", "Vehicle Model", "Report Number", "Local Case Number", "Location"]
p5_boost = ["Cross-Street Name", "Off-Road Description", "Vehicle Make"]

In [25]:
col_rf = p0+p1+p2
col_boost = p0_boost + p1_boost + p2_boost + p3_boost + p4_boost

Some sanity checks if we missed anything or got name incorrect. Note that we ignore p5_boost for now

In [26]:
for col in col_rf:
    print("Recheck col names", col) if col not in train.columns else None
print("Done checking RF cols")

for col in col_boost:
    print("Recheck col names", col) if col not in train.columns else None
print("Done checking Boost cols")

print("Anything left?",len(set(train.columns) - set(col_boost) - set(["Fault"]) - set(p5_boost)))

Done checking RF cols
Done checking Boost cols
Anything left? 0


In [27]:
##We take a copy of train and test. For RF, we have to OHE
train_rf, test_rf = train.copy(), test.copy()
train_rf_y = train_rf["Fault"]

##Drop the unwanted cols for rf
dropcols = set(train.columns) - set(col_rf) - set(["Fault"])
for col in dropcols:
    train_rf, test_rf = train_rf.drop(col, 1) , test_rf.drop(col, 1)
train_rf = train_rf.drop("Fault", 1) 
test_rf = test_rf.drop("Id", 1) 

##Do likewise for boost algos.
train_boost_X, test_boost_X = train.copy(), test.copy()
train_boost_y = train_boost_X["Fault"]
dropcols = set(train.columns) - set(col_boost) - set(["Fault"])
for col in dropcols:
    train_boost_X, test_boost_X = train_boost_X.drop(col, 1) , test_boost_X.drop(col, 1)
train_boost_X = train_boost_X.drop("Fault", 1) 
test_boost_X = test_boost_X.drop("Id", 1) 

  
  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.


In [28]:
print('start', process.memory_info().rss)

start 625713152


In [29]:
##OHE the categorical for RF only
categorical=[]
for col in train_rf.columns:
    if type(train[col][0]) == str:
        categorical.append(col)        
        
OHE_train_rf = pd.get_dummies(train_rf, columns = categorical)
OHE_test_rf = pd.get_dummies(test_rf, columns = categorical)
train_rf_X, test_rf_X = OHE_train_rf.align(OHE_test_rf,join='left', fill_value=0, axis=1)
del OHE_train_rf, OHE_test_rf
gc.collect()

0

So now we have train_rf_X, test_rf_X, train_rf_y AND train_boost_X, test_boost_X, train_boost_y as well as the original train, test copies just for reference

We also define a neural net in case we want to use it (depending on time)

In [30]:
class Net(nn.Module):
    def __init__(self, col_size):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(col_size, 1000)
        self.fc2 = nn.Linear(1000, 2)
        
    def forward(self, x):
        x = self.fc1(x)
        x = F.dropout(x, p=0.1)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.sigmoid(x)
        return x
    
batch_size = 50
num_epochs = 15
learning_rate = 0.01
criterion = nn.CrossEntropyLoss()

We will now split the train data into train, validation and have a stratified 5 CV split.

In [31]:
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=config.SEED)
cv_score_rf,cv_score_boost, fold = [],[], 1
preds_ensemble_rf, preds_ensemble_boost = np.zeros(len(test_rf_X)), np.zeros(len(test_boost_X))

for train_index, valid_index in kfold.split(train_rf_X, train_rf_y):
    ##We will use the above split for RF as well as boost. Index is same
    seed_everything(config.SEED)
    print("TRAIN:", len(train_index), "VALID:", len(valid_index))
    print('{} of KFold {}'.format(fold,kfold.n_splits))
    X_train_rf,X_val_rf = train_rf_X.loc[train_index],train_rf_X.loc[valid_index]
    y_train_rf,y_val_rf = train_rf_y.loc[train_index],train_rf_y.loc[valid_index]
    X_train_boost,X_val_boost = train_boost_X.loc[train_index],train_boost_X.loc[valid_index]
    y_train_boost,y_val_boost = train_boost_y.loc[train_index],train_boost_y.loc[valid_index]
    
    cat_features = np.where(X_train_boost.dtypes != np.int64)[0]
    cat_features = list(set(cat_features) - set([36]))
    X_train_rf,X_val_rf,y_train_rf,y_val_rf = np.array(X_train_rf), np.array(X_val_rf), np.array(y_train_rf), np.array(y_val_rf)
    X_train_boost,X_val_boost,y_train_boost,y_val_boost = np.array(X_train_boost), np.array(X_val_boost), np.array(y_train_boost), np.array(y_val_boost)
    
    print("Fitting...")
    """
    n_estimators = [int(x) for x in np.linspace(start=200, stop=1000, num=5)]
    max_depth = [int(x) for x in np.linspace(20, 100, num=10)]
    max_depth.append(None)
    min_samples_split = [2, 5, 10]
    min_samples_leaf = [1, 2, 4] 
    bootstrap = [True, False]
    max_features = ['auto', 'sqrt']
    random_grid = {'n_estimators': n_estimators, 'max_depth': max_depth, 'min_samples_split': min_samples_split,
                  'max_features': max_features, 'bootstrap': bootstrap,'min_samples_leaf':min_samples_leaf }
    rf_cv=RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_iter=10, cv=3,random_state=config.SEED, n_jobs=-1)
    rf_cv.fit(X_train, y_train)
    rf_best_params = rf_cv.best_params_
    print(f"Best paramters: {rf_best_params})")
    rf = RandomForestClassifier(**rf_best_params)
    """
    rf = RandomForestClassifier(random_state=config.SEED, n_estimators = 100, min_samples_split = 4)
    boost = CatBoostClassifier(random_seed=config.SEED)
    rf.fit(X_train_rf, y_train_rf)
    boost.fit(X_train_boost, y_train_boost, cat_features=cat_features, eval_set=(X_val_boost, y_val_boost), verbose=False)

    
    print("Predicting...")
    """
    net = Net(X_train)
    optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)
    for epoch in range(num_epochs):
        print('Epoch {}'.format(epoch+1)) if (epoch+1)%5 == 0 else None
        batch_no = len(X_train) // batch_size

        for i in range(batch_no):
            start = i * batch_size
            end = start + batch_size
            x_batch = Variable(torch.FloatTensor(X_train[start:end]))
            y_batch = Variable(torch.LongTensor(y_train[start:end]))

            optimizer.zero_grad()
            pred_batch = net(x_batch)
            loss = criterion(pred_batch, y_batch)
            loss.backward()
            optimizer.step()
        
    val_ = Variable(torch.FloatTensor(X_val), requires_grad=True)
    with torch.no_grad():
        preds_ = net(val_)
    values, labels = torch.max(preds_, 1)
    num_right = np.sum(labels.data.numpy() == y_val)
    print('Accuracy {:.4f}'.format(num_right / len(y_val)))
    """
    preds_rf = rf.predict(X_val_rf)
    preds_test_rf = rf.predict(test_rf_X)
    preds_boost = np.round(boost.predict(X_val_boost))
    preds_test_boost = np.round(boost.predict(test_boost_X))
    ##preds_test_boost = boost.predict_proba(test_boost_X)[:,-1]
    
    print("Scoring...")
    acc_rf = (preds_rf == y_val_rf).sum()/len(y_val_rf)
    print("Accuracy RF:", round(acc_rf, 4))
    cv_score_rf.append(acc_rf)    
    
    acc_boost = (preds_boost == y_val_boost).sum()/len(y_val_boost)
    print("Accuracy Boost:", round(acc_boost, 4), "\n")
    cv_score_boost.append(acc_boost)    
    
    preds_ensemble_rf += preds_test_rf
    preds_ensemble_boost += preds_test_boost
    fold+=1 
    
print("\nFinal score RF:", round(np.mean(cv_score_rf), 4))
print("\nFinal score Boost:", round(np.mean(cv_score_boost), 4))

TRAIN: 46341 VALID: 5149
1 of KFold 10
Fitting...
Predicting...
Scoring...
Accuracy RF: 0.8479
Accuracy Boost: 0.8842 

TRAIN: 46341 VALID: 5149
2 of KFold 10
Fitting...
Predicting...
Scoring...
Accuracy RF: 0.8582
Accuracy Boost: 0.8977 

TRAIN: 46341 VALID: 5149
3 of KFold 10
Fitting...
Predicting...
Scoring...
Accuracy RF: 0.8435
Accuracy Boost: 0.8835 

TRAIN: 46341 VALID: 5149
4 of KFold 10
Fitting...
Predicting...
Scoring...
Accuracy RF: 0.8491
Accuracy Boost: 0.8881 

TRAIN: 46341 VALID: 5149
5 of KFold 10
Fitting...
Predicting...
Scoring...
Accuracy RF: 0.8501
Accuracy Boost: 0.8936 

TRAIN: 46341 VALID: 5149
6 of KFold 10
Fitting...
Predicting...
Scoring...
Accuracy RF: 0.8507
Accuracy Boost: 0.8934 

TRAIN: 46341 VALID: 5149
7 of KFold 10
Fitting...
Predicting...
Scoring...
Accuracy RF: 0.8421
Accuracy Boost: 0.8813 

TRAIN: 46341 VALID: 5149
8 of KFold 10
Fitting...
Predicting...
Scoring...
Accuracy RF: 0.8549
Accuracy Boost: 0.8912 

TRAIN: 46341 VALID: 5149
9 of KFold 10
F

In [32]:
final_preds_rf = preds_ensemble_rf/10
final_preds_boost = preds_ensemble_boost/10

final_preds_rf[final_preds_rf>=0.5] = 1
final_preds_boost[final_preds_boost>=0.5] = 1

final_preds_rf[final_preds_rf<0.5] = 0
final_preds_boost[final_preds_boost<0.5] = 0

final_preds_rf = final_preds_rf.astype(int)
final_preds_boost = final_preds_boost.astype(int)

In [33]:
##Unfortunately it looks like ensembling with RF lowers score
##final_preds = np.round((final_preds_rf + final_preds_boost)/2).astype(int)
final_preds = final_preds_boost

In [34]:
out=pd.DataFrame(dict(Id = test["Id"].values, Fault = final_preds))
out.to_csv("out3.csv", index=False)