# AXA coding challenge
Data:
1. Citibike: https://s3.amazonaws.com/tripdata/index.html
2. NYPD:  https://data.cityofnewyork.us/Public-Safety/Motor-Vehicle-Collisions-Crashes/h9gi-nx95/about_data

In [55]:
# Install packages (only once)
#!pip install selenium webdriver-manager

# Import modules
import os # basic
import zipfile
import time

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk

from selenium import webdriver # for downloading files automatically
from selenium.webdriver.chrome.service import Service

current_dir = os.getcwd() # current dir
print('Current directory: ' + current_dir)

pd.options.display.float_format = '{:.2f}'.format # set pd output to 2 decimals

Current directory: C:\Users\Hanna\sciebo\AXA_coding-challenge


In [None]:
# Functions

# to download files from an url
def download_files(url, save_path):
    response = requests.get(url, stream=True)
    with open(save_path, 'wb') as file:
        for chunk in response.iter_content(chunk_size=1024):
            if chunk:
                file.write(chunk)
    print(f"Downloaded {save_path}")

## Download Citibike data automatically from url

In [7]:
url = "https://s3.amazonaws.com/tripdata/index.html" # url to data files
driver_path = 'C:/Drivers/chromedriver-win64_128/chromedriver.exe' # Chrome driver for web interaction, needed by selenium - must match Chrome version

# Download files
service = Service(driver_path) # initialize the Chrome driver
driver = webdriver.Chrome(service=service)
driver.get(url) # navigate to website
time.sleep(5)  # give the page time to load the dynamic content
html = driver.page_source # get the page source after JavaScript has executed
soup = BeautifulSoup(html, 'html.parser') # parse the HTML

# find all .zip links
file_links = []
for link in soup.find_all('a', href=True):
    if link['href'].endswith('.zip'): # on this website, files are .zip format
        file_links.append(link['href'])
print(file_links[:2]) # check if the file paths are retrieved correctly by printing a few

driver.quit() # close the browser

if not os.path.exists(current_dir+'/downloads'): # directory to save the downloaded files
    os.makedirs(current_dir+'/downloads')

for file_link in file_links: # loop through all the zip links and download them
    filename = os.path.join(current_dir+'/downloads', os.path.basename(file_link))
    
    if not file_link.startswith('http'): # if the link is relative, make it an absolute URL by appending the base URL
        file_link = url + file_link

    download_files(file_link, filename) # download the file

<Response [200]>
[]


## Unzip & reorganize files

In [None]:
# - alternatively

# Unzip files  
# zip_dir = current_dir+'/downloads' # directory containing the zip files
# extract_dir = current_dir+'/data' # directory where extracted files will be saved

# for filename in os.listdir(zip_dir): # loop through all files in the directory
#     if filename.endswith('.zip') :
#         zip_file_path = os.path.join(zip_dir, filename)
#         new_file_path = extract_dir + '/' + filename[:-4] + '.csv' # remove '.zip' and subfolders from the target path name
#         os.makedirs(new_file_path, exist_ok=True)  # create the directory if it doesn't exist

#         with zipfile.ZipFile(zip_file_path, 'r') as zip_ref: # extract the zip file
#             for member in zip_ref.namelist():
#                 if '_MACOSX' not in member: # skip any file or folder inside "_MACOSX" (for MAC computers, not needed)
#                     zip_ref.extract(member, new_file_path) # extract to the specified directory

#             print(f'Extracted: {member} to {new_file_path}')


# # Move  files from subfolders in subfolders to 1 folder

# import shutil

# source_dir = current_dir + '/data'
# destination_dir = current_dir + '/data_test'
# os.makedirs(destination_dir, exist_ok=True)

# for root, dirs, files in os.walk(source_dir):
#     for file in files:
#         if file.endswith('.csv') and not file.startswith('.'): # select .csv files, skip files starting with '.' 
#             if '_MACOSX' in root:
#                 continue  # skip this directory and its contents, for MAC

#             source_file = os.path.join(root, file)
#             destination_file = os.path.join(destination_dir, file)
            
#             shutil.move(source_file, destination_file) # or shutil.copy
#             print(f"Moved: {source_file} -> {destination_file}")


In [95]:
# Unzip files & reorganize
zip_dir = current_dir + '/downloads'  # directory containing the zip files
extract_dir = current_dir + '/data/bike-tripdata'  # directory where extracted files will be saved

os.makedirs(extract_dir, exist_ok=True)  # create the directory if it doesn't exist

for filename in os.listdir(zip_dir):  # loop through all files in the directory
    if filename.endswith('.zip'):
        zip_file_path = os.path.join(zip_dir, filename)

        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:  # extract the zip file
            for member in zip_ref.namelist():
                # skip any file or folder inside "_MACOSX" (for MAC computers, not needed), and files that do not end with .csv
                if '_MACOSX' not in member and member.endswith('.csv'):  
                    # get only the base name of the file (ignore folder structure in zip)
                    base_member = os.path.basename(member)
                    target_path = os.path.join(extract_dir, base_member)
                    
                    with zip_ref.open(member) as source, open(target_path, "wb") as target:
                        target.write(source.read())  # write the extracted content to the single folder

                    print(f'Extracted {base_member}')
    print(f'... from {filename} to {extract_dir}')

Extracted 201309-citibike-tripdata.csv
Extracted 201311-citibike-tripdata.csv
Extracted 201307-citibike-tripdata.csv
Extracted 201308-citibike-tripdata.csv
Extracted 201306-citibike-tripdata.csv
Extracted 201310-citibike-tripdata.csv
Extracted 201312-citibike-tripdata.csv
Extracted 201312-citibike-tripdata_1.csv
Extracted 201311-citibike-tripdata_1.csv
Extracted 201307-citibike-tripdata_1.csv
Extracted 201310-citibike-tripdata_2.csv
Extracted 201310-citibike-tripdata_1.csv
Extracted 201309-citibike-tripdata_2.csv
Extracted 201309-citibike-tripdata_1.csv
Extracted 201308-citibike-tripdata_1.csv
Extracted 201308-citibike-tripdata_2.csv
Extracted 201306-citibike-tripdata_1.csv
from 2013-citibike-tripdata.zip to C:\Users\Hanna\sciebo\AXA_coding-challenge/data/bike-tripdata
Extracted 201404-citibike-tripdata_1.csv
Extracted 201412-citibike-tripdata_1.csv
Extracted 201411-citibike-tripdata_1.csv
Extracted 201407-citibike-tripdata_1.csv
Extracted 201410-citibike-tripdata_1.csv
Extracted 20140

Extracted 202407-citibike-tripdata_1.csv
Extracted 202407-citibike-tripdata_2.csv
Extracted 202407-citibike-tripdata_3.csv
Extracted 202407-citibike-tripdata_4.csv
Extracted 202407-citibike-tripdata_5.csv
from 202407-citibike-tripdata.zip to C:\Users\Hanna\sciebo\AXA_coding-challenge/data/bike-tripdata
Extracted 202408-citibike-tripdata_3.csv
Extracted 202408-citibike-tripdata_2.csv
Extracted 202408-citibike-tripdata_1.csv
Extracted 202408-citibike-tripdata_5.csv
Extracted 202408-citibike-tripdata_4.csv
from 202408-citibike-tripdata.zip to C:\Users\Hanna\sciebo\AXA_coding-challenge/data/bike-tripdata
Extracted JC-201509-citibike-tripdata.csv
from JC-201509-citibike-tripdata.csv.zip to C:\Users\Hanna\sciebo\AXA_coding-challenge/data/bike-tripdata
Extracted JC-201510-citibike-tripdata.csv
from JC-201510-citibike-tripdata.csv.zip to C:\Users\Hanna\sciebo\AXA_coding-challenge/data/bike-tripdata
Extracted JC-201511-citibike-tripdata.csv
from JC-201511-citibike-tripdata.csv.zip to C:\Users\H

Extracted JC-202007-citibike-tripdata.csv
from JC-202007-citibike-tripdata.csv.zip to C:\Users\Hanna\sciebo\AXA_coding-challenge/data/bike-tripdata
Extracted JC-202008-citibike-tripdata.csv
from JC-202008-citibike-tripdata.csv.zip to C:\Users\Hanna\sciebo\AXA_coding-challenge/data/bike-tripdata
Extracted JC-202009-citibike-tripdata.csv
from JC-202009-citibike-tripdata.csv.zip to C:\Users\Hanna\sciebo\AXA_coding-challenge/data/bike-tripdata
Extracted JC-202010-citibike-tripdata.csv
from JC-202010-citibike-tripdata.csv.zip to C:\Users\Hanna\sciebo\AXA_coding-challenge/data/bike-tripdata
Extracted JC-202011-citibike-tripdata.csv
from JC-202011-citibike-tripdata.csv.zip to C:\Users\Hanna\sciebo\AXA_coding-challenge/data/bike-tripdata
Extracted JC-202012-citibike-tripdata.csv
from JC-202012-citibike-tripdata.csv.zip to C:\Users\Hanna\sciebo\AXA_coding-challenge/data/bike-tripdata
Extracted JC-202101-citibike-tripdata.csv
from JC-202101-citibike-tripdata.csv.zip to C:\Users\Hanna\sciebo\AXA_

## Load files & concatenate (if possible)

In [None]:
# concatenate all files or choose period

## Load collision data

In [63]:
file_path = current_dir + '/data/Motor_Vehicle_Collisions_-_Crashes_20240922.csv'
df = pd.read_csv(file_path)

print(df.shape)
df.head(5)

  df = pd.read_csv(file_path)


(2120518, 29)


Unnamed: 0,CRASH DATE,CRASH TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,...,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,COLLISION_ID,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
0,09/11/2021,2:39,,,,,,WHITESTONE EXPRESSWAY,20 AVENUE,,...,Unspecified,,,,4455765,Sedan,Sedan,,,
1,03/26/2022,11:45,,,,,,QUEENSBORO BRIDGE UPPER,,,...,,,,,4513547,Sedan,,,,
2,06/29/2022,6:55,,,,,,THROGS NECK BRIDGE,,,...,Unspecified,,,,4541903,Sedan,Pick-up Truck,,,
3,09/11/2021,9:35,BROOKLYN,11208.0,40.67,-73.87,"(40.667202, -73.8665)",,,1211 LORING AVENUE,...,,,,,4456314,Sedan,,,,
4,12/14/2021,8:13,BROOKLYN,11233.0,40.68,-73.92,"(40.683304, -73.917274)",SARATOGA AVENUE,DECATUR STREET,,...,,,,,4486609,,,,,


In [71]:
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_') # change column names

print(df.dtypes)

crash_date                        object
crash_time                        object
borough                           object
zip_code                          object
latitude                         float64
longitude                        float64
location                          object
on_street_name                    object
cross_street_name                 object
off_street_name                   object
number_of_persons_injured        float64
number_of_persons_killed         float64
number_of_pedestrians_injured      int64
number_of_pedestrians_killed       int64
number_of_cyclist_injured          int64
number_of_cyclist_killed           int64
number_of_motorist_injured         int64
number_of_motorist_killed          int64
contributing_factor_vehicle_1     object
contributing_factor_vehicle_2     object
contributing_factor_vehicle_3     object
contributing_factor_vehicle_4     object
contributing_factor_vehicle_5     object
collision_id                       int64
vehicle_type_cod

## Data cleaning
### Which columns contain nans?
### Change data format

In [73]:
# Check which column has missing values (nans) and how many
summary_table = pd.DataFrame({
    'Nan_count': df.isna().sum(),
    'Total': df.shape[0]
})

print(summary_table)

                               Nan_count    Total
crash_date                             0  2120518
crash_time                             0  2120518
borough                           659498  2120518
zip_code                          659758  2120518
latitude                          247820  2120518
longitude                         247820  2120518
location                          247820  2120518
on_street_name                    453598  2120518
cross_street_name                 807416  2120518
off_street_name                  1759293  2120518
number_of_persons_injured             18  2120518
number_of_persons_killed              31  2120518
number_of_pedestrians_injured          0  2120518
number_of_pedestrians_killed           0  2120518
number_of_cyclist_injured              0  2120518
number_of_cyclist_killed               0  2120518
number_of_motorist_injured             0  2120518
number_of_motorist_killed              0  2120518
contributing_factor_vehicle_1       7107  2120518


In [96]:
# Check types of data in columns
#df['zip_code'].unique()
#df['number_of_persons_injured'].unique()
df['number_of_persons_killed'].unique()

array([ 0.,  1.,  2.,  3.,  4., nan,  8.,  5.])

In [None]:
convert_dict = {
    'borough': str,
}
df = df.astype(convert_dict)
    
df['datetime'] = pd.to_datetime(df['date'] + ' ' + df['time']) # create datetime column
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['datetime'].dt.year # create year column for easy data selection

#df[col] = pd.to_numeric(df[col], errors='coerce')  # errors='coerce' will convert invalid parsing to NaN

In [None]:
df.describe()