<a href="https://colab.research.google.com/github/ipeirotis/dealing_with_data/blob/master/01-Pandas/A3-NYPD_Vehicle_Collisions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introduction to Pandas

## Setup and preliminaries

In [None]:
!pip3 install -U -q PyMySQL sqlalchemy sql_magic xlrd

# Render our plots inline
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

# Make the graphs a bit bigger
matplotlib.style.use(['seaborn-talk', 'seaborn-ticks', 'seaborn-whitegrid'])

In [None]:
# We install the geospatial libraries to be used for Task 10 (if desired)

!apt-get -qq install -y  libgeos-dev libproj-dev proj-data proj-bin libgdal-dev libspatialindex-dev
!pip install -q -U shapely rtree pygeos
!pip install -q geopandas descartes

import geopandas as gpd

# Dataset from NYC Open Data: https://data.cityofnewyork.us/City-Government/Neighborhood-Tabulation-Areas/cpf4-rkhq
df_nyc = gpd.GeoDataFrame.from_file('https://data.cityofnewyork.us/api/geospatial/cpf4-rkhq?method=export&format=Shapefile')


## Exercise: NYPD Vehicle Collisions

* We interacted with the NYC Restaurant Inspection Data. Now, let's download another dataset, and do some analysis. We will focus on the [NYPD Vehicle Collisions](https://data.cityofnewyork.us/Public-Safety/NYPD-Motor-Vehicle-Collisions/h9gi-nx95/data) data set.


### Task 1

Load the dataset for all the collisions after Jan 1st, 2020. We will need to load two tables, with the appropriate date restrictions:

* `collisions`
* `vehicles_involved`


In [None]:
import os
from sqlalchemy import create_engine
from sqlalchemy import text

conn_string = 'mysql+pymysql://{user}:{password}@{host}/{db}?charset=utf8mb4'.format(
    host = 'db.ipeirotis.org',
    user = 'student',
    password = 'dwdstudent2015',
    db = 'collisions',
    encoding = 'utf8mb4')

engine  = create_engine(conn_string)

# mysql_conn.close()

In [None]:
# This query returns back the collisions table
# sql = '''
    #YOUR CODE HERE
# '''
# with engine.connect() as connection:
#	c = pd.read_sql(text(sql), con=connection)

In [None]:
# This query returns back the vehicles_involved table
# sql = '''
    #YOUR CODE HERE
# '''
# with engine.connect() as connection:
#	v = pd.read_sql(text(sql), con=connection)

#### Solution

In [None]:
# This query returns back the collisions table
sql = '''
	SELECT *
  FROM collisions
  WHERE DATE_TIME > '2020-01-01'
'''
with engine.connect() as connection:
	c = pd.read_sql(text(sql), con=connection)

In [None]:
# This query returns back the vehicles_involved table
sql = '''
	SELECT *
  FROM vehicles_involved
  WHERE UNIQUE_KEY IN (
  	SELECT UNIQUE_KEY
    FROM collisions
    WHERE DATE_TIME > '2020-01-01'
  )
'''
with engine.connect() as connection:
	v = pd.read_sql(text(sql), con=connection)

In [None]:
c.memory_usage(deep=True)

In [None]:
# Memory optimization, should not have any effect except for speed

# We do not need the highest level of precision
c['LATITUDE'] = pd.to_numeric(c['LATITUDE'], downcast='float')
c['LONGITUDE'] = pd.to_numeric(c['LONGITUDE'], downcast='float')

# Convert from strings to categorical variables, saves significant amount
# of data for columns with just a few values
c['BOROUGH'] = pd.Categorical(c['BOROUGH'])
c['NEIGHBORHOOD'] = pd.Categorical(c['NEIGHBORHOOD'])
c['ZIPCODE'] = pd.Categorical(c['ZIPCODE'])

# Convert from high precision double to unsigned int (1 byte per entry)
c['PERSONS_INJURED'] = pd.to_numeric(c['PERSONS_INJURED'], downcast='unsigned')
c['PERSONS_KILLED'] = pd.to_numeric(c['PERSONS_KILLED'], downcast='unsigned')
c['PEDESTRIANS_INJURED'] = pd.to_numeric(c['PEDESTRIANS_INJURED'], downcast='unsigned')
c['PEDESTRIANS_KILLED'] = pd.to_numeric(c['PEDESTRIANS_KILLED'], downcast='unsigned')
c['CYCLISTS_INJURED'] = pd.to_numeric(c['CYCLISTS_INJURED'], downcast='unsigned')
c['CYCLISTS_KILLED'] = pd.to_numeric(c['CYCLISTS_KILLED'], downcast='unsigned')
c['MOTORISTS_INJURED'] = pd.to_numeric(c['MOTORISTS_INJURED'], downcast='unsigned')
c['MOTORISTS_KILLED'] = pd.to_numeric(c['MOTORISTS_KILLED'], downcast='unsigned')

# Remove columns that we do not need
c = c.drop(
    ['ON_STREET_NAME', 'CROSS_STREET_NAME', 'OFF_STREET_NAME', 'REPORTED_ZIPCODE', 'REPORTED_BOROUGH'],
    axis = 'columns'
)

In [None]:
c.memory_usage(deep=True)

In [None]:
# Memory optimization, should not have any effect except for speed

# Convert from strings to categorical variables, saves significant amount
# of data for columns with just a few values
v['VEHICLE'] = pd.Categorical(v['VEHICLE'])
v['CAUSE'] = pd.Categorical(v['CAUSE'])
v['VEHICLE_TYPE'] = pd.Categorical(v['VEHICLE_TYPE'])


In [None]:
c.dtypes

In [None]:
v.dtypes

### Task 2

Find out the most common contributing factors to the collisions, for all accidents after Jan-1-2020. You can either use the dataframe that we loaded above (`vehicles_involved`)  or issue an SQL query and fetch a new dataframe.

#### Solution

In [None]:
# Task 2: Find out the most common contributing factors to the collisions.
v['CAUSE'].value_counts() #.plot(kind='barh')

In [None]:
# Task 2: If we want to remove the "Unspecified", we select the elements starting
# from position 1 (i.e., the second element in the list, the first one is 0)
v['CAUSE'].value_counts()[1:10].plot(kind='barh')

In [None]:
# Notice the  difference if we use "COUNT(DISTINCT UNIQUE_KEY)"
# instead of COUNT(*). The former counts accidents, the later vehicles
factors_sql = '''
	SELECT CAUSE, COUNT(*) AS cnt
  FROM vehicles_involved
  WHERE UNIQUE_KEY IN (
  	SELECT UNIQUE_KEY
    FROM collisions
    WHERE DATE_TIME > '2020-01-01'
  )
  GROUP BY CAUSE
  ORDER BY cnt DESC
'''

with engine.connect() as connection:
	factors_df = pd.read_sql(text(factors_sql), con=connection)

factors_df.head(10)

In [None]:
(
 factors_df
 .set_index('CAUSE') # Make "CAUSE" the x-axis for the plot
 .head(10) # keep the top-10 factors
 #.tail(9) # uncomment if you want to eliminate "UNSPECIFIED" (the top-1)
 .sort_values('cnt')
 .plot(
     kind='barh',
     figsize=(10,4)
  )
)

### Task 3

Break down the number of collisions by borough.





#### Solution

In [None]:
# Task 3: Break down the number of collisions by borough.
c['BOROUGH'].value_counts().plot(kind='barh', figsize=(10,5))

In [None]:
# Notice that you can remove the date time restriction and you
# will get the results back equally fast. If you try to load the
# whole collisions table in a dataframe, and then do the value_counts
# or a pivot table, it may take quite a while.

boro_sql = '''
	SELECT BOROUGH, COUNT(*) AS cnt
  FROM collisions
  WHERE DATE_TIME > '2020-01-01'
  GROUP BY BOROUGH
  ORDER BY cnt DESC
'''

with engine.connect() as connection:
	boro_df = pd.read_sql(text(boro_sql), con=connection)

(
    boro_df
    .set_index('BOROUGH')
    .plot(kind='barh', figsize=(10,5))
)


### Task 4

Find out the how many collisions had 0 persons injured, 1 persons injured, etc. persons injured in each accident. Use the `value_counts()` approach. You may also find the `.plot(logy=True)` option useful when you create the plot to make the y-axis logarigthmic.


#### Solution

In [None]:
# "Chain" style of writing data maniputation operations
plot = (
    c['PERSONS_INJURED'] # take the num of injuries column
    .value_counts() # compure the freuquency of each value
    .sort_index() # sort the results based on the index value instead of the frequency,
                  # which is the default for value_counts
    .plot( # and plot the results
        kind='line', # we use a line plot because the x-axis is numeric/continuous
        marker='o',  # we use a marker to mark where we have data points
        logy=True # make the y-axis logarithmic
    )
)
plot.set_xlabel("Number of injuries")
plot.set_ylabel("Number of collisions")
plot.set_title("Analysis of number of injuries per collision")
pass

In [None]:
injuries_sql = '''
	SELECT PERSONS_INJURED, COUNT(*) AS cnt
  FROM collisions
  -- WHERE DATE_TIME > '2020-01-01'
  GROUP BY PERSONS_INJURED
  ORDER BY cnt DESC
'''

with engine.connect() as connection:
	injuries_df = pd.read_sql(text(injuries_sql), con=connection)

# "Chain" style of writing data maniputation operations
plot = (
    injuries_df # take the num of injuries column
    .set_index('PERSONS_INJURED') # compure the frequency of each value
    .sort_index() # sort the results based on the index value instead of the frequency,
                  # which is the default for value_counts
    .plot( # and plot the results
        kind='line', # we use a line plot because the x-axis is numeric/continuous
        marker='o',  # we use a marker to mark where we have data points
        logy=True # make the y-axis logarithmic
    )
)
plot.set_xlabel("Number of injuries")
plot.set_ylabel("Number of collisions")
plot.set_title("Analysis of number of injuries per collision")
pass

### Task 5

(a) Compute the average number of injuries and deaths per accident, broken down by borough. Use the `pivot_table` functionality, putting `BOROUGH` as the index. You can answer this query by generating two separate tables, or you can create a single table by using the fact that you can pass a list of attributes/columns to the `values` parameter of the pivot table.

(b) Repeat the exercise above, but break down the average number of deaths and injuries using the cause for the accident. (Do not worry that each accident may have multiple causes.) You will need to **join** the tables `collisions` and `vehicles_involves`; you can do the join either in SQL or in pandas, using the `pd.merge` command. Use the `sort_values` command to sort the results, putting on top the contributing factors that generate the highest number of deaths. Limit to the 10-deadliest causes.

#### Solution

In [None]:
pd.pivot_table(
    data = c,
    index = 'BOROUGH',
    aggfunc = 'mean',
    values = ['PERSONS_INJURED', 'PERSONS_KILLED']
)

In [None]:
# By keeping only the minimum attributes that we need, we speed up
# the execution, as we do not bring back data that we will discard anyway
sql = '''
  SELECT C.BOROUGH
        , AVG(C.PERSONS_INJURED) AS PERSONS_INJURED
        , AVG(C.PERSONS_KILLED) AS PERSONS_KILLED
  FROM collisions C
  WHERE DATE_TIME > '2020-01-01'
  GROUP BY C.BOROUGH
'''
with engine.connect() as connection:
	result = pd.read_sql(text(sql), con=connection)
result.set_index('BOROUGH')

In [None]:
# By keeping only the minimum attributes that we need, we speed up
# the execution, as we do not bring back data that we will discard anyway
sql = '''
  SELECT V.CAUSE
        , AVG(C.PERSONS_INJURED) AS PERSONS_INJURED
        , AVG(C.PERSONS_KILLED) AS PERSONS_KILLED
  FROM collisions C JOIN vehicles_involved V ON C.UNIQUE_KEY = V.UNIQUE_KEY
  WHERE DATE_TIME > '2020-01-01'
  GROUP BY V.CAUSE
'''
with engine.connect() as connection:
	result = pd.read_sql(text(sql), con=connection)

In [None]:
(
  result
 .set_index('CAUSE')
 .sort_values('PERSONS_KILLED',ascending=False)
 .head(20)

)
#

### Task 6

Break down the number of accidents by borough and cause. Use the `pivot_table` function of Pandas, making the values of "borough" to be  columns and cause to be rows.


#### Solution

In [None]:
# By keeping only the minimum attributes that we need, we speed up
# the execution, as we do not bring back data that we will discard anyway
sql = '''
  SELECT C.BOROUGH,  V.CAUSE, COUNT(DISTINCT C.UNIQUE_KEY) AS cnt
  FROM collisions C JOIN vehicles_involved V ON C.UNIQUE_KEY = V.UNIQUE_KEY
  WHERE DATE_TIME > '2020-01-01'
  GROUP BY C.BOROUGH,  V.CAUSE
'''
with engine.connect() as connection:
	result = pd.read_sql(text(sql), con=connection)

In [None]:
pivot = pd.pivot_table(
    data = result, # we analyze the df (accidents) dataframe
    index = 'CAUSE',
    columns = 'BOROUGH',
    values = 'cnt',
    aggfunc = 'sum'
)

# Create an extra column showing the total deaths across boroughs (=columns)
pivot["Total"] = pivot.sum(axis="columns")

# Sort the dataframe by descending order of the values in the column "Total"
pivot = pivot.sort_values("Total", ascending=False)

pivot.head(10)

### Task 7

Find the dates with the most accidents. Can you figure out what happened on these days?


#### Solution

In [None]:
sql = '''
  SELECT DATE(DATE_TIME) AS accident_date, COUNT(*) AS cnt
  FROM collisions
  GROUP BY DATE(DATE_TIME)
  ORDER BY cnt DESC
'''

with engine.connect() as connection:
	date_df = pd.read_sql(text(sql), con=connection)

In [None]:
date_df

In [None]:
(
  pd.pivot_table(
      data = date_df,
      index = 'accident_date',
      values = 'cnt',
  )
  # .resample('1D').sum()
  .sort_values('cnt', ascending=False)
)

### Task 8

Plot the number of accidents per day. Try to eliminate the effects of seasonality by resampling and calculating values on a weekly or monthly basis (Hint: Ensure that your date column is in the right datatype and that it is properly sorted, before attempting a `resample`)


#### Solution

In [None]:
(
  pd.pivot_table(
      data = date_df,
      index = 'accident_date',
      values = 'cnt',
  )
  # .resample('1W') # take periods of 1 week
  # .sum() # sum the number of accidents per period
  .plot(figsize=(15,3)) # plot the result
)



In [None]:
# Convert the 'accident_date' from 'object' to datetime
date_df['date'] = pd.to_datetime(date_df['accident_date'])

(
  pd.pivot_table(
      data = date_df,
      index = 'date',
      values = 'cnt',
  )
  .resample('1W') # take periods of 1 week
  .sum() # sum the number of accidents per period
  .plot(figsize=(15,3)) # plot the result
)

### Task 9

We want to analyze the timing patterns of accidents that lead to death or injury.

We will do the analysis by creating histograms showing the frequency of deadly vs non-deadly accidents throughout the day. By comparing the two histograms we will be able to understand if time of day is correlated with deadly accidents or not.

Steps to follow:
* Create an `HOUR` column that captures the hour of the day that the accident happened.
* Create a boolean column `DEATH` that is true when someone was killed in the accident (i.e., `NUMBER OF PERSONS KILLED > 0`).
* Create a boolean column `INJURY` that is true when someone was injured in the accident (i.e., `NUMBER OF PERSONS INJURED > 0`).
* Query the dataframe to get back the deadly accidents and create a histogram of deadly accidents over time. Do the same for non-deadly accidents.
* To allow a more direct visual comparison of the two histograms, we want to merge them in one plot. Since the number of accidents without deaths is *much* higher, we want the histograms to be normalized (i.e., `density=True`).
* It is also a good idea to make the histographs partially transparent, to allow for easier comparison of the two histograms.


#### Solution

In [None]:
sql = '''
  SELECT UNIQUE_KEY, DATE_TIME
        , HOUR(DATE_TIME) AS HOUR
        , PERSONS_INJURED>0 AS INJURY
        , PERSONS_KILLED>0 AS DEATH
  FROM collisions
'''

with engine.connect() as connection:
	df = pd.read_sql(text(sql), con=connection)


In [None]:
# Define the two subsets
deadly = df.query(" DEATH == True ")
noharm = df.query(" DEATH == False ")

In [None]:
deadly['HOUR'].hist(bins=24)

In [None]:
noharm['HOUR'].hist(bins=24)

In [None]:
deadly['HOUR'].hist(
    bins=24, # one bar per hour
    figsize=(15,5),  # make the figure bigger
    density=True, # normalize the counts
    alpha=0.5,  # make the histogram semi-transparent
    color='red' # color red the deadly accidents
)

noharm['HOUR'].hist(
    bins=24,
    figsize=(15,5),
    density=True,
    alpha=0.5,
    color='green'
)

In [None]:
# Alternatively:

pd.pivot_table(
    data = df,
    index = 'HOUR',
    values = 'DEATH',
    aggfunc = 'mean',
).plot()

In [None]:
injuries = df.query(" INJURY == True ")
no_injuries = df.query(" INJURY == False ")

In [None]:
injuries['HOUR'].hist(bins=24,figsize=(15,5), density=True,alpha=0.5, color='red')
no_injuries['HOUR'].hist(bins=24,figsize=(15,5), density=True,alpha=0.5, color='green')

In [None]:
# Alternatively:

pd.pivot_table(
    data = df,
    index = 'HOUR',
    values = 'INJURY',
    aggfunc = 'mean',
).plot()

##### And let's do the same analysis over time

In [None]:
injuries['DATE_TIME'].hist(bins=48,figsize=(20,10), density=True,alpha=0.5, color='red')
no_injuries['DATE_TIME'].hist(bins=48,figsize=(20,10), density=True,alpha=0.5, color='green')

In [None]:
pd.pivot_table(
    data = df,
    index = 'DATE_TIME',
    values = 'INJURY',
    aggfunc = 'mean',
).resample('1W').mean().plot()

In [None]:
deadly['DATE_TIME'].hist(bins=48,figsize=(20,10), density=True,alpha=0.5, color='red')
noharm['DATE_TIME'].hist(bins=48,figsize=(20,10), density=True,alpha=0.5, color='green')

In [None]:
pd.pivot_table(
    data = df,
    index = 'DATE_TIME',
    values = 'DEATH',
    aggfunc = 'mean',
).resample('1M').mean().plot()

In [None]:
import seaborn as sns

In [None]:
sns.kdeplot(data = df, x ='HOUR', hue='DEATH', common_norm=False, bw_adjust=2, cut=0)

In [None]:
sns.kdeplot(data = df, x ='HOUR', hue='INJURY', common_norm=False, bw_adjust=2, cut=0)

### Task 10

Create a plot that shows the locations of the cyclist deaths. Filter first for accidents where there was a cyclist fatality, and then use a scatterplot on longitude and latitude. In the next step, create a 2-d kernel density plot to show the same information.

#### Solution

In [None]:
sql = "SELECT LONGITUDE, LATITUDE FROM collisions WHERE CYCLISTS_KILLED > 0"

with engine.connect() as connection:
	cyclist_dead = pd.read_sql(text(sql), con=connection)

In [None]:
(
    cyclist_dead
    .plot(
        kind='scatter',
        x='LONGITUDE',
        y='LATITUDE',
        # s=1,
        figsize=(10,10)
    )
)

In [None]:
scatter = (
    cyclist_dead
    .plot(
        kind='scatter',
        x='LONGITUDE',
        y='LATITUDE',
        figsize=(10,10)
    )
)

sns.kdeplot(
    data = cyclist_dead,
    x='LONGITUDE',
    y='LATITUDE',
    shade=True,
    gridsize=100,
    cmap='rainbow',
    alpha=0.75,
    n_levels=50,
    ax=scatter
)

In [None]:
base = df_nyc.plot(
    linewidth=0.5,
    color='White',
    edgecolor='Black',
    figsize=(10, 10),
    alpha=0.75
)

scatter = (
    cyclist_dead
    .plot(
        kind='scatter',
        x='LONGITUDE',
        y='LATITUDE',
        ax = base # plot it on top of the NYC boundaries
    )
)

sns.kdeplot(
    data = cyclist_dead,
    x='LONGITUDE',
    y='LATITUDE',
    shade=True, # Whether to color between the levels (True) or just keep the contours
    gridsize=100, # The resoltion of the 2d plot
    cmap='rainbow', # Color scheme
    alpha=0.5, # make the 2d density plot a bit transparent
    n_levels=20, # calculate 20 levels for the 2d density plot
    ax=scatter # plot it on top of the scatter plot
)