In [1]:
import sqlalchemy
from sqlalchemy import create_engine, text
import pandas as pd
database_name = 'crashes'
connection_string = f"postgresql://postgres:postgres@localhost:5433/{database_name}"
engine = create_engine(connection_string)

In [2]:
crash_data = pd.read_csv('../data/clean/east_nash_crashes.csv')
crash_data.describe()

Unnamed: 0.1,Unnamed: 0,accident_number,number_of_motor_vehicles,number_of_injuries,number_of_fatalities,zip,rpa,lat,long
count,20815.0,20815.0,20815.0,20815.0,20815.0,20815.0,20815.0,20730.0,20730.0
mean,89497.296181,20195480000.0,1.765554,0.428729,0.0,37155.958924,3261.732501,36.231816,-86.730998
std,51697.216564,19663060.0,0.775387,0.797325,0.0,46.708315,4837.613279,0.042433,0.024833
min,28.0,20170000000.0,0.0,0.0,0.0,37115.0,1009.0,36.1044,-86.8861
25%,45224.5,20180400000.0,2.0,0.0,0.0,37115.0,1425.0,36.1884,-86.7504
50%,88552.0,20190650000.0,2.0,0.0,0.0,37115.0,1713.0,36.2462,-86.7354
75%,134224.5,20210400000.0,2.0,1.0,0.0,37206.0,1851.0,36.2638,-86.7124
max,180099.0,20240100000.0,8.0,9.0,0.0,37216.0,20143.0,36.3096,-86.6476


In [3]:
crash_data.isna().sum(axis = 0)

Unnamed: 0                     0
accident_number                0
date_and_time                  0
number_of_motor_vehicles       0
number_of_injuries             0
number_of_fatalities           0
hit_and_run                    0
collision_type_description     0
weather_description            0
illumination_description       0
harmfuldescriptions            0
street_address                 0
city                           0
state                          0
zip                            0
rpa                            0
precinct                       0
lat                           85
long                          85
mapped_location               85
property_damage                0
dtype: int64

Location data is important for the first step of identifying "hot spots", so any nulls in these columns will need to be addressed. The records with nulls for location fields (`lat`, `long`, & `mapped_location`) do seem to be from actual events and not errors, but given that at the time of this analysis they represent 0.04% of the data, I don't believe removeing them will negatively impact the overall analysis.

In [4]:
crash_data = crash_data.dropna()

In [5]:
crash_data.isna().sum(axis = 0)

Unnamed: 0                    0
accident_number               0
date_and_time                 0
number_of_motor_vehicles      0
number_of_injuries            0
number_of_fatalities          0
hit_and_run                   0
collision_type_description    0
weather_description           0
illumination_description      0
harmfuldescriptions           0
street_address                0
city                          0
state                         0
zip                           0
rpa                           0
precinct                      0
lat                           0
long                          0
mapped_location               0
property_damage               0
dtype: int64

In [6]:
crash_data.describe()

Unnamed: 0.1,Unnamed: 0,accident_number,number_of_motor_vehicles,number_of_injuries,number_of_fatalities,zip,rpa,lat,long
count,20730.0,20730.0,20730.0,20730.0,20730.0,20730.0,20730.0,20730.0,20730.0
mean,89767.714858,20195370000.0,1.768017,0.429329,0.0,37155.832369,3269.298263,36.231816,-86.730998
std,51620.641608,19612790.0,0.773911,0.798322,0.0,46.691722,4846.04756,0.042433,0.024833
min,28.0,20170000000.0,0.0,0.0,0.0,37115.0,1009.0,36.1044,-86.8861
25%,45666.0,20180390000.0,2.0,0.0,0.0,37115.0,1431.0,36.1884,-86.7504
50%,88982.5,20190640000.0,2.0,0.0,0.0,37115.0,1713.0,36.2462,-86.7354
75%,134408.75,20210390000.0,2.0,1.0,0.0,37206.0,1851.0,36.2638,-86.7124
max,180099.0,20240100000.0,8.0,9.0,0.0,37216.0,20143.0,36.3096,-86.6476


No fatalities are reported in the entire dataset, which is surprising. However, these reports are only as accurate as the officers recording them, and they may be busy attending to those involved and quickly filling these out as soon as they arrive or or after they've left. That said, I will ignore the column for this analysis but leave it in the dataset, so it can be used in the future if numbers start showing up.

As this analysis is only looking at one particular roadway corridor, I can't figure out how to specify a radius from the street but I can at least filter out any crashes that occured on interstates.

In [7]:
crash_data_no_hwys = crash_data.drop(crash_data[(crash_data['street_address'].str.contains('I*24')) | (crash_data['street_address'].str.contains('I*40')) | (crash_data['street_address'].str.contains('I*40')) == True].index)
crash_data_no_hwys

Unnamed: 0.1,Unnamed: 0,accident_number,date_and_time,number_of_motor_vehicles,number_of_injuries,number_of_fatalities,hit_and_run,collision_type_description,weather_description,illumination_description,...,street_address,city,state,zip,rpa,precinct,lat,long,mapped_location,property_damage
0,28,20240102658,2024-02-14 07:04:00,3.0,0.0,0.0,False,FRONT TO REAR,CLEAR,DAYLIGHT,...,ELLINGTON PKWYS & ELLINGTON PKWYN,NASHVILLE,TN,37206,1995,EAST,36.1758,-86.7666,"{'type': 'Point', 'coordinates': [-86.7666, 36...",True
1,39,20240101201,2024-02-13 18:38:00,2.0,2.0,0.0,False,FRONT TO REAR,CLEAR,DARK - LIGHTED,...,I65 S EXT RAMP & I 65,MADISON,TN,37115,20044,MADISO,36.2481,-86.7430,"{'type': 'Point', 'coordinates': [-86.743, 36....",True
2,46,20240100905,2024-02-13 16:00:00,2.0,0.0,0.0,False,FRONT TO REAR,CLEAR,DAYLIGHT,...,GALLATIN PKES & E PALESTINE AV,MADISON,TN,37115,1507,MADISO,36.2491,-86.7196,"{'type': 'Point', 'coordinates': [-86.7196, 36...",True
3,69,20240099545,2024-02-13 06:20:00,2.0,0.0,0.0,False,ANGLE,CLEAR,DAYLIGHT,...,GALLATIN PKES & MOVING CENTER CT,MADISON,TN,37115,1507,MADISO,36.2509,-86.7184,"{'type': 'Point', 'coordinates': [-86.7184, 36...",True
4,71,20240099483,2024-02-13 05:55:00,2.0,1.0,0.0,False,ANGLE,CLEAR,DUSK,...,E TRINITY LN & KEELING AV,NASHVILLE,TN,37216,1851,EAST,36.2044,-86.7463,"{'type': 'Point', 'coordinates': [-86.7463, 36...",True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20810,180034,20170001270,2017-01-01 14:59:00,2.0,0.0,0.0,False,FRONT TO REAR,RAIN,DAYLIGHT,...,CLEVELAND ST & N 9TH ST,NASHVILLE,TN,37206,1925,EAST,36.1847,-86.7583,"{'type': 'Point', 'coordinates': [-86.7583, 36...",True
20811,180043,20170001226,2017-01-01 14:33:00,2.0,2.0,0.0,False,FRONT TO REAR,RAIN,DAYLIGHT,...,ANDERSON LN & MYATT DR,MADISON,TN,37115,1713,MADISO,36.2721,-86.6890,"{'type': 'Point', 'coordinates': [-86.689, 36....",True
20812,180068,20170000705,2017-01-01 07:59:00,1.0,2.0,0.0,False,NOT COLLISION W/MOTOR VEHICLE-TRANSPORT,CLEAR,DAYLIGHT,...,I65 S EXT RAMP & I 65,MADISON,TN,37115,20044,MADISO,36.2481,-86.7430,"{'type': 'Point', 'coordinates': [-86.743, 36....",True
20813,180082,20170000450,2017-01-01 03:47:00,1.0,0.0,0.0,False,NOT COLLISION W/MOTOR VEHICLE-TRANSPORT,CLEAR,DARK - LIGHTED,...,RIVERWOOD DR & COOPER LN,NASHVILLE,TN,37216,1449,EAST,36.2095,-86.7135,"{'type': 'Point', 'coordinates': [-86.7135, 36...",True


In [8]:
crash_data_no_hwys.to_csv('../data/clean/all_crashes_no_hwys.csv')

Now is a good time to pause and look at the crashes on a map.<br><br>
(This is better done in a separate notebook, so this will serve as a stopping point for this one. The current table will be exported to a .csv file and used in the mapping notebook. Refer to `mapping.ipynb` for the overall map(s) and next steps will follow below)

In [9]:
nash_311 = pd.read_csv('../data/clean/nash_311.csv')
nash_311.describe()

Unnamed: 0.1,Unnamed: 0,case_number,incident_council_district,incident_zip_code,latitude,longitude,parent_case,preferred_language
count,143985.0,143985.0,143917.0,143985.0,143365.0,143365.0,2027.0,0.0
mean,768348.1,697192.6,6.591501,37178.930423,36.215272,-86.726553,678849.0,
std,424241.2,449070.6,1.759451,44.471905,0.121284,0.256722,415882.7,
min,0.0,143.0,2.0,37115.0,0.0,-104.979529,143.0,
25%,380323.0,273480.0,6.0,37115.0,36.18328,-86.743581,287071.0,
50%,816851.0,703021.0,6.0,37206.0,36.20242,-86.729594,678424.0,
75%,1129872.0,1088637.0,7.0,37206.0,36.25202,-86.713715,1030204.0,
max,1433298.0,1486956.0,33.0,37216.0,44.763165,0.0,1485830.0,


In [10]:
nash_311.isna().sum(axis = 0)

Unnamed: 0                        0
case_number                       0
status                            0
case_request                      0
case_subrequest                  45
additional_subrequest          7315
date_time_opened                  0
date_time_closed               2179
case_origin                       0
state_issue                       0
closed_when_created               0
incident_address                266
incident_city                   540
incident_council_district        68
incident_zip_code                 0
latitude                        620
longitude                       620
mapped_location                 620
contact_type                 139429
parent_case                  141958
preferred_language           143985
dtype: int64

**For Reference**

Syntax for SQL queries in Python:
```
query_name = '''
(
SQL syntax
)
'''

with engine.connect() as connection:
    counts = pd.read_sql(text(query_name), con = connection)
```