In [1]:
import pandas as pd
import numpy as np
import json

import seaborn as sn

import requests

from census import Census

from matplotlib import pyplot as plt

import datetime as dt

from config import (gkey,api_key)

from scipy.stats import linregress,pearsonr

from us import states

import gmaps

# Access maps with unique API key
gmaps.configure(api_key=gkey)

#Access census data for 2017
c = Census(api_key, year=2018)


### Loading data into dataframe

In [2]:
#Reading data from source file

accidents_source=pd.read_csv("Accident data/US_Accidents_Dec19.csv")


MemoryError: Unable to allocate 477. MiB for an array with shape (21, 2974335) and data type object

### Data cleaning

In [None]:
#Checking size of data

accidents_source.size

In [None]:
accidents_source.shape

In [None]:
#Priniting list of columns in the dataset
accidents_source.columns

In [None]:
#Running to see top 5 records in the dataset

accidents_source.head()

In [None]:
# Finding total count of values in each column

accidents_source.agg(["count"])

In [None]:
# Finding number of null values within each column

accidents_source.isna().sum()

In [None]:
accidents_source["Source"].value_counts()


*The below columns have most of the values as null:</b>

1. End_Lat: 2246264 out of 2974335
2. End_Lng: 2246264 out of 2974335
3. Number: 1917605 out of 2974335
4. Wind_Chill(F): 1852623 out of 2974335
5. Precipitation(in): 1998358 out of 2974335

*The columns <b>End_Lat, End_Lng, Number</b> do not have much significance for our analysis and most of the data is null, the columns will be removed

*Since <b>Wind_Chill(F) and Precipitation(in)</b> are weather factors and might be used in analysis of weather factor relationship with accidents.

*Column <b>"ID"</b> is an identity column for each row and is of no significance in the analysis hence will be removed.

*Column <b>"Source"</b> has three values: MapQuest (2204098), Bing (728071), MapQuest-Bing(42166) with MapQuest being the source for most of the data. Since our focus is not on the source of the data the column will be removed.

*Column <b>"TMC"</b> has been removed as it is needed for our analysis.
*Column <b>"Description"</b> is a brief description of accidents and since other columns provide factors around accidents are provided and sufficient of our analysis, the column is being removed

*Colums related to location of Accidents:

1. Side : Not relevant for our study
2. City: Needed for analysis
3. County: Neede fo analysis
4. State: Needed for analysis
5. Zipcode: Analysis would be based on city, state, county and hence not needed
6. Country: The data is for only for country: US and hence removed
7. Timezone: Since the time of accident is provided in local timezone and being analyzed based on local timezone and hence being removed
8. Airport_Code: Not required in analysis

<b>Amenity, Bump, Crossing, Give_Way, Junction, No_Exit, Railway, Roundabout, Station, Stop, Traffic_Calming, Traffic_Signal, Turning_Loop factors</b> are not being analyzed as part of our study and hence removed from data set

<b>Civil_Twilight, Nautical_Twilight and Astronomical_Twilight </b> are being removed as analysis on time point would be done using Start_Time of accidents

The data provided in the data set is from February 2016 to December 2019 but data before December 2016 will be removed as focus is to analysis data from Winter 2016 to Fall 2019.

In [None]:
#accidents_source[["End Date","End Time"]]=accidents_source["End_Time"].str.split(expand=True)

In [None]:
# Setting list of columns that will used from data source n futher analysis

target_columns=['Severity','Start_Time','End_Time','Start_Lat', 'Start_Lng','City', 'County','State',\
                'Zipcode', 'Temperature(F)', 'Wind_Chill(F)', 'Humidity(%)', 'Pressure(in)',\
       'Visibility(mi)', 'Wind_Direction', 'Wind_Speed(mph)','Precipitation(in)', 'Weather_Condition','Sunrise_Sunset']


In [None]:
#Keeping data from December 2016 to reduce size of data and focus on 2017, 2018 and 2019

accident_target=accidents_source.loc[accidents_source['Start_Time']>='2016-12-01'][target_columns].reset_index(drop=True)


In [None]:
accident_target.head()

In [None]:
#Checking datatype for columns in the resultant data-set
accident_target.dtypes

In [None]:
# Changing datatype of Start_Time column to datetime as this column will be used to find 
#month/year/season/hour for our time analysis
accident_target["Start_Time"]=pd.to_datetime(accident_target["Start_Time"],format='%Y-%m-%d %H:%M:%S.%f')

In [None]:
# Adding a new column "Start_Year" for analysis on year
accident_target["Start_Year"]=accident_target["Start_Time"].dt.year

#accident_target.head()

In [None]:
#Adding a new column "Season" for analysis based on Season in different years
#Seasons are being set based on below logic:
#months in (Dec,Jan,Feb) then Winter
#months in (Mar,Apr,May) the Spring
#months in (Jun, Jul, Aug) then Summer
#months in (Sep, Oct, Nov) then Fall

accident_target["Season"]=np.select(
    [
        (accident_target["Start_Time"].dt.month== 12) | (accident_target["Start_Time"].dt.month== 1)| (accident_target["Start_Time"].dt.month== 2), 
        (accident_target["Start_Time"].dt.month== 3) | (accident_target["Start_Time"].dt.month== 4)| (accident_target["Start_Time"].dt.month== 5),
        (accident_target["Start_Time"].dt.month== 6) | (accident_target["Start_Time"].dt.month== 7)| (accident_target["Start_Time"].dt.month== 8),
        (accident_target["Start_Time"].dt.month== 9) | (accident_target["Start_Time"].dt.month== 10)| (accident_target["Start_Time"].dt.month== 11)
        
    ], 
    [
        'Winter', 
        'Spring',
        'Summer',
        'Fall'
    ], 
    default='Unknown'
)


In [None]:
accident_target.head()

In [None]:
#Writing clean data-set into a csv to create smaller and cleaner file that can be used by individual contributors

accident_target.to_csv("TargetData/us_accident_clean.csv",index=False)

# Analysis of Location and Population on Number of Accidents

## Analysis of states based on Number of Accidents

In [None]:
#Creating Dataframe that contains counts of Accidents per state in each year

accident_state_year=pd.DataFrame(accident_target[accident_target["Start_Time"]>='2017-01-01']\
                                 .groupby(["State","Start_Year"])["Start_Time"].count())\
.sort_values("Start_Time",ascending=False).reset_index()

accident_state_year.columns=["State","Year","Accident Count"]

accident_state_year[["State","Year","Accident Count"]].head()


In [None]:
#Plotting of Distribution of Accidents in different states

plt.figure(figsize=(20,6))
_=plt.bar(accident_state_year["State"],accident_state_year["Accident Count"])
plt.tight_layout()

plt.xlabel("State",fontsize=16)

plt.ylabel("Number of Accidents",fontsize=16)

_=plt.title("Distribution of Accidents in different states in 2017-19",fontsize=20)

plt.savefig("Images/Fig1. AccidentsByState.png",bbox_inches='tight')

In [None]:
# calculation combined Month and Year for Start of Accident
accident_target["month_year"]=accident_target["Start_Time"].dt.year.astype(str)\
+'-'+accident_target["Start_Time"].dt.month.astype(str)

In [None]:
#accident_target.groupby("month_year")["Start_Time"].count().nlargest(3).index

In [None]:
#Ceating list of top 3 months of accidents to plot data on maps of number of accidentd

max_accident_3mon=[]
max_accident_3mon.extend(accident_target.groupby("month_year")["Start_Time"].count().nlargest(3).index)

#max_accident_3mon

In [None]:
#CReating a traget dataframe for plotting number of accidents for top 3 months on gmaps
accident_target_max3mon=accident_target.loc[accident_target["month_year"].isin (max_accident_3mon)]

In [None]:
locations=accident_target_max3mon[["Start_Lat","Start_Lng"]]


In [None]:
# PLotting on google maps using gmaps

fig=gmaps.figure()

heatmap_layer=gmaps.heatmap_layer(locations,dissipating=False, max_intensity=5,
                                 point_radius=0.1)


# Add layer
fig.add_layer(heatmap_layer)

# Display figure
fig



##### The distribution of accidents by state show that California is the state with most accidents by almost the double amount of the number of accidents for Texas followed by Florida and New York in suxth position.

### 10 Cities with largest number of accidents in 2017-2019

In [None]:
# Create a datafram of top 10 cities with maximum number of accidents in 2017-2019

cities_accidents=accident_target.groupby(["City"])["Start_Time"].count().nlargest(10).reset_index()
#locations_ca_ny.head()

cities_accidents.columns=["City","Accident Count"]

In [None]:
#Ploting 10 Cities vs Accidents
plt.figure(figsize=(20,6))
_=plt.bar(cities_accidents["City"],cities_accidents["Accident Count"])
plt.tight_layout()

plt.xlabel("City",fontsize=16)

plt.ylabel("Number of Accidents",fontsize=16)

_=plt.title("Top 10 cities in terms of Accidents in 2017-19",fontsize=20)

plt.savefig("Images/Fig2.TopCities_Accident.png", bbox_inches='tight')

##### Though California is the state with most accidents (almost triple than next state with most accidents) but ony one city in California (Los Angeles) is among 10 cities with most accidents

Let's see the distribution of accidents in different cities of California and Texas

In [None]:
accidents_cities_CA_TX=accident_target[accident_target["State"].isin(["CA","TX"])]\
                                    .groupby(["State","City"])["Start_Time"].count().reset_index()
accidents_cities_CA_TX.rename(columns={"Start_Time":"Accident Count"},inplace=True)


In [None]:
accidents_cities_CA_TX.sort_values(["Accident Count"],ascending=False)

In [None]:
#plt.bar(accidents_cities_CA_TX["City"],accidents_cities_CA_TX["Accident Count"])

### Analysis of accidents with Years and Severity

In [None]:
# Extract years of interest and get number of accidents for each year
target_years = accident_target[(accident_target["Start_Year"] >= 2017) & (accident_target["Start_Year"] < 2020)]
yearly_accidents = target_years.groupby("Start_Year")
yearly_counts = yearly_accidents["Start_Year"].count()
yearly_counts

In [None]:
# Plot the number of accidents by year
accident_count = yearly_counts.plot(kind="bar", color="blue", figsize=(10,8))
plt.title("Number of Accidents by Year", fontweight="bold", fontsize=14)
plt.xlabel("Year", fontweight='bold', fontsize=12)
plt.ylabel("Number of Accidents", fontweight='bold', fontsize=12)

# Save image and show graph
plt.savefig("Images/Fig3. AccidentsByYear.png")
plt.tight_layout()
plt.show();

##### Number of accidents seems to show a steady increase each year from 2017 to 2019.

In [None]:
# Get severity counts by year
severity_counts = yearly_accidents["Severity"].value_counts()
severity_counts = severity_counts.to_frame()
severity_by_year = severity_counts.rename(columns= {severity_counts.columns[0]: "Severity Count"})
severity_by_year.reset_index(inplace=True)
severity_by_year

In [None]:
# Create plot to compare accident severity levels by year
plt.figure(figsize=(10, 8))
barWidth = 0.25

# Set height of bar
y1 = severity_by_year[severity_by_year["Severity"]==1]["Severity Count"].to_list()
y2 = severity_by_year[severity_by_year["Severity"]==2]["Severity Count"].to_list()
y3 = severity_by_year[severity_by_year["Severity"]==3]["Severity Count"].to_list()
y4 = severity_by_year[severity_by_year["Severity"]==4]["Severity Count"].to_list()

# Set position of bar on X axis
x1 = np.arange(len(y1))
x2 = [x + barWidth for x in x1]
x3 = [x + barWidth for x in x2]
x4 = [x + barWidth for x in x3]
 
# Make the plot
plt.bar(x1, y1, color="r", width=barWidth, edgecolor="white", label="1")
plt.bar(x2, y2, color="b", width=barWidth, edgecolor="white", label="2")
plt.bar(x3, y3, color="orange", width=barWidth, edgecolor="white", label="3")
plt.bar(x4, y4, color="g", width=barWidth, edgecolor="white", label="4")
 
# Add xticks and lables
plt.title("Accident Severity by Year", fontweight="bold", fontsize=14)
plt.xlabel("Severity Level", fontweight="bold", fontsize=12)
plt.ylabel("Severity Count", fontweight="bold", fontsize=12)
plt.xticks([r + barWidth for r in range(len(y1))], ["2017", "2018", "2019"])
 
# Create legend, save image and show graph
plt.legend()
plt.savefig("Images/Fig4. SeverityByYear.png")
plt.show();


##### Severity of accidents by year. The graph shows that the level of severity of 2 is the most frequent, followed by 3 with 1 and 4 being very minimal by comparison. Severity level in this case is as follows: 1 indicates the least impact on traffic (i.e., short delay) and 4 indicates major impact on traffic (i.e. long delay)

In [None]:
# Create a pie chart to showcase the percentage of each severity 
plt.figure(figsize=(10, 8))
accident_target["Severity"].value_counts().plot.pie(explode=[0.1,0,0,0],autopct="%1.1f%%",shadow=True)
plt.title("Accident Severity by Percentage (Cumulative) Dec. 2016-Dec.2019", fontweight="bold", fontsize=14)
plt.ylabel("Severity Count",fontweight="bold", fontsize=12)
plt.axis("equal")

# Save image and show graph
plt.savefig("Images/Fig5. TotalSeverity.png")
plt.tight_layout()
plt.show()

##### Chart showing the cumulative percentage of accidents based on severity level. This again shows that most accidents cause medium and moderate delay in traffic accounts for approx. 97% of all accidents between December 2016 to December 2019.

## Relationship of accidents with population density in different states

In [None]:
# Finding population and other census data using census api for all states in USA

census_state = c.acs5.state(("NAME", "B19013_001E", "B01003_001E", "B01002_001E",
                          "B19301_001E",
                          "B17001_002E",
                          "B23025_005E"), Census.ALL)

In [None]:
# Creating dataframe of census and renaming column

census_state_df=pd.DataFrame(census_state)

census_state_df.rename(columns={"NAME":"State",
                                "B01003_001E": "Population",
                                "B01002_001E": "Median Age",
                                "B19013_001E": "Household Income",
                                "B19301_001E": "Per Capita Income",
                                "B17001_002E": "Poverty Count",
                                "B23025_005E": "Unemployment Count"}, inplace=True)

census_state_df.head()

In [None]:
# We have states name in census data but abbreviation in accident dataset.. 
#Importing an excel for mapping of abbreviation and name into a dataframe
accident_state_abbrev=pd.read_excel("state_abbrev.xlsx")

In [None]:
accident_state_abbrev.head()

In [None]:
#Creating datframe for accident count by state

accident_states=pd.DataFrame(accident_target.groupby(["State"])["Start_Time"].count().reset_index())
accident_states.columns=["State","Accident Count"]

In [None]:
accident_states.head()

In [None]:
# Creating dataframe for accident count per state with names of state added from state abbreviation dataframe

accident_states_df=accident_states.merge(accident_state_abbrev,left_on="State",right_on="Abbreviation").drop("State_x",axis=1)
#["State_y","Abbreviation","Accident Count"]
accident_states_df.rename(columns={"State_y":"State"},inplace=True)

#accident_states_df.head()

In [None]:
#Creating a dataframe containing census and accident count for each state

accident_states_population=accident_states_df.merge(census_state_df,on="State")

accident_states_population["Population(mil)"]=accident_states_population["Population"]/1000000

accident_states_population.head()

In [None]:
# Plotting Distribution of Accidents vs Population in States

x_axis=accident_states_population["Population(mil)"]
y_axis=accident_states_population["Accident Count"]

plt.figure(figsize=(10,6))
plt.scatter(x_axis,y_axis,facecolor="red",edgecolor="grey")

plt.title("Distribution of Accidents vs Population in States",fontsize=16)
plt.xlabel("Population in millions",fontsize=16)
_=plt.ylabel("Number of Accidents",fontsize=16)

plt.savefig("Images/Fig6. Accident_Population_State.png")

##### It can be inferred from the graph that state with more population witness more accidents

### Distribution of number of accidents against population in counties of California

In [None]:
# Finding population and other census data using census api for all states and counties in USA

census_state_county=c.acs5.state_county(("NAME", "B19013_001E", "B01003_001E", "B01002_001E",
                          "B19301_001E",
                          "B17001_002E",
                          "B23025_005E"), Census.ALL, Census.ALL)


In [None]:
#Creating dataframe for census data using census api for all states and counties in USA
#County and State name is combined in one column Name
#Using split function to split data into two columns in dataframe

census_state_county_df=pd.DataFrame(census_state_county)

census_state_county_df[["County","State"]]=census_state_county_df["NAME"].str.split(', ',expand=True)

census_state_county_df.rename(columns={"B01003_001E": "Population",
                                "B01002_001E": "Median Age",
                                "B19013_001E": "Household Income",
                                "B19301_001E": "Per Capita Income",
                                "B17001_002E": "Poverty Count",
                                "B23025_005E": "Unemployment Count"}, inplace=True)

census_state_county_df.head()

In [None]:
# CReating dataframe to get census data only for california
# Since County name contains County in Census data but not in Accidents data; remove text ' county' from county name in census data

census_ca_county_df=census_state_county_df[census_state_county_df["State"]=="California"]

#census_state_county_df[census_state_county_df.county.str.endswith('County')]
census_ca_county_df["County"].str.replace(' County','')

census_ca_county_df["County"]=census_ca_county_df["County"].str.replace(' County','')

census_ca_county_df.head()

#census_ca_county_df["County"].unique()

In [None]:
#Create dataframe for acident by state and city

accident_states_county=pd.DataFrame(accident_target.groupby(["State","County"])["Start_Time"].count().reset_index())
accident_states_county.columns=["State","County","Accident Count"]

In [None]:
# mErge with State Abbreviation and name dataframe to add state name in the column

accident_states_county_df=accident_states_county.merge(accident_state_abbrev,left_on="State",right_on="Abbreviation").drop("State_x",axis=1)
#["State_y","Abbreviation","Accident Count"]
accident_states_county_df.rename(columns={"State_y":"State"},inplace=True)

accident_states_county_df.head()

In [None]:
#Create dataframe with accident data for california county along with census data
accident_county_population_ca=accident_states_county_df.merge(census_ca_county_df,on=["State","County"])

accident_county_population_ca["Population(tenthou)"]=accident_county_population_ca["Population"]/10000

accident_county_population_ca.head()

In [None]:
#Plotting distribution of accidents vs population in counties in california

x_axis=accident_county_population_ca["Population(tenthou)"]
y_axis=accident_county_population_ca["Accident Count"]

plt.figure(figsize=(10,6))
plt.scatter(x_axis,y_axis,facecolor="red",edgecolor="grey")

plt.title("Distribution of Accidents vs Population in Counties in California",fontsize=16)
plt.xlabel("Population in ten thousands",fontsize=16)
_=plt.ylabel("Number of Accidents",fontsize=16)

plt.savefig("Images/Fig7. Accident_Population_County_CA.png")

plt.tight_layout()
#for i, text in enumerate(accident_county_population_ca.loc[accident_county_population_ca["Accident Count"]>25000,"County"]):
#   plt.annotate(text,(x_axis[i],y_axis[i]))

In [None]:
#Top 10 counties in terms of Accident Count

accident_county_population_ca[["County","Population","Accident Count"]].sort_values("Accident Count",ascending=False).head(10)

##### It is very evident from the graph that Los Angeles is worst affected county in california in terms of number of accidents and one of the factor reponsible for that is due to very large population in Los Angeles compared to other counties in California and has skewed the number of accidents in California compared to other states.

# Time Analysis

In [None]:
accident_target.count()

### Analyze accidents across seasons

In [None]:
#Combining season and year into one column

accident_target["Season_Year"]=np.select(
    [
        (accident_target["Start_Time"].dt.month== 1)| (accident_target["Start_Time"].dt.month== 2) 
    ], 
    [
       (accident_target["Start_Year"]-1).astype(str)+'-'+accident_target["Season"]
    ], 
    default=accident_target["Start_Year"].astype(str)+'-'+accident_target["Season"]
)


In [None]:
accident_target["Season_Year"].unique()

In [None]:
# Removing Winter-2019 as ther is only one month for Winter -2019

accident_target.loc[accident_target["Season_Year"]!='2019-Winter'].groupby("Season_Year")["Start_Time"].count()

In [None]:
# Creating dataframe for season-year and count of Accidents
accident_season_year_us=pd.DataFrame(accident_target.loc[accident_target["Season_Year"]!='2019-Winter'].groupby("Season_Year")["Start_Time"].count()).reset_index()

In [None]:
# Changing the order of rows since Winter should be followed by Spring followed by Summer 
#and followed by Fall otherwise the data would not show in a sequence of seasons while plotting

accident_season_year_us.rename(columns={"Start_Time":"Accident Count"},inplace=True)
accident_season_year_us=accident_season_year_us.reindex([0,2,3,1,4,6,7,5,8,10,11,9]).reset_index(drop=True)

accident_season_year_us

In [None]:
plt.figure(figsize=(20,6))
plt.bar(accident_season_year_us["Season_Year"],accident_season_year_us["Accident Count"],alpha=1.0,color="red",align="center")
_=plt.xticks(rotation=30)
plt.tight_layout()

plt.xlabel("Seasons",fontsize=16)
plt.ylabel("Number of Accidents",fontsize=16)

_=plt.title("Distribution of Accidents in different seasons across dataset",fontsize=20)

plt.savefig("Images\Fig8. Accidents_vs_Seasons.png")

###### Fall and Winters are the seasons in which most accidents happend compared to other seasons with most accidents happening in Fall in 2017-2019.

### Analysis by Month

In [None]:
#Calling the initial data and creating a new dataFrame
accident_target["Start_Year"]=accident_target["Start_Time"].dt.year
accident_target["Start_Month"]=accident_target["Start_Time"].dt.month
accident_target["Start_Date_Only"]=accident_target["Start_Time"].dt.date

In [None]:
#Calling the initial data and creating a new column for hour
accident_target["Hour"]=accident_target["Start_Time"].dt.hour


In [None]:
accident_target.head()

In [None]:
#Creating df for each year to evaluate the number of accidents for each month in each year
accidents_2016_df = accident_target.loc[accident_target['Start_Year']==2016].sort_values(by='Start_Month', ascending=True)
accidents_2017_df = accident_target.loc[accident_target['Start_Year']==2017].sort_values(by='Start_Month', ascending=True)
accidents_2018_df = accident_target.loc[accident_target['Start_Year']==2018].sort_values(by='Start_Month', ascending=True)
accidents_2019_df = accident_target.loc[accident_target['Start_Year']==2019].sort_values(by='Start_Month', ascending=True)
accidents_2016_df.head()

In [None]:
#Creating new df for accidents per month of each year
accidents_per_month_2016 = accidents_2016_df.groupby(accidents_2016_df['Start_Month']).count()
accidents_per_month_2017 = accidents_2017_df.groupby(accidents_2017_df['Start_Month']).count()
accidents_per_month_2018 = accidents_2018_df.groupby(accidents_2018_df['Start_Month']).count()
accidents_per_month_2019 = accidents_2019_df.groupby(accidents_2019_df['Start_Month']).count()
accidents_per_month_2019.head()

In [None]:
#Plot will show distribution of accidents for each year
#List of months
months = [1,2,3,4,5,6,7,8,9,10,11,12]
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
#Plotting the number of accidents per month for each year

plt.figure(figsize=(10, 8))
plt.plot(months, accidents_per_month_2017['Start_Date_Only'], color="blue", label="2017")
plt.plot(months, accidents_per_month_2018['Start_Date_Only'], color="red", label="2018")
plt.plot(months, accidents_per_month_2019['Start_Date_Only'], color="orange", label="2019")

# Place a legend on the chart in what matplotlib believes to be the "best" location
plt.legend(loc="best")

plt.title("Accidents per month for each year")
plt.xlabel("Months")
plt.ylabel("Number of accidents")
plt.xticks(months, month_names,rotation=90)
plt.savefig("Images/Fig9. AccidentsPerMonthLine.png")

# Print our chart to the screen
plt.show();

##### Accidents per month for the years 2017, 2018 and 2019 show a slight increase in number of accidents in the later part of the year, starting August, with 2018 showing a decrease in December. 

In [None]:
#Creating df for California and NY to evaluate the number of accidents for each month in each year
#California
accidents_CA_2016_df = accident_target.loc[(accident_target['State']=='CA') & (accident_target['Start_Year']==2016)].sort_values(by='Start_Month', ascending=True)
accidents_CA_2017_df = accident_target.loc[(accident_target['State']=='CA') & (accident_target['Start_Year']==2017)].sort_values(by='Start_Month', ascending=True)
accidents_CA_2018_df = accident_target.loc[(accident_target['State']=='CA') & (accident_target['Start_Year']==2018)].sort_values(by='Start_Month', ascending=True)
accidents_CA_2019_df = accident_target.loc[(accident_target['State']=='CA') & (accident_target['Start_Year']==2019)].sort_values(by='Start_Month', ascending=True)

accidents_CA_2019_df.head()

#New York
accidents_NY_2016_df = accident_target.loc[(accident_target['State']=='NY') & (accident_target['Start_Year']==2016)].sort_values(by='Start_Month', ascending=True)
accidents_NY_2017_df = accident_target.loc[(accident_target['State']=='NY') & (accident_target['Start_Year']==2017)].sort_values(by='Start_Month', ascending=True)
accidents_NY_2018_df = accident_target.loc[(accident_target['State']=='NY') & (accident_target['Start_Year']==2018)].sort_values(by='Start_Month', ascending=True)
accidents_NY_2019_df = accident_target.loc[(accident_target['State']=='NY') & (accident_target['Start_Year']==2019)].sort_values(by='Start_Month', ascending=True)

accidents_NY_2019_df.head()

In [None]:
#Grouping by month for California and counting accidents
accidents_per_month_2016_CA = accidents_CA_2016_df.groupby(accidents_CA_2016_df['Start_Month']).count()
accidents_per_month_2017_CA = accidents_CA_2017_df.groupby(accidents_CA_2017_df['Start_Month']).count()
accidents_per_month_2018_CA = accidents_CA_2018_df.groupby(accidents_CA_2018_df['Start_Month']).count()
accidents_per_month_2019_CA = accidents_CA_2019_df.groupby(accidents_CA_2019_df['Start_Month']).count()
accidents_per_month_2019_CA

#Grouping by month for NY and counting accidents
accidents_per_month_2016_NY = accidents_NY_2016_df.groupby(accidents_NY_2016_df['Start_Month']).count()
accidents_per_month_2017_NY = accidents_NY_2017_df.groupby(accidents_NY_2017_df['Start_Month']).count()
accidents_per_month_2018_NY = accidents_NY_2018_df.groupby(accidents_NY_2018_df['Start_Month']).count()
accidents_per_month_2019_NY = accidents_NY_2019_df.groupby(accidents_NY_2019_df['Start_Month']).count()
accidents_per_month_2019_NY.head()

In [None]:
#Plot will show distribution of accidents for each year in California
#List of months
months = [1,2,3,4,5,6,7,8,9,10,11,12]

#Plotting the number of accidents per month for each year

plt.figure(figsize=(10, 8))
plt.plot(months, accidents_per_month_2017_CA['Start_Date_Only'], color="blue", label="2017")
plt.plot(months, accidents_per_month_2018_CA['Start_Date_Only'], color="red", label="2018")
plt.plot(months, accidents_per_month_2019_CA['Start_Date_Only'], color="orange", label="2019")


# Place a legend on the chart in what matplotlib believes to be the "best" location
plt.legend(loc="best")

plt.title("Accidents per month for each year for California")
plt.xlabel("Months")
plt.ylabel("Number of accidents")
plt.xticks(months, month_names,rotation=90)
plt.savefig("Images/Fig10. AccidentsPerMonthLineCAL.png")

# Print our chart to the screen
plt.show();

##### Accidents per month for California in 2017, 2018 and 2019. California does not necesarily show an increase in number of accidents towards the later part of the year for 2017 and 2018, with only an obvious peak in accidents for 2019. 

In [None]:
#Plot will show distribution of accidents for each year in New York
#List of months
months = [1,2,3,4,5,6,7,8,9,10,11,12]

#Plotting the number of accidents per month for each year

plt.figure(figsize=(10, 8))
plt.plot(months, accidents_per_month_2017_NY['Start_Date_Only'], color="blue", label="2017")
plt.plot(months, accidents_per_month_2018_NY['Start_Date_Only'], color="red", label="2018")
plt.plot(months, accidents_per_month_2019_NY['Start_Date_Only'], color="orange", label="2019")


# Place a legend on the chart in what matplotlib believes to be the "best" location
plt.legend(loc="best")

plt.title("Accidents per month for each year for NY")
plt.xlabel("Months")
plt.ylabel("Number of accidents")
plt.xticks(months, month_names,rotation=90)
plt.savefig("Images/Fig11. AccidentsPerMonthLineNY.png")

# Print our chart to the screen
plt.show()

##### In New York State there doesn't seem to be any change in number of accidents throughout the year for 2019 with some months dropping and some other months increasing the number randomly. However for 2017 and 2018 there it can be seen that there is an increase in number of accidents after August. 

In [None]:
#Plot will show distribution of accidents for California and for New York in the year 2019
#List of months
months = [1,2,3,4,5,6,7,8,9,10,11,12]

#Plotting the number of accidents per month for California in 2019
plt.figure(figsize=(10, 8))
plt.plot(months, accidents_per_month_2019_CA['Start_Date_Only'], color="blue", label="CA")

#Plotting the number of accidents per month for New York in 2019
plt.plot(months, accidents_per_month_2019_NY['Start_Date_Only'], color="red", label="NY")



# Place a legend on the chart in what matplotlib believes to be the "best" location
plt.legend(loc="best")

plt.title("Accidents per month for 2019 for CA and NY")
plt.xlabel("Months")
plt.ylabel("Number of accidents")
plt.xticks(months, month_names,rotation=90)
plt.savefig("Images/Fig12. AccidentsPerMonthLineNYca2019_.png")

# Print our chart to the screen
plt.show()

##### When we compare California and New York the previous observations remain. For 2019 there is an increase in number of accidents towards the end of the year for California but not for New York. 

### Time of Hour Analysis

In [None]:
#Grouping by hour and counting accidents for the whole dataset
accidents_per_hour = accident_target.groupby(accident_target['Hour']).count()
accidents_per_hour.head()

In [None]:
#Plot will show distribution of accidents throughout the hours of the day for the whole dataset
#List of hours
hours = accidents_per_hour.index

#Plotting the number of accidents per hour
plt.figure(figsize=(10, 8))
plt.plot(hours, accidents_per_hour['Start_Date_Only'], color="blue", label="Number of accidents")

# Place a legend on the chart in what matplotlib believes to be the "best" location
plt.legend(loc="best")

plt.title("Number of accidents per hour Dec2016-Dec2019")
plt.xlabel("Hours")
plt.ylabel("Number of accidents")
plt.xticks(hours, hours ,rotation=0)
plt.savefig("Images/Fig13. AccidentsPerHour.png")

# Print our chart to the screen
plt.show()

##### This graph shows the hourly distribution of accidents. It is very obvious that there is a larger number of accidents occuring during rush hours in the morning between 7am and 9am and in the afternoon between 4pm and 6pm. 

In [None]:
#Cresting the Dataframe with the whole data for CA, NY
accidents_CA = accident_target.loc[accident_target['State']=='CA']
accidents_NY = accident_target.loc[accident_target['State']=='NY']

In [None]:
#Grouping by month for California and counting accidents
accidents_per_hour_CA = accidents_CA.groupby(accidents_CA['Hour']).count()

#Grouping by month for NY and counting accidents
accidents_per_hour_NY = accidents_NY.groupby(accidents_NY['Hour']).count()

In [None]:
#Plot will show distribution of accidents throughout the hours of the day for California and New York
#List of hours
hours = accidents_per_hour.index

#Plotting the number of accidents per hour in California
plt.figure(figsize=(10, 8))
plt.plot(hours, accidents_per_hour_CA['Start_Date_Only'], color="blue", label="CA")

#Plotting the number of accidents per hour in New York
plt.plot(hours, accidents_per_hour_NY['Start_Date_Only'], color="red", label="NY")

# Place a legend on the chart in what matplotlib believes to be the "best" location
plt.legend(loc="best")

plt.title("Number of accidents per hour in California and New York Dec2016-Dec2019")
plt.xlabel("Hours")
plt.ylabel("Number of accidents")
plt.xticks(hours, hours ,rotation=0)
plt.savefig("Images/Fig14. AccidentsPerHourCALny.png")

# Print our chart to the screen
plt.show()

##### The relation of number of accidents to rush hour is present here for New York and California with a most obvious peak in California in both morning and afternoon rush hour but only for morning rush hour in New York. 

In [None]:
#Grouping by month for California and counting accidents
accidents_per_hour_2016_CA = accidents_CA_2016_df.groupby(accidents_CA_2016_df['Hour']).count()
accidents_per_hour_2017_CA = accidents_CA_2017_df.groupby(accidents_CA_2017_df['Hour']).count()
accidents_per_hour_2018_CA = accidents_CA_2018_df.groupby(accidents_CA_2018_df['Hour']).count()
accidents_per_hour_2019_CA = accidents_CA_2019_df.groupby(accidents_CA_2019_df['Hour']).count()
accidents_per_hour_2019_CA

#Grouping by month for NY and counting accidents
accidents_per_hour_2016_NY = accidents_NY_2016_df.groupby(accidents_NY_2016_df['Hour']).count()
accidents_per_hour_2017_NY = accidents_NY_2017_df.groupby(accidents_NY_2017_df['Hour']).count()
accidents_per_hour_2018_NY = accidents_NY_2018_df.groupby(accidents_NY_2018_df['Hour']).count()
accidents_per_hour_2019_NY = accidents_NY_2019_df.groupby(accidents_NY_2019_df['Hour']).count()
accidents_per_hour_2019_NY.head()

In [None]:
#Plot will show distribution of accidents throughout the hours of the day for California and New York in 2019
#List of hours
hours = accidents_per_hour.index

#Plotting the number of accidents per hour in California
plt.figure(figsize=(10, 8))
plt.plot(hours, accidents_per_hour_2019_CA['Start_Date_Only'], color="blue", label="CA")

#Plotting the number of accidents per hour in New York
plt.plot(hours, accidents_per_hour_2019_NY['Start_Date_Only'], color="red", label="NY")

# Place a legend on the chart in what matplotlib believes to be the "best" location
plt.legend(loc="best")

plt.title("Number of accidents per hour in CA and NY 2019")
plt.xlabel("Hours")
plt.ylabel("Number of accidents")
plt.xticks(hours, hours ,rotation=0)
plt.savefig("Images/Fig15. AccidentsPerHourCALny2019.png")

# Print our chart to the screen
plt.show()

###### In 2019 the relation to rush hour remains for both California and New York.

# Weather Conditions

In [None]:
accident_target['Weather_Condition'].isna().sum()

In [None]:
##RUN this code only if we want to replace NaN values for weather conditions with 'Clear'
#accident_target['Weather_Condition'].replace(np.nan, 'Clear')

In [None]:
#Grouping by weather condition and counting each condition
accidents_per_weather_condition = accident_target.groupby(accident_target['Weather_Condition']).count()
accidents_per_weather_condition_df = accidents_per_weather_condition.reset_index()
accidents_per_weather_condition_df.sort_values(by= 'Severity', ascending=False)

In [None]:
#Removing Values with little occurance
accidents_per_weather_condition_clean = accidents_per_weather_condition_df.loc[accidents_per_weather_condition_df["Severity"]>=1000]
accidents_per_weather_condition_clean.sort_values(by="Severity", ascending=False).head()

In [None]:
#Number of accidents per detailed weather condition.
plt.figure(figsize=(10, 8))
plt.title("Number of accidents per detailed weather condition")

#explode = (0.1)
plt.pie(accidents_per_weather_condition_clean["Severity"],labels= accidents_per_weather_condition_clean['Weather_Condition'])

plt.savefig("Images/Fig16. Number of accidents per detailed weather condition.png")
plt.show()

In [None]:
accidents_per_weather_condition_df['Weather_Condition'].value_counts()

In [None]:
#Regrouping detail of weather conditions into 5 most relevant categories. 
#List of categories for weather conditions
sleet = ['Freezing Rain', 'Freezing Rain / Windy', 'Heavy Ice Pellets', 'Heavy Sleet', 'Ice Pellets', 'Light Freezing Rain', 'Light Freezing Rain / Windy', 'Light Ice Pellets', 'Light Sleet', 'Light Snow and Sleet / Windy', 'Sleet', 'Small Hail',  'Thunder and Hail / Windy', 'Hail', 'Light Hail' ]

snow = ['Blowing Snow', 'Blowing Snow / Windy', 'Heavy Blowing Snow', 'Heavy Snow', 'Heavy Snow / Windy', 'Heavy Snow with Thunder', 'Heavy Thunderstorms and Snow','Light Blowing Snow', 'Light Snow', 'Light Snow / Windy', 'Light Snow Grains','Light Snow Shower', 'Light Snow Showers', 'Light Snow and Sleet','Snow Showers', 'Light Snow with Thunder', 'Light Thunderstorms and Snow', 'Low Drifting Snow', 'Snow', 'Snow / Windy','Snow Grains','Snow and Sleet','Snow and Sleet / Windy', 'Snow and Thunder', 'Thunderstorms and Snow']

rain = ['Heavy Drizzle','Drizzle', 'Light Rain / Windy','Drizzle / Windy', 'Heavy Freezing Rain', 'Heavy Freezing Drizzle', 'Heavy Rain', 'Heavy Rain / Windy', 'Heavy Rain Showers', 'Heavy T-Storm', 'Heavy T-Storm / Windy', 'Heavy Thunderstorms and Rain', 'Heavy Thunderstorms with Small Hail', 'Light Rain Shower', 'Light Rain Shower / Windy', 'Light Rain Showers', 'Light Rain with Thunder', 'Light Thunderstorm', 'Light Thunderstorms and Rain', 'Rain', 'Rain / Windy', 'Rain Shower', 'Rain Showers', 'Squalls', 'Squalls / Windy', 'T-Storm', 'T-Storm / Windy', 'Thunderstorm', 'Thunderstorms and Rain', 'Wintry Mix', 'Wintry Mix / Windy', 'N/A Precipitation', 'Light Drizzle', 'Light Drizzle / Windy', 'Light Freezing Drizzle', 'Light Rain']

fog = ['Drizzle and Fog', 'Dust Whirls', 'Fog', 'Fog / Windy', 'Haze', 'Haze / Windy', 'Light Fog', 'Heavy Smoke', 'Light Freezing Fog', 'Light Haze', 'Mist', 'Partial Fog', 'Patches of Fog', 'Shallow Fog', 'Smoke', 'Smoke / Windy', 'Volcanic Ash', 'Widespread Dust']

wind = ['Blowing Dust','Partial Fog / Windy', 'Blowing Dust / Windy', 'Cloudy / Windy', 'Sand / Dust Whirlwinds','Sand / Dust Whirlwinds / Windy', 'Funnel Cloud', 'Thunder / Windy', 'Thunder / Wintry Mix / Windy','Tornado', 'Widespread Dust / Windy', 'Dust Whirls', 'Sand']

clear= ['Clear', 'Cloudy', 'Fair', 'Fair / Windy', 'Mostly Cloudy', 'Mostly Cloudy / Windy', 'Overcast', 'Partly Cloudy', 'Partly Cloudy / Windy', 'Scattered Clouds', 'Showers in the Vicinity', 'Thunder', 'Thunder in the Vicinity']


In [None]:
# Replacing the weather conditions to group them into 6 cagetories. 
accident_target_new = accident_target.replace(sleet, 'sleet')
accident_target_new = accident_target_new.replace(snow, 'snow')
accident_target_new = accident_target_new.replace(rain, 'rain')
accident_target_new = accident_target_new.replace(fog, 'fog')
accident_target_new = accident_target_new.replace(wind, 'wind')
accident_target_new = accident_target_new.replace(clear, 'clear')

In [None]:
#Checking the new categories
accident_target_new['Weather_Condition'].value_counts()

In [None]:
#Confirming that all the categories have been replaced effectively. 
accident_per_weather_category = accident_target_new.groupby(accident_target_new['Weather_Condition']).count()
accident_per_weather_category.sort_values( by = "Severity", ascending = False)

In [None]:
#Presenting the number of accidents per Weather Category.
plt.figure(figsize=(10, 8))
plt.title("Number of Accidents per Weather Category Dec.16-Dec.2019")
explode = (0.1, 0.5,0.5,0.5,0.5,0.5)
labels = accident_per_weather_category.index
#['clear','fog','rain', 'sleet','snow', 'wind']
plt.pie(accident_per_weather_category["Severity"], explode=explode, labels=labels, autopct="%1.1f%%")

# Save and display the chart

plt.savefig("Images/Fig17. Number of accidents per detailed weather condition.png")
plt.show()

##### Approximately 13.4% of the accidents between december 2016 and december 2019 ocurred during inclement weather conditions. Of those rain fog and snow seem to have the most number of accidents.

In [None]:
accident_per_weather_category_df = accident_per_weather_category.rename(columns={"Weather_Condition": "Weather_Count"})

In [None]:
accident_per_weather_category_df.reset_index(inplace=True)

In [None]:
weather_accidents = accident_per_weather_category_df.loc[accident_per_weather_category_df['Weather_Condition'] != 'clear']
weather_accidents#Presenting the number of accidents within the Weather Categories.
plt.figure(figsize=(10, 8))
plt.title("Number of Accidents within Weather Categories Dec.16-Dec.2019")
explode = (0.1,0.5,0.1,0.1,0.1)
labels = weather_accidents['Weather_Condition']
plt.pie(weather_accidents["Severity"], explode=explode, labels=labels, autopct="%1.1f%%")

# Save and display the chart
plt.savefig("Images/Fig18. Number of accidents per weather condition category.png")
plt.show()

##### Most weather related accidents are caused by wet roads accounting to 64.1% of all weather related accidents.

# Analyze impact of Weather Factors on Number of Accident across data

In [None]:
#Create function to calculate linear regression and correlation co-efficient for factors against accident

def factor_accident_correlation(factor,line_x,line_y):
    accident_count_factor=accident_target.groupby(factor).count().reset_index()
    #accident_count_humidity.head()
    x_axis=accident_count_factor[factor]
    y_axis=accident_count_factor["Start_Time"]
    
    #fig=plt.figure(figsize=(12,8))
    
    _=plt.scatter(x_axis,y_axis)
    
    plt.title(f"Number of Accidents vs {factor}")
    plt.xlabel(f"{factor}")
    plt.ylabel("Number of Accidents")
    
    #plt.yticks(np.arange(min(accident_count_factor["Start_Time"]),max(accident_count_factor["Start_Time"])+10000,10000))
    
    #plt.xticks(np.arange(min(accident_count_factor[factor]),max(accident_count_factor[factor])+5,10))
    
    slope,intercept,p,r,stderr=linregress(x_axis,y_axis)
    
    line_eq=f"y= {round(slope,2)}*x + {round(intercept,2)}"
    
    plt.plot(x_axis,((slope*x_axis)+intercept),"r--")
    
    _=plt.annotate(line_eq,(line_x,line_y),color="red",fontsize=15)
    
    corr,pvalue=pearsonr(accident_count_factor[factor],accident_count_factor["Start_Time"])
    
    plt.tight_layout()
    
    print(f"The correlation-co-efficient is: {round(corr,4)}")
    

## Analyze impact of Humidity on Number of Accident across data

In [None]:
factor_accident_correlation('Humidity(%)',10,50)

## Analyze impact of Temperature on Number of Accident across data

In [None]:
factor_accident_correlation('Temperature(F)',10,50)

## Analyze impact of Visibility on Number of Accident across data

In [None]:
factor_accident_correlation('Visibility(mi)',10,100000)

In [None]:
# Since the data is skewed due to a big value of Accidents for Visibility: 10 mi 
#Hence removing the value to see the correlaton of Visibility with Accidents without outlier value of 10 mi

factor="Visibility(mi)"
accident_count_factor=accident_target.loc[accident_target[factor]!=10.0].groupby(factor).count().reset_index()
x_axis=accident_count_factor[factor]
y_axis=accident_count_factor["Start_Time"]

#fig=plt.figure(figsize=(10,6))

_=plt.scatter(x_axis,y_axis,s=20)

plt.title(f"Number of Accidents vs {factor}")
plt.xlabel(f"{factor}")
plt.ylabel("Number of Accidents")

#plt.yticks(np.arange(min(accident_count_factor["Start_Time"]),max(accident_count_factor["Start_Time"])+10000,10000))

#plt.xticks(np.arange(min(accident_count_factor[factor]),max(accident_count_factor[factor])+5,10))

slope,intercept,p,r,stderr=linregress(x_axis,y_axis)

line_eq=f"y= {round(slope,2)}*x + {round(intercept,2)}"

plt.plot(x_axis,((slope*x_axis)+intercept),"r--")

_=plt.annotate(line_eq,(10,10000),color="red",fontsize=15)

corr,pvalue=pearsonr(accident_count_factor[factor],accident_count_factor["Start_Time"])
    
plt.tight_layout()
    
print(f"The correlation-co-efficient is: {round(corr,4)}")

## Analyze impact of Wind Speed on Number of Accident across data

In [None]:
factor_accident_correlation('Wind_Speed(mph)',10,100000)

In [None]:
# Since there are very few values with wind speed above 100 mph and is not possible value of wind speed in general 
#Hence removing all the rows with WindSpeed greater than 100 mph to see the correlaton of Wind Speed with Accidents 

factor="Wind_Speed(mph)"
accident_count_factor=accident_target.loc[accident_target[factor]<=100].groupby(factor).count().reset_index()
x_axis=accident_count_factor[factor]
y_axis=accident_count_factor["Start_Time"]

#fig=plt.figure(figsize=(10,6))

_=plt.scatter(x_axis,y_axis,s=20)

plt.title(f"Number of Accidents vs {factor}")
plt.xlabel(f"{factor}")
plt.ylabel("Number of Accidents")

#plt.yticks(np.arange(min(accident_count_factor["Start_Time"]),max(accident_count_factor["Start_Time"])+10000,10000))

#plt.xticks(np.arange(min(accident_count_factor[factor]),max(accident_count_factor[factor])+5,10))

slope,intercept,p,r,stderr=linregress(x_axis,y_axis)

line_eq=f"y= {round(slope,2)}*x + {round(intercept,2)}"

plt.plot(x_axis,((slope*x_axis)+intercept),"r--")

_=plt.annotate(line_eq,(10,10000),color="red",fontsize=15)

corr,pvalue=pearsonr(accident_count_factor[factor],accident_count_factor["Start_Time"])
    
plt.tight_layout()
    
print(f"The correlation-co-efficient is: {round(corr,4)}")

## Analyze impact of different weather factors on Number of Accident across data

In [None]:
multi_factor=["Temperature(F)","Humidity(%)","Visibility(mi)","Wind_Speed(mph)"]

In [None]:
fig,ax =  plt.subplots(2,2,figsize=(16,8))
#fig=plt.figure(figsize=(20,8))

#multi_factor=["Humidity(%)","Visibility(mi)","Temperature(F)","Wind_Speed(mph)"]
#,"Precipitation(in)"
x_cord=0
y_cord=0

for factor in multi_factor:
    
    #.loc[accident_target[factor]!=10.0]
    if factor=="Wind_Speed(mph)":
        accident_count_factor=accident_target.loc[accident_target[factor]<=100].groupby(factor).count().reset_index()
        x_axis=accident_count_factor[factor]
        y_axis=accident_count_factor["Start_Time"]
    elif factor=="Visibility(mi)":
        accident_count_factor=accident_target.loc[accident_target[factor]!=10.0].groupby(factor).count().reset_index()
        x_axis=accident_count_factor[factor]
        y_axis=accident_count_factor["Start_Time"]
    else:
        accident_count_factor=accident_target.groupby(factor).count().reset_index()
        x_axis=accident_count_factor[factor]
        y_axis=accident_count_factor["Start_Time"]
    
   

    ax[x_cord][y_cord].scatter(x_axis,y_axis)
    
    ax[x_cord][y_cord].set_title(f"Correlation of Accidents vs {factor}",fontsize=16)
    ax[x_cord][y_cord].set_xlabel(f"{factor}",fontsize=12)
    ax[x_cord][y_cord].set_ylabel("Number of Accidents",fontsize=12)

    #plt.yticks(np.arange(min(accident_count_factor["Start_Time"]),max(accident_count_factor["Start_Time"])+10000,10000))

    #plt.xticks(np.arange(min(accident_count_factor[factor]),max(accident_count_factor[factor])+5,10))

    slope,intercept,p,r,stderr=linregress(x_axis,y_axis)

    line_eq=f"y= {round(slope,2)}*x + {round(intercept,2)}"
    
    corr,pvalue=pearsonr(accident_count_factor[factor],accident_count_factor["Start_Time"])

    ax[x_cord][y_cord].plot(x_axis,((slope*x_axis)+intercept),"r--")

    ax[x_cord][y_cord].annotate(line_eq,(10,10000),color="red",fontsize=15)
    
    ax[x_cord][y_cord].text(0.1, 0.9,f"The correlation co-efficient is: {round(corr,4)}", transform=ax[x_cord][y_cord].transAxes,bbox=dict(facecolor='red', alpha=0.5))
    
    #imagepath=f"Images/Accidents_vs_factors"
    
    
    
    if(x_cord==0 and y_cord==0):
        y_cord=1
    elif (x_cord==0 and y_cord==1):
        x_cord=1
        y_cord=0
    elif (x_cord==1 and y_cord==0):
        y_cord=1
       

    plt.tight_layout()
    
    plt.savefig("Images/Fig19. Accidents_vs_factors.png")

    #print(f"The r-squared value for Number of Accidents vs {factor} is: {r**2}")
    


###### The correlation co-efficient factor between Accidents and Temperature is 0.1402 which shows a weak correlation between temperature and accidents. The distribution of number of accidents in US follows the distribution of temperature across state across year. 

###### The correlation co-efficient factor between Accidents and Humidity is 0.7223 which shows a very high correlation between humidity and accidents.
###### The correlation co-efficient factor between Accidents and Visibility is -0.2266 which shows a negative correlation between Visibility and accidents. As we can see from the graph that more accidents tend to happen when the visibility is lower. 

###### The correlation co-efficient factor between Accidents and Wind Speed is -0.5445 which shows a moderately negative correlation between Wind speed and accidents. More accidents tend to happen when Wind speed is low.

## Analyze impact of various weather factors on Number of Accident in Calfornia

In [None]:
fig,ax =  plt.subplots(2,2,figsize=(16,8))
#fig=plt.figure(figsize=(20,8))

#multi_factor=["Humidity(%)","Visibility(mi)","Temperature(F)","Wind_Speed(mph)"]
#,"Precipitation(in)"
x_cord=0
y_cord=0

for factor in multi_factor:
    
    if factor=="Wind_Speed(mph)":
        accident_count_factor=accident_target.loc[(accident_target[factor]<=100) & (accident_target["State"]=="CA")].groupby(factor).count().reset_index()
        x_axis=accident_count_factor[factor]
        y_axis=accident_count_factor["Start_Time"]
    elif factor=="Visibility(mi)":
        accident_count_factor=accident_target.loc[(accident_target[factor]!=10.0) & (accident_target["State"]=="CA")].groupby(factor).count().reset_index()
        x_axis=accident_count_factor[factor]
        y_axis=accident_count_factor["Start_Time"]
    else:
        accident_count_factor=accident_target.groupby(factor).count().reset_index()
        x_axis=accident_count_factor[factor]
        y_axis=accident_count_factor["Start_Time"]
    
    ax[x_cord][y_cord].scatter(x_axis,y_axis)
    
    ax[x_cord][y_cord].set_title(f"Correlation of Accidents vs {factor} in California",fontsize=16)
    ax[x_cord][y_cord].set_xlabel(f"{factor}",fontsize=12)
    ax[x_cord][y_cord].set_ylabel("Number of Accidents",fontsize=12)

    #plt.yticks(np.arange(min(accident_count_factor["Start_Time"]),max(accident_count_factor["Start_Time"])+10000,10000))

    #plt.xticks(np.arange(min(accident_count_factor[factor]),max(accident_count_factor[factor])+5,10))

    slope,intercept,p,r,stderr=linregress(x_axis,y_axis)

    line_eq=f"y= {round(slope,2)}*x + {round(intercept,2)}"
    
    corr,pvalue=pearsonr(accident_count_factor[factor],accident_count_factor["Start_Time"])

    ax[x_cord][y_cord].plot(x_axis,((slope*x_axis)+intercept),"r--")

    ax[x_cord][y_cord].annotate(line_eq,(10,10000),color="red",fontsize=15)
    
    ax[x_cord][y_cord].text(0.1, 0.9,f"The correlation co-efficient is: {round(corr,4)}", transform=ax[x_cord][y_cord].transAxes,bbox=dict(facecolor='red', alpha=0.5))
    
    if(x_cord==0 and y_cord==0):
        y_cord=1
    elif (x_cord==0 and y_cord==1):
        x_cord=1
        y_cord=0
    elif (x_cord==1 and y_cord==0):
        y_cord=1
    
    plt.savefig("Images/Fig20. Accidents_vs_factors_ca.png")

    plt.tight_layout()

##### The relationship is in-line with US in general.

## Analyze impact of various weather factors on Number of Accident in New York

In [None]:
fig,ax =  plt.subplots(2,2,figsize=(16,8))
#fig=plt.figure(figsize=(20,8))

#multi_factor=["Humidity(%)","Visibility(mi)","Temperature(F)","Wind_Speed(mph)"]
#,"Precipitation(in)"
x_cord=0
y_cord=0

for factor in multi_factor:
    if factor=="Wind_Speed(mph)":
        accident_count_factor=accident_target.loc[(accident_target[factor]<=100) & (accident_target["State"]=="NY")].groupby(factor).count().reset_index()
        x_axis=accident_count_factor[factor]
        y_axis=accident_count_factor["Start_Time"]
    elif factor=="Visibility(mi)":
        accident_count_factor=accident_target.loc[(accident_target[factor]!=10.0) & (accident_target["State"]=="NY")].groupby(factor).count().reset_index()
        x_axis=accident_count_factor[factor]
        y_axis=accident_count_factor["Start_Time"]
    else:
        accident_count_factor=accident_target.groupby(factor).count().reset_index()
        x_axis=accident_count_factor[factor]
        y_axis=accident_count_factor["Start_Time"]
    
    ax[x_cord][y_cord].scatter(x_axis,y_axis)
    
    ax[x_cord][y_cord].set_title(f"Correlation of Accidents vs {factor} in New York",fontsize=16)
    ax[x_cord][y_cord].set_xlabel(f"{factor}",fontsize=12)
    ax[x_cord][y_cord].set_ylabel("Number of Accidents",fontsize=12)

    #plt.yticks(np.arange(min(accident_count_factor["Start_Time"]),max(accident_count_factor["Start_Time"])+10000,10000))

    #plt.xticks(np.arange(min(accident_count_factor[factor]),max(accident_count_factor[factor])+5,10))

    slope,intercept,p,r,stderr=linregress(x_axis,y_axis)

    line_eq=f"y= {round(slope,2)}*x + {round(intercept,2)}"

    ax[x_cord][y_cord].plot(x_axis,((slope*x_axis)+intercept),"r--")
    
    corr,pvalue=pearsonr(accident_count_factor[factor],accident_count_factor["Start_Time"])

    ax[x_cord][y_cord].annotate(line_eq,(10,2000),color="red",fontsize=15)
    
    ax[x_cord][y_cord].text(0.1, 0.9,f"The correlation co-efficient is: {round(corr,4)}", transform=ax[x_cord][y_cord].transAxes,bbox=dict(facecolor='red', alpha=0.5))
    
    if(x_cord==0 and y_cord==0):
        y_cord=1
    elif (x_cord==0 and y_cord==1):
        x_cord=1
        y_cord=0
    elif (x_cord==1 and y_cord==0):
        y_cord=1
       
    plt.savefig("Images/Fig21. Accidents_vs_factors_ny.png")
    
    plt.tight_layout()


##### The relationship is in-line with US in general Except for visibility which shows a low relation wth the number of accidents.
##### The Visibility in New York is upto 20 mil compared to being 140 mil in US in general. The highest number of accidents fall in same range of first 10 miles in almost all of US.

### Correlation of temperature and visibility on Accident Count

In [None]:
accident_count_temp_visibility=pd.DataFrame(accident_target[accident_target["Visibility(mi)"]!=10]\
                                            .groupby(["Temperature(F)","Visibility(mi)"])["Start_Time"].count()).reset_index()
accident_count_temp_visibility.rename(columns={"Start_Time":"Accident Count"},inplace=True)
accident_count_temp_visibility["Combined Value"]=accident_count_temp_visibility["Temperature(F)"]\
*accident_count_temp_visibility["Visibility(mi)"]

accident_count_temp_visibility.head(200)


In [None]:
corr,pvalue=pearsonr(accident_count_temp_visibility["Combined Value"],accident_count_temp_visibility["Accident Count"])

print(f"The correlation co-efficient is: {corr}")

##### The correlation co-efficient is very low when combining values of temperature and visibility and does not seem to correlate together with number of accidents

### Correlation of humidity and visibility on Accident Count

In [None]:
accident_count_humid_visibility=pd.DataFrame(accident_target[accident_target["Visibility(mi)"]!=10]\
                                            .groupby(["Humidity(%)","Visibility(mi)"])["Start_Time"].count()).reset_index()
accident_count_humid_visibility.rename(columns={"Start_Time":"Accident Count"},inplace=True)
accident_count_humid_visibility["Combined Value"]=accident_count_humid_visibility["Humidity(%)"]\
*accident_count_humid_visibility["Visibility(mi)"]

accident_count_humid_visibility.head(200)

In [None]:
corr,pvalue=pearsonr(accident_count_humid_visibility["Combined Value"],accident_count_humid_visibility["Accident Count"])

print(f"The correlation co-efficient is: {corr}")
    

##### The correlation co-efficient is very low when combining values of humidity and visibility and does not seem to correlate together with number of accidents