## Data Preparation And Cleaning

In [None]:
import pandas as pd
from matplotlib import pyplot as plt

In [None]:
df=pd.read_csv("US_Accidents_Dec20_updated.csv")
df

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# to get the numeric data types 
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric=df.select_dtypes(include=numerics)
# check how many numeric columns we have
len(numeric.columns)

Percentage of missing values per column

In [None]:
missing_percentages=df.isna().sum().sort_values(ascending=False)/len(df) # count of missing values in each column
print(missing_percentages)

In [None]:
type(missing_percentages)  # series is one column from a dataframe

In [None]:
missing_percentages[missing_percentages!=0].plot(kind="barh")
# it will include those values which don't have zero

In [None]:
# we can drop those column also which have half and more than half null values

# Exploratory Analysis and Visualization

Columns we'll analyze:

City

Start Time

Start Lat, Start Lng

Temperature

Weather Condition

In [None]:
df.columns

In [None]:
df.City

In [None]:
cities=df["City"].unique()
len(cities) # to chack the number of unique cities.

In [None]:
# to check the number of cities by accidents
number_of_city=df.City.value_counts()
number_of_city

In [None]:
# top 10 cities
number_of_city[:11]

In [None]:
# since the data is huge so we won't be able to create a graph for all the states
number_of_city[:20].plot(kind="barh")

In [None]:
# new york is the most populated state but then also it is not coming in the top 10 states with most accident cases
# let's check if it is present in the dataset or not
"New York" in df.City

In [None]:
"NY" in df.State
# by this we can say that new york is not present in the dataset

In [None]:
import seaborn as sns
sns.set_style("darkgrid")

In [None]:
sns.distplot(number_of_city, kde=False)

In [None]:
high_accident_cities=number_of_city[number_of_city>=1000]
low_accident_cities=number_of_city[number_of_city<1000]

In [None]:
sns.distplot(high_accident_cities);

In [None]:
sns.distplot(low_accident_cities);

In [None]:
len(high_accident_cities)/len(number_of_city)
# by this we can say 2% of the cities have more than 1000 yearly accidents

In [None]:
# number of cities that has only one accidents
only_one_accident=number_of_city[number_of_city==1]
only_one_accident
# we can say 1167 cities have only one accident

# Start Time

In [None]:
df.columns

In [None]:
df["Start_Time"] # this contains the string

In [None]:
df["Start_Time"]=pd.to_datetime(df["Start_Time"])
# now it is converted in date time object
df["Start_Time"][0]

In [None]:
# we can't practially plot graph for each accident as the accidents must have occured in different time period
sns.distplot(df["Start_Time"].dt.hour,kde=False,bins=24,norm_hist=True);
# norm_hist will show the density instead of the frequency

# By this we can say that the maximum accidents are from 8-9 am (probably people in hurry for the office) and 4-6 pm (as people leave from the work.)

In [None]:
sns.distplot(df.Start_Time.dt.dayofweek, bins=7, kde=False, norm_hist=True);
# norm_hist=True If True, the histogram height shows a density rather than a count.
# on weekends very less people travel for which which may be the reason for less number of accidents

Is the distribution of accidents by hour the same on weekends as on weekdays.

In [None]:
sundays_start=df["Start_Time"][df["Start_Time"].dt.dayofweek==6] 
sns.distplot(sundays_start.dt.hour,kde=False,bins=24,norm_hist=True);

In [None]:
# by looking at the graph we can say on sunday in the morning and in the evening most accidents occur.
# let's check for monday also to check if the trend is same or different
# the peak occurs at 

In [None]:
monday_start=df["Start_Time"][df["Start_Time"].dt.dayofweek==0] 
sns.distplot(monday_start.dt.hour,kde=False,bins=24,norm_hist=True);

In [None]:
# we can see the graph is different in monday it's higher in the morning and in the afternoon

In [None]:
sns.distplot(df.Start_Time.dt.month, bins=12, kde=False, norm_hist=True);
# we can say that the maximum accidents are in the month of December.

In [None]:
# we can look for the data for a particular year
df_2019=df["Start_Time"][df["Start_Time"].dt.year==2019]
sns.distplot(df_2019.dt.month, bins=12, kde=False, norm_hist=True);
# in 2019 we can see there were more accidents in the month of october and december



In [None]:
# we can look for 2018 also
df_2018=df["Start_Time"][df["Start_Time"].dt.year==2018]
sns.distplot(df_2018.dt.month, bins=12, kde=False, norm_hist=True);
# here we can see the highest cases of accidents were in the last 3 months

# Start Latitude & Longitude

In [None]:
df["Start_Lat"]

In [None]:
df["Start_Lng"]

In [None]:
sns.scatterplot(x=df["Start_Lng"],y=df["Start_Lat"],size=0.01);
# size is the point size
# longitude is the horizontal lines
# latitude is the vertical lines

In [None]:
import folium

In [None]:
lat,lon=df.Start_Lat[0],df.Start_Lng[0]
lat,lon

In [None]:
lat_lon=list(zip(list(df["Start_Lat"]),list(df["Start_Lng"])))

In [None]:
from folium.plugins import HeatMap

map=folium.Map()  # we create a map
# we want heatmap
HeatMap(lat_lon).add_to(map)
map


1) Are there more accidents in warmer or colder areas?

2) Which 5 states have the highest number of accidents? How about per capita?

3) Does New York show up in the data? If yes, why is the count lower if this the most populated city.

4) Among the top 100 cities in number of accidents, which states do they belong to most frequently.

5) What time of the day are accidents most frequent in? - ANSWERED

6) Which days of the week have the most accidents?

7) Which months have the most accidents?

8) What is the trend of accidents year over year (decreasing/increasing?)

9) When is accidents per unit of traffic the highest.