In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import os
import math as m
import matplotlib.pyplot as plt

# <h1>Data Preparation & Cleaning</h1>

### 1. load the file using pandas
### 2. take overview of the columns and data
### 3. fix any missing and wrong entries

In [2]:
# read the csv file

df = pd.read_csv('./US_Accidents.csv')

FileNotFoundError: [Errno 2] No such file or directory: './US_Accidents.csv'

In [None]:
# lets look at the head of the data

df.head()

In [None]:
# dataframe columns

df.columns

In [None]:
# look at the data types of the columns
df.dtypes

In [None]:
# detailed info of the dataset

df.info()

In [None]:
# view the statistics of the data

df.describe()

In [None]:
# identify no of numeric columns

numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
newdf = df.select_dtypes(include=numerics)
len(newdf.columns)

## Data Cleaning

In [None]:
# count missing values per column/ percentage of missing values

missing_percentages = df.isna().sum().sort_values(ascending=False)/len(df)

In [None]:
# plot bar chart of missing percentages greater than zeo

missing_percentages[missing_percentages>0].plot(kind='bar', color='orange', figsize=(10,5))

In [None]:
# drop column which has more than 50 percent data is missing

df2 = df.drop('Number',1)

<h1><i>Exploratory Analysis and Visualization</i></h1>

## Analysize these 5 columns

### 1. City
### 2. Start time
### 3. start lat, start lng
### 4. Temperature
### 5. Weather Condition

In [None]:
df2.columns

## City

In [None]:
df2[['City', 'Start_Time','Start_Lat','Start_Lng','Temperature(F)','Weather_Condition']]

In [None]:
# unique cities

df2.City.unique()

In [None]:
# top 5 cities with most number of accidents

df2['City'].value_counts()[:5]

In [None]:
df2[['City','Description']][df2.City == 'New York']

In [None]:
# bar chart of 10 ten cities which has recorded most no of accidents

df2['City'].value_counts()[:10].plot(kind='bar',color='g')

In [None]:
# distribution of accidents recordred in cities in seaborn distribution plot

sns.distplot(df2.City.value_counts()

In [None]:
cities_by_accident = df2.City.value_counts()

In [None]:
cities_by_accident

In [None]:
# separate cities by accident to Highest Accident Cities and Lower Accident Cities

highest_accident_cities = df2.City.value_counts()[df2.City.value_counts()>1000]
Lower_accident_cities = df2.City.value_counts()[df2.City.value_counts()< 1000]

In [None]:
len(Lower_accident_cities)/len(df2.City.value_counts())

In [None]:
len(highest_accident_cities)/len(df2.City.value_counts())

In [None]:
# distribution of countries with less than 1000 accidents recorded
sns.distplot(Lower_accident_cities)

In [None]:
sns.histplot(highest_accident_cities, log_scale=True)

In [None]:
# countries which recorded which only recorded 1 accident

df2.City.value_counts()[df2.City.value_counts()==1]

<i><b>there is something wrong with this, about 1110 cities recorded just 1 accident, How ? </a></i>

## Summary & Insights
#### 1. there are 1110 cities which have recorded just 1 accident.?
#### 2. only 5 % countries which have recorded more than 1000 accidents,

### Start Time

In [None]:
df2.Start_Time = pd.to_datetime(df2.Start_Time)

#### histogram of accidents recorded by every hour

In [None]:
sns.histplot(df.Start_Time.dt.hour, bins=24)

##### we can see most accidents happened during 6AM-9AM in the morning (people travelling for office) 
##### and 15-17 i.e 3PM-7PM in the evening (returning from office)

### Histogram of accidents recorded day-wise (0-monday, 6-Sunday)

In [None]:
sns.histplot(df2['Start_Time'].dt.dayofweek, bins=7, color='b')

<h5> We can clearly see that most accidents happend on monday to friday <b>(due to working days)</b> <br> 
and least no of accidents happend on sat-sun <b> (official holidays) </b>. </h5>


### is distribution of accidents by hour is same on weekends campared to working day?

In [None]:
# accidents happened by hour on sunday

accidents_on_sunday = df2.Start_Time.dt.hour[df2.Start_Time.dt.dayofweek == 6]

In [None]:
sns.histplot(accidents_on_sunday)

In [None]:
# accidents happened by hour on monday

accidents_on_monday = df2.Start_Time.dt.hour[df2.Start_Time.dt.dayofweek == 0]

In [None]:
sns.histplot(accidents_on_monday, kde=True)

In [None]:
# accidents happened by months
current_palette = sns.color_palette()
sns.histplot(df2['Start_Time'].dt.month, bins=12, color='red')

## Summary & Insights

#### less accidents happened on weekends as compared to working days
#### weekends distribution of accidents by hour is different from working days, its more like a normal distrubtion i.e. bell curve

### Weather Condition

In [None]:
# different weather conditions 

len(df2.Weather_Condition.unique())

In [None]:
# total accidents happened in different weather conditions

df2.Weather_Condition.value_counts()[:20]

In [None]:
sns.distplot(df2.Weather_Condition.value_counts())

### Temperature

In [None]:
df2['Temperature(F)']

In [None]:
df2['Temperature(F)'].describe().apply(lambda x: format(x, 'f'))


In [None]:
# error because Temperature column contain NaN values let's check

In [None]:
df2['Temperature(F)'].isna().sum()

In [None]:
# fill the Nan values with 0
df2['Temperature(F)']= df2['Temperature(F)'].fillna(0)

In [None]:
# parse float to int

df2['Temperature(F)']= df2['Temperature(F)'].astype('int64')

### Note

#### so the maximum Temperature ever recorded in fahrenheit is 134 F and lowest temperature ever recorded is -128.6 F
#### lets check how many temperature counts are greater than 134 F

In [None]:
df2['Temperature(F)'][df2['Temperature(F)'] < 134].count()

In [None]:
# there are just 8 entries which are greater than 134. so can drop those rows

In [None]:
df2 = df2[df2['Temperature(F)'] < 135]

In [None]:
# wrong temperatures i.e. greater than 134 has been removed

df2['Temperature(F)'].sort_values()

In [None]:
# lets draw histrogram to see the disribution

sns.histplot(df2['Temperature(F)'], kde=True)

##### it looks like a normal distribution

### Start Latitude & Start Longitute

In [None]:
df2['Start_Lng']

In [None]:
# lets view some stats of these columns

#df2[['Start_Lng', 'Start_Lat']].describe()
df2[['Start_Lng', 'Start_Lat']].describe().apply(lambda s: s.apply('{0:.5f}'.format))

In [None]:
# histogram of start_lat, start_lng

sns.histplot(df2[['Start_Lng', 'Start_Lat']])

In [None]:
sns.scatterplot(y=df2.Start_Lat, x=df2.Start_Lng)

In [None]:
import folium
map = folium.Map()
map

In [None]:
# put a piont on a map

df2[['Start_Lng', 'Start_Lat']].iloc[1]

In [None]:
location = folium.Map(location=[39.86542,-84.06280], zoom_start=18, control_scale=True, tiles='Stamen Water Color')

### HeatMap

In [None]:
from folium import plugins
from folium.plugins import HeatMap

In [None]:
# 01 % random sample of dataframe

df_sample2 = df2.sample(int(0.001*len(df2)))

In [None]:
HeatMap(list(zip(df_sample2.Start_Lat,df_sample2.Start_Lng)), ).add_to(map)
map

In [None]:
list(zip(df2.Start_Lat.iloc[:20], df2.Start_Lng.iloc[:20]))

<h1>Ask & Answer Questions</h1>

### Are there more accidents in warmer or in cold areas?

### Which State has highest no of accidents?

In [None]:
df2.City.value_counts()

In [None]:
# Miami has recorded most no of accidents

### What time of day accidents are more frequent in?

In [None]:
accident_hours = df.Start_Time.dt.hour.value_counts()
accident_hours

#### 5PM is the time of the day when most of the accidents happened.

In [None]:
# 17 i.e. 5 PM is the time when most of the accidents heppened, also verified by this histogram

sns.histplot(df.Start_Time.dt.hour, bins=24)

### which days of week has most accidents?

In [None]:
accident_by_day = df2['Start_Time'].dt.dayofweek.value_counts()
accident_by_day

##### 4 i.e. Friday is the day when most number of accidents happened 
##### 6 i.e. sunday is the day least no of acidents happened (may be because of the holiday)