# X20181663- DAP PROJECT

### installing and Importing  the libraries

In [None]:
!pip install requests
!pip install pymongo
!pip install pypyodbc
!pip install folium
!pip install sodapy

import numpy as np
import requests
import json
import pandas as pd
from sodapy import Socrata
import matplotlib.pyplot as plt 
import os
from pymongo import MongoClient
import psycopg2
import io
import pandas.io.sql as sqlio
from pandas import DataFrame

### Getting the data from source and storing the dataset in MongoDB cloud

In [None]:
dataset_id="h9gi-nx95"
rows_limit=100000
db_name="MONGO_DB_DAP_PROJ"
username= "dap"
password = "dap"

### Function for connecting to MongoDb and the inserting the Dataset.

In [None]:
 def db_connect(dataset_id,data_limit,db_name):
    flag=0
    try:
        client = Socrata("data.cityofnewyork.us", None)
        raw_data = client.get(dataset_id, where = "crash_date >'2019-12-31'", limit=data_limit)
        print("Data fetched")
        flag=1
    except:
        print("API connection Error")
    
    if(flag==1):
        try:
            #mongoDb cloud connection
            client = MongoClient("mongodb://%s:%s@127.0.0.1" % (username, password))
            mongo_db = client[db_name]
            collection =mongo_db.db_name
            print("Dataset and collection created")
            
            #clear existing records from database
            db_del_data= collection.remove() 
        
            #Pushing unstructed json data into MongoDb cloud.
            db_mongo_inserted= collection.insert_many(raw_data)
        
            print("JSON Data injected in the MongoDB ")
        
            #iterating over the injected data
            db_mongo_inserted=collection.find().limit(5);
            for data in db_mongo_inserted: 
                print(data)
            return collection
        except Exception as e: print(e)
            
        finally:
            client.close()

In [None]:
raw_collection=db_connect(dataset_id,rows_limit,db_name)

### Extracting data from collection in dataframe

In [None]:
df_list=list(raw_collection.find({}))
raw_dataframe=pd.DataFrame(df_list)
raw_dataframe.info() 
raw_dataframe

### Data Cleaning (Removing Null values)

In [None]:

df_percent_missing =pd.DataFrame(raw_dataframe.isnull().sum() * 100 / len(raw_dataframe))
df_percent_missing.columns=['Null_Percentage']
df_percent_missing

In [None]:
#removing columns having not null values less than 40% of the total rows.
raw_dataframe.dropna(axis=1, how="any", thresh=len(raw_dataframe)*.40, inplace=True)

#removing rows for the columns having null values less than 10%
cols_to_delete = raw_dataframe.columns[raw_dataframe.isnull().sum()/len(raw_dataframe) < .10]
raw_dataframe.dropna(axis=0, how="any", thresh=None, subset=cols_to_delete.values, inplace=True)

In [None]:
#replacing the null values with their mode as all the columns are catagorical in nature.
columns_to_replace = raw_dataframe.columns[raw_dataframe.isnull().sum()/len(raw_dataframe) !=.00]

for i in columns_to_replace:
    raw_dataframe[i].fillna(raw_dataframe[i].mode()[0],inplace=True)

### Data Transformation (Renaming and removing columns)

In [None]:
#Removing location column from the dataframe as we cannot calculate mode for the combined location data.
del raw_dataframe['location']

#We would not loose any information in this process as we could use latitude and longitude columns for generating location.
raw_dataframe["LOCATION"]=raw_dataframe["latitude"] + raw_dataframe["longitude"]

In [None]:
##Rename columns for the better understanding of the data.

raw_dataframe.rename(columns = {'_id'                  : 'MONGODB_ID',
                       'on_street_name'                : 'ON_STREET',
                       'off_street_name'               : 'OFF_STREET',
                       'number_of_persons_injured'     : 'NUM_PER_INJURED',
                       'number_of_persons_killed'      : 'NUM_PER_KILLED',
                       'number_of_pedestrians_injured' : 'NUM_PED_INJURED',
                       'number_of_pedestrians_killed'  : 'NUM_PED_KILLED',
                       'number_of_cyclist_injured'     : 'NUM_CYCL_INJURED',
                       'number_of_cyclist_killed'      : 'NUM_CYCL_KILLED',
                       'number_of_motorist_injured'    : 'NUM_MOTOR_INJURED',
                       'number_of_motorist_killed'     : 'NUM_MOTOR_KILLED',
                       'contributing_factor_vehicle_1' : 'VEH_FACTOR_1',
                       'contributing_factor_vehicle_2' : 'VEH_FACTOR_2',
                       'vehicle_type_code1'            : 'VEH_TYPE_1',
                       'vehicle_type_code2'            : 'VEH_TYPE_2'}, inplace = True) 

In [None]:
#Concatinating columns for better analysis of the data. also changing the datatypes of the column.

raw_dataframe['DATE'] = raw_dataframe['crash_date'] + ' ' + raw_dataframe['crash_time']
raw_dataframe['DATE'] = pd.to_datetime(raw_dataframe.DATE)

raw_dataframe['year'] = raw_dataframe['DATE'].dt.year
raw_dataframe['month'] = raw_dataframe['DATE'].dt.month

In [None]:
#changing object datatypes to int for numeric values columns
raw_dataframe['zip_code'] =raw_dataframe['zip_code'].astype(int,errors='ignore')
raw_dataframe['NUM_PER_INJURED'] =raw_dataframe['NUM_PER_INJURED'].astype(int,errors='ignore')
raw_dataframe['NUM_PER_KILLED'] =raw_dataframe['NUM_PER_KILLED'].astype(int,errors='ignore')
raw_dataframe['NUM_PED_INJURED'] =raw_dataframe['NUM_PED_INJURED'].astype(int,errors='ignore')
raw_dataframe['NUM_PED_KILLED'] =raw_dataframe['NUM_PED_KILLED'].astype(int,errors='ignore')
raw_dataframe['NUM_CYCL_INJURED'] =raw_dataframe['NUM_CYCL_INJURED'].astype(int,errors='ignore')
raw_dataframe['NUM_CYCL_KILLED'] =raw_dataframe['NUM_CYCL_KILLED'].astype(int,errors='ignore')
raw_dataframe['NUM_MOTOR_INJURED'] =raw_dataframe['NUM_MOTOR_INJURED'].astype(int,errors='ignore')

#changing object datatypes to string for string values columns 
raw_dataframe['VEH_TYPE_1'] =raw_dataframe['VEH_TYPE_1'].astype(str,errors='ignore')
raw_dataframe['VEH_TYPE_2'] =raw_dataframe['VEH_TYPE_2'].astype(str,errors='ignore')
raw_dataframe['borough'] =raw_dataframe['borough'].astype(str,errors='ignore')
raw_dataframe['ON_STREET'] =raw_dataframe['ON_STREET'].astype(str,errors='ignore')
raw_dataframe['OFF_STREET'] =raw_dataframe['OFF_STREET'].astype(str,errors='ignore')


# changing string to numeric
raw_dataframe["latitude"] = raw_dataframe["latitude"].astype(float,errors='ignore')
raw_dataframe["longitude"] = raw_dataframe["longitude"].astype(float,errors='ignore')



In [None]:
print(raw_dataframe.dtypes)
print("\n*******************************")
raw_dataframe

### Extracting cleaned and transformed file

In [None]:
raw_dataframe.to_csv('C:/Users/DEV/Desktop/nyc_car_colision_data.csv')
del raw_dataframe['MONGODB_ID']

### Injecting the data in PostgreSQL for visualization

In [None]:
def postgreConnection():
    dbConnection = psycopg2.connect(
        user = username, 
        password = password,
        host = "127.0.0.1",
        port = "5432",
        database = "collision")
    return dbConnection
    

In [None]:
try:
    dbConnection = postgreConnection()
    dbConnection.set_isolation_level(0) # AUTOCOMMIT
    dbCursor = dbConnection.cursor()
    dbCursor.execute('CREATE DATABASE collision')
    dbCursor.close()
except (Exception , psycopg2.Error) as dbError :
    print ("Error while connecting to PostgreSQL", dbError)
finally:
        if dbConnection in locals(): 
            dbConnection.close()

In [None]:
try:
    sql = '''DROP TABLE IF EXISTS NYC_VEHICLE_COLLISION_DATA'''
    dbCursor.execute(sql)
except (Exception , psycopg2.Error) as dbError :
    print ("Error while connecting to PostgreSQL", dbError)
finally:
        if dbConnection in locals(): 
            dbConnection.close()

In [None]:
from sqlalchemy import create_engine
#PostgreSQL connection .
connection='postgresql://dap:dap@localhost:5432/collision'

In [None]:
#function def for injecting data to postgreSQL
def data_insert_postgreSQL(tbName, dataframe,con,sql_cursor):
    try:
        engine = create_engine(con)
        dataframe.to_sql(tbName, con=engine, if_exists='replace')
    except (Exception , psycopg2.Error) as dbError :
        print ("Error while connecting to PostgreSQL", dbError)
    finally:
        if dbConnection in locals(): 
            dbConnection.close()

In [None]:
data_insert_postgreSQL('NYC_VEHICLE_COLLISION_DATA',raw_dataframe,connection,dbCursor)

### Fetching the injected data from PostgreSQL database

In [None]:

def fetch_data(query):
    try:
        dbConnection = postgreConnection()
        sql=query
        clean_dataframe = sqlio.read_sql_query(sql, dbConnection)
        return clean_dataframe
    except (Exception , psycopg2.Error) as dbError :
        print ("Error while connecting to PostgreSQL", dbError)
    finally:
        if dbConnection in locals(): 
            dbConnection.close()

In [None]:
sql_query='''SELECT * FROM public."NYC_VEHICLE_COLLISION_DATA"'''
cleaned_df=fetch_data(sql_query)

In [None]:
cleaned_df.info()
cleaned_df.head()

### Visualization with fetch data

In [None]:
!pip install ggplot
!pip install plotly
#import ggplot
import matplotlib.pyplot as plt
import seaborn as sns

#plotly
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

#import folium
import datetime
import calendar
import folium

### Plot for accidents happening in different boroughs in New York City

In [None]:
#Plot 1
plt.figure(figsize=(10,5))
plt.title('Collision in different Boroughs',fontsize=30)
plt.xlabel('Boroughs',fontsize=20)
plt.ylabel('Accident Count',fontsize=20)
sns.barplot(x=cleaned_df.groupby('borough').size().index,
            y=cleaned_df.groupby('borough').size().values, palette = 'viridis')

### Plot for different reasons behind the accidents

In [None]:
#Plot 2
FACTOR_1 = cleaned_df.groupby('VEH_FACTOR_1').size().sort_values(ascending=True)
plt.figure(figsize=(10,10))
plt.title('Reasons for Accidents', fontsize=30)
plt.ylabel('Reasons',fontsize=20)
plt.xlabel('Accident Count', fontsize=20)
sns.barplot(y = FACTOR_1.index, x = FACTOR_1.values, palette = 'flare')

In [None]:
#PLot 3: Bar plot displaying the count of injured and killed persons with respect to different accident types(pedestrain,cyclist,motorist)

df_injured = cleaned_df[[i for i in cleaned_df.columns for c in ['NUM_PED_INJURED', 'NUM_CYCL_INJURED', 'NUM_MOTOR_INJURED'] if c in i]].sum()
df_injured.index = ['Pedestrian', 'Cyclist', 'Motorist']
plt.suptitle('PLOT FOR COUNT OF PERSONS INJURED PER ACCIDENT TYPE', fontsize=25, x=0.5,y=1.02)
sns.barplot(df_injured.index, df_injured.values, palette='viridis')


### Plot for displaying the exact location of accidents in the city

In [None]:
#Plot 4
map_df = cleaned_df.dropna(subset=['latitude', 'longitude'])
map = folium.Map(location=[40.767937,-73.982155 ], zoom_start=10) 

for collision in map_df[0:100].iterrows():
    folium.Marker([collision[1]['latitude'],
                   collision[1]['longitude']]).add_to(map)
display(map)

In [None]:
# Top 10 Vehicles in collisions
v_cols = [c for c in cleaned_df.columns if c.startswith("VEH_TYPE_1")]
v_cols
vehicles = cleaned_df[v_cols]
vehicles_1d = vehicles.stack()
vehicles_counts = vehicles_1d.value_counts()
top10_vehicles = vehicles_counts.head(10)
print("Top 10 Vehicles in Accidents")
top10_vehicles

In [None]:
weatherData = pd.read_csv("C:/Users/DEV/Desktop/weather.csv")

In [None]:
data_insert_postgreSQL('WEATHER_DATA',weatherData,connection,dbCursor)

In [None]:
sql_query='''SELECT * FROM public."NYC_VEHICLE_COLLISION_DATA" WHERE "year" = 2020;'''
col21Data=fetch_data(sql_query)

In [None]:
sql_query='''SELECT * FROM public."WEATHER_DATA" WHERE "year" = 2020;'''
wthr21Data=fetch_data(sql_query)

In [None]:
df_cd = pd.merge(wthr21Data, col21Data, how='inner', on = 'month')

In [None]:
#Plot 1: Month Wise Distribution
FACTOR_1 = df_cd.groupby('month').size().sort_values(ascending=True)
plt.figure(figsize=(5,5))
plt.title('Accidents distribution month wise', fontsize=30)
plt.ylabel('Count',fontsize=20)
plt.xlabel('Accident Count', fontsize=20)
sns.barplot(y = FACTOR_1.values, x =FACTOR_1.index  , palette = 'cubehelix')

In [None]:
#Plot 2: Month Wise Temp
df_cd_1 = df_cd[['month','temp']]
df_cd_2= df_cd_1.groupby(['month'],as_index=False).mean()
plt.figure(figsize=(5,5))
plt.title('Temperature across different months in 2020 in New York', fontsize=25)
sns.barplot(y = df_cd_2['temp'], x =df_cd_2['month']  , palette = 'rocket')

In [None]:
#Plot 3: Month Wise Visibility
df_cd_3 = df_cd[['month','visibility']]
df_cd_4= df_cd_3.groupby(['month'],as_index=False).mean()
plt.figure(figsize=(5,5))
plt.title('Visibility across different months in 2020 in New York', fontsize=25)
sns.barplot(y = df_cd_4['visibility'], x =df_cd_4['month']  , palette = 'crest')