In [44]:
# This cell will prompt you to connect this notebook with your google account.
from google.colab import drive                                                                        
drive.mount('/content/gdrive', force_remount=True)
root_dir = "/content/gdrive/My Drive/"
base_dir = root_dir + 'DS4A/Coursera_Capstone/'

Mounted at /content/gdrive


In [2]:
#Importing libraries

import numpy as np # library to handle data in a vectorized manner
import os
#!pip3 install geopandas
import pandas as pd # library for data analsysis
import random
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files
import geopandas as gp

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library
from folium import plugins
import shapely
from shapely.geometry import shape,mapping, Point, Polygon, MultiPolygon

from tqdm import tqdm

print('Libraries imported.')

Libraries imported.


# CHAPTER B. ANALYZING TRAFFIC ACCIDENTS IN MEDELLÍN

In [4]:
df_accidents = pd.read_csv(base_dir + '/data/final1_coord_accidents.csv', sep='|')[['class', 'longitude', 'latitude']]
print(df_accidents.shape)
df_accidents.head()

(226564, 3)


Unnamed: 0,class,longitude,latitude
0,crash,-75.575177,6.256572
1,crash,-75.566388,6.246808
2,other,-75.574233,6.206502
3,other,-75.619871,6.249727
4,other,-75.592195,6.282603


In [5]:
df_accidents.isna().sum()

class        0
longitude    0
latitude     0
dtype: int64

In [6]:
# Reading 2% of the sample for displaying the map using folium
p = 0.02  # 2% of the lines
# keep the header, then take only 1% of lines
# if random from [0,1] interval is greater than 0.01 the row will be skipped
df_accidents_10 = pd.read_csv(base_dir + '/data/final1_coord_accidents.csv', sep='|',
                           skiprows=lambda i: i>0 and random.random() > p)[['class', 'longitude', 'latitude']]

print(df_accidents_10.shape)
df_accidents_10.head()

(4428, 3)


Unnamed: 0,class,longitude,latitude
0,crash,-75.571757,6.265115
1,crash,-75.575958,6.231444
2,other,-75.577247,6.201682
3,overturning,-75.61782,6.272257
4,crash,-75.563479,6.20764


In [7]:
df_accidents_10.isna().sum()

class        0
longitude    0
latitude     0
dtype: int64

## 1. Displaying the accidents in Medellín

In [8]:
# Medellín latitude and longitude values
med_latitude = 6.245768
med_longitude = -75.577228

In [9]:
# create map and display it
medellin_map = folium.Map(location=[med_latitude, med_longitude], zoom_start=12)

# instantiate a mark cluster object for the accidents in the dataframe
accidents = plugins.MarkerCluster().add_to(medellin_map)

# loop through the accidents and add each to the accidents feature group
for lat, lng, label in zip(df_accidents_10['latitude'], df_accidents_10['longitude'], df_accidents_10['class']):
    folium.Marker(
        location=[lat, lng],
        icon=None,
        popup=label,
    ).add_to(accidents)

# display the map of Medellín
medellin_map

## 2. Grouping the accidents in Medellín by Neighborhood

In [None]:
# Let's take all the neighborhoods with their respective geometry (shape)
geometry_neigh_df = gp.GeoDataFrame.from_file(base_dir + '/data/Barrio_Vereda.geojson')[['NOMBRE', 'geometry']]
geometry_neigh_df.rename(columns={"NOMBRE": "neighborhood"}, inplace=True)
print(geometry_neigh_df.shape)
geometry_neigh_df.head()


(332, 2)


Unnamed: 0,neighborhood,geometry
0,La Aguacatala,"POLYGON ((-75.57623 6.19462, -75.57615 6.19473..."
1,El Pinal,"POLYGON ((-75.54161 6.24532, -75.54164 6.24533..."
2,Fuente Clara,"POLYGON ((-75.60107 6.27832, -75.60107 6.27823..."
3,Santo Domingo Savio No.2,"POLYGON ((-75.54062 6.30237, -75.54067 6.30238..."
4,Las Granjas,"POLYGON ((-75.54372 6.28266, -75.54371 6.28267..."


Now, lets associate each traffic accidents with their respective neighborhood

In [None]:
#Extract coordinates from Polygons of NTAs
def extract_poly_coords(geom):
    if geom.type == 'Polygon':
        exterior_coords = geom.exterior.coords[:]
        interior_coords = []
        for interior in geom.interiors:
            interior_coords += interior.coords[:]
    elif geom.type == 'MultiPolygon':
        exterior_coords = []
        interior_coords = []
        for part in geom:
            epc = extract_poly_coords(part)  # Recursive call
            exterior_coords += epc['exterior_coords']
            interior_coords += epc['interior_coords']
    else:
        raise ValueError('Unhandled geometry type: ' + repr(geom.type))
    return {'exterior_coords': exterior_coords,
            'interior_coords': interior_coords}


In [None]:
#List of polygons
list_of_polygons = []
for i in range(0,len(geometry_neigh_df['geometry'])):
    tmp = extract_poly_coords(geometry_neigh_df['geometry'][i])
    list_of_polygons.append(tmp)

In [None]:
# Function that finds the coordinate assotiated to a NTA
def isinnta(xx,yy):
    point = shapely.geometry.Point(xx,yy)
    for i in range(0,len(geometry_neigh_df['geometry'])):
        polygon = shapely.geometry.Polygon(list_of_polygons[i]['exterior_coords'])
        if polygon.contains(point):
            return(geometry_neigh_df['neighborhood'][i])

In [None]:
df_accidents['neighborhood'] = df_accidents.apply(lambda row: isinnta(row['longitude'],row['latitude']),axis=1)

In [None]:
df_accidents.head()

Unnamed: 0,class,longitude,latitude,neighborhood
0,crash,-75.575177,6.256572,Sin Nombre
1,crash,-75.566388,6.246808,Sin Nombre
2,other,-75.574233,6.206502,Patio Bonito
3,other,-75.619871,6.249727,Sin Nombre
4,other,-75.592195,6.282603,Aures No.1


Now, lets define for each neighborhood, how many accidents have been in each class

In [None]:
accidents_nta_num = df_accidents.groupby(['neighborhood', 'class'])['longitude'].count().reset_index()
accidents_nta_num.columns=['neighborhood','class', 'number_accidents']
accidents_nta_num.head()

Unnamed: 0,neighborhood,class,number_accidents
0,Aguas Frías,crash,9
1,Aguas Frías,fallen_occupant,3
2,Aguas Frías,other,6
3,Aguas Frías,overturning,1
4,Aguas Frías,run_over,2


In [None]:
accidents_per_neigh = accidents_nta_num.pivot(index='neighborhood', columns='class', values=['number_accidents']).reset_index()
accidents_per_neigh.columns=['neighborhood', 'crash', 'fallen_occupant', 'other', 'overturning', 'run_over']
accidents_per_neigh.fillna(0, inplace=True)
accidents_per_neigh = accidents_per_neigh.astype({"crash": int, "fallen_occupant": int, "other": int, "overturning": int, "run_over": int,})
accidents_per_neigh.head()

Unnamed: 0,neighborhood,crash,fallen_occupant,other,overturning,run_over
0,Aguas Frías,9,3,6,1,2
1,Aldea Pablo VI,25,12,6,2,19
2,Alejandro Echavarría,130,23,38,11,38
3,Alejandría,408,18,23,9,14
4,Alfonso López,714,198,201,55,148


In [None]:
# Saving accidents_per_neigh
accidents_per_neigh.to_csv(base_dir + '/data/accidents_per_neigh.csv')