In [1]:
import numpy as np
import pandas as pd
import folium
from sklearn.cluster import DBSCAN

In [2]:
moskva = pd.read_csv('clean_data_MSK.csv', sep=':')
moskva.head()

Unnamed: 0,lighting,lat,long,region,category,date,severity,dead,injured,n_participants,...,Эксплуатация,Нарушение правил подачи/приема сигналов,Другое,Нарушения при движении,Пешеходы,Скончался до госпитализации,Не пострадал,Легкие травмы,Ранения,Скончался после госпитализации
0,Светлое время суток,55.616777,37.706516,Орехово-Борисово Северное,Наезд на пешехода,2020-03-24 13:06:00,Легкий,0,1,2,...,0,0,0,0,0,0,0,0,1,0
1,Светлое время суток,55.841157,37.489332,Головинский,Столкновение,2018-09-15 11:00:00,Легкий,0,1,2,...,0,0,0,0,0,0,0,0,0,0
2,"В темное время суток, освещение включено",55.713332,37.658944,Даниловский,Наезд на пешехода,2016-04-14 21:25:00,Легкий,0,1,2,...,0,0,0,0,0,0,0,0,1,0
3,Светлое время суток,55.655243,37.414176,Солнцево,Столкновение,2019-06-20 18:15:00,Легкий,0,1,2,...,0,0,0,0,0,0,0,0,0,0
4,Светлое время суток,55.756493,37.789315,Перово,Наезд на пешехода,2016-04-04 11:50:00,Легкий,0,1,2,...,0,0,0,0,0,0,0,0,1,0


Data contains flawed coordinates that are quite remote from Moscow, that's why we need to bound them.

In [3]:
moskva_new = moskva[(moskva['long'] < 37.9545100) & (moskva['long'] > 37.1813900) & (moskva['lat'] < 55.9825000) & (moskva['lat'] > 55.1339600)].copy()

Display probability of death by category of accident.

In [4]:
categor_df = moskva_new[['category', 'dead', 'injured', 'n_participants']].groupby('category').sum()
categor_df["P_death"] = round(categor_df['dead'] / categor_df['n_participants'], 2)
categor_df

Unnamed: 0_level_0,dead,injured,n_participants,P_death
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"Возгорание вследствие технической неисправности движущегося или остановившегося ТС, участвующего в дорожном движении.",0,2,3,0.0
Иной вид ДТП,4,57,103,0.04
Наезд на велосипедиста,22,1498,3042,0.01
Наезд на внезапно возникшее препятствие,0,16,34,0.0
Наезд на гужевой транспорт,0,1,2,0.0
Наезд на животное,1,9,14,0.07
"Наезд на лицо, не являющееся участником дорожного движения, осуществляющее какую-либо другую деятельность",3,15,31,0.1
"Наезд на лицо, не являющееся участником дорожного движения, осуществляющее несение службы",5,29,67,0.07
"Наезд на лицо, не являющееся участником дорожного движения, осуществляющее производство работ",22,45,121,0.18
Наезд на пешехода,1347,19158,40370,0.03


Heavy map, which contains all accidents inside defined boundaries

In [5]:
map_ = folium.Map([55.75215, 37.61819], zoom_start=10)
for id, row in moskva_new.iterrows():
    folium.Circle([row['lat'], row['long']],
                  radius=10).add_to(map_)

The following functions are responsible for defining DBSCAN model and displaying problematic regions on folium map

In [6]:
def select_problematic_clusters(coordinates, distance: float, min_samples: int) -> DBSCAN:
    """
    ==========
    Parameters:
        coordinates: pd.Dataframe or np.array containing latitude and longitude
        distance: float, neighbourhood distance for DBSCAN;

        intuitively, this parameter helps us to define the most problematic regions with accidents, according to which
        neighbourhood distance between the accidents is at most 'distance' km

        min_samples: int, number of samples for accident to be considered as core point by DBSCAN
        
        intuitively, min_samples helps us regulate the density of problematic regions

    ==========
    Return:
        DBSCAN – fitted DBSCAN model
    """
    model = DBSCAN(eps=distance/6371., min_samples=min_samples, metric='haversine')
    model.fit(np.radians(coordinates))
    return model
    

def DBSCAN_clustered_map(data, cluster_model, dead: bool=True, hard: bool=True, light: bool=False):
   """
   Create map with problematic clusters, colour them depending on the severity

   ==========
   Parameters:
      data: pd.DataFrame - public transport stops data (coordinates + severity)
      cluster_model: DBSCAN model
      dead: bool, include lethal accidents
      hard: bool, include severe accidents
      light: bool, include non-severe accidents


   ==========
   Returns: 
      folium.Map - map with coloured clusters
   """
   labels = cluster_model.labels_
   df = data[['lat', 'long', 'severity']].copy()
   df.insert(2, "labels", labels + 1)

   Moscow_Map = folium.Map([55.75215, 37.61819], zoom_start=11)
   for id, row in df.iterrows():
      idx = row.labels - 1
      if idx >= 0:
         if row.severity=='С погибшими' and dead:
            folium.Circle([row.lat, row.long],
                  radius=40, color='red').add_to(Moscow_Map)
         elif row.severity=='Тяжёлый' and hard:
            folium.Circle([row.lat, row.long],
                  radius=26, color='purple').add_to(Moscow_Map)
         elif row.severity=='Легкий' and light:
            folium.Circle([row.lat, row.long],
                  radius=5, color='lightgreen').add_to(Moscow_Map)
   
   return Moscow_Map

In [7]:
prob_regions = DBSCAN_clustered_map(moskva_new, select_problematic_clusters(moskva_new[['lat', 'long']].values, 0.1, 15))

In [8]:
from utils.save_map import save_map_html_and_png

In [9]:
save_map_html_and_png(prob_regions, 'msc_problem_regions')