### A US-county would like to know what are the main cases they need to focus on to protect their citizens, the goal is to get this kind of recommandations . In addition there is a map with all the 911 calls they received over the past years.

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import  silhouette_score

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

import plotly.express as px
import plotly.io as pio
pio.renderers.default = "iframe_connected"



In [5]:
dataset=pd.read_csv("911.csv")

In [6]:
sample = emergency.sample(n=10000)
sample.head()

Unnamed: 0,lat,lng,desc,zip,title,timeStamp,twp,addr,e
625945,40.106293,-75.059001,MORELAND RD & WILLIAMSBURG RD; LOWER MORELAND;...,19006.0,Fire: ELECTRICAL FIRE OUTSIDE,2020-04-06 18:40:28,LOWER MORELAND,MORELAND RD & WILLIAMSBURG RD,1
245042,40.107709,-75.213634,MILL RD & BETHLEHEM PIKE; WHITEMARSH; Station...,19031.0,EMS: SUBJECT IN PAIN,2017-09-07 17:04:05,WHITEMARSH,MILL RD & BETHLEHEM PIKE,1
460293,40.100423,-75.207064,RT309 EXPY & WILLOW GROVE AVE OVERPASS; SPRING...,,Traffic: ROAD OBSTRUCTION -,2019-02-14 13:49:06,SPRINGFIELD,RT309 EXPY & WILLOW GROVE AVE OVERPASS,1
160352,40.002564,-75.22365,CITY AVE & N 47TH ST; LOWER MERION; 2017-01-27...,19004.0,Fire: VEHICLE FIRE,2017-01-27 11:45:41,LOWER MERION,CITY AVE & N 47TH ST,1
530095,40.252203,-75.683169,SYLVAN DR & GROSSTOWN RD; WEST POTTSGROVE; St...,19464.0,EMS: STABBING,2019-08-04 20:31:25,WEST POTTSGROVE,SYLVAN DR & GROSSTOWN RD,1


In [7]:
#Shape of the dataset
print("The shape of the dataset is :")
display(dataset.shape)
#The columns of the dataset
print("The columns of the dataset :")
display(dataset.columns)
#The type of the columns of the dataset 
print("The Type of columns of the dataset :")
display(dataset.dtypes)
#Some statistical information about the dataset
print(" Some statistical information about the dataset :")
display(dataset.describe(include="all"))
#The pourcentage of missing value in the columns of the dataset
print(" The pourcentage of missing value in the columns of the dataset:")
display(100*dataset.isnull().sum()/dataset.shape[0])

The shape of the dataset is :


(663522, 9)

The columns of the dataset :


Index(['lat', 'lng', 'desc', 'zip', 'title', 'timeStamp', 'twp', 'addr', 'e'], dtype='object')

The Type of columns of the dataset :


lat          float64
lng          float64
desc          object
zip          float64
title         object
timeStamp     object
twp           object
addr          object
e              int64
dtype: object

 Some statistical information about the dataset :


Unnamed: 0,lat,lng,desc,zip,title,timeStamp,twp,addr,e
count,663522.0,663522.0,663522,583323.0,663522,663522,663229,663522,663522.0
unique,,,663282,,148,640754,68,41292,
top,,,CITY AVE & CARDINAL AVE; LOWER MERION; Statio...,,Traffic: VEHICLE ACCIDENT -,2018-10-06 19:26:38,LOWER MERION,SHANNONDELL DR & SHANNONDELL BLVD,
freq,,,5,,148372,9,55490,7285,
mean,40.158162,-75.300105,,19236.055791,,,,,1.0
std,0.220641,1.672884,,298.222637,,,,,0.0
min,0.0,-119.698206,,1104.0,,,,,1.0
25%,40.100344,-75.392735,,19038.0,,,,,1.0
50%,40.143927,-75.305143,,19401.0,,,,,1.0
75%,40.229008,-75.211865,,19446.0,,,,,1.0


 The pourcentage of missing value in the columns of the dataset:


lat           0.000000
lng           0.000000
desc          0.000000
zip          12.086864
title         0.000000
timeStamp     0.000000
twp           0.044158
addr          0.000000
e             0.000000
dtype: float64

In [9]:
fig = px.scatter_mapbox(
        sample, 
        lat="lat", 
        lon="lng",
        color="title",
        mapbox_style="carto-positron"
)

fig.show()

#### The dataset is quite big we only use the following columns

In [10]:
sample = sample.loc[:, ["lat", "lng", "title"]]
sample.head()

Unnamed: 0,lat,lng,title
625945,40.106293,-75.059001,Fire: ELECTRICAL FIRE OUTSIDE
245042,40.107709,-75.213634,EMS: SUBJECT IN PAIN
460293,40.100423,-75.207064,Traffic: ROAD OBSTRUCTION -
160352,40.002564,-75.22365,Fire: VEHICLE FIRE
530095,40.252203,-75.683169,EMS: STABBING


### We are using a sample of the dataset and not the whole one , and we are using the following columns:
#### ["lat", "lng", "title"]
#### They do not have missing values , "lat","lng" are numerical and "title" is categorical

In [13]:
#Searching for numerical features and categorical ones
num_features=[]
categorical_features=[]
for column, dtype in sample.dtypes.items():
    if (dtype=='int64') or (dtype=='float64'):
        num_features.append(column)
    else:
        categorical_features.append(column)
print('The numerical features are {}:'.format(num_features))
print('The categorical features are {}:'.format(categorical_features))

The numerical features are ['lat', 'lng']:
The categorical features are ['title']:


In [14]:
print("Partial Creation of the preprocessing pipeline:")

print("Step 1 : for numerical features:")
num_transformer=Pipeline(steps=
                        [
                         ("scaler",StandardScaler())
                        ])
print("Done^^")

print("Step 2 : for categorical features:")
cat_transformer=Pipeline(steps=
                         [
                          ("cat_encoder",OneHotEncoder())
                         ])
print("Done^^")

print("Step 3 : Creation of the final pipeline ")
preprocessor=ColumnTransformer(transformers=[
    ("cat_transformer", cat_transformer, categorical_features),
    ("num_transformer", num_transformer, num_features)
    ])
print("Done^^")

Partial Creation of the preprocessing pipeline:
Step 1 : for numerical features:
Done^^
Step 2 : for categorical features:
Done^^
Step 3 : Creation of the final pipeline 
Done^^


In [15]:
X=preprocessor.fit_transform(sample)

In [16]:
from sklearn.cluster import DBSCAN

db = DBSCAN(eps=0.2, min_samples=100, metric="manhattan")

db.fit(X)

DBSCAN(eps=0.2, metric='manhattan', min_samples=100)

### The number of DBSCAN clusters

In [18]:
import numpy as np
np.unique(db.labels_)

array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10], dtype=int64)

In [19]:
sample['Cluster']=db.labels_
sample.head()

Unnamed: 0,lat,lng,title,Cluster
625945,40.106293,-75.059001,Fire: ELECTRICAL FIRE OUTSIDE,-1
245042,40.107709,-75.213634,EMS: SUBJECT IN PAIN,0
460293,40.100423,-75.207064,Traffic: ROAD OBSTRUCTION -,8
160352,40.002564,-75.22365,Fire: VEHICLE FIRE,-1
530095,40.252203,-75.683169,EMS: STABBING,-1


### Visualization of the grouped problems within clusters, excluding the '-1' clusters, as they represent outliers where the clustering did not accurately occur

In [23]:
fig = px.scatter_mapbox(
        sample[sample.Cluster != -1], 
        lat="lat", 
        lon="lng",
        color="Cluster",
        mapbox_style="carto-positron"
)

fig.show()

### The map shows the main topics to focus on and the main areas where this events occur.So these are the areas that politics should focus on.

In [25]:
px.scatter_mapbox(
    sample.loc[sample.Cluster != -1, :],
    lat="lat",
    lon="lng",
    color="title",
    mapbox_style="carto-positron"
)