## Crimes by municipality Mexico
Focus crimes: Secuestro, Extorsión, Robo a negocio


In [1]:
# Importing libraries
import pandas as pd 
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt 
import seaborn as sns
import regex as re

%matplotlib inline

In [2]:
### Select if you want to limit the type of crimes

In [3]:
limit_crimes = False

### 1.- Importing data

In [4]:
# Database with crimes information
df = pd.read_csv('data/delitos-datos-abiertos.csv')
# Database with population information
pop = pd.read_csv('data/poblaciones_2015.csv', skiprows=4, encoding='latin-1', skipfooter=6, engine='python')
# Database with cp information
cp = pd.read_excel('data/codigos-postales-mexico.xlsx')

# Formatting column names
df.columns = [x.lower().strip() for x in df.columns]
pop.columns = [x.lower().strip() for x in pop.columns]
cp.columns = [x.lower().strip() for x in cp.columns]

# Formatting date columns
df['fecha'] = pd.to_datetime(df['fecha'], infer_datetime_format=True)

# Renaming columns
df.rename(columns={'inegi_entidad': 'id_entidad',
                    'inegi_municipio': 'id_municipio'}, inplace=True)

pop.drop(columns='id_municipio', inplace=True)

pop.rename(columns={'estado': 'entidad',
                    'cve_inegi': 'id_municipio',
                    'id_estado': 'id_entidad'}, inplace=True)
                    
cp.rename(columns={'estado': 'entidad'}, inplace=True)

df.head()

Unnamed: 0,id_entidad,entidad,id_municipio,municipio,id_delito,delito,carpetas,tasa,fecha
0,1,Aguascalientes,1001,Aguascalientes,1100,Homicidio doloso,3,0.323526,2021-01-01
1,1,Aguascalientes,1002,Asientos,1100,Homicidio doloso,0,0.0,2021-01-01
2,1,Aguascalientes,1010,El Llano,1100,Homicidio doloso,0,0.0,2021-01-01
3,1,Aguascalientes,1009,Tepezala,1100,Homicidio doloso,0,0.0,2021-01-01
4,1,Aguascalientes,1007,Rincon De Romos,1100,Homicidio doloso,0,0.0,2021-01-01


In [5]:
pop.head()

Unnamed: 0,entidad,municipio,id_municipio,id_entidad,hombres,mujeres,total
0,Aguascalientes,Aguascalientes,1001,1,425731,451459,877190
1,Aguascalientes,Asientos,1002,1,22745,23719,46464
2,Aguascalientes,Calvillo,1003,1,27298,28750,56048
3,Aguascalientes,Cosío,1004,1,7552,8025,15577
4,Aguascalientes,Jesús María,1005,1,60135,60270,120405


In [6]:
df.tail()

Unnamed: 0,id_entidad,entidad,id_municipio,municipio,id_delito,delito,carpetas,tasa,fecha
471739,32,Zacatecas,32047,Teul De Gonzalez Ortega,9000,Lesiones dolosas,1,17.430714,2021-12-01
471740,32,Zacatecas,32007,Concepcion Del Oro,9000,Lesiones dolosas,1,7.334067,2021-12-01
471741,32,Zacatecas,32008,Cuauhtemoc,9000,Lesiones dolosas,0,0.0,2021-12-01
471742,32,Zacatecas,32016,General Panfilo Natera,9000,Lesiones dolosas,1,4.203977,2021-12-01
471743,32,Zacatecas,32056,Zacatecas,9000,Lesiones dolosas,23,15.266466,2021-12-01


In [7]:
pop.tail()

Unnamed: 0,entidad,municipio,id_municipio,id_entidad,hombres,mujeres,total
2452,Zacatecas,Villa Hidalgo,32054,32,9433,9722,19155
2453,Zacatecas,Villanueva,32055,32,14793,15447,30240
2454,Zacatecas,Zacatecas,32056,32,70855,75292,146147
2455,Zacatecas,Trancoso,32057,32,9505,9908,19413
2456,Zacatecas,Santa María de la Paz *,32058,32,1305,1351,2656


In [8]:
cp.head()

Unnamed: 0,código,asentamiento,tipo,municipio,ciudad,entidad
0,1000,San Angel,Colonia,Álvaro Obregón,Ciudad de México,Distrito Federal
1,1010,Los Alpes,Colonia,Álvaro Obregón,Ciudad de México,Distrito Federal
2,1020,Guadalupe Inn,Colonia,Álvaro Obregón,Ciudad de México,Distrito Federal
3,1028,Secretaria de Contraloría y Desarrollo Adminis...,Gran usuario,Álvaro Obregón,Ciudad de México,Distrito Federal
4,1029,INFONAVIT,Gran usuario,Álvaro Obregón,Ciudad de México,Distrito Federal


### 1.1 Cleaning data

In [9]:
# Cleaning  * from names in pop df
pop['municipio'] = pop['municipio'].replace(' *', '', regex=True)
pop['municipio'] = pop['municipio'].replace('\*', '', regex=True)

# Changing Distrito Federal for CDMX
pop['entidad'] = pop['entidad'].replace('Distrito Federal', 'Ciudad de México')
df['entidad'] = df['entidad'].replace('Distrito Federal', 'Ciudad de México')
cp['entidad'] = cp['entidad'].replace('Distrito Federal', 'Ciudad de México')

pop.tail()

Unnamed: 0,entidad,municipio,id_municipio,id_entidad,hombres,mujeres,total
2452,Zacatecas,VillaHidalgo,32054,32,9433,9722,19155
2453,Zacatecas,Villanueva,32055,32,14793,15447,30240
2454,Zacatecas,Zacatecas,32056,32,70855,75292,146147
2455,Zacatecas,Trancoso,32057,32,9505,9908,19413
2456,Zacatecas,SantaMaríadelaPaz,32058,32,1305,1351,2656


In [10]:
# Separating words in pop df
string = 'VillaHidalgo'
words = re.findall('[A-Z][a-z]*', string)
string2 = ' '.join(words)
print(string2)

# Deleting spaces
string = 'Villa Hidalgo'
string3 = string.replace(' ', '')
string3

Villa Hidalgo


'VillaHidalgo'

In [11]:
# Applying functions
# pop['municipio_sep'] = pop['municipio'].apply(lambda x: ' '.join(re.findall('[A-Z][a-zÀ-ÿ]*', x)))
cp['municipio_strip'] = cp['municipio'].apply(lambda x: x.replace(' ', ''))

In [12]:
cp

Unnamed: 0,código,asentamiento,tipo,municipio,ciudad,entidad,municipio_strip
0,1000,San Angel,Colonia,Álvaro Obregón,Ciudad de México,Ciudad de México,ÁlvaroObregón
1,1010,Los Alpes,Colonia,Álvaro Obregón,Ciudad de México,Ciudad de México,ÁlvaroObregón
2,1020,Guadalupe Inn,Colonia,Álvaro Obregón,Ciudad de México,Ciudad de México,ÁlvaroObregón
3,1028,Secretaria de Contraloría y Desarrollo Adminis...,Gran usuario,Álvaro Obregón,Ciudad de México,Ciudad de México,ÁlvaroObregón
4,1029,INFONAVIT,Gran usuario,Álvaro Obregón,Ciudad de México,Ciudad de México,ÁlvaroObregón
...,...,...,...,...,...,...,...
143219,99993,Cuxpala,Pueblo,Moyahua de Estrada,,Zacatecas,MoyahuadeEstrada
143220,99994,Vicente Guerrero,Pueblo,Moyahua de Estrada,,Zacatecas,MoyahuadeEstrada
143221,99998,Palmarejo,Ranchería,Moyahua de Estrada,,Zacatecas,MoyahuadeEstrada
143222,99998,Jesús Maria,Ranchería,Moyahua de Estrada,,Zacatecas,MoyahuadeEstrada


### 2.- Exploring data

#### 2.1.- Exploring datatypes

In [13]:
df.dtypes

id_entidad               int64
entidad                 object
id_municipio             int64
municipio               object
id_delito                int64
delito                  object
carpetas                 int64
tasa                   float64
fecha           datetime64[ns]
dtype: object

In [14]:
pop.dtypes

entidad         object
municipio       object
id_municipio     int64
id_entidad       int64
hombres          int64
mujeres          int64
total            int64
dtype: object

In [15]:
cp.dtypes

código              int64
asentamiento       object
tipo               object
municipio          object
ciudad             object
entidad            object
municipio_strip    object
dtype: object

#### 2.2.- Cleaning data to merge

Checking id_municipio

In [16]:
# Creating list of unique ids in each dataset
unique_id_df = df['id_municipio'].unique()
unique_id_pop = pop['id_municipio'].unique()

# Len of the lists
len_id_df = len(df['id_municipio'].unique())
len_id_pop = len(pop['id_municipio'].unique())

print(len_id_df)
print(len_id_pop)

for i in unique_id_df:
    if i not in unique_id_pop:
        m = df[df['id_municipio'] == i].iloc[0]['municipio']
        print('id {} ({}) does not exist in pop data'.format(i, m))

print('\n')


2457
2457
id 7101 (Tuxtla Gutierrez) does not exist in pop data
id 7109 (Yajalon) does not exist in pop data
id 7110 (San Lucas) does not exist in pop data
id 7119 (Santiago El Pinar) does not exist in pop data
id 7114 (Benemerito De Las Americas) does not exist in pop data
id 7107 (Villa Corzo) does not exist in pop data
id 7117 (Montecristo De Guerrero) does not exist in pop data
id 7105 (Union Juarez) does not exist in pop data
id 7111 (Zinacantan) does not exist in pop data
id 7118 (San Andres Duraznal) does not exist in pop data
id 7102 (Tuxtla Chico) does not exist in pop data
id 7116 (Marques De Comillas) does not exist in pop data
id 7100 (Tumbala) does not exist in pop data
id 7108 (Villaflores) does not exist in pop data
id 7103 (Tuzantan) does not exist in pop data
id 7104 (Tzimol) does not exist in pop data
id 7113 (Aldama) does not exist in pop data
id 7112 (San Juan Cancuc) does not exist in pop data
id 7115 (Maravilla Tenejapa) does not exist in pop data
id 7106 (Venusti

In [17]:
for i in unique_id_pop:
    if i not in unique_id_df:
        m = pop[pop['id_municipio'] == i].iloc[0]['municipio']
        print('id {} ({}) does not exist in crimes data'.format(i, m))

id 70100 (Tumbalá) does not exist in crimes data
id 70101 (TuxtlaGutiérrez) does not exist in crimes data
id 70102 (TuxtlaChico) does not exist in crimes data
id 70103 (Tuzantán) does not exist in crimes data
id 70104 (Tzimol) does not exist in crimes data
id 70105 (UniónJuárez) does not exist in crimes data
id 70106 (VenustianoCarranza) does not exist in crimes data
id 70107 (VillaCorzo) does not exist in crimes data
id 70108 (Villaflores) does not exist in crimes data
id 70109 (Yajalón) does not exist in crimes data
id 70110 (SanLucas) does not exist in crimes data
id 70111 (Zinacantán) does not exist in crimes data
id 70112 (SanJuanCancuc) does not exist in crimes data
id 70113 (Aldama) does not exist in crimes data
id 70114 (BeneméritodelasAméricas) does not exist in crimes data
id 70115 (MaravillaTenejapa) does not exist in crimes data
id 70116 (MarquésdeComillas) does not exist in crimes data
id 70117 (MontecristodeGuerrero) does not exist in crimes data
id 70118 (SanAndrésDurazn

In [18]:
# Fixing misslabeled data
for i in unique_id_pop:
    if i not in unique_id_df:
        to_replace_with = i - 70000 + 7000
        print('{} replaced with {}'.format(i, to_replace_with))
        pop['id_municipio'] = pop['id_municipio'].replace(i, to_replace_with)

70100 replaced with 7100
70101 replaced with 7101
70102 replaced with 7102
70103 replaced with 7103
70104 replaced with 7104
70105 replaced with 7105
70106 replaced with 7106
70107 replaced with 7107
70108 replaced with 7108
70109 replaced with 7109
70110 replaced with 7110
70111 replaced with 7111
70112 replaced with 7112
70113 replaced with 7113
70114 replaced with 7114
70115 replaced with 7115
70116 replaced with 7116
70117 replaced with 7117
70118 replaced with 7118
70119 replaced with 7119


In [19]:
# Creating list of unique ids in each dataset
unique_id_df = df['id_municipio'].unique()
unique_id_pop = pop['id_municipio'].unique()

# Len of the lists
len_id_df = len(df['id_municipio'].unique())
len_id_pop = len(pop['id_municipio'].unique())

for i in unique_id_pop:
    if i not in unique_id_df:
        print('id {} does not exist in crimes data'.format(i))

for i in unique_id_df:
    if i not in unique_id_pop:
        print('id {} does not exist in crimes data'.format(i))

print('No missing values')

if len_id_df == len_id_pop:
    print('Equal lenghts')

print('Ready to merge crimes and pop')


No missing values
Equal lenghts
Ready to merge crimes and pop


Checking municipio column

In [20]:
# Creating list of unique ids in each dataset
unique_mun_cp = cp['municipio_strip'].unique()
unique_mun_pop = pop['municipio'].unique()

# Len of the lists
len_mun_cp = len(cp['municipio_strip'].unique())
len_mun_pop = len(pop['municipio'].unique())

print(len_mun_cp)
print(len_mun_pop)
print(len_mun_pop - len_mun_cp)

print('\n')

for i in unique_mun_cp:
    if i not in unique_mun_pop:
        print('Municipio {}  does not exist in pop data'.format(i))



2318
2317
-1


Municipio TezoatlándeSegurayLuna  does not exist in pop data
Municipio SanJuanMixtepec-Dto.08-  does not exist in pop data
Municipio SanJuanMixtepec-Dto.26-  does not exist in pop data
Municipio SanPedroMixtepec-Dto.26-  does not exist in pop data
Municipio SanPedroMixtepec-Dto.22-  does not exist in pop data
Municipio Medellín  does not exist in pop data


In [21]:
# Cleaning -Dto.xx- from name in cp database
cp['municipio_strip'] = cp['municipio_strip'].replace('\-.+\..+', '', regex=True)

# Creating list of unique ids in each dataset
unique_mun_cp = cp['municipio_strip'].unique()
unique_mun_pop = pop['municipio'].unique()

# Len of the lists
len_mun_cp = len(cp['municipio_strip'].unique())
len_mun_pop = len(pop['municipio'].unique())

print(len_mun_cp)
print(len_mun_pop)
print(len_mun_pop - len_mun_cp)

print('\n')

for i in unique_mun_cp:
    if i not in unique_mun_pop:
        print('Municipio {}  does not exist in pop data'.format(i))

2316
2317
1


Municipio TezoatlándeSegurayLuna  does not exist in pop data
Municipio Medellín  does not exist in pop data


In [22]:
for i in unique_mun_pop:
    if i not in unique_mun_cp:
        print('Municipio {}  does not exist in cp data'.format(i))

Municipio ZacualpandeAmilpas  does not exist in cp data
Municipio HeroicaVillaTezoatlándeSegurayLuna,CunadelaIndependenciadeOaxaca  does not exist in cp data
Municipio MedellíndeBravo  does not exist in cp data


In [23]:
# Replacing values manually
pop['municipio'] = pop['municipio'].replace('HeroicaVillaTezoatlándeSegurayLuna,CunadelaIndependenciadeOaxaca',
                                            'TezoatlándeSegurayLuna')

pop['municipio'] = pop['municipio'].replace('MedellíndeBravo', 'Medellín')

In [24]:
# Creating list of unique ids in each dataset
unique_mun_cp = cp['municipio_strip'].unique()
unique_mun_pop = pop['municipio'].unique()

# Len of the lists
len_mun_cp = len(cp['municipio_strip'].unique())
len_mun_pop = len(pop['municipio'].unique())

print(len_mun_cp)
print(len_mun_pop)
print(len_mun_pop - len_mun_cp)

print('\n')

for i in unique_mun_pop:
    if i not in unique_mun_cp:
        print('Municipio {} does not exist in cp data'.format(i))

pop[pop['municipio'] == 'ZacualpandeAmilpas']

2316
2317
1


Municipio ZacualpandeAmilpas does not exist in cp data


Unnamed: 0,entidad,municipio,id_municipio,id_entidad,hombres,mujeres,total
925,Morelos,ZacualpandeAmilpas,17032,17,4550,4820,9370


In [25]:
# Correcting last value
pop['municipio'] = pop['municipio'].replace('ZacualpandeAmilpas', 'Zacualpan')

In [26]:
# Creating list of unique ids in each dataset
unique_mun_cp = cp['municipio_strip'].unique()
unique_mun_pop = pop['municipio'].unique()

# Len of the lists
len_mun_cp = len(cp['municipio_strip'].unique())
len_mun_pop = len(pop['municipio'].unique())

for i in unique_mun_pop:
    if i not in unique_mun_cp:
        print('Municipio {} does not exist in cp data'.format(i))

for i in unique_mun_cp:
    if i not in unique_mun_pop:
        print('Municipio {} does not exist in cp data'.format(i))

print('No missing values')

if len_id_df == len_id_pop:
    print('Equal lenghts')

print('Ready to merge crimes and pop')


No missing values
Equal lenghts
Ready to merge crimes and pop


Checking entidad column

In [27]:
# Creating list of unique ids in each dataset
unique_mun_cp = cp['entidad'].unique()
unique_mun_pop = pop['entidad'].unique()

# Len of the lists
len_mun_cp = len(cp['entidad'].unique())
len_mun_pop = len(pop['entidad'].unique())

for i in unique_mun_pop:
    if i not in unique_mun_cp:
        print('entidad {} does not exist in cp data'.format(i))

for i in unique_mun_cp:
    if i not in unique_mun_pop:
        print('entidad {} does not exist in pop data'.format(i))

print('No missing values')

if len_id_df == len_id_pop:
    print('Equal lenghts')

print('Ready to merge crimes and pop')


No missing values
Equal lenghts
Ready to merge crimes and pop


### 3.- Merging

#### 3.1 Merging zip code and population data

In [28]:
# Generating column entidad-municipio for pop
pop['entidad_municipio'] = pop['entidad'] + '-' + pop['municipio']

# Generating column entidad-municipio for cp
cp['entidad_municipio'] = cp['entidad'] + '-' + cp['municipio_strip']
cp.head()

Unnamed: 0,código,asentamiento,tipo,municipio,ciudad,entidad,municipio_strip,entidad_municipio
0,1000,San Angel,Colonia,Álvaro Obregón,Ciudad de México,Ciudad de México,ÁlvaroObregón,Ciudad de México-ÁlvaroObregón
1,1010,Los Alpes,Colonia,Álvaro Obregón,Ciudad de México,Ciudad de México,ÁlvaroObregón,Ciudad de México-ÁlvaroObregón
2,1020,Guadalupe Inn,Colonia,Álvaro Obregón,Ciudad de México,Ciudad de México,ÁlvaroObregón,Ciudad de México-ÁlvaroObregón
3,1028,Secretaria de Contraloría y Desarrollo Adminis...,Gran usuario,Álvaro Obregón,Ciudad de México,Ciudad de México,ÁlvaroObregón,Ciudad de México-ÁlvaroObregón
4,1029,INFONAVIT,Gran usuario,Álvaro Obregón,Ciudad de México,Ciudad de México,ÁlvaroObregón,Ciudad de México-ÁlvaroObregón


In [29]:
df.head()

Unnamed: 0,id_entidad,entidad,id_municipio,municipio,id_delito,delito,carpetas,tasa,fecha
0,1,Aguascalientes,1001,Aguascalientes,1100,Homicidio doloso,3,0.323526,2021-01-01
1,1,Aguascalientes,1002,Asientos,1100,Homicidio doloso,0,0.0,2021-01-01
2,1,Aguascalientes,1010,El Llano,1100,Homicidio doloso,0,0.0,2021-01-01
3,1,Aguascalientes,1009,Tepezala,1100,Homicidio doloso,0,0.0,2021-01-01
4,1,Aguascalientes,1007,Rincon De Romos,1100,Homicidio doloso,0,0.0,2021-01-01


In [30]:
pop.head()

Unnamed: 0,entidad,municipio,id_municipio,id_entidad,hombres,mujeres,total,entidad_municipio
0,Aguascalientes,Aguascalientes,1001,1,425731,451459,877190,Aguascalientes-Aguascalientes
1,Aguascalientes,Asientos,1002,1,22745,23719,46464,Aguascalientes-Asientos
2,Aguascalientes,Calvillo,1003,1,27298,28750,56048,Aguascalientes-Calvillo
3,Aguascalientes,Cosío,1004,1,7552,8025,15577,Aguascalientes-Cosío
4,Aguascalientes,JesúsMaría,1005,1,60135,60270,120405,Aguascalientes-JesúsMaría


In [31]:
# Merging pop and cp data
cols_cp = ['código', 'asentamiento', 'tipo', 'municipio', 'entidad_municipio']

cp_pop = pd.merge(cp[cols_cp], pop, how='left', left_on='entidad_municipio', right_on='entidad_municipio')
cp_pop.head()

Unnamed: 0,código,asentamiento,tipo,municipio_x,entidad_municipio,entidad,municipio_y,id_municipio,id_entidad,hombres,mujeres,total
0,1000,San Angel,Colonia,Álvaro Obregón,Ciudad de México-ÁlvaroObregón,Ciudad de México,ÁlvaroObregón,9010,9,355754,394228,749982
1,1010,Los Alpes,Colonia,Álvaro Obregón,Ciudad de México-ÁlvaroObregón,Ciudad de México,ÁlvaroObregón,9010,9,355754,394228,749982
2,1020,Guadalupe Inn,Colonia,Álvaro Obregón,Ciudad de México-ÁlvaroObregón,Ciudad de México,ÁlvaroObregón,9010,9,355754,394228,749982
3,1028,Secretaria de Contraloría y Desarrollo Adminis...,Gran usuario,Álvaro Obregón,Ciudad de México-ÁlvaroObregón,Ciudad de México,ÁlvaroObregón,9010,9,355754,394228,749982
4,1029,INFONAVIT,Gran usuario,Álvaro Obregón,Ciudad de México-ÁlvaroObregón,Ciudad de México,ÁlvaroObregón,9010,9,355754,394228,749982


In [32]:
len(cp) -len(cp_pop)

-213

In [33]:
# Difference comes from Districts different cp
cp[cp['municipio'].str.contains('\-.+\..+', regex=True)]

Unnamed: 0,código,asentamiento,tipo,municipio,ciudad,entidad,municipio_strip,entidad_municipio
91558,69770,Las Flores,Colonia,San Juan Mixtepec -Dto. 08 -,,Oaxaca,SanJuanMixtepec,Oaxaca-SanJuanMixtepec
91559,69770,San Miguel Lado,Barrio,San Juan Mixtepec -Dto. 08 -,,Oaxaca,SanJuanMixtepec,Oaxaca-SanJuanMixtepec
91560,69770,San Juan Mixtepec - Dto.08 Centro,Colonia,San Juan Mixtepec -Dto. 08 -,,Oaxaca,SanJuanMixtepec,Oaxaca-SanJuanMixtepec
91561,69770,San Pedro Calvario,Barrio,San Juan Mixtepec -Dto. 08 -,,Oaxaca,SanJuanMixtepec,Oaxaca-SanJuanMixtepec
91562,69770,De Jesús,Barrio,San Juan Mixtepec -Dto. 08 -,,Oaxaca,SanJuanMixtepec,Oaxaca-SanJuanMixtepec
...,...,...,...,...,...,...,...,...
95508,71998,El Salitre,Ranchería,San Pedro Mixtepec -Dto. 22 -,,Oaxaca,SanPedroMixtepec,Oaxaca-SanPedroMixtepec
95509,71998,Cerro Zopilote,Ranchería,San Pedro Mixtepec -Dto. 22 -,,Oaxaca,SanPedroMixtepec,Oaxaca-SanPedroMixtepec
95510,71998,El Toledo,Ranchería,San Pedro Mixtepec -Dto. 22 -,,Oaxaca,SanPedroMixtepec,Oaxaca-SanPedroMixtepec
95511,71998,Regadío,Ranchería,San Pedro Mixtepec -Dto. 22 -,,Oaxaca,SanPedroMixtepec,Oaxaca-SanPedroMixtepec


In [34]:
pop[pop['municipio'].str.contains('Mixtepec', regex=True)]

Unnamed: 0,entidad,municipio,id_municipio,id_entidad,hombres,mujeres,total,entidad_municipio
1045,Oaxaca,MagdalenaMixtepec,20048,20,662,713,1375,Oaxaca-MagdalenaMixtepec
1120,Oaxaca,SanBernardoMixtepec,20123,20,1313,1428,2741,Oaxaca-SanBernardoMixtepec
1150,Oaxaca,SanGabrielMixtepec,20153,20,2355,2481,4836,Oaxaca-SanGabrielMixtepec
1205,Oaxaca,SanJuanMixtepec,20208,20,3113,3567,6680,Oaxaca-SanJuanMixtepec
1206,Oaxaca,SanJuanMixtepec,20209,20,285,375,660,Oaxaca-SanJuanMixtepec
1268,Oaxaca,SanMiguelMixtepec,20271,20,1295,1349,2644,Oaxaca-SanMiguelMixtepec
1315,Oaxaca,SanPedroMixtepec,20318,20,23381,24955,48336,Oaxaca-SanPedroMixtepec
1316,Oaxaca,SanPedroMixtepec,20319,20,523,551,1074,Oaxaca-SanPedroMixtepec
1375,Oaxaca,SantaCruzMixtepec,20378,20,1624,1801,3425,Oaxaca-SantaCruzMixtepec


In [35]:
# Dropping second repeated municipios
pop.drop([1206, 1316], inplace=True)
pop[pop['municipio'].str.contains('Mixtepec', regex=True)]

Unnamed: 0,entidad,municipio,id_municipio,id_entidad,hombres,mujeres,total,entidad_municipio
1045,Oaxaca,MagdalenaMixtepec,20048,20,662,713,1375,Oaxaca-MagdalenaMixtepec
1120,Oaxaca,SanBernardoMixtepec,20123,20,1313,1428,2741,Oaxaca-SanBernardoMixtepec
1150,Oaxaca,SanGabrielMixtepec,20153,20,2355,2481,4836,Oaxaca-SanGabrielMixtepec
1205,Oaxaca,SanJuanMixtepec,20208,20,3113,3567,6680,Oaxaca-SanJuanMixtepec
1268,Oaxaca,SanMiguelMixtepec,20271,20,1295,1349,2644,Oaxaca-SanMiguelMixtepec
1315,Oaxaca,SanPedroMixtepec,20318,20,23381,24955,48336,Oaxaca-SanPedroMixtepec
1375,Oaxaca,SantaCruzMixtepec,20378,20,1624,1801,3425,Oaxaca-SantaCruzMixtepec


In [36]:
# Merging again
# Merging pop and cp data
cols_cp = ['código', 'asentamiento', 'tipo', 'municipio', 'entidad_municipio']

cp_pop = pd.merge(cp[cols_cp], pop, how='left', left_on='entidad_municipio', right_on='entidad_municipio')
if len(cp) -len(cp_pop) == 0:
    print('Same shapes!')

cp_pop.rename(columns={'municipio_x': 'municipio', 'municipio_y': 'municipio_strip'}, inplace=True)
cp_pop.head()

Same shapes!


Unnamed: 0,código,asentamiento,tipo,municipio,entidad_municipio,entidad,municipio_strip,id_municipio,id_entidad,hombres,mujeres,total
0,1000,San Angel,Colonia,Álvaro Obregón,Ciudad de México-ÁlvaroObregón,Ciudad de México,ÁlvaroObregón,9010,9,355754,394228,749982
1,1010,Los Alpes,Colonia,Álvaro Obregón,Ciudad de México-ÁlvaroObregón,Ciudad de México,ÁlvaroObregón,9010,9,355754,394228,749982
2,1020,Guadalupe Inn,Colonia,Álvaro Obregón,Ciudad de México-ÁlvaroObregón,Ciudad de México,ÁlvaroObregón,9010,9,355754,394228,749982
3,1028,Secretaria de Contraloría y Desarrollo Adminis...,Gran usuario,Álvaro Obregón,Ciudad de México-ÁlvaroObregón,Ciudad de México,ÁlvaroObregón,9010,9,355754,394228,749982
4,1029,INFONAVIT,Gran usuario,Álvaro Obregón,Ciudad de México-ÁlvaroObregón,Ciudad de México,ÁlvaroObregón,9010,9,355754,394228,749982


In [37]:
# Exporting to csv
cp_pop.to_excel('output data/cp-population.xlsx')

#### 3.2.- Merging new dataframe with criminality dataframe

In [38]:
merged = pd.merge(cp_pop, df, on='id_municipio', how='left')

In [39]:
merged[merged['id_municipio'] == 1001]

Unnamed: 0,código,asentamiento,tipo,municipio_x,entidad_municipio,entidad_x,municipio_strip,id_municipio,id_entidad_x,hombres,mujeres,total,id_entidad_y,entidad_y,municipio_y,id_delito,delito,carpetas,tasa,fecha
400896,20000,Zona Centro,Colonia,Aguascalientes,Aguascalientes-Aguascalientes,Aguascalientes,Aguascalientes,1001,1,425731,451459,877190,1,Aguascalientes,Aguascalientes,1100,Homicidio doloso,3,0.323526,2021-01-01
400897,20000,Zona Centro,Colonia,Aguascalientes,Aguascalientes-Aguascalientes,Aguascalientes,Aguascalientes,1001,1,425731,451459,877190,1,Aguascalientes,Aguascalientes,1120,Feminicidio,2,0.418447,2021-01-01
400898,20000,Zona Centro,Colonia,Aguascalientes,Aguascalientes-Aguascalientes,Aguascalientes,Aguascalientes,1001,1,425731,451459,877190,1,Aguascalientes,Aguascalientes,1200,Homicidio culposo,5,0.539210,2021-01-01
400899,20000,Zona Centro,Colonia,Aguascalientes,Aguascalientes-Aguascalientes,Aguascalientes,Aguascalientes,1001,1,425731,451459,877190,1,Aguascalientes,Aguascalientes,2000,Secuestro,0,0.000000,2021-01-01
400900,20000,Zona Centro,Colonia,Aguascalientes,Aguascalientes-Aguascalientes,Aguascalientes,Aguascalientes,1001,1,425731,451459,877190,1,Aguascalientes,Aguascalientes,3000,Extorsión,5,0.539210,2021-01-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
542011,20399,El Turicate,Colonia,Aguascalientes,Aguascalientes-Aguascalientes,Aguascalientes,Aguascalientes,1001,1,425731,451459,877190,1,Aguascalientes,Aguascalientes,5000,Violación,25,2.696052,2021-12-01
542012,20399,El Turicate,Colonia,Aguascalientes,Aguascalientes-Aguascalientes,Aguascalientes,Aguascalientes,1001,1,425731,451459,877190,1,Aguascalientes,Aguascalientes,6000,Violencia familiar,116,12.509679,2021-12-01
542013,20399,El Turicate,Colonia,Aguascalientes,Aguascalientes-Aguascalientes,Aguascalientes,Aguascalientes,1001,1,425731,451459,877190,1,Aguascalientes,Aguascalientes,7000,Trata de personas,0,0.000000,2021-12-01
542014,20399,El Turicate,Colonia,Aguascalientes,Aguascalientes-Aguascalientes,Aguascalientes,Aguascalientes,1001,1,425731,451459,877190,1,Aguascalientes,Aguascalientes,8000,Narcomenudeo,17,1.833315,2021-12-01


In [40]:
# Grouping population information by id_municipio
_ = cp_pop.groupby('id_municipio').agg({'hombres': np.mean,
                                    'mujeres': np.mean,
                                    'total': np.mean})

# Joining
df_pop = pd.merge(df, _, on='id_municipio', how='left')

In [41]:
df_pop.head()

Unnamed: 0,id_entidad,entidad,id_municipio,municipio,id_delito,delito,carpetas,tasa,fecha,hombres,mujeres,total
0,1,Aguascalientes,1001,Aguascalientes,1100,Homicidio doloso,3,0.323526,2021-01-01,425731.0,451459.0,877190.0
1,1,Aguascalientes,1002,Asientos,1100,Homicidio doloso,0,0.0,2021-01-01,22745.0,23719.0,46464.0
2,1,Aguascalientes,1010,El Llano,1100,Homicidio doloso,0,0.0,2021-01-01,9982.0,10263.0,20245.0
3,1,Aguascalientes,1009,Tepezala,1100,Homicidio doloso,0,0.0,2021-01-01,10197.0,10729.0,20926.0
4,1,Aguascalientes,1007,Rincon De Romos,1100,Homicidio doloso,0,0.0,2021-01-01,26693.0,27173.0,53866.0


### 4.- Analysis

#### Select crimes

In [42]:
df_pop['delito'].unique()

array(['Homicidio doloso', 'Feminicidio', 'Homicidio culposo',
       'Secuestro', 'Extorsión', 'Robo con violencia', 'Robo de vehículo',
       'Robo a casa habitación', 'Robo a negocio',
       'Robo a transeúnte total', 'Robo en transporte público',
       'Violación', 'Violencia familiar', 'Trata de personas',
       'Narcomenudeo', 'Lesiones dolosas'], dtype=object)

In [43]:
if limit_crimes == True:
    df_pop = df_pop[(df_pop['delito'] == 'Extorsión') |
                     (df_pop['delito'] == 'Robo a negocio') |
                     (df_pop['delito'] == 'Secuestro')]

# Aggregating crimes by id_municipio
crimes_by_municipio = df_pop.groupby('id_municipio').agg({'carpetas': np.sum, 'total': np.mean})

# Generating crimes per 10000 people
crimes_by_municipio['crimes_10k'] = (crimes_by_municipio['carpetas'] / crimes_by_municipio['total']) * 10000

# Adding names of places
crimes_by_municipio = pd.merge(crimes_by_municipio, df[['entidad', 'municipio', 'id_municipio']],
                                how='left', on='id_municipio')
    


In [44]:
crimes_by_municipio = crimes_by_municipio.groupby('id_municipio').agg({'carpetas': np.mean, 'total': np.mean,
                                                                        'crimes_10k': np.mean, 'entidad': stats.mode,
                                                                        'municipio':stats.mode})

In [45]:
# Extracting only value without frequency
crimes_by_municipio['entidad'] = crimes_by_municipio['entidad'].apply(lambda x: x[0][0])
crimes_by_municipio['municipio'] = crimes_by_municipio['municipio'].apply(lambda x: x[0][0])


In [46]:
# Rearanging columns
crimes_by_municipio = crimes_by_municipio[['entidad', 'municipio', 'carpetas', 'total', 'crimes_10k']]

#### *Clean dataset aggregated by id_municipio*

In [47]:
# Clean dataset
crimes_by_municipio

Unnamed: 0_level_0,entidad,municipio,carpetas,total,crimes_10k
id_municipio,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1001,Aguascalientes,Aguascalientes,11091,877190.0,126.437830
1002,Aguascalientes,Asientos,411,46464.0,88.455579
1003,Aguascalientes,Calvillo,304,56048.0,54.239224
1004,Aguascalientes,Cosio,148,15577.0,95.011876
1005,Aguascalientes,Jesus Maria,1498,120405.0,124.413438
...,...,...,...,...,...
32054,Zacatecas,Villa Hidalgo,45,19155.0,23.492561
32055,Zacatecas,Villanueva,174,30240.0,57.539683
32056,Zacatecas,Zacatecas,1942,146147.0,132.879909
32057,Zacatecas,Trancoso,178,19413.0,91.691135


### X.- Coordinates data set

In [48]:
# Imoorting location lat, lon dataset
locations = pd.read_excel('data/coordenadas_municipios.xlsx')
locations.head()


Unnamed: 0,id_municipio,lat,lon
0,1001,21.879823,-102.296047
1,1002,22.238317,-102.089275
2,1003,21.846907,-102.718751
3,1004,22.366409,-102.300044
4,1005,21.961273,-102.343416


#### Coinciding id values

In [49]:
# Creating list of unique ids in each dataset
unique_id_crimes_by_municipio = crimes_by_municipio.index.unique()
unique_id_locations = locations['id_municipio'].unique()

# Len of the lists
len_id_crimes_by_municipio = len(crimes_by_municipio.index.unique())
len_id_locations = len(locations['id_municipio'].unique())

for i in unique_id_crimes_by_municipio:
    if i not in unique_id_locations:
        print('id_municipio {} does not exist in locations data'.format(i))

print('No missing values')

if len_id_df == len_id_locations:
    print('Equal lenghts')

print('Ready to merge crimes and locations')

No missing values
Ready to merge crimes and locations


#### Merging location information

In [50]:
crimes_by_municipio = crimes_by_municipio.merge(locations, how='left', on='id_municipio')
crimes_by_municipio

Unnamed: 0,id_municipio,entidad,municipio,carpetas,total,crimes_10k,lat,lon
0,1001,Aguascalientes,Aguascalientes,11091,877190.0,126.437830,21.879823,-102.296047
1,1002,Aguascalientes,Asientos,411,46464.0,88.455579,22.238317,-102.089275
2,1003,Aguascalientes,Calvillo,304,56048.0,54.239224,21.846907,-102.718751
3,1004,Aguascalientes,Cosio,148,15577.0,95.011876,22.366409,-102.300044
4,1005,Aguascalientes,Jesus Maria,1498,120405.0,124.413438,21.961273,-102.343416
...,...,...,...,...,...,...,...,...
2452,32054,Zacatecas,Villa Hidalgo,45,19155.0,23.492561,22.357088,-101.712599
2453,32055,Zacatecas,Villanueva,174,30240.0,57.539683,22.354259,-102.883726
2454,32056,Zacatecas,Zacatecas,1942,146147.0,132.879909,22.776096,-102.571836
2455,32057,Zacatecas,Trancoso,178,19413.0,91.691135,22.735389,-102.366038


#### Creating scales

In [51]:
data_map_municipios = crimes_by_municipio
data_map_municipios = data_map_municipios.dropna()

limits = []
for x in range(0, 101, 20):
    limits.append(np.percentile(data_map_municipios['crimes_10k'], x))

data_map_municipios['class'] = data_map_municipios['crimes_10k'].apply(lambda x:
                                        0 if (x >= limits[0] and x < limits[1])
                                        else
                                        (1 if (x > limits[1] and x < limits[2]) 
                                        else 
                                        (2 if (x > limits[2] and x < limits[3])
                                        else 
                                        (3 if (x > limits[3] and x < limits[4])
                                        else 4))))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_map_municipios['class'] = data_map_municipios['crimes_10k'].apply(lambda x:


In [52]:
data_map_municipios.head()

Unnamed: 0,id_municipio,entidad,municipio,carpetas,total,crimes_10k,lat,lon,class
0,1001,Aguascalientes,Aguascalientes,11091,877190.0,126.43783,21.879823,-102.296047,4
1,1002,Aguascalientes,Asientos,411,46464.0,88.455579,22.238317,-102.089275,4
2,1003,Aguascalientes,Calvillo,304,56048.0,54.239224,21.846907,-102.718751,3
3,1004,Aguascalientes,Cosio,148,15577.0,95.011876,22.366409,-102.300044,4
4,1005,Aguascalientes,Jesus Maria,1498,120405.0,124.413438,21.961273,-102.343416,4


#### Map

In [53]:
import folium

map = folium.Map(location = [20.693943, -100.985880], zoom_start=5.3)

data1 = data_map_municipios

data = data_map_municipios

for state in data1['entidad'].unique():
    data = data1[data1['entidad'] == state]

    colors = ['forestgreen', 'lime', 'yellow', 'lightcoral', 'firebrick']
    for cluster in range(5):
        color_cluster = colors[cluster]
        for point in range(len(data[data['class']==cluster])):
            folium.CircleMarker(location=[data.iloc[point]['lat'], data.iloc[point]['lon']],
            radius=8,
            color=color_cluster,
            fill_color=color_cluster,
            fill=True
            ).add_to(map)



map.save('map_colors.html')
map

In [54]:
data[data['municipio'] == 'Puebla']
  

Unnamed: 0,id_municipio,entidad,municipio,carpetas,total,crimes_10k,lat,lon,class


In [55]:
import folium
from folium.plugins import HeatMap

data['map_input'] = data.apply(lambda x: [x['lat'], x['lon'], x['crimes_10k']], axis=1)
map_data = data['map_input']

# Create map object
map = folium.Map(location = [20.693943, -100.985880], zoom_start=5.3)

HeatMap(map_data).add_to(map)

map.save('map1.html')

map

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['map_input'] = data.apply(lambda x: [x['lat'], x['lon'], x['crimes_10k']], axis=1)


In [56]:
# Exporting data map to excel
data.to_csv('output data/data_map_mun.csv')

#### Aggregating on neighborhood scope

In [57]:
# Importing location of neighborhoods
locations = pd.read_excel('data/codigos-postales-mexico.xlsx')

# Cleaning
# Column names
locations.columns = [x.strip().lower() for x in locations.columns]
locations = locations.rename(columns={'código': 'código postal',
                                        'estado': 'entidad'})

# CDMX for DF
locations = locations.replace('Distrito Federal', 'Ciudad de México')

# Stripping municipio
locations['municipio_strip'] = locations['municipio'].replace(' ', '', regex=True)

# Generating municipio_entidad
locations['entidad_municipio'] = locations['entidad'] + '-' + locations['municipio']

# Converting into lowercase
locations['entidad_municipio'] = locations['entidad_municipio'].apply(lambda x: x.lower())

locations.head()

Unnamed: 0,código postal,asentamiento,tipo,municipio,ciudad,entidad,municipio_strip,entidad_municipio
0,1000,San Angel,Colonia,Álvaro Obregón,Ciudad de México,Ciudad de México,ÁlvaroObregón,ciudad de méxico-álvaro obregón
1,1010,Los Alpes,Colonia,Álvaro Obregón,Ciudad de México,Ciudad de México,ÁlvaroObregón,ciudad de méxico-álvaro obregón
2,1020,Guadalupe Inn,Colonia,Álvaro Obregón,Ciudad de México,Ciudad de México,ÁlvaroObregón,ciudad de méxico-álvaro obregón
3,1028,Secretaria de Contraloría y Desarrollo Adminis...,Gran usuario,Álvaro Obregón,Ciudad de México,Ciudad de México,ÁlvaroObregón,ciudad de méxico-álvaro obregón
4,1029,INFONAVIT,Gran usuario,Álvaro Obregón,Ciudad de México,Ciudad de México,ÁlvaroObregón,ciudad de méxico-álvaro obregón


In [58]:
# Cleaning data dataset
data = data[['id_municipio', 'entidad', 'municipio', 'carpetas', 'total',
       'crimes_10k', 'class']]

# Changing abbreviations
data = data.replace({'Dr.': 'Doctor',
                    'Gral.': 'General'},
                    regex=True)

data['entidad_municipio'] = data['entidad'] + '-' + data['municipio']

data['entidad_municipio'] = data['entidad_municipio'].apply(lambda x: x.lower())


data

Unnamed: 0,id_municipio,entidad,municipio,carpetas,total,crimes_10k,class,entidad_municipio
2399,32001,Zacatecas,Apozol,20,6086.0,32.862307,2,zacatecas-apozol
2400,32002,Zacatecas,Apulco,5,4738.0,10.552976,1,zacatecas-apulco
2401,32003,Zacatecas,Atolinga,12,2427.0,49.443758,3,zacatecas-atolinga
2402,32004,Zacatecas,Benito Juarez,9,3990.0,22.556391,2,zacatecas-benito juarez
2403,32005,Zacatecas,Calera,387,45204.0,85.611893,4,zacatecas-calera
2404,32006,Zacatecas,Cañitas De Felipe Pescador,22,8393.0,26.21232,2,zacatecas-cañitas de felipe pescador
2405,32007,Zacatecas,Concepcion Del Oro,130,12944.0,100.432633,4,zacatecas-concepcion del oro
2406,32008,Zacatecas,Cuauhtemoc,40,12590.0,31.771247,2,zacatecas-cuauhtemoc
2407,32009,Zacatecas,Chalchihuites,17,11416.0,14.891381,1,zacatecas-chalchihuites
2408,32010,Zacatecas,Fresnillo,2628,230865.0,113.832759,4,zacatecas-fresnillo


Coinciding values entidad_municipio

In [59]:
# Creating list of unique ids in each dataset
unique_mun_locations = locations['entidad_municipio'].unique()
unique_mun_data = data['entidad_municipio'].unique()

# Len of the lists
len_mun_locations = len(locations['entidad_municipio'].unique())
len_mun_data = len(data['entidad_municipio'].unique())

for i in unique_mun_data:
    if i not in unique_mun_locations:
        print('entidad_municipio {} does not exist in locations data'.format(i))

#for i in unique_mun_locations:
 #   if i not in unique_mun_data:
  #      print('entidad_municipio {} does not exist in data data'.format(i))

print('No missing values')

if len_mun_locations == len_mun_data:
    print('Equal lenghts')

print('Ready to merge crimes and data')

entidad_municipio zacatecas-benito juarez does not exist in locations data
entidad_municipio zacatecas-concepcion del oro does not exist in locations data
entidad_municipio zacatecas-cuauhtemoc does not exist in locations data
entidad_municipio zacatecas-trinidad garcia de la cadena does not exist in locations data
entidad_municipio zacatecas-general francisco r. murguia does not exist in locations data
entidad_municipio zacatecas-el plateado de joaquin amaro does not exist in locations data
entidad_municipio zacatecas-general panfilo natera does not exist in locations data
entidad_municipio zacatecas-jimenez del teul does not exist in locations data
entidad_municipio zacatecas-nochistlan de mejia does not exist in locations data
entidad_municipio zacatecas-noria de angeles does not exist in locations data
entidad_municipio zacatecas-panuco does not exist in locations data
entidad_municipio zacatecas-rio grande does not exist in locations data
entidad_municipio zacatecas-susticacan doe

In [60]:
# Cleaning acentos
locations = locations.replace({'Á': 'A', 'á': 'a',
                    'É': 'E', 'é': 'e',
                    'Í': 'I', 'í': 'i',
                    'Ó': 'O', 'ó': 'o',
                    'Ú': 'U', 'ú': 'u'},
                    regex=True)

data = data.replace({'Á': 'A', 'á': 'a',
                    'É': 'E', 'é': 'e',
                    'Í': 'I', 'í': 'i',
                    'Ó': 'O', 'ó': 'o',
                    'Ú': 'U', 'ú': 'u'},
                    regex=True)
locations

Unnamed: 0,código postal,asentamiento,tipo,municipio,ciudad,entidad,municipio_strip,entidad_municipio
0,1000,San Angel,Colonia,Alvaro Obregon,Ciudad de Mexico,Ciudad de Mexico,AlvaroObregon,ciudad de mexico-alvaro obregon
1,1010,Los Alpes,Colonia,Alvaro Obregon,Ciudad de Mexico,Ciudad de Mexico,AlvaroObregon,ciudad de mexico-alvaro obregon
2,1020,Guadalupe Inn,Colonia,Alvaro Obregon,Ciudad de Mexico,Ciudad de Mexico,AlvaroObregon,ciudad de mexico-alvaro obregon
3,1028,Secretaria de Contraloria y Desarrollo Adminis...,Gran usuario,Alvaro Obregon,Ciudad de Mexico,Ciudad de Mexico,AlvaroObregon,ciudad de mexico-alvaro obregon
4,1029,INFONAVIT,Gran usuario,Alvaro Obregon,Ciudad de Mexico,Ciudad de Mexico,AlvaroObregon,ciudad de mexico-alvaro obregon
...,...,...,...,...,...,...,...,...
143219,99993,Cuxpala,Pueblo,Moyahua de Estrada,,Zacatecas,MoyahuadeEstrada,zacatecas-moyahua de estrada
143220,99994,Vicente Guerrero,Pueblo,Moyahua de Estrada,,Zacatecas,MoyahuadeEstrada,zacatecas-moyahua de estrada
143221,99998,Palmarejo,Rancheria,Moyahua de Estrada,,Zacatecas,MoyahuadeEstrada,zacatecas-moyahua de estrada
143222,99998,Jesus Maria,Rancheria,Moyahua de Estrada,,Zacatecas,MoyahuadeEstrada,zacatecas-moyahua de estrada


In [61]:
# Creating list of unique ids in each dataset
unique_mun_locations = locations['entidad_municipio'].unique()
unique_mun_data = data['entidad_municipio'].unique()

# Len of the lists
len_mun_locations = len(locations['entidad_municipio'].unique())
len_mun_data = len(data['entidad_municipio'].unique())

for i in unique_mun_data:
    if i not in unique_mun_locations:
        print('entidad_municipio {} does not exist in locations data'.format(i))

#for i in unique_mun_locations:
 #   if i not in unique_mun_data:
  #      print('entidad_municipio {} does not exist in data data'.format(i))

print('No missing values')

if len_mun_locations == len_mun_data:
    print('Equal lenghts')

print('Ready to merge crimes and data')

No missing values
Ready to merge crimes and data


In [62]:
# Replacing data mannually
locations['entidad_municipio'] = locations['entidad_municipio'].replace({
                                                                        'guanajuato-silao de la victoria': 'guanajuato-silao',
                                                                        'jalisco-san pedro tlaquepaque': 'jalisco-tlaquepaque',
                                                                        'mexico-acambay de ruiz castañeda': 'mexico-acambay',
                                                                        'morelos-tlaltizapan de zapata': 'morelos-tlaltizapan',
                                                                        'chihuahua-dr. belisario dominguez': 'chihuahua-doctor belisario dominguez',
                                                                        'nuevo leon-el carmen': 'nuevo leon-carmen'
                                                                        })

In [63]:
# Creating list of unique ids in each dataset
unique_mun_locations = locations['entidad_municipio'].unique()
unique_mun_data = data['entidad_municipio'].unique()

# Len of the lists
len_mun_locations = len(locations['entidad_municipio'].unique())
len_mun_data = len(data['entidad_municipio'].unique())

for i in unique_mun_data:
    if i not in unique_mun_locations:
        print('entidad_municipio {} does not exist in locations data'.format(i))

for i in unique_mun_locations:
    if i not in unique_mun_data:
        print('entidad_municipio {} does not exist in data data'.format(i))

print('No missing values')

if len_mun_locations == len_mun_data:
    print('Equal lenghts')

print('Ready to merge crimes and data')

entidad_municipio ciudad de mexico-alvaro obregon does not exist in data data
entidad_municipio ciudad de mexico-azcapotzalco does not exist in data data
entidad_municipio ciudad de mexico-benito juarez does not exist in data data
entidad_municipio ciudad de mexico-coyoacan does not exist in data data
entidad_municipio ciudad de mexico-cuajimalpa de morelos does not exist in data data
entidad_municipio ciudad de mexico-cuauhtemoc does not exist in data data
entidad_municipio ciudad de mexico-gustavo a. madero does not exist in data data
entidad_municipio ciudad de mexico-iztacalco does not exist in data data
entidad_municipio ciudad de mexico-iztapalapa does not exist in data data
entidad_municipio ciudad de mexico-la magdalena contreras does not exist in data data
entidad_municipio ciudad de mexico-miguel hidalgo does not exist in data data
entidad_municipio ciudad de mexico-milpa alta does not exist in data data
entidad_municipio ciudad de mexico-tlahuac does not exist in data data
e

In [64]:
data[data['entidad_municipio'].str.contains('oaxaca-zapotitlan')]

Unnamed: 0,id_municipio,entidad,municipio,carpetas,total,crimes_10k,class,entidad_municipio


In [65]:
locations[locations['entidad_municipio'].str.contains('oaxaca-zapotitlan')]


Unnamed: 0,código postal,asentamiento,tipo,municipio,ciudad,entidad,municipio_strip,entidad_municipio
91154,69060,Zapotitlan Palmas,Rancheria,Zapotitlan Palmas,,Oaxaca,ZapotitlanPalmas,oaxaca-zapotitlan palmas
91176,69120,Zapotitlan Lagunas,Pueblo,Zapotitlan Lagunas,,Oaxaca,ZapotitlanLagunas,oaxaca-zapotitlan lagunas
91177,69123,Cerro los Cuates,Rancheria,Zapotitlan Lagunas,,Oaxaca,ZapotitlanLagunas,oaxaca-zapotitlan lagunas
91178,69123,El Naranjo,Rancheria,Zapotitlan Lagunas,,Oaxaca,ZapotitlanLagunas,oaxaca-zapotitlan lagunas
91179,69123,Barrio San Isidro,Barrio,Zapotitlan Lagunas,,Oaxaca,ZapotitlanLagunas,oaxaca-zapotitlan lagunas
91180,69123,El Palmon Largo,Rancheria,Zapotitlan Lagunas,,Oaxaca,ZapotitlanLagunas,oaxaca-zapotitlan lagunas
91181,69123,La Soledad,Rancheria,Zapotitlan Lagunas,,Oaxaca,ZapotitlanLagunas,oaxaca-zapotitlan lagunas
91182,69123,San Pedro Cuaxoxocatla,Rancheria,Zapotitlan Lagunas,,Oaxaca,ZapotitlanLagunas,oaxaca-zapotitlan lagunas
91183,69123,San Miguel Hidalgo,Rancheria,Zapotitlan Lagunas,,Oaxaca,ZapotitlanLagunas,oaxaca-zapotitlan lagunas
91184,69123,Guadalupe Buenos Aires,Rancheria,Zapotitlan Lagunas,,Oaxaca,ZapotitlanLagunas,oaxaca-zapotitlan lagunas


In [66]:
# Merging
colonias = pd.merge(locations, data, how='left', on='entidad_municipio')
# CLeaning columns
colonias.drop(columns=['entidad_y', 'municipio_y'], inplace=True)
colonias.rename(columns={'entidad_x': 'entidad', 'municipio_x': 'municipio'}, inplace=True)
colonias.head()

Unnamed: 0,código postal,asentamiento,tipo,municipio,ciudad,entidad,municipio_strip,entidad_municipio,id_municipio,carpetas,total,crimes_10k,class
0,1000,San Angel,Colonia,Alvaro Obregon,Ciudad de Mexico,Ciudad de Mexico,AlvaroObregon,ciudad de mexico-alvaro obregon,,,,,
1,1010,Los Alpes,Colonia,Alvaro Obregon,Ciudad de Mexico,Ciudad de Mexico,AlvaroObregon,ciudad de mexico-alvaro obregon,,,,,
2,1020,Guadalupe Inn,Colonia,Alvaro Obregon,Ciudad de Mexico,Ciudad de Mexico,AlvaroObregon,ciudad de mexico-alvaro obregon,,,,,
3,1028,Secretaria de Contraloria y Desarrollo Adminis...,Gran usuario,Alvaro Obregon,Ciudad de Mexico,Ciudad de Mexico,AlvaroObregon,ciudad de mexico-alvaro obregon,,,,,
4,1029,INFONAVIT,Gran usuario,Alvaro Obregon,Ciudad de Mexico,Ciudad de Mexico,AlvaroObregon,ciudad de mexico-alvaro obregon,,,,,


Adding coordenates for every zip code

In [67]:
# Importing data
coordenates_cp = pd.read_csv('data/coordenates_colonias.csv', usecols=[0,3,6,8,9])
# Cleaning columns
coordenates_cp.columns = ['id_entidad', 'id_municipio', 'localidad', 'lat', 'lon']
coordenates_cp.columns = [x.lower().strip() for x in coordenates_cp.columns]
# Generating id_municipio full
coordenates_cp['id_municipio'] = coordenates_cp['id_entidad']*1000 + coordenates_cp['id_municipio']
coordenates_cp

Unnamed: 0,id_entidad,id_municipio,localidad,lat,lon
0,1,1001,Aguascalientes,21.879823,-102.296047
1,1,1001,Granja Adelita,21.871875,-102.373531
2,1,1001,Agua Azul,21.883756,-102.357122
3,1,1001,Rancho Alegre,21.854599,-102.372746
4,1,1001,Los Arbolitos [Rancho],21.780181,-102.357295
...,...,...,...,...,...
304216,32,32058,San Isidro,21.486425,-103.337268
304217,32,32058,San José,21.516539,-103.456568
304218,32,32058,San Miguel Tepetitlán,21.504213,-103.335932
304219,32,32058,San Rafael,21.527595,-103.372259


In [68]:
# Creating list of unique ids in each coordenates_cpset
unique_mun_colonias = colonias['id_municipio'].unique()
unique_mun_coordenates_cp = coordenates_cp['id_municipio'].unique()

# Len of the lists
len_mun_colonias = len(colonias['id_municipio'].unique())
len_mun_coordenates_cp = len(coordenates_cp['id_municipio'].unique())

for i in unique_mun_coordenates_cp:
    if i not in unique_mun_colonias:
        print('id_municipio {} does not exist in colonias coordenates_cp'.format(i))

for i in unique_mun_colonias:
    if i not in unique_mun_coordenates_cp:
        print('id_municipio {} does not exist in coordenates_cp coordenates_cp'.format(i))


if len_mun_colonias == len_mun_coordenates_cp:
    print('Equal lenghts')


id_municipio 1001 does not exist in colonias coordenates_cp
id_municipio 1002 does not exist in colonias coordenates_cp
id_municipio 1003 does not exist in colonias coordenates_cp
id_municipio 1004 does not exist in colonias coordenates_cp
id_municipio 1005 does not exist in colonias coordenates_cp
id_municipio 1006 does not exist in colonias coordenates_cp
id_municipio 1007 does not exist in colonias coordenates_cp
id_municipio 1008 does not exist in colonias coordenates_cp
id_municipio 1009 does not exist in colonias coordenates_cp
id_municipio 1010 does not exist in colonias coordenates_cp
id_municipio 1011 does not exist in colonias coordenates_cp
id_municipio 2001 does not exist in colonias coordenates_cp
id_municipio 2002 does not exist in colonias coordenates_cp
id_municipio 2003 does not exist in colonias coordenates_cp
id_municipio 2004 does not exist in colonias coordenates_cp
id_municipio 2005 does not exist in colonias coordenates_cp
id_municipio 3001 does not exist in colo

In [69]:
colonias

Unnamed: 0,código postal,asentamiento,tipo,municipio,ciudad,entidad,municipio_strip,entidad_municipio,id_municipio,carpetas,total,crimes_10k,class
0,1000,San Angel,Colonia,Alvaro Obregon,Ciudad de Mexico,Ciudad de Mexico,AlvaroObregon,ciudad de mexico-alvaro obregon,,,,,
1,1010,Los Alpes,Colonia,Alvaro Obregon,Ciudad de Mexico,Ciudad de Mexico,AlvaroObregon,ciudad de mexico-alvaro obregon,,,,,
2,1020,Guadalupe Inn,Colonia,Alvaro Obregon,Ciudad de Mexico,Ciudad de Mexico,AlvaroObregon,ciudad de mexico-alvaro obregon,,,,,
3,1028,Secretaria de Contraloria y Desarrollo Adminis...,Gran usuario,Alvaro Obregon,Ciudad de Mexico,Ciudad de Mexico,AlvaroObregon,ciudad de mexico-alvaro obregon,,,,,
4,1029,INFONAVIT,Gran usuario,Alvaro Obregon,Ciudad de Mexico,Ciudad de Mexico,AlvaroObregon,ciudad de mexico-alvaro obregon,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
143219,99993,Cuxpala,Pueblo,Moyahua de Estrada,,Zacatecas,MoyahuadeEstrada,zacatecas-moyahua de estrada,32033.0,9.0,3947.0,22.802128,2.0
143220,99994,Vicente Guerrero,Pueblo,Moyahua de Estrada,,Zacatecas,MoyahuadeEstrada,zacatecas-moyahua de estrada,32033.0,9.0,3947.0,22.802128,2.0
143221,99998,Palmarejo,Rancheria,Moyahua de Estrada,,Zacatecas,MoyahuadeEstrada,zacatecas-moyahua de estrada,32033.0,9.0,3947.0,22.802128,2.0
143222,99998,Jesus Maria,Rancheria,Moyahua de Estrada,,Zacatecas,MoyahuadeEstrada,zacatecas-moyahua de estrada,32033.0,9.0,3947.0,22.802128,2.0


Merging colonias with coordenates_cp

In [70]:
data_map_cp = pd.merge(data_map_municipios.drop(columns=['lat', 'lon']), coordenates_cp, how='right', on='id_municipio')
data_map_cp.head()

Unnamed: 0,id_municipio,entidad,municipio,carpetas,total,crimes_10k,class,id_entidad,localidad,lat,lon
0,1001,Aguascalientes,Aguascalientes,11091.0,877190.0,126.43783,4.0,1,Aguascalientes,21.879823,-102.296047
1,1001,Aguascalientes,Aguascalientes,11091.0,877190.0,126.43783,4.0,1,Granja Adelita,21.871875,-102.373531
2,1001,Aguascalientes,Aguascalientes,11091.0,877190.0,126.43783,4.0,1,Agua Azul,21.883756,-102.357122
3,1001,Aguascalientes,Aguascalientes,11091.0,877190.0,126.43783,4.0,1,Rancho Alegre,21.854599,-102.372746
4,1001,Aguascalientes,Aguascalientes,11091.0,877190.0,126.43783,4.0,1,Los Arbolitos [Rancho],21.780181,-102.357295


#### Mapping by zip code

In [71]:
import folium

data = data_map_cp

#data = data[data['municipio'] == 'Puebla']

data = data.iloc[192200: 192202]

data1 = data

map = folium.Map(location = [data.iloc[0]['lat'], data.iloc[0]['lon']], zoom_start=8)



for localidad in data1['localidad'].unique():
    data = data1[data1['localidad'] == localidad]

    colors = ['forestgreen', 'lime', 'yellow', 'lightcoral', 'firebrick']
    for cluster in range(5):
        color_cluster = colors[cluster]
        for point in range(len(data[data['class']==cluster])):
            folium.CircleMarker(location=[data.iloc[point]['lat'], data.iloc[point]['lon']],
            radius=8,
            color=color_cluster,
            fill_color=color_cluster,
            fill=True
            ).add_to(map)

data = data1

map.save('map_colonias.html')

In [73]:
# Exporting output
data_map_cp.to_csv('output data/data_map_cp.csv')