## Crimes by municipality Mexico
Focus crimes: Secuestro, Extorsión, Robo a negocio


In [134]:
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
import regex as re

%matplotlib inline

### 1.- Importing data

In [135]:
# Database with crimes information
df = pd.read_csv('delitos-datos-abiertos.csv')
# Database with population information
pop = pd.read_csv('poblaciones_2015.csv', skiprows=4, encoding='latin-1', skipfooter=6, engine='python')
# Database with cp information
cp = pd.read_excel('codigos-postales-mexico.xlsx')

# Formatting column names
df.columns = [x.lower().strip() for x in df.columns]
pop.columns = [x.lower().strip() for x in pop.columns]
cp.columns = [x.lower().strip() for x in cp.columns]

# Formatting date columns
df['fecha'] = pd.to_datetime(df['fecha'], infer_datetime_format=True)

# Renaming columns
df.rename(columns={'inegi_entidad': 'id_entidad',
                    'inegi_municipio': 'id_municipio'}, inplace=True)

pop.drop(columns='id_municipio', inplace=True)

pop.rename(columns={'estado': 'entidad',
                    'cve_inegi': 'id_municipio',
                    'id_estado': 'id_entidad'}, inplace=True)
                    
cp.rename(columns={'estado': 'entidad'}, inplace=True)

df.head()

Unnamed: 0,id_entidad,entidad,id_municipio,municipio,id_delito,delito,carpetas,tasa,fecha
0,1,Aguascalientes,1001,Aguascalientes,1100,Homicidio doloso,3,0.323526,2021-01-01
1,1,Aguascalientes,1002,Asientos,1100,Homicidio doloso,0,0.0,2021-01-01
2,1,Aguascalientes,1010,El Llano,1100,Homicidio doloso,0,0.0,2021-01-01
3,1,Aguascalientes,1009,Tepezala,1100,Homicidio doloso,0,0.0,2021-01-01
4,1,Aguascalientes,1007,Rincon De Romos,1100,Homicidio doloso,0,0.0,2021-01-01


In [136]:
pop.head()

Unnamed: 0,entidad,municipio,id_municipio,id_entidad,hombres,mujeres,total
0,Aguascalientes,Aguascalientes,1001,1,425731,451459,877190
1,Aguascalientes,Asientos,1002,1,22745,23719,46464
2,Aguascalientes,Calvillo,1003,1,27298,28750,56048
3,Aguascalientes,Cosío,1004,1,7552,8025,15577
4,Aguascalientes,Jesús María,1005,1,60135,60270,120405


In [137]:
df.tail()

Unnamed: 0,id_entidad,entidad,id_municipio,municipio,id_delito,delito,carpetas,tasa,fecha
471739,32,Zacatecas,32047,Teul De Gonzalez Ortega,9000,Lesiones dolosas,1,17.430714,2021-12-01
471740,32,Zacatecas,32007,Concepcion Del Oro,9000,Lesiones dolosas,1,7.334067,2021-12-01
471741,32,Zacatecas,32008,Cuauhtemoc,9000,Lesiones dolosas,0,0.0,2021-12-01
471742,32,Zacatecas,32016,General Panfilo Natera,9000,Lesiones dolosas,1,4.203977,2021-12-01
471743,32,Zacatecas,32056,Zacatecas,9000,Lesiones dolosas,23,15.266466,2021-12-01


In [138]:
pop.tail()

Unnamed: 0,entidad,municipio,id_municipio,id_entidad,hombres,mujeres,total
2452,Zacatecas,Villa Hidalgo,32054,32,9433,9722,19155
2453,Zacatecas,Villanueva,32055,32,14793,15447,30240
2454,Zacatecas,Zacatecas,32056,32,70855,75292,146147
2455,Zacatecas,Trancoso,32057,32,9505,9908,19413
2456,Zacatecas,Santa María de la Paz *,32058,32,1305,1351,2656


In [139]:
cp.head()

Unnamed: 0,código,asentamiento,tipo,municipio,ciudad,entidad
0,1000,San Angel,Colonia,Álvaro Obregón,Ciudad de México,Distrito Federal
1,1010,Los Alpes,Colonia,Álvaro Obregón,Ciudad de México,Distrito Federal
2,1020,Guadalupe Inn,Colonia,Álvaro Obregón,Ciudad de México,Distrito Federal
3,1028,Secretaria de Contraloría y Desarrollo Adminis...,Gran usuario,Álvaro Obregón,Ciudad de México,Distrito Federal
4,1029,INFONAVIT,Gran usuario,Álvaro Obregón,Ciudad de México,Distrito Federal


### 1.1 Cleaning data

In [108]:
# Cleaning  * from names in pop df
pop['municipio'] = pop['municipio'].replace(' *', '', regex=True)
pop['municipio'] = pop['municipio'].replace('\*', '', regex=True)

# Changing Distrito Federal for CDMX
pop['entidad'] = pop['entidad'].replace('Distrito Federal', 'Ciudad de México')
df['entidad'] = df['entidad'].replace('Distrito Federal', 'Ciudad de México')
cp['entidad'] = cp['entidad'].replace('Distrito Federal', 'Ciudad de México')

pop.tail()

Unnamed: 0,entidad,municipio,id_municipio,id_entidad,hombres,mujeres,total
2452,Zacatecas,VillaHidalgo,32054,32,9433,9722,19155
2453,Zacatecas,Villanueva,32055,32,14793,15447,30240
2454,Zacatecas,Zacatecas,32056,32,70855,75292,146147
2455,Zacatecas,Trancoso,32057,32,9505,9908,19413
2456,Zacatecas,SantaMaríadelaPaz,32058,32,1305,1351,2656


In [109]:
# Separating words in pop df
string = 'VillaHidalgo'
words = re.findall('[A-Z][a-z]*', string)
string2 = ' '.join(words)
print(string2)

# Deleting spaces
string = 'Villa Hidalgo'
string3 = string.replace(' ', '')
string3

Villa Hidalgo


'VillaHidalgo'

In [110]:
# Applying functions
pop['municipio_sep'] = pop['municipio'].apply(lambda x: ' '.join(re.findall('[A-Z][a-zÀ-ÿ]*', x)))
cp['municipio_strip'] = cp['municipio'].apply(lambda x: x.replace(' ', ''))

In [111]:
cp

Unnamed: 0,código,asentamiento,tipo,municipio,ciudad,entidad,municipio_strip
0,1000,San Angel,Colonia,Álvaro Obregón,Ciudad de México,CDMX,ÁlvaroObregón
1,1010,Los Alpes,Colonia,Álvaro Obregón,Ciudad de México,CDMX,ÁlvaroObregón
2,1020,Guadalupe Inn,Colonia,Álvaro Obregón,Ciudad de México,CDMX,ÁlvaroObregón
3,1028,Secretaria de Contraloría y Desarrollo Adminis...,Gran usuario,Álvaro Obregón,Ciudad de México,CDMX,ÁlvaroObregón
4,1029,INFONAVIT,Gran usuario,Álvaro Obregón,Ciudad de México,CDMX,ÁlvaroObregón
...,...,...,...,...,...,...,...
143219,99993,Cuxpala,Pueblo,Moyahua de Estrada,,Zacatecas,MoyahuadeEstrada
143220,99994,Vicente Guerrero,Pueblo,Moyahua de Estrada,,Zacatecas,MoyahuadeEstrada
143221,99998,Palmarejo,Ranchería,Moyahua de Estrada,,Zacatecas,MoyahuadeEstrada
143222,99998,Jesús Maria,Ranchería,Moyahua de Estrada,,Zacatecas,MoyahuadeEstrada


### 2.- Exploring data

#### 2.1.- Exploring datatypes

In [112]:
df.dtypes

id_entidad               int64
entidad                 object
id_municipio             int64
municipio               object
id_delito                int64
delito                  object
carpetas                 int64
tasa                   float64
fecha           datetime64[ns]
dtype: object

In [113]:
pop.dtypes

entidad          object
municipio        object
id_municipio      int64
id_entidad        int64
hombres           int64
mujeres           int64
total             int64
municipio_sep    object
dtype: object

In [114]:
cp.dtypes

código              int64
asentamiento       object
tipo               object
municipio          object
ciudad             object
entidad            object
municipio_strip    object
dtype: object

#### 2.2.- Cleaning data to merge

Checking id_municipio

In [115]:
# Creating list of unique ids in each dataset
unique_id_df = df['id_municipio'].unique()
unique_id_pop = pop['id_municipio'].unique()

# Len of the lists
len_id_df = len(df['id_municipio'].unique())
len_id_pop = len(pop['id_municipio'].unique())

print(len_id_df)
print(len_id_pop)

for i in unique_id_df:
    if i not in unique_id_pop:
        m = df[df['id_municipio'] == i].iloc[0]['municipio']
        print('id {} ({}) does not exist in pop data'.format(i, m))

print('\n')


2457
2457
id 7101 (Tuxtla Gutierrez) does not exist in pop data
id 7109 (Yajalon) does not exist in pop data
id 7110 (San Lucas) does not exist in pop data
id 7119 (Santiago El Pinar) does not exist in pop data
id 7114 (Benemerito De Las Americas) does not exist in pop data
id 7107 (Villa Corzo) does not exist in pop data
id 7117 (Montecristo De Guerrero) does not exist in pop data
id 7105 (Union Juarez) does not exist in pop data
id 7111 (Zinacantan) does not exist in pop data
id 7118 (San Andres Duraznal) does not exist in pop data
id 7102 (Tuxtla Chico) does not exist in pop data
id 7116 (Marques De Comillas) does not exist in pop data
id 7100 (Tumbala) does not exist in pop data
id 7108 (Villaflores) does not exist in pop data
id 7103 (Tuzantan) does not exist in pop data
id 7104 (Tzimol) does not exist in pop data
id 7113 (Aldama) does not exist in pop data
id 7112 (San Juan Cancuc) does not exist in pop data
id 7115 (Maravilla Tenejapa) does not exist in pop data
id 7106 (Venusti

In [116]:
for i in unique_id_pop:
    if i not in unique_id_df:
        m = pop[pop['id_municipio'] == i].iloc[0]['municipio']
        print('id {} ({}) does not exist in crimes data'.format(i, m))

id 70100 (Tumbalá) does not exist in crimes data
id 70101 (TuxtlaGutiérrez) does not exist in crimes data
id 70102 (TuxtlaChico) does not exist in crimes data
id 70103 (Tuzantán) does not exist in crimes data
id 70104 (Tzimol) does not exist in crimes data
id 70105 (UniónJuárez) does not exist in crimes data
id 70106 (VenustianoCarranza) does not exist in crimes data
id 70107 (VillaCorzo) does not exist in crimes data
id 70108 (Villaflores) does not exist in crimes data
id 70109 (Yajalón) does not exist in crimes data
id 70110 (SanLucas) does not exist in crimes data
id 70111 (Zinacantán) does not exist in crimes data
id 70112 (SanJuanCancuc) does not exist in crimes data
id 70113 (Aldama) does not exist in crimes data
id 70114 (BeneméritodelasAméricas) does not exist in crimes data
id 70115 (MaravillaTenejapa) does not exist in crimes data
id 70116 (MarquésdeComillas) does not exist in crimes data
id 70117 (MontecristodeGuerrero) does not exist in crimes data
id 70118 (SanAndrésDurazn

In [117]:
# Fixing misslabeled data
for i in unique_id_pop:
    if i not in unique_id_df:
        to_replace_with = i - 70000 + 7000
        print('{} replaced with {}'.format(i, to_replace_with))
        pop['id_municipio'] = pop['id_municipio'].replace(i, to_replace_with)

70100 replaced with 7100
70101 replaced with 7101
70102 replaced with 7102
70103 replaced with 7103
70104 replaced with 7104
70105 replaced with 7105
70106 replaced with 7106
70107 replaced with 7107
70108 replaced with 7108
70109 replaced with 7109
70110 replaced with 7110
70111 replaced with 7111
70112 replaced with 7112
70113 replaced with 7113
70114 replaced with 7114
70115 replaced with 7115
70116 replaced with 7116
70117 replaced with 7117
70118 replaced with 7118
70119 replaced with 7119


In [118]:
# Creating list of unique ids in each dataset
unique_id_df = df['id_municipio'].unique()
unique_id_pop = pop['id_municipio'].unique()

# Len of the lists
len_id_df = len(df['id_municipio'].unique())
len_id_pop = len(pop['id_municipio'].unique())

for i in unique_id_pop:
    if i not in unique_id_df:
        print('id {} does not exist in crimes data'.format(i))

for i in unique_id_df:
    if i not in unique_id_pop:
        print('id {} does not exist in crimes data'.format(i))

print('No missing values')

if len_id_df == len_id_pop:
    print('Equal lenghts')

print('Ready to merge crimes and pop')


No missing values
Equal lenghts
Ready to merge crimes and pop


Checking municipio column

In [119]:
# Creating list of unique ids in each dataset
unique_mun_cp = cp['municipio_strip'].unique()
unique_mun_pop = pop['municipio'].unique()

# Len of the lists
len_mun_cp = len(cp['municipio_strip'].unique())
len_mun_pop = len(pop['municipio'].unique())

print(len_mun_cp)
print(len_mun_pop)
print(len_mun_pop - len_mun_cp)

print('\n')

for i in unique_mun_cp:
    if i not in unique_mun_pop:
        print('Municipio {}  does not exist in pop data'.format(i))



2318
2317
-1


Municipio TezoatlándeSegurayLuna  does not exist in pop data
Municipio SanJuanMixtepec-Dto.08-  does not exist in pop data
Municipio SanJuanMixtepec-Dto.26-  does not exist in pop data
Municipio SanPedroMixtepec-Dto.26-  does not exist in pop data
Municipio SanPedroMixtepec-Dto.22-  does not exist in pop data
Municipio Medellín  does not exist in pop data


In [120]:
# Cleaning -Dto.xx- from name in cp database
cp['municipio_strip'] = cp['municipio_strip'].replace('\-.+\..+', '', regex=True)

# Creating list of unique ids in each dataset
unique_mun_cp = cp['municipio_strip'].unique()
unique_mun_pop = pop['municipio'].unique()

# Len of the lists
len_mun_cp = len(cp['municipio_strip'].unique())
len_mun_pop = len(pop['municipio'].unique())

print(len_mun_cp)
print(len_mun_pop)
print(len_mun_pop - len_mun_cp)

print('\n')

for i in unique_mun_cp:
    if i not in unique_mun_pop:
        print('Municipio {}  does not exist in pop data'.format(i))

2316
2317
1


Municipio TezoatlándeSegurayLuna  does not exist in pop data
Municipio Medellín  does not exist in pop data


In [121]:
for i in unique_mun_pop:
    if i not in unique_mun_cp:
        print('Municipio {}  does not exist in cp data'.format(i))

Municipio ZacualpandeAmilpas  does not exist in cp data
Municipio HeroicaVillaTezoatlándeSegurayLuna,CunadelaIndependenciadeOaxaca  does not exist in cp data
Municipio MedellíndeBravo  does not exist in cp data


In [122]:
# Replacing values manually
pop['municipio'] = pop['municipio'].replace('HeroicaVillaTezoatlándeSegurayLuna,CunadelaIndependenciadeOaxaca',
                                            'TezoatlándeSegurayLuna')

pop['municipio'] = pop['municipio'].replace('MedellíndeBravo', 'Medellín')

In [123]:
# Cleaning -Dto.xx- from name in cp database
cp['municipio_strip'] = cp['municipio_strip'].replace('\-.+\..+', '', regex=True)

# Creating list of unique ids in each dataset
unique_mun_cp = cp['municipio_strip'].unique()
unique_mun_pop = pop['municipio'].unique()

# Len of the lists
len_mun_cp = len(cp['municipio_strip'].unique())
len_mun_pop = len(pop['municipio'].unique())

print(len_mun_cp)
print(len_mun_pop)
print(len_mun_pop - len_mun_cp)

print('\n')

for i in unique_mun_pop:
    if i not in unique_mun_cp:
        print('Municipio {} does not exist in cp data'.format(i))

pop[pop['municipio'] == 'ZacualpandeAmilpas']

2316
2317
1


Municipio ZacualpandeAmilpas does not exist in cp data


Unnamed: 0,entidad,municipio,id_municipio,id_entidad,hombres,mujeres,total,municipio_sep
925,Morelos,ZacualpandeAmilpas,17032,17,4550,4820,9370,Zacualpande Amilpas


In [124]:
# Correcting last value
pop['municipio'] = pop['municipio'].replace('ZacualpandeAmilpas', 'Zacualpan')

In [125]:
# Creating list of unique ids in each dataset
unique_mun_cp = cp['municipio_strip'].unique()
unique_mun_pop = pop['municipio'].unique()

# Len of the lists
len_mun_cp = len(cp['municipio_strip'].unique())
len_mun_pop = len(pop['municipio'].unique())

for i in unique_mun_pop:
    if i not in unique_mun_cp:
        print('Municipio {} does not exist in cp data'.format(i))

for i in unique_mun_cp:
    if i not in unique_mun_pop:
        print('Municipio {} does not exist in cp data'.format(i))

print('No missing values')

if len_id_df == len_id_pop:
    print('Equal lenghts')

print('Ready to merge crimes and pop')


No missing values
Equal lenghts
Ready to merge crimes and pop


Checking entidad column

In [126]:
# Creating list of unique ids in each dataset
unique_mun_cp = cp['entidad'].unique()
unique_mun_pop = pop['entidad'].unique()

# Len of the lists
len_mun_cp = len(cp['entidad'].unique())
len_mun_pop = len(pop['entidad'].unique())

for i in unique_mun_pop:
    if i not in unique_mun_cp:
        print('entidad {} does not exist in cp data'.format(i))

for i in unique_mun_cp:
    if i not in unique_mun_pop:
        print('entidad {} does not exist in pop data'.format(i))

print('No missing values')

if len_id_df == len_id_pop:
    print('Equal lenghts')

print('Ready to merge crimes and pop')


No missing values
Equal lenghts
Ready to merge crimes and pop


### 3.- Merging

In [127]:
# Generating column entidad-municipio for pop
pop['entidad_municipio'] = pop['entidad'] + '-' + pop['municipio']
pop.head()

# Generating column entidad-municipio for cp
cp['entidad_municipio'] = cp['entidad'] + '-' + cp['municipio']
cp.head()

Unnamed: 0,código,asentamiento,tipo,municipio,ciudad,entidad,municipio_strip,entidad_municipio
0,1000,San Angel,Colonia,Álvaro Obregón,Ciudad de México,CDMX,ÁlvaroObregón,CDMX-Álvaro Obregón
1,1010,Los Alpes,Colonia,Álvaro Obregón,Ciudad de México,CDMX,ÁlvaroObregón,CDMX-Álvaro Obregón
2,1020,Guadalupe Inn,Colonia,Álvaro Obregón,Ciudad de México,CDMX,ÁlvaroObregón,CDMX-Álvaro Obregón
3,1028,Secretaria de Contraloría y Desarrollo Adminis...,Gran usuario,Álvaro Obregón,Ciudad de México,CDMX,ÁlvaroObregón,CDMX-Álvaro Obregón
4,1029,INFONAVIT,Gran usuario,Álvaro Obregón,Ciudad de México,CDMX,ÁlvaroObregón,CDMX-Álvaro Obregón


In [133]:
df['entidad'].unique()

array(['Aguascalientes', 'Baja California', 'Baja California Sur',
       'Campeche', 'Coahuila de Zaragoza', 'Colima', 'Chiapas',
       'Chihuahua', 'Ciudad de México', 'Durango', 'Guanajuato',
       'Guerrero', 'Hidalgo', 'Jalisco', 'México', 'Michoacán de Ocampo',
       'Morelos', 'Nayarit', 'Nuevo León', 'Oaxaca', 'Puebla',
       'Querétaro', 'Quintana Roo', 'San Luis Potosí', 'Sinaloa',
       'Sonora', 'Tabasco', 'Tamaulipas', 'Tlaxcala',
       'Veracruz de Ignacio de la Llave', 'Yucatán', 'Zacatecas'],
      dtype=object)

In [129]:
df.head()

Unnamed: 0,id_entidad,entidad,id_municipio,municipio,id_delito,delito,carpetas,tasa,fecha
0,1,Aguascalientes,1001,Aguascalientes,1100,Homicidio doloso,3,0.323526,2021-01-01
1,1,Aguascalientes,1002,Asientos,1100,Homicidio doloso,0,0.0,2021-01-01
2,1,Aguascalientes,1010,El Llano,1100,Homicidio doloso,0,0.0,2021-01-01
3,1,Aguascalientes,1009,Tepezala,1100,Homicidio doloso,0,0.0,2021-01-01
4,1,Aguascalientes,1007,Rincon De Romos,1100,Homicidio doloso,0,0.0,2021-01-01


In [130]:
pop.head()

Unnamed: 0,entidad,municipio,id_municipio,id_entidad,hombres,mujeres,total,municipio_sep,entidad_municipio
0,Aguascalientes,Aguascalientes,1001,1,425731,451459,877190,Aguascalientes,Aguascalientes-Aguascalientes
1,Aguascalientes,Asientos,1002,1,22745,23719,46464,Asientos,Aguascalientes-Asientos
2,Aguascalientes,Calvillo,1003,1,27298,28750,56048,Calvillo,Aguascalientes-Calvillo
3,Aguascalientes,Cosío,1004,1,7552,8025,15577,Cosío,Aguascalientes-Cosío
4,Aguascalientes,JesúsMaría,1005,1,60135,60270,120405,Jesús María,Aguascalientes-JesúsMaría


In [131]:
cp.head()

Unnamed: 0,código,asentamiento,tipo,municipio,ciudad,entidad,municipio_strip,entidad_municipio
0,1000,San Angel,Colonia,Álvaro Obregón,Ciudad de México,CDMX,ÁlvaroObregón,CDMX-Álvaro Obregón
1,1010,Los Alpes,Colonia,Álvaro Obregón,Ciudad de México,CDMX,ÁlvaroObregón,CDMX-Álvaro Obregón
2,1020,Guadalupe Inn,Colonia,Álvaro Obregón,Ciudad de México,CDMX,ÁlvaroObregón,CDMX-Álvaro Obregón
3,1028,Secretaria de Contraloría y Desarrollo Adminis...,Gran usuario,Álvaro Obregón,Ciudad de México,CDMX,ÁlvaroObregón,CDMX-Álvaro Obregón
4,1029,INFONAVIT,Gran usuario,Álvaro Obregón,Ciudad de México,CDMX,ÁlvaroObregón,CDMX-Álvaro Obregón
