Author: Hector

Notice: This data is cummulative by year.

In [1]:
import pandas as pd
from glob import glob
import os
import numpy as np

In [2]:
# Finding all xls files in the directory of delitos


files = glob("../../data/raw/Delitos Tipo 1/*/*.xls")
files.extend(glob("../../data/raw/Delitos Tipo 1/*/*.xlsx"))

In [3]:
# When importing each xls, a new column will be created to identify the year-month by its file name
df = pd.DataFrame()
for f in files:
    
    tmp = pd.read_excel(f, sheet_name="MUNICIPIOS")
    # Dropping rows with all nans before assigning year_month value
    tmp.dropna(how='all', inplace=True)
    # Dropping columns with all nans as well
    tmp.dropna(axis=1, how='all', inplace=True)
    
    # In the case that the columns have inconsistent names,
    # they will be renamed appropriately.
    tmp.rename({"Trata Humana":"Trata Hum.",
                "Agresión Grave":"Agr. Grave",
                "Violación":"Viol.",
                "Apropiación Ilegal":"Apr. I",
                "Escalamiento":"Esc.",
                "Hurto Auto":"H. Auto",
                "Asesinato":"Ases.",
                }, axis=1, inplace=True)
    
    year_month = os.path.basename(f).removeprefix("Policia_DelitosTipoI_").removesuffix(".xls").removesuffix(".xlsx")
    year_month = year_month[:4] + '-' + year_month[4:]
    tmp['Date'] = year_month
    
    df = pd.concat([df, tmp])


In [4]:
df.reset_index(inplace=True, drop=True)

In [5]:
# Will drop TOTAL rows from district
df = df[~(df['Distrito'] == 'TOTAL')]

In [6]:
# Will drop randomly inserted row that repeats the header
df = df[~(df['Distrito'] == 'Distrito')]

In [7]:
df.reset_index(inplace=True, drop=True)

In [8]:
df.columns
# Some column names have been shortened so they must be merged
# since they represent the same values. This rename will be done
# before concattenating the dataframes.

Index(['Distrito', 'Tipo I', 'Ases.', 'Viol.', 'Robo', 'Agr. Grave', 'Esc.',
       'Apr. I', 'H. Auto', 'Date', 'Trata Hum.', 'Unnamed: 10', 'Unnamed: 9',
       'Unnamed: 20', 'AREA'],
      dtype='object')

In [9]:
assoc_columns = {"Trata Humana":"Trata Hum.",
                "Agresión Grave":"Agr. Grave",
                "Violación":"Viol.",
                "Apropiación Ilegal":"Apr. I",
                "Escalamiento":"Esc.",
                "Hurto Auto":"H. Auto",
                "Asesinato":"Ases.",
                }

In [10]:
df.isna().sum()

Distrito          21
Tipo I            21
Ases.            181
Viol.            333
Robo              77
Agr. Grave        26
Esc.              28
Apr. I            21
H. Auto          107
Date               0
Trata Hum.      5201
Unnamed: 10    10315
Unnamed: 9     10298
Unnamed: 20    10304
AREA           10239
dtype: int64

In [11]:
# Trata humana has too many missing values to be significant
df.drop('Trata Hum.', axis=1, inplace=True)

In [12]:
df

Unnamed: 0,Distrito,Tipo I,Ases.,Viol.,Robo,Agr. Grave,Esc.,Apr. I,H. Auto,Date,Unnamed: 10,Unnamed: 9,Unnamed: 20,AREA
0,Adjuntas,117,0,0,6,5,32,69,5,2014-12,,,,
1,Aguada,323,4,2,19,13,78,194,13,2014-12,,,,
2,Aguadilla,736,5,2,30,24,280,374,21,2014-12,,,,
3,Aguas Buenas,261,6,0,28,14,102,89,22,2014-12,,,,
4,Aibonito,232,8,0,14,15,62,104,29,2014-12,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10312,Vega Baja,155,5,1,9,14,38,74,14,2020-04,,,,
10313,Vieques,47,3,0,0,11,16,15,2,2020-04,,,,
10314,Villalba,14,0,0,1,5,3,5,0,2020-04,,,,
10315,Yabucoa,32,0,2,0,8,9,12,1,2020-04,,,,


In [13]:
# sorted(list(df['Date'].unique()))

In [14]:
df[~df['Distrito'].isna()]

Unnamed: 0,Distrito,Tipo I,Ases.,Viol.,Robo,Agr. Grave,Esc.,Apr. I,H. Auto,Date,Unnamed: 10,Unnamed: 9,Unnamed: 20,AREA
0,Adjuntas,117,0,0,6,5,32,69,5,2014-12,,,,
1,Aguada,323,4,2,19,13,78,194,13,2014-12,,,,
2,Aguadilla,736,5,2,30,24,280,374,21,2014-12,,,,
3,Aguas Buenas,261,6,0,28,14,102,89,22,2014-12,,,,
4,Aibonito,232,8,0,14,15,62,104,29,2014-12,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10312,Vega Baja,155,5,1,9,14,38,74,14,2020-04,,,,
10313,Vieques,47,3,0,0,11,16,15,2,2020-04,,,,
10314,Villalba,14,0,0,1,5,3,5,0,2020-04,,,,
10315,Yabucoa,32,0,2,0,8,9,12,1,2020-04,,,,


In [15]:
df[~(df['AREA'].isna())]
# Area was oddly added in the Policia_DelitosTipoI_201612 spreadsheet

Unnamed: 0,Distrito,Tipo I,Ases.,Viol.,Robo,Agr. Grave,Esc.,Apr. I,H. Auto,Date,Unnamed: 10,Unnamed: 9,Unnamed: 20,AREA
5636,Adjuntas,78,0.0,0.0,2.0,18,30,28,0.0,2016-12,,,,UTUADO
5637,Aguada,256,2.0,1.0,14.0,41,78,112,8.0,2016-12,,,,AGUADILLA
5638,Aguadilla,598,9.0,2.0,27.0,92,174,281,13.0,2016-12,,,,AGUADILLA
5639,Aguas Buenas,201,8.0,1.0,24.0,14,44,81,29.0,2016-12,,,,CAGUAS
5640,Aibonito,206,6.0,0.0,13.0,16,52,105,14.0,2016-12,,,,AIBONITO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5709,Vega Baja,936,6.0,3.0,76.0,35,262,467,87.0,2016-12,,,,BAYAMON
5710,Vieques,161,2.0,2.0,1.0,27,51,77,1.0,2016-12,,,,FAJARDO
5711,Villalba,138,1.0,1.0,1.0,28,35,67,5.0,2016-12,,,,PONCE
5712,Yabucoa,212,5.0,1.0,13.0,26,66,97,4.0,2016-12,,,,HUMACAO


In [16]:
df.drop('AREA',axis=1,inplace=True)

In [17]:
# df[~(df['Unnamed: 5'].isna())]
# There are some weird cases where the header did
# not extract properly and the column positions shifted.
# This seems to happen when the first row in the MUNICIPIOS
# sheet is empty, so the header is not extracted properly.
# Straightforward fix is to remove the empty row from the
# spreadsheet directly and manually.

In [18]:
df[~(df['Unnamed: 10'].isna())]
# Unnamed: 10 seems to contain the data of Tipo 1 of some
# of the spreadsheets I converted from pdf to xls online.
# This happens because some of the columns are merged in the
# spreadsheet so the easiest solution is to fix it directly on
# the sheets by merging those cases.
# Once fixing that, it can be dropped

Unnamed: 0,Distrito,Tipo I,Ases.,Viol.,Robo,Agr. Grave,Esc.,Apr. I,H. Auto,Date,Unnamed: 10,Unnamed: 9,Unnamed: 20
1716,,,,,,,,,,2015-11,,,
6416,,,,,,,,,,2016-03,,,


In [19]:
df[~(df['Unnamed: 9'].isna())]
# Nothing significant here. Can be dropped.

Unnamed: 0,Distrito,Tipo I,Ases.,Viol.,Robo,Agr. Grave,Esc.,Apr. I,H. Auto,Date,Unnamed: 10,Unnamed: 9,Unnamed: 20
2185,,,,,,,,,,2013-12,,,
3200,,,,,,,,,,2012-06,,,
3825,,,,,,,,,,2011-01,,,
3904,,,,,,,,,,2011-02,,,
3983,,,,,,,,,,2011-03,,,
4062,,,,,,,,,,2011-04,,,
4141,,,,,,,,,,2011-05,,,
4766,,,,,,,,,,2010-01,,,
4845,,,,,,,,,,2010-09,,,
4924,,,,,,,,,,2010-02,,,


In [20]:
df.drop('Unnamed: 9', axis=1, inplace=True)

In [21]:
df[~(df['Unnamed: 10'].isna())]

Unnamed: 0,Distrito,Tipo I,Ases.,Viol.,Robo,Agr. Grave,Esc.,Apr. I,H. Auto,Date,Unnamed: 10,Unnamed: 20
1716,,,,,,,,,,2015-11,,
6416,,,,,,,,,,2016-03,,


In [22]:
df.drop('Unnamed: 10', axis=1, inplace=True)

In [23]:
df[~(df['Unnamed: 20'].isna())]
# Sometimes a 3 slips here, this column can be dropped

Unnamed: 0,Distrito,Tipo I,Ases.,Viol.,Robo,Agr. Grave,Esc.,Apr. I,H. Auto,Date,Unnamed: 20
2134,Florida,160,4,0,7,7,69,68,5,2013-12,3.0
3149,Florida,60,0,0,4,0,33,19,4,2012-06,3.0
3853,Florida,34,7,0,2,0,17,6,2,2011-02,3.0
3932,Florida,37,7,0,2,0,19,7,2,2011-03,3.0
4011,Florida,42,7,0,2,0,23,8,2,2011-04,3.0
4090,Florida,55,7,0,2,1,34,8,3,2011-05,3.0
4873,Florida,24,0,0,0,0,15,6,3,2010-02,3.0
4952,Florida,32,0,0,1,0,21,7,3,2010-03,3.0
5110,Florida,47,1,0,4,0,25,11,6,2010-04,3.0
5189,Florida,61,1,0,4,0,34,14,8,2010-05,3.0


In [24]:
df.drop('Unnamed: 20', axis=1,inplace=True)

In [25]:
df[~(df['Distrito'].isna())]
# All rows where Distrito is nan can be dropped.

Unnamed: 0,Distrito,Tipo I,Ases.,Viol.,Robo,Agr. Grave,Esc.,Apr. I,H. Auto,Date
0,Adjuntas,117,0,0,6,5,32,69,5,2014-12
1,Aguada,323,4,2,19,13,78,194,13,2014-12
2,Aguadilla,736,5,2,30,24,280,374,21,2014-12
3,Aguas Buenas,261,6,0,28,14,102,89,22,2014-12
4,Aibonito,232,8,0,14,15,62,104,29,2014-12
...,...,...,...,...,...,...,...,...,...,...
10312,Vega Baja,155,5,1,9,14,38,74,14,2020-04
10313,Vieques,47,3,0,0,11,16,15,2,2020-04
10314,Villalba,14,0,0,1,5,3,5,0,2020-04
10315,Yabucoa,32,0,2,0,8,9,12,1,2020-04


In [26]:
df=df[~(df['Distrito'].isna())]

In [27]:
df.reset_index(inplace=True, drop=True)

In [28]:
df

Unnamed: 0,Distrito,Tipo I,Ases.,Viol.,Robo,Agr. Grave,Esc.,Apr. I,H. Auto,Date
0,Adjuntas,117,0,0,6,5,32,69,5,2014-12
1,Aguada,323,4,2,19,13,78,194,13,2014-12
2,Aguadilla,736,5,2,30,24,280,374,21,2014-12
3,Aguas Buenas,261,6,0,28,14,102,89,22,2014-12
4,Aibonito,232,8,0,14,15,62,104,29,2014-12
...,...,...,...,...,...,...,...,...,...,...
10291,Vega Baja,155,5,1,9,14,38,74,14,2020-04
10292,Vieques,47,3,0,0,11,16,15,2,2020-04
10293,Villalba,14,0,0,1,5,3,5,0,2020-04
10294,Yabucoa,32,0,2,0,8,9,12,1,2020-04


In [29]:
df = df.sort_values(by='Date')

In [30]:
df.reset_index(inplace=True, drop=True)

In [31]:
df.isna().sum()

Distrito        0
Tipo I          0
Ases.         160
Viol.         312
Robo           56
Agr. Grave      5
Esc.            7
Apr. I          0
H. Auto        86
Date            0
dtype: int64

In [32]:
df[df['Ases.'].isna()]

Unnamed: 0,Distrito,Tipo I,Ases.,Viol.,Robo,Agr. Grave,Esc.,Apr. I,H. Auto,Date
706,Culebra,58,,,,2,17,39,,2010-10
710,Guánica,137,,,10,10,45,67,5,2010-10
715,Hatillo,395,,,30,4,143,122,96,2010-10
721,Adjuntas,258,,,15,9,126,100,8,2010-10
726,Añasco,157,,,15,7,83,41,11,2010-10
...,...,...,...,...,...,...,...,...,...,...
8022,Maricao,26,,,,3,6,17,,2018-07
8025,Lares,98,,1,5,21,27,40,4,2018-07
8028,Naranjito,156,,,7,8,32,90,19,2018-07
8030,Patillas,54,,,2,18,8,26,,2018-07


In [33]:
pd.to_datetime(df['Date'])

0       2010-01-01
1       2010-01-01
2       2010-01-01
3       2010-01-01
4       2010-01-01
           ...    
10291   2020-12-01
10292   2020-12-01
10293   2020-12-01
10294   2020-12-01
10295   2020-12-01
Name: Date, Length: 10296, dtype: datetime64[ns]

In [34]:
df.dtypes
# Need to change some of the datatypes to be numeric

Distrito      object
Tipo I        object
Ases.         object
Viol.         object
Robo          object
Agr. Grave    object
Esc.          object
Apr. I        object
H. Auto       object
Date          object
dtype: object

In [35]:
df['Tipo I'] = pd.to_numeric(df['Tipo I'])
df['Ases.'] = pd.to_numeric(df['Ases.'])
df['Viol.'] = pd.to_numeric(df['Viol.'])
df['Robo'] = pd.to_numeric(df['Robo'])
df['Agr. Grave'] = pd.to_numeric(df['Agr. Grave'])
df['Esc.'] = pd.to_numeric(df['Esc.'])
df['Apr. I'] = pd.to_numeric(df['Apr. I'])
df['H. Auto'] = pd.to_numeric(df['H. Auto'])

In [36]:
df.dtypes

Distrito       object
Tipo I          int64
Ases.         float64
Viol.         float64
Robo          float64
Agr. Grave    float64
Esc.          float64
Apr. I          int64
H. Auto       float64
Date           object
dtype: object

In [37]:
df['Date'] = pd.to_datetime(df['Date'])

In [38]:
# Making sure we strip all distrito names of any whitespaces
df['Distrito'] = df['Distrito'].str.strip()

In [39]:
print(sorted(df['Distrito'].unique()))
print(len(df['Distrito'].unique()))
# All Distritos are unique

['Adjuntas', 'Aguada', 'Aguadilla', 'Aguas Buenas', 'Aibonito', 'Arecibo', 'Arroyo', 'Añasco', 'Barceloneta', 'Barranquitas', 'Bayamón', 'Cabo Rojo', 'Caguas', 'Camuy', 'Canóvanas', 'Carolina', 'Cataño', 'Cayey', 'Ceiba', 'Ciales', 'Cidra', 'Coamo', 'Comerío', 'Corozal', 'Culebra', 'Dorado', 'Fajardo', 'Florida', 'Guayama', 'Guayanilla', 'Guaynabo', 'Gurabo', 'Guánica', 'Hatillo', 'Hormigueros', 'Humacao', 'Isabela', 'Jayuya', 'Juana Díaz', 'Juncos', 'Lajas', 'Lares', 'Las Marías', 'Las Piedras', 'Loiza', 'Luquillo', 'Manatí', 'Maricao', 'Maunabo', 'Mayagüez', 'Moca', 'Morovis', 'Naguabo', 'Naranjito', 'Orocovis', 'Patillas', 'Peñuelas', 'Ponce', 'Quebradillas', 'Rincón', 'Rio Grande', 'Sabana Grande', 'Salinas', 'San Germán', 'San Juan', 'San Lorenzo', 'San Sebastián', 'Santa Isabel', 'Toa Alta', 'Toa Baja', 'Trujillo Alto', 'Utuado', 'Vega Alta', 'Vega Baja', 'Vieques', 'Villalba', 'Yabucoa', 'Yauco']
78


In [40]:
df[df['Tipo I'] < 0]

Unnamed: 0,Distrito,Tipo I,Ases.,Viol.,Robo,Agr. Grave,Esc.,Apr. I,H. Auto,Date


In [41]:
def fill_missing(x):
    
    # set nans from first row equal to 0
    # first = x.index[0]
    # x.loc[first] = x.loc[first].fillna(0)
    
    columns = ['Tipo I','Ases.','Viol.','Robo','Agr. Grave','Esc.','Apr. I','H. Auto']
    
    
    x[columns] = x[columns].interpolate() #first linear
    
    x[columns] = x[columns].interpolate(method='ffill') # then forward fill
    x[columns] = x[columns].interpolate(method='bfill') # then backward fill
    
    
    return x
    

df_interpolated = df.groupby(['Distrito', df.Date.dt.year]).apply(fill_missing)

In [42]:
df_interpolated.isna().sum()

Distrito      0
Tipo I        0
Ases.         0
Viol.         0
Robo          0
Agr. Grave    0
Esc.          0
Apr. I        0
H. Auto       0
Date          0
dtype: int64

In [43]:
df_interpolated

Unnamed: 0,Distrito,Tipo I,Ases.,Viol.,Robo,Agr. Grave,Esc.,Apr. I,H. Auto,Date
0,Adjuntas,34,0.0,0.0,0.0,1.0,16.0,16,1.0,2010-01-01
1,Ponce,262,11.0,0.0,28.0,27.0,50.0,125,21.0,2010-01-01
2,Peñuelas,12,0.0,1.0,2.0,1.0,2.0,5,1.0,2010-01-01
3,Patillas,17,0.0,0.0,1.0,3.0,8.0,5,0.0,2010-01-01
4,Orocovis,34,0.0,0.0,0.0,2.0,25.0,6,1.0,2010-01-01
...,...,...,...,...,...,...,...,...,...,...
10291,Orocovis,75,1.0,1.0,1.0,11.0,23.0,35,3.0,2020-12-01
10292,Patillas,66,2.0,2.0,2.0,21.0,17.0,21,1.0,2020-12-01
10293,Ponce,754,24.0,3.0,32.0,186.0,125.0,365,18.0,2020-12-01
10294,Mayagüez,241,15.0,4.0,8.0,58.0,47.0,93,16.0,2020-12-01


In [44]:
df_interpolated[df_interpolated['Tipo I'] < 0]

Unnamed: 0,Distrito,Tipo I,Ases.,Viol.,Robo,Agr. Grave,Esc.,Apr. I,H. Auto,Date


In [45]:
df_interpolated.dtypes

Distrito              object
Tipo I                 int64
Ases.                float64
Viol.                float64
Robo                 float64
Agr. Grave           float64
Esc.                 float64
Apr. I                 int64
H. Auto              float64
Date          datetime64[ns]
dtype: object

In [46]:
df_interpolated['Ases.'] = df_interpolated['Ases.'].astype(int)
df_interpolated['Viol.'] = df_interpolated['Viol.'].astype(int)
df_interpolated['Robo'] = df_interpolated['Robo'].astype(int)
df_interpolated['Agr. Grave'] = df_interpolated['Agr. Grave'].astype(int)
df_interpolated['Esc.'] = df_interpolated['Esc.'].astype(int)
df_interpolated['H. Auto'] = df_interpolated['H. Auto'].astype(int)

In [47]:
df_interpolated

Unnamed: 0,Distrito,Tipo I,Ases.,Viol.,Robo,Agr. Grave,Esc.,Apr. I,H. Auto,Date
0,Adjuntas,34,0,0,0,1,16,16,1,2010-01-01
1,Ponce,262,11,0,28,27,50,125,21,2010-01-01
2,Peñuelas,12,0,1,2,1,2,5,1,2010-01-01
3,Patillas,17,0,0,1,3,8,5,0,2010-01-01
4,Orocovis,34,0,0,0,2,25,6,1,2010-01-01
...,...,...,...,...,...,...,...,...,...,...
10291,Orocovis,75,1,1,1,11,23,35,3,2020-12-01
10292,Patillas,66,2,2,2,21,17,21,1,2020-12-01
10293,Ponce,754,24,3,32,186,125,365,18,2020-12-01
10294,Mayagüez,241,15,4,8,58,47,93,16,2020-12-01


In [48]:
# df[df['Distrito'] == 'Vega Baja']['Tipo I'].diff()

In [49]:
# Exporting both interpolated and non interpolated versions of the dataframe
df.to_csv('../../data/clean/DelitosTipo1/DelitosTipo1-2010-2020.csv')
df_interpolated.to_csv('../../data/clean/DelitosTipo1/DelitosTipo1-2010-2020(interpolado).csv')

In [50]:
df_interpolated['Date'] = pd.to_datetime(df_interpolated['Date'])

In [51]:
# Turning data from cumulative into monthly deltas.
def year_to_month(x):
    # Turning into monthly deltas
    cols =['Tipo I','Ases.', 'Viol.', 'Robo', 'Agr. Grave','Esc.', 'Apr. I', 'H. Auto']
    
    x[cols] = x[cols].diff()
    
    
    return x

df_interpolated_deltas = df_interpolated.groupby(['Distrito', df_interpolated.Date.dt.year]).apply(year_to_month)

In [52]:
# Checking for negatives before setting them to nan
cols_to_check = ['Tipo I', 'Ases.', 'Viol.', 'Robo', 
                 'Agr. Grave', 'Esc.' , 'Apr. I' ,'H. Auto']
for col in cols_to_check:
    print(df_interpolated_deltas[df_interpolated_deltas[col] < 0])

           Distrito  Tipo I  Ases.  Viol.  Robo  Agr. Grave  Esc.  Apr. I  \
508          Juncos    -1.0    0.0    0.0   4.0         1.0  -4.0    -5.0   
546         Florida    -9.0    0.0    0.0   2.0         1.0   4.0    -8.0   
617        Orocovis    -1.0    0.0    0.0   0.0        -2.0   3.0     0.0   
706         Culebra    -1.0    0.0    0.0   0.0         0.0   0.0    -1.0   
1505       Villalba    -7.0    0.0    0.0   2.0         4.0   9.0   -23.0   
2121        Isabela   -21.0    0.0    0.0   2.0        -2.0   0.0   -20.0   
2129      Aguadilla   -16.0   -1.0    0.0  -1.0         1.0  13.0   -31.0   
2193           Moca   -93.0    0.0    0.0   0.0        -3.0 -49.0   -41.0   
2202         Aguada   -66.0   -1.0    0.0  -2.0         2.0   4.0   -61.0   
4528        Culebra    -1.0    0.0    0.0  -5.0         1.0  -1.0     3.0   
7657     Las Marías    -2.0    0.0    0.0   0.0         0.0   2.0    -4.0   
9097  Trujillo Alto  -276.0  -11.0   -1.0 -37.0       -49.0 -32.0  -128.0   

In [53]:
for col in cols_to_check:
    df_interpolated_deltas.loc[df_interpolated_deltas[col] < 0, col] = np.nan

In [54]:
df_interpolated_deltas = df_interpolated_deltas.groupby(['Distrito']).apply(fill_missing)

In [55]:
df_interpolated_deltas

Unnamed: 0,Distrito,Tipo I,Ases.,Viol.,Robo,Agr. Grave,Esc.,Apr. I,H. Auto,Date
0,Adjuntas,34.0,0.0,0.0,4.0,1.0,12.0,16.0,1.0,2010-01-01
1,Ponce,243.0,4.0,0.0,26.0,14.0,52.0,123.0,24.0,2010-01-01
2,Peñuelas,12.0,0.0,0.0,2.0,0.0,6.0,2.0,2.0,2010-01-01
3,Patillas,13.0,0.0,0.0,0.0,4.0,6.0,3.0,0.0,2010-01-01
4,Orocovis,22.0,0.0,0.0,0.0,0.0,10.0,12.0,0.0,2010-01-01
...,...,...,...,...,...,...,...,...,...,...
10291,Orocovis,11.0,0.0,0.0,0.0,2.0,5.0,3.0,1.0,2020-12-01
10292,Patillas,8.0,0.0,0.0,0.0,6.0,1.0,1.0,0.0,2020-12-01
10293,Ponce,71.0,0.0,1.0,6.0,10.0,13.0,40.0,1.0,2020-12-01
10294,Mayagüez,34.0,2.0,0.0,1.0,6.0,6.0,18.0,1.0,2020-12-01


In [56]:
# Checking for negatives again
for col in cols_to_check:
    print(df_interpolated_deltas[df_interpolated_deltas[col] < 0])

Empty DataFrame
Columns: [Distrito, Tipo I, Ases., Viol., Robo, Agr. Grave, Esc., Apr. I, H. Auto, Date]
Index: []
Empty DataFrame
Columns: [Distrito, Tipo I, Ases., Viol., Robo, Agr. Grave, Esc., Apr. I, H. Auto, Date]
Index: []
Empty DataFrame
Columns: [Distrito, Tipo I, Ases., Viol., Robo, Agr. Grave, Esc., Apr. I, H. Auto, Date]
Index: []
Empty DataFrame
Columns: [Distrito, Tipo I, Ases., Viol., Robo, Agr. Grave, Esc., Apr. I, H. Auto, Date]
Index: []
Empty DataFrame
Columns: [Distrito, Tipo I, Ases., Viol., Robo, Agr. Grave, Esc., Apr. I, H. Auto, Date]
Index: []
Empty DataFrame
Columns: [Distrito, Tipo I, Ases., Viol., Robo, Agr. Grave, Esc., Apr. I, H. Auto, Date]
Index: []
Empty DataFrame
Columns: [Distrito, Tipo I, Ases., Viol., Robo, Agr. Grave, Esc., Apr. I, H. Auto, Date]
Index: []
Empty DataFrame
Columns: [Distrito, Tipo I, Ases., Viol., Robo, Agr. Grave, Esc., Apr. I, H. Auto, Date]
Index: []


In [57]:
df_interpolated_deltas.isna().sum()

Distrito      0
Tipo I        0
Ases.         0
Viol.         0
Robo          0
Agr. Grave    0
Esc.          0
Apr. I        0
H. Auto       0
Date          0
dtype: int64

In [58]:
df_interpolated_deltas.to_csv('../../data/clean/DelitosTipo1/DelitosTipo1-2010-2020_deltas_mensuales(interpolado).csv')

In [59]:
pueblos = list(df_interpolated_deltas.Distrito.unique())

In [60]:
with open("pueblos.txt", 'w') as outfile:
    outfile.writelines((str(i)+'\n' for i in pueblos))