In [1]:
import pandas as pd
import pandas_profiling
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, HTML
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import calendar
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', None)  
from wordcloud import WordCloud, STOPWORDS
import folium
from folium import plugins
from shapely.geometry import Point
from shapely.geometry.polygon import Polygon
import geopandas as gpd

ModuleNotFoundError: No module named 'pandas_profiling'

# 1. Merge datasets

In [None]:
data_location = '/Users/anamaria/Desktop/dev/security_project/datasets/NUSE 934 611(M) 2017-2018.dsv'
data2018=pd.read_csv(data_location,delimiter="|")

In [None]:
data_location = '/Users/anamaria/Desktop/dev/security_project/datasets/NUSE 934-611-611M ENERO2019.csv'
data2019=pd.read_csv(data_location,delimiter=";")

In [None]:
frames = [data2018, data2019]
data = pd.concat(frames)
merged_nuse = data.loc[data['TIPO_DETALLE'] == '934 - RIÑA']
merged_nuse.reset_index(inplace=True)
merged_nuse.drop(columns=['index'], inplace=True)

In [None]:
merged_nuse.to_csv(r'/Users/anamaria/Desktop/dev/security_project/datasets/merged_nuse.csv',index=None)

# 2. Rebuild missing data

In [2]:
localidadCodDictionaryNuse = {1:'USAQUEN',
                              2:'CHAPINERO',
                              3:'SANTA FE',
                              4:'SAN CRISTOBAL',
                              5:'USME',
                              6:'TUNJUELITO',
                              7:'BOSA',
                              8:'KENNEDY',
                              9:'FONTIBON',
                              10:'ENGATIVA',
                              11:'SUBA',
                              12:'BARRIOS UNIDOS',
                              13:'TEUSAQUILLO',
                              14:'LOS MARTIRES',
                              15:'ANTONIO NARIÑO',
                              16:'PUENTE ARANDA',
                              17:'CANDELARIA',
                              18:'RAFAEL URIBE URIBE',
                              19:'CIUDAD BOLIVAR',
                              20:'SUMAPAZ',
                              99:'SIN LOCALIZACION'}

## Methods to rebuild

In [3]:
import import_ipynb
import ws_address
from selenium.common.exceptions import TimeoutException
import re
import unidecode

importing Jupyter notebook from ws_address.ipynb


In [4]:
def find_between( s, first, last ):
    try:
        start = s.index( first ) + len( first )
        end = s.index( last, start )
        return s[start:end]
    except ValueError:
        return ""

In [5]:
tags = ["Dirección ingresada: ","Dirección encontrada: ","Tipo dirección: ","Código postal: ","Sector catastral: ",
        "UPZ: ","Localidad: ","Latitud: ","Longitud: ","CHIP: "]
def parse_address_ws(ws_result):
    location = {}
    for idx in range(len(tags)-1):
        location[tags[idx].replace(': ','')] = find_between(ws_result,tags[idx],tags[idx+1])
    return location

In [6]:
def assign_upz(original_df,index,UPZ_ws_field):
    original_df.at[index,'COD_UPZ'] = find_between(UPZ_ws_field, '(', ')')
    original_df.at[index,'UPZ'] = find_between(UPZ_ws_field, '', ' (')

In [7]:
def get_cod_localidad(localidad_name):
    return [key  for (key, value) in localidadCodDictionaryNuse.items() if value == localidad_name][0]

In [17]:
def rebuild_location_in_nuse(original_df, index, driver):
    address = original_df.at[index,'STR_DIRECCION_INCIDENTE']
    print(address)
    result_ws = ws_address.web_scrap_address(driver,address)
    ws_address.delete_address(driver,address)

    if result_ws != "Not found":
        parsed_result = parse_address_ws(result_ws)
        print(parsed_result)
        if parsed_result["Dirección ingresada"] != address:
            return "Error loading address"
        else:            
            original_df.at[index,'LATITUD'] = float(parsed_result['Latitud'])
            original_df.at[index,'LONGITUD'] = float(parsed_result['Longitud'])
            parsed_localidad = parsed_result['Localidad']
            if parsed_localidad == 'ANTONIO NARIÑO':
                original_df.at[index,'LOCALIDAD'] = parsed_localidad
            else:
                original_df.at[index,'LOCALIDAD'] = unidecode.unidecode(parsed_localidad)
            original_df.at[index,'COD_LOCALIDAD'] = int(get_cod_localidad(original_df.at[index,'LOCALIDAD']))
            original_df.at[index,'SEC_CATASTRAL'] = parsed_result['Sector catastral']
            assign_upz(original_df,index,parsed_result['UPZ'])
            return "Rebuilt"
    else:
        return "Not processed"


In [9]:
def rebuild_address_in_nuse(original_df, index):
    log_text = original_df.at[index,'LOG_TEXT']
    address_found = re.search(address_regex,log_text)

    if address_found != None:
        parsed_address = clean_address(address_found)
        print(parsed_address.strip())
        original_df.at[index,'STR_DIRECCION_INCIDENTE'] = parsed_address.strip()
        return "Rebuilt"
    else:
        original_df.at[index,'STR_DIRECCION_INCIDENTE'] = 'ND'
        return "Not processed"

def clean_address(address_found):
    exclude_char_list = ['~','/','*','(',')']
    one_occurrence = address_found.group().split(',,,')[0].replace(',',' ')
    final_address = one_occurrence
    
    for char in exclude_char_list:
        if char in one_occurrence:
            final_address = final_address.split(char)[0]
            
    numbers_in_substring = re.sub('[^0-9]','', final_address)
    numbers_proportion = len(numbers_in_substring)/len(final_address)
    
    if numbers_proportion < 0.2:
        final_address = 'ND'
    
    return final_address


## Implement rebuild methods

In [None]:
data_location = '/Users/anamaria/Desktop/dev/security_project/datasets/merged_nuse.csv'
merged_nuse=pd.read_csv(data_location,delimiter=",")

In [None]:
pd.DataFrame({"Tipo de dato":merged_nuse.dtypes.values,
              "Celdas con valor '-'":(merged_nuse == '-').sum().values,
              "Celdas con valor ''":(merged_nuse == '').sum().values,
              "Celdas con valor ' '":(merged_nuse == ' ').sum().values,
              "Celdas vacías": merged_nuse.isna().sum().values},
             index=merged_nuse.columns)

### Rebuild address through log_text

In [None]:
#Try to rebuild missing address through log_text field
df_empty_locations_without_address = merged_nuse.loc[merged_nuse['STR_DIRECCION_INCIDENTE'] == '-']
list_idx_rebuild_address = list(df_empty_locations_without_address.index.values)

In [None]:
len(list_idx_rebuild_address)

In [None]:
address_regex= '(CL|DG|KR|TV)+\s\d+.*(,,)'
registers_to_process = len(list_idx_rebuild_address)
rebuilt_registers = 0
registers_not_processed = 0
other_condition_counter = 0

for index in list_idx_rebuild_address:
    state = rebuild_address_in_nuse(merged_nuse, index)
    
    if state == "Rebuilt":
        rebuilt_registers += 1
    elif state == "Not processed":
        registers_not_processed += 1
    else:
        other_condition_counter += 1
    
    print('Rebuilt registers: ',rebuilt_registers,'/',registers_to_process)
    print('Registers not processed: ',registers_not_processed, '/', registers_to_process)

In [None]:
merged_nuse.to_csv(r'/Users/anamaria/Desktop/dev/security_project/datasets/rebuild_address_nuse_18112019.csv',index=None)

In [None]:
pd.DataFrame({"Tipo de dato":merged_nuse.dtypes.values,
              "Celdas con valor '-'":(merged_nuse == '-').sum().values,
              "Celdas con valor 'ND'":(merged_nuse == 'ND').sum().values,
              "Celdas vacías": merged_nuse.isna().sum().values},
             index=merged_nuse.columns)

### Rebuild location through address

In [10]:
#data_location = '/Users/anamaria/Desktop/dev/security_project/datasets/rebuild_address_nuse_18112019.csv'
data_location = '/Users/anamaria/Desktop/dev/security_project/datasets/standardise_result_nuse_27112019.csv'
df_input = pd.read_csv(data_location,delimiter=",")

In [11]:
df1 = df_input.loc[df_input['COD_UPZ'] == '-']
df2 = df_input.loc[df_input['UPZ'] == '-']
df3 = df_input.loc[df_input['COD_SEC_CATAST'] == '-']
df4 = df_input.loc[df_input['SEC_CATASTRAL'] == '-']
df5 = df_input.loc[df_input['COD_BARRIO'] == '-']
df6 = df_input.loc[df_input['BARRIO'] == '-']

In [12]:
df1.equals(df2) and df1.equals(df3) and df1.equals(df4) and df1.equals(df5) and df1.equals(df6)

True

In [13]:
#Try to rebuild 'sector catastral', 'UPZ', 'localidad', 'latitud', 'longitud' through address
df_empty_locations_with_address = df1.loc[df1['STR_DIRECCION_INCIDENTE'] != 'ND']
list_idx_rebuild_location = list(df_empty_locations_with_address.index.values)
len(list_idx_rebuild_location)

755

In [None]:
#Rebuild 'sector catastral', 'UPZ', 'localidad', 'latitud', 'longitud' using web scraping
df_output = df_input
url='https://mapas.bogota.gov.co'
driver = ws_address.web_scrap_page(url)
registers_to_process = len(list_idx_rebuild_location)
rebuilt_registers = 0
registers_not_processed = 0
other_condition_counter = 0
idx_error_loading_address = []

for index in list_idx_rebuild_location:
    state = rebuild_location_in_nuse(df_output, index, driver)
    
    if state == "Rebuilt":
        rebuilt_registers += 1
    elif state == "Not processed":
        registers_not_processed += 1
    elif state == "Error loading address":
        idx_error_loading_address.append(index)
    else:
        other_condition_counter += 1
    
    print('Rebuilt registers: ',rebuilt_registers,'/',registers_to_process)
    print('Registers not processed: ',registers_not_processed, '/', registers_to_process)

In [20]:
idx_error_loading_address

[163]

In [None]:
#iterate over idx_error_loading_address to rebuild location if necesary

In [None]:
print(rebuilt_registers)
print(registers_not_processed)
print(other_condition_counter)

In [None]:
df_output.to_csv(r'/Users/anamaria/Desktop/dev/security_project/datasets/rebuild_locations_nuse_29112019.csv',index=None)

In [None]:
pd.DataFrame({"Tipo de dato":df_output.dtypes.values,
              "Celdas con valor '-'":(df_output == '-').sum().values,
              "Celdas con valor 'ND'":(df_output == 'ND').sum().values,
              "Celdas vacías": df_output.isna().sum().values},
             index=df_output.columns)

In [None]:
#assign ND to df_empty_locations_without_address on location fields
#'SEC_CATASTRAL', 'UPZ', 'COD_UPZ', 'LATITUD'', 'LONGITUD', 'LOCALIDAD', 'COD_LOCALIDAD'

In [2]:
data_location = '/home/combios/Documents/amreyesp/clean_nuse_data/rebuild_locations_nuse_29012020.csv'
df_input = pd.read_csv(data_location,delimiter=",")

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
#Registers without address or coordinates can not be rebuilt
df_empty_locations_without_address = df_input.loc[(df_input['STR_DIRECCION_INCIDENTE'] == 'ND') & (df_input['LATITUD']==-1) & (df_input['LONGITUD']==-1)]
list_idx_not_rebuild = list(df_empty_locations_without_address.index.values)
len(list_idx_not_rebuild)

3139

In [9]:
df_output = df_input

In [12]:
for index in list_idx_not_rebuild:
    #df_output.at[index,'LATITUD'] = 'ND'
    #df_output.at[index,'LONGITUD'] = 'ND'
    df_output.at[index,'SEC_CATASTRAL'] = 'ND'
    df_output.at[index,'UPZ'] = 'ND'
    df_output.at[index,'COD_UPZ'] = 'ND'
    df_output.at[index,'LOCALIDAD'] = 'SIN LOCALIZACION'
    df_output.at[index,'LATITUD'] = 99

In [13]:
df_output.to_csv(r'/home/combios/Documents/amreyesp/clean_nuse_data/rebuild_nuse_29012020.csv',index=None)

# 3. Standardise

In [4]:
data_location = '/home/combios/Documents/amreyesp/rebuild_locations_nuse_idx_30kto39k_23012020.csv'
df_input = pd.read_csv(data_location,delimiter=",")

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
#create timpestamp col to handle time ranges on unique event process
df_input['time_stamp']=pd.to_datetime(df_input['FECHA'] + ' ' + df_input["HORA"].astype(str).str.rjust(4,'0'))

In [5]:
pd.DataFrame({"Tipo de dato":df_input.dtypes.values,
              "Celdas con valor '-'":(df_input == '-').sum().values,
              "Celdas con valor 'ND'":(df_input == 'ND').sum().values,
              "Celdas vacías": df_input.isna().sum().values},
             index=df_input.columns)

  result = method(y)


Unnamed: 0,Tipo de dato,Celdas con valor '-',Celdas con valor 'ND',Celdas vacías
STR_NUMERO_INTERNO,object,0,0,0
FECHA,object,0,0,0
HORA,int64,0,0,0
ANIO,int64,0,0,0
MES,int64,0,0,0
COD_LOCALIDAD,object,3,0,0
LOCALIDAD,object,3,0,0
COD_UPZ,object,28041,0,45
UPZ,object,28041,0,45
COD_SEC_CATAST,object,51950,0,0


## 3.1 One register per event: event that occurs within 400 mts radius and 20 minutes time interval

### Find duplicated events

In [None]:
import time, datetime
time_offset = 20
coor_offset = 0.001

In [None]:
def find_duplicated_events(df, row):
    current_time = row['time_stamp']
    current_lat = row['LATITUD']
    current_lon = row['LONGITUD']
    current_point=Point(current_lon,current_lat)

    duplicated_event_idx = {}
    limit_time_interval = current_time + datetime.timedelta(minutes = time_offset)
    df_event_time = df.loc[(df['time_stamp'] >= current_time) & (df['time_stamp'] < limit_time_interval)]
    
    lat_point_list = [current_lat-coor_offset, current_lat-coor_offset, current_lat+coor_offset, current_lat+coor_offset]
    lon_point_list = [current_lon+coor_offset, current_lon-coor_offset, current_lon-coor_offset, current_lon+coor_offset]
    polygon_event = Polygon(zip(lon_point_list, lat_point_list))
    
    for index, row in df_event_time.iterrows():
        point=Point(row['LONGITUD'],row['LATITUD'])
        if point.within(polygon_event):
            #duplicated_event_idx.append(index)
            duplicated_event_idx[index] = row['STR_NUMERO_INTERNO']
    return duplicated_event_idx

In [None]:
df_output = df_input.copy()

In [None]:
df_output['dup_event'] = df_output.apply (lambda row: find_duplicated_events(df_output, row), axis=1)

In [None]:
df_output.to_csv(r'/Users/anamaria/Desktop/dev/security_project/datasets/standardise_find_dup_event_nuse_26112019.csv',index=None)

### Delete duplicated events: preserve the first event on dup_event column

In [None]:
data_location = '/Users/anamaria/Desktop/dev/security_project/datasets/standardise_find_dup_event_nuse_26112019.csv'
df_input = pd.read_csv(data_location,delimiter=",")

In [None]:
pd.DataFrame({"Tipo de dato":df_input.dtypes.values,
              "Celdas con valor '-'":(df_input == '-').sum().values,
              "Celdas con valor 'ND'":(df_input == 'ND').sum().values,
              "Celdas vacías": df_input.isna().sum().values},
             index=df_input.columns)
print(df_input.shape)

In [None]:
#Get index of registers that should be deleted
import ast
df = df_input
list_idx_repeated = []
list_idx_preserved = []
registers_to_process = len(df)
list_idx_processed =[]
counter_processed = 0

for index, row in df.iterrows():
    dup_event_x = ast.literal_eval(df.at[index,'dup_event'])
    current_dup_events = list(dup_event_x.keys())

    if (current_dup_events[0] not in list_idx_processed) & (current_dup_events[0] not in list_idx_preserved):
        list_idx_preserved.append(current_dup_events[0])
        list_idx_processed.append(current_dup_events[0])
        current_dup_events.pop(0)

    for idx_event in current_dup_events:
        if idx_event not in list_idx_processed:
            list_idx_repeated.append(idx_event)
            list_idx_processed.append(idx_event)
                
    counter_processed += 1
    
    print('Registers processed: ',counter_processed,'/',registers_to_process)


In [None]:
#check (quantitatively) ID of preserved and repeated events index was succesful
print(len(list_idx_repeated)+len(list_idx_preserved))
print(len(list_idx_processed))
join_list = list_idx_preserved + list_idx_repeated

import collections
seen = set()
uniq = []
for x in join_list:
    if x not in seen:
        uniq.append(x)
        seen.add(x)

print(len(uniq))

lst = join_list
dupItems = []
uniqItems = {}
for x in lst:
    if x not in uniqItems:
        uniqItems[x] = 1
    else:
        if uniqItems[x] == 1:
            dupItems.append(x)
        uniqItems[x] += 1
        
print(len(dupItems))

In [None]:
df_output = df_input.copy()

In [None]:
df_output=df_output.drop(list_idx_repeated)
df_output.drop(columns=['dup_event','time_stamp'],inplace=True)
df_output.reset_index(inplace=True)

In [None]:
df_output.to_csv(r'/Users/anamaria/Desktop/dev/security_project/datasets/standardise_result_nuse_27112019.csv',index=None)

In [None]:
#save lists
MyFile=open('/Users/anamaria/Desktop/dev/security_project/datasets/list_idx_preserved_27112019.txt','w')
MyList=map(lambda x: str(x)+'\n', list_idx_preserved)
MyFile.writelines(MyList)
MyFile.close()

In [None]:
#save lists
MyFile=open('/Users/anamaria/Desktop/dev/security_project/datasets/list_idx_repeated_27112019.txt','w')
MyList=map(lambda x: str(x)+'\n', list_idx_repeated)
MyFile.writelines(MyList)
MyFile.close()

# 4. Normalise

In [None]:
#data_location = '/Users/anamaria/Desktop/dev/security_project/datasets/standardise_result_nuse_27112019.csv'
data_location = '/Users/anamaria/Desktop/dev/security_project/datasets/rebuild_locations_nuse_29112019.csv'
df_input = pd.read_csv(data_location,delimiter=",")

In [None]:
print(df_input.shape)
pd.DataFrame({"Tipo de dato":df_input.dtypes.values,
              "Celdas con valor '-'":(df_input == '-').sum().values,
              "Celdas con valor 'ND'":(df_input == 'ND').sum().values,
              "Celdas vacías": df_input.isna().sum().values},
             index=df_input.columns)

### Verify FECHA

In [None]:
# It´s a REGEX with the form: nn-www-nnnn
regex_fecha = '^\d{1,2}-\w{3}-\d{1,2}$'
df_input['FECHA'].str.contains(regex_fecha, regex=True).all()

### Verify HORA

In [None]:
# It´s a number between 0 and 2359
print(df_input['HORA'].between(0,2359).all())

# It´s a regex:
regex_hora = '^[0-2][0-9][0-5]|[0-9]$'
df_input['HORA'].apply(str).str.contains(regex_hora, regex=True).all()

### Verify ANIO

In [None]:
# It´s a number between 2017 and 2019
df_input['ANIO'].between(2017,2019).all()

### Verify MES

In [None]:
# It´s a number between 1 and 12
df_input['MES'].between(1,12).all()

### Verify COD_LOCALIDAD - LOCALIDAD

In [None]:
var_aux = 'STR_NUMERO_INTERNO'
df_input.groupby(['COD_LOCALIDAD','LOCALIDAD']).agg({var_aux:'count'}).reset_index().rename(columns={var_aux:'Frecuencia'})

### Verify LATITUD, LONGITUD

In [None]:
# Should be in Bogotá
json_file="/Users/anamaria/Desktop/dev/security_project/assets/bogota_polygon.geojson"
bog_loc=gpd.read_file(json_file)

In [None]:
df_output=df_input.copy()

In [None]:
def check_bog_location(df, row):
    lat = row['LATITUD']
    lon = row['LONGITUD']
    current_point = Point(lon,lat)
    if bog_loc.geometry.contains(current_point)[0]:
        return True
    else:
        return False


In [None]:
df_output['in_bogota?'] = df_output.apply (lambda row: check_bog_location(df_output, row), axis=1)

In [None]:
print(len(df_output))
print(len(df_output.loc[df_output['in_bogota?'] == True]))
print(len(df_output.loc[df_output['in_bogota?'] == False]))
print(len(df_output.loc[(df_output['in_bogota?'] == False) & (df_output['LATITUD']==-1) & (df_output['LONGITUD']==-1)]))
print(len(df_output.loc[(df_output['in_bogota?'] == False) & (df_output['LATITUD']!=-1) & (df_output['LONGITUD']!=-1)]))

In [None]:
df_output.loc[(df_output['in_bogota?'] == False)]

In [None]:
df_output.to_csv(r'/Users/anamaria/Desktop/dev/security_project/datasets/normalise_find_out_bogota_nuse_29112019.csv',index=None)

In [None]:
#Get index of registers out of Bogota and drop it
list_index_out_bogota=df_output[(df_output['in_bogota?'] == False)].index
df_output=df_output.drop(list_index_out_bogota)
df_output['in_bogota?'].all()

In [None]:
df_output.to_csv(r'/Users/anamaria/Desktop/dev/security_project/datasets/normalise_result_nuse_29112019.csv',index=None)

# 5. De-duplicate

In [None]:
data_location = '/Users/anamaria/Desktop/dev/security_project/datasets/normalise_result_nuse_29112019.csv'
df_input = pd.read_csv(data_location,delimiter=",")

In [None]:
print(df_input.shape)
pd.DataFrame({"Tipo de dato":df_input.dtypes.values,
              "Celdas con valor '-'":(df_input == '-').sum().values,
              "Celdas con valor 'ND'":(df_input == 'ND').sum().values,
              "Celdas vacías": df_input.isna().sum().values},
             index=df_input.columns)

### Verify there are not identycal rows

In [None]:
print("Filas duplicadas",df_input.duplicated().sum())

### Verify unique STR_NUMERO_INTERNO

In [None]:
len(df_input) == len(df_input['STR_NUMERO_INTERNO'].unique())

In [None]:
df_input.to_csv(r'/Users/anamaria/Desktop/dev/security_project/datasets/deduplicate_nuse_29112019.csv',index=None)

# 6. Verify and enrich

In [None]:
data_location = '/Users/anamaria/Desktop/dev/security_project/datasets/deduplicate_nuse_29112019.csv'
df_input = pd.read_csv(data_location,delimiter=",")

In [None]:
print(df_input.shape)
pd.DataFrame({"Tipo de dato":df_input.dtypes.values,
              "Celdas con valor '-'":(df_input == '-').sum().values,
              "Celdas con valor 'ND'":(df_input == 'ND').sum().values,
              "Celdas vacías": df_input.isna().sum().values},
             index=df_input.columns)

In [None]:
df_output=df_input.copy()

### Verify columns with empty or anomalous values

In [None]:
# Check COD_UPZ, UPZ, SEC_CATASTRAL with '-' values
df1 = df_output.loc[df_output['COD_UPZ']=='-']
df2 = df_output.loc[df_output['UPZ']=='-']
df3 = df_output.loc[df_output['SEC_CATASTRAL']=='-']
df1.equals(df2) and df1.equals(df3)

In [None]:
# Check COD_UPZ, UPZ, SEC_CATASTRAL with empty values
df_output.loc[df_output['COD_UPZ'].isna(),'COD_UPZ'] = '-'
df_output.loc[df_output['UPZ'].isna(),'UPZ'] = '-'
df_output.loc[df_output['SEC_CATASTRAL'].isna(),'SEC_CATASTRAL'] = '-'

In [None]:
# Check ESTADO_INCIDENTE with empty values
df_output['ESTADO_INCIDENTE'].value_counts()
#rebuild empty values with 'CERRADO'
df_output.loc[df_output['ESTADO_INCIDENTE'].isna(),'ESTADO_INCIDENTE'] = 'CERRADO'

In [None]:
# Check BARRIO and COD_BARRIO with '-' values
df1 = df_output.loc[df_output['BARRIO']=='-']
df2 = df_output.loc[df_output['COD_BARRIO']=='-']
df3 = df_output.loc[df_output['COD_SEC_CATAST']=='-']
df1.equals(df2) and df1.equals(df3)

In [None]:
# Check STR_DIRECCION_INCIDENTE with 'ND' values
df_output.loc[(df_output['STR_DIRECCION_INCIDENTE'] == 'ND')]
df_output.loc[(df_output['STR_DIRECCION_INCIDENTE'] == 'ND') & (df_output['COD_LOCALIDAD'] == 99)]

### Delete aditional columns created on cleaning process

In [None]:
df_output.drop(columns=['index','in_bogota?'],inplace=True)
df_output.reset_index(inplace=True)

In [None]:
df_output.drop(columns=['index'],inplace=True)

In [None]:
print(df_output.shape)
pd.DataFrame({"Tipo de dato":df_output.dtypes.values,
              "Celdas con valor '-'":(df_output == '-').sum().values,
              "Celdas con valor 'ND'":(df_output == 'ND').sum().values,
              "Celdas vacías": df_output.isna().sum().values},
             index=df_output.columns)

In [None]:
df_output.to_csv(r'/Users/anamaria/Desktop/dev/security_project/datasets/verify_enrich_nuse_29112019.csv',index=None)