In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, HTML
import calendar
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', None)  
from shapely.geometry import Point
from shapely.geometry.polygon import Polygon
import geopandas as gpd

# 1. Merge datasets

In [None]:
# Merge is not required

# 2. Rebuild missing data

In [None]:
data_location = '/u01/user8/Documents/Riñas/RNMC/data_2019_11/df_riñas_rnmc.csv'
df_input=pd.read_csv(data_location,delimiter=",")

In [None]:
pd.DataFrame({"Tipo de dato":df_input.dtypes.values,
              "Celdas con valor '-'":(df_input == '-').sum().values,
              "Celdas con valor ''":(df_input == '').sum().values,
              "Celdas con valor ' '":(df_input == ' ').sum().values,
              "Celdas vacías": df_input.isna().sum().values},
             index=df_input.columns)

## Built-in methods

In [None]:
def create_Subdata(df,variables_array):
    subdata=pd.DataFrame(df.groupby(variables_array).size(),columns=["Frecuencia"]).sort_index().reset_index()
    return subdata

In [None]:
def inp_dist(df,columna,vacio='-'):
    df[columna].replace(vacio,np.NAN,inplace=True)
    index_na=df[df[columna].isna()].index
    valores=df[columna].dropna().value_counts().reset_index().values
    df[columna].values[index_na]=np.random.choice(valores[:,0],p=(valores[:,1]/valores[:,1].sum()).astype(float),size=len(index_na))

## Define variables to rebuild

### Structured location fields: COD_UPZ, NOMBRE_UPZ, COD_SCAT, NOMBRE_SECTOR_CAT, COD_BARRIO, NOMBRE_BARRIO

#### 1. A missing structured location field can be rebuild using the information of other structured field? 
#### No, since all missing structured location fields belong to the same registers subset

In [None]:
df1 = df_input.loc[df_input['COD_UPZ'] == '-']
df2 = df_input.loc[df_input['NOMBRE_UPZ'] == '-']
df3 = df_input.loc[df_input['COD_SCAT'] == '-']
df4 = df_input.loc[df_input['NOMBRE_SECTOR_CAT'] == '-']
df5 = df_input.loc[df_input['COD_BARRIO'] == '-']
df6 = df_input.loc[df_input['NOMBRE_BARRIO'] == '-']

In [None]:
df1.equals(df2) and df1.equals(df3) and df1.equals(df4) and df1.equals(df5) and df1.equals(df6)

#### 2. A missing structured location field can be rebuild using the latitude and longitude fields?
#### No, these registers have latitude and longitude fields set on 0

In [None]:
df1['LATITUD'].unique()

In [None]:
df1['LONGITUD'].unique()

#### 3. A missing structured location field can be rebuild other non-structured fields such as (TIPO_SITIO, STR_RELATO_HECHOS, DESCRIPCION_COMPORTAMIENTO)?
#### No, non-structured fields dont have address or location information 

In [None]:
df1['TIPO_SITIO'].unique()

In [None]:
import re
df1['STR_RELATO_HECHOS'].unique()
address_regex= '(CL|CALLE|DG|DIAG|KR|CR|CRA|TV|TRANS)+\s\d+.*(,,)'
list_address = []
for relato in df1['STR_RELATO_HECHOS'].unique():
    address_found = re.search(address_regex,relato)
    list_address.append(address_found)
    

In [None]:
#unique elements on list_address
mylist = list(set(list_address))
mylist

In [None]:
df1['DESCRIPCION_COMPORTAMIENTO'].unique()

### Conclusion: structured location missing fields (COD_UPZ, NOMBRE_UPZ, COD_SCAT, NOMBRE_SECTOR_CAT, COD_BARRIO, NOMBRE_BARRIO) cant be rebuilt

#### Assign "ND" (No disponible) to structured location missing fields

In [None]:
df_output = df_input

In [None]:
df_output.loc[df_output['COD_UPZ']=='-','COD_UPZ'] = 'ND'
df_output.loc[df_output['NOMBRE_UPZ']=='-','NOMBRE_UPZ'] = 'ND'
df_output.loc[df_output['COD_SCAT']=='-','COD_SCAT'] = 'ND'
df_output.loc[df_output['NOMBRE_SECTOR_CAT']=='-','NOMBRE_SECTOR_CAT'] = 'ND'
df_output.loc[df_output['COD_BARRIO']=='-','COD_BARRIO'] = 'ND'
df_output.loc[df_output['NOMBRE_BARRIO']=='-','NOMBRE_BARRIO'] = 'ND'

## =====================================================================

### RANGO_EDAD_1

#### 1. RANGO_EDAD_1 can be rebuilt through other field?
#### No, other fields dont have info related to offender age

In [None]:
df7 = df_output.loc[df_output['RANGO_EDAD_1'] == '-']
df7.head(2)

#### 2. Rebuild considering the variable distribution on dataset

In [None]:
subdata = create_Subdata(df_output,["RANGO_EDAD_1"])
subdata

In [None]:
columna = 'RANGO_EDAD_1'
inp_dist(df_output,columna)

In [None]:
subdata = create_Subdata(df_output,["RANGO_EDAD_1"])
subdata

## ======================================================================

### MEDIDA_CORRECTIVA_CODIGO

#### 1. Field can be rebuilt through other registars that have the same título, artículo... cod_comportamiento?
#### No, other registers dont share the same values for these fields

In [None]:
df8 = df_output.loc[df_output['MEDIDA_CORRECTIVA_CODIGO'] == '-']
df8.head(2)

In [None]:
print(df8['TITULO'].unique())
print(df8['CAPITULO'].unique())
print(df8['ARTICULO'].unique())
print(df8['DESCRIPCION_ARTICULO'].unique())
print(df8['COD_COMPORTAMIENTO'].unique())
print(df8['DESCRIPCION_COMPORTAMIENTO'].unique())

In [None]:
df9 = df8[(df8['TITULO'] == 'Titulo 3') & (df8['CAPITULO'] == 'Capitulo 1') & (df8['ARTICULO'] == 27) & (df8['DESCRIPCION_ARTICULO'] == 'Comportamientos que ponen en riesgo la vida e integridad. Corregido por el art. 1, Decreto 555 de 2017. Los siguientes comportamientos ponen en riesgo la vida e integridad de las personas, y, por lo tanto, son contrarios a la convivencia:') & (df8['COD_COMPORTAMIENTO'] == 'Articulo 27') & (df8['DESCRIPCION_COMPORTAMIENTO'] == 'Comportamientos que ponen en riesgo la vida e integridad. Corregido por el art. 1, Decreto 555 de 2017. Los siguientes comportamientos ponen en riesgo la vida e integridad de las personas, y, por lo tanto, son contrarios a la convivencia:')]
df9['MEDIDA_CORRECTIVA_CODIGO'].unique()

#### 2. Rebuild considering the variable distribution on dataset

In [None]:
subdata = create_Subdata(df_output,["MEDIDA_CORRECTIVA_CODIGO"])
subdata

In [None]:
columna = 'MEDIDA_CORRECTIVA_CODIGO'
inp_dist(df_output,columna)

In [None]:
subdata = create_Subdata(df_output,["MEDIDA_CORRECTIVA_CODIGO"])
subdata

## ======================================================================

### STR_RELATO_HECHOS

#### 1. Variable can be rebuild through other fields?
#### No. Strategy: 'ND' (No disponible) assigned to empty field

In [None]:
df_output.loc[df_output['STR_RELATO_HECHOS'].isna()]

In [None]:
df_output.at[12566,'STR_RELATO_HECHOS'] = "ND"

## ======================================================================

### ACTIVIDAD_COMERCIAL, RAZON_SOCIAL

#### 1. Variables can be rebuild through other registers with the same 'tipo_sitio' value?
#### No

In [None]:
df9 = df_output.loc[df_output["ACTIVIDAD_COMERCIAL"] == '-']
df10 = df_output.loc[df_output["RAZON_SOCIAL"] == '-']
df9.equals(df10)

In [None]:
print(df_output['TIPO_SITIO'].unique())
print(df9['TIPO_SITIO'].unique())

In [None]:
df11 = df_output.loc[df_output["ACTIVIDAD_COMERCIAL"] != '-']
print(len(df11))
df11.head(2)

#### 2. Assign 'ND' (No disponible) to variables with '-' value

In [None]:
df_output.loc[df_output['ACTIVIDAD_COMERCIAL']=='-','ACTIVIDAD_COMERCIAL'] = 'ND'

In [None]:
df_output.loc[df_output['RAZON_SOCIAL']=='-','RAZON_SOCIAL'] = 'ND'

## Check final status of variables after rebuild process

In [None]:
pd.DataFrame({"Tipo de dato":df_output.dtypes.values,
              "Celdas con valor '-'":(df_output == '-').sum().values,
              "Celdas con valor 'ND'":(df_output == 'ND').sum().values,
              "Celdas vacías": df_output.isna().sum().values},
             index=df_output.columns)

In [None]:
df_output.to_csv(r'/u01/user8/Documents/Riñas/RNMC/data_2019_11/rebuild_rnmc_05022020.csv',index=None)

# 3. Standardise

In [None]:
data_location = '/u01/user8/Documents/Riñas/RNMC/data_2019_11/rebuild_rnmc_05022020.csv'
df_input = pd.read_csv(data_location,delimiter=",")

In [None]:
pd.DataFrame({"Tipo de dato":df_input.dtypes.values,
              "Celdas con valor '-'":(df_input == '-').sum().values,
              "Celdas con valor 'ND'":(df_input == 'ND').sum().values,
              "Celdas vacías": df_input.isna().sum().values},
             index=df_input.columns)

### 3.1 Unique register by riña event: since NUM_ID_HECHOS is a unique identifier is expected df size match with  number of unique NUM_ID_HECHOS

In [None]:
print(len(df_input))
print(len(df_input['EXPEDIENTE'].unique()))
print(len(df_input['COMPARENDO'].unique()))
print(len(df_input['NUM_ID_HECHOS'].unique()))

In [None]:
# Identify the duplicated NUM_ID_HECHOS
df_id_hechos_duplicated = df_input[df_input.duplicated(['NUM_ID_HECHOS'])]
list_id_hechos_duplicated = df_id_hechos_duplicated['NUM_ID_HECHOS'].tolist()
list_id_hechos_duplicated[0:10]

In [None]:
#Difference among duplicated pairs NUM_ID_HECHOS seems to be RANGO_EDAD variable
df_input.loc[df_input['NUM_ID_HECHOS']==list_id_hechos_duplicated[1]]

In [None]:
# Repeated NUM_ID_HECHOS are related with RANGO_EDAD missing registers that were rebuilt? No
df7.loc[df7['NUM_ID_HECHOS']==list_id_hechos_duplicated[123]]

In [None]:
#check duplicated pairs id_hechos just differ on RANGO_EDAD
list_differ_not_age = []
for id_num in list_id_hechos_duplicated:
    df = df_input.loc[df_input['NUM_ID_HECHOS']==id_num]
    df_duplicated = df[df.duplicated(['EXPEDIENTE','COMPARENDO','FECHA','HORA','ANIO','MES','LATITUD','LONGITUD','COD_LOCALIDAD','NOMBRE_LOCALIDAD','COD_UPZ','NOMBRE_UPZ','COD_SCAT','NOMBRE_SECTOR_CAT','COD_BARRIO','NOMBRE_BARRIO','TIPO_SITIO','TITULO','CAPITULO','ARTICULO','DESCRIPCION_ARTICULO','COD_COMPORTAMIENTO','DESCRIPCION_COMPORTAMIENTO','TIPO_PRIORIZACION','MEDIDA_CORRECTIVA_CODIGO','ES_PEDAGOGICO','STR_RELATO_HECHOS','ACTIVIDAD_COMERCIAL','RAZON_SOCIAL','NUM_ID_HECHOS'])]
    if len(df_duplicated) == 0:
        list_differ_not_age.append(id_num)
len(list_differ_not_age)   

In [None]:
list_differ_not_age

In [None]:
df_input.loc[df_input['NUM_ID_HECHOS'] == 1993080]

In [None]:
print(len(df_input))
print(len(df_id_hechos_duplicated))

In [None]:
df_clean = df_input.drop_duplicates(subset='NUM_ID_HECHOS', keep='first')
print(len(df_clean) == len(df_input['NUM_ID_HECHOS'].unique()))
print((df_clean['NUM_ID_HECHOS'].unique() == df_input['NUM_ID_HECHOS'].unique()).all())
df_clean[df_clean.duplicated(['NUM_ID_HECHOS'])]

In [None]:
df_output = df_clean
df_output.to_csv(r'/u01/user8/Documents/Riñas/RNMC/data_2019_11/standardise_by_num_id_hechos_rnmc_06022020.csv',index=None)

## 3.2 One register per event: event that occurs within 400 mts radius and 20 minutes time interval

In [None]:
data_location = '/u01/user8/Documents/Riñas/RNMC/data_2019_11/standardise_by_num_id_hechos_rnmc_06022020.csv'
df_input = pd.read_csv(data_location,delimiter=",")

In [None]:
#create timpestamp col to handle time ranges on unique event process
df_input['time_stamp']=pd.to_datetime(df_input['FECHA'] + ' ' + df_input["HORA"].astype(str).str.rjust(4,'0'))

In [None]:
df_input.columns

### Find duplicated events

In [None]:
import time, datetime
time_offset = 20
coor_offset = 0.001

In [None]:
def find_duplicated_events(df, row):
    current_time = row['time_stamp']
    current_lat = row['LATITUD']
    current_lon = row['LONGITUD']
    current_point=Point(current_lon,current_lat)

    duplicated_event_idx = {}
    limit_time_interval = current_time + datetime.timedelta(minutes = time_offset)
    df_event_time = df.loc[(df['time_stamp'] >= current_time) & (df['time_stamp'] < limit_time_interval)]
    
    lat_point_list = [current_lat-coor_offset, current_lat-coor_offset, current_lat+coor_offset, current_lat+coor_offset]
    lon_point_list = [current_lon+coor_offset, current_lon-coor_offset, current_lon-coor_offset, current_lon+coor_offset]
    polygon_event = Polygon(zip(lon_point_list, lat_point_list))
    
    for index, row in df_event_time.iterrows():
        point=Point(row['LONGITUD'],row['LATITUD'])
        if point.within(polygon_event):
            #duplicated_event_idx.append(index)
            duplicated_event_idx[index] = row['NUM_ID_HECHOS']
    return duplicated_event_idx

In [None]:
df_output = df_input.copy()

In [None]:
df_output['dup_event'] = df_output.apply (lambda row: find_duplicated_events(df_output, row), axis=1)

In [None]:
df_output.to_csv('/u01/user8/Documents/Riñas/RNMC/data_2019_11/standardise_find_dup_spatio_temporal_events_rnmc_06022020.csv',index=None)

### Delete duplicated events: preserve the first event on dup_event column

In [None]:
data_location = '/u01/user8/Documents/Riñas/RNMC/data_2019_11/standardise_find_dup_spatio_temporal_events_rnmc_06022020.csv'
df_input = pd.read_csv(data_location,delimiter=",")

In [None]:
pd.DataFrame({"Tipo de dato":df_input.dtypes.values,
              "Celdas con valor '-'":(df_input == '-').sum().values,
              "Celdas con valor 'ND'":(df_input == 'ND').sum().values,
              "Celdas vacías": df_input.isna().sum().values},
             index=df_input.columns)

In [None]:
df_input.iloc[[4]]

In [None]:
df_input.iloc[[8]]

In [None]:
#Get index of registers that should be deleted
import ast
df = df_input
list_idx_repeated = []
list_idx_preserved = []
registers_to_process = len(df)
list_idx_processed =[]
counter_processed = 0

for index, row in df.iterrows():
    dup_event_x = ast.literal_eval(df.at[index,'dup_event'])
    current_dup_events = list(dup_event_x.keys())

    if (current_dup_events[0] not in list_idx_processed) & (current_dup_events[0] not in list_idx_preserved):
        list_idx_preserved.append(current_dup_events[0])
        list_idx_processed.append(current_dup_events[0])
        current_dup_events.pop(0)

    for idx_event in current_dup_events:
        if idx_event not in list_idx_processed:
            list_idx_repeated.append(idx_event)
            list_idx_processed.append(idx_event)
                
    counter_processed += 1
    
    print('Registers processed: ',counter_processed,'/',registers_to_process)


In [None]:
#check (quantitatively) ID of preserved and repeated events index was succesful
print(len(list_idx_repeated)+len(list_idx_preserved))
print(len(list_idx_processed))
join_list = list_idx_preserved + list_idx_repeated

import collections
seen = set()
uniq = []
for x in join_list:
    if x not in seen:
        uniq.append(x)
        seen.add(x)

print(len(uniq))

lst = join_list
dupItems = []
uniqItems = {}
for x in lst:
    if x not in uniqItems:
        uniqItems[x] = 1
    else:
        if uniqItems[x] == 1:
            dupItems.append(x)
        uniqItems[x] += 1
        
print(len(dupItems))

In [None]:
df_output = df_input.copy()

In [None]:
df_output=df_output.drop(list_idx_repeated)
df_output.drop(columns=['dup_event','time_stamp'],inplace=True)
df_output.reset_index(inplace=True)

In [None]:
len(df_output)

In [None]:
#save lists
MyFile=open('/u01/user8/Documents/Riñas/RNMC/data_2019_11/standardise_dup_events_list_idx_preserved_06022020.txt','w')
MyList=map(lambda x: str(x)+'\n', list_idx_preserved)
MyFile.writelines(MyList)
MyFile.close()

In [None]:
#save lists
MyFile=open('/u01/user8/Documents/Riñas/RNMC/data_2019_11/standardise_dup_events_list_idx_repeated_06022020.txt','w')
MyList=map(lambda x: str(x)+'\n', list_idx_repeated)
MyFile.writelines(MyList)
MyFile.close()

In [None]:
pd.DataFrame({"Tipo de dato":df_output.dtypes.values,
              "Celdas con valor '-'":(df_output == '-').sum().values,
              "Celdas con valor 'ND'":(df_output == 'ND').sum().values,
              "Celdas vacías": df_output.isna().sum().values},
             index=df_output.columns)

In [None]:
df_output.to_csv(r'/u01/user8/Documents/Riñas/RNMC/data_2019_11/standardise_result_rnmc_06022020.csv',index=None)

# 4. Normalise

In [None]:
data_location = '/home/combios/Documents/amreyesp/clean_rnmc_data/standardise_result_rnmc_06022020.csv'
df_input = pd.read_csv(data_location,delimiter=",")

In [None]:
print(df_input.shape)
pd.DataFrame({"Tipo de dato":df_input.dtypes.values,
              "Celdas con valor '-'":(df_input == '-').sum().values,
              "Celdas con valor 'ND'":(df_input == 'ND').sum().values,
              "Celdas vacías": df_input.isna().sum().values},
             index=df_input.columns)

### Verify FECHA

In [None]:
# It´s a REGEX with the form: YYYY-mm-dd
regex_fecha = '^\d{4}-\d{2}-\d{2}$'
df_input['FECHA'].str.contains(regex_fecha, regex=True).all()

### Verify HORA

In [None]:
# It´s a number between 0 and 2359
print(df_input['HORA'].between(0,2359).all())

# It´s a regex:
regex_hora = '^[0-2][0-9][0-5]|[0-9]$'
df_input['HORA'].apply(str).str.contains(regex_hora, regex=True).all()

### Verify ANIO

In [None]:
# It´s a number between 2017 and 2019
df_input['ANIO'].between(2017,2019).all()

### Verify MES

In [None]:
# It´s a number between 1 and 12
df_input['MES'].between(1,12).all()

### Verify COD_LOCALIDAD - NOMBRE_LOCALIDAD

In [None]:
var_aux = 'NUM_ID_HECHOS'
df_input.groupby(['COD_LOCALIDAD','NOMBRE_LOCALIDAD']).agg({var_aux:'count'}).reset_index().rename(columns={var_aux:'Frecuencia'})

In [None]:
df_output = df_input.copy()

In [None]:
df_output.loc[df_output['NOMBRE_LOCALIDAD'] == 'ANTONIO NARI?O', 'NOMBRE_LOCALIDAD'] = "ANTONIO NARIÑO"

### Verify LATITUD, LONGITUD

In [None]:
# Should be in Bogotá
json_file="/home/combios/Documents/amreyesp/security_project/assets/bogota_polygon.geojson"
bog_loc=gpd.read_file(json_file)

In [None]:
def check_bog_location(df, row):
    lat = row['LATITUD']
    lon = row['LONGITUD']
    current_point = Point(lon,lat)
    if bog_loc.geometry.contains(current_point)[0]:
        return True
    else:
        return False


In [None]:
df_output['in_bogota?'] = df_output.apply (lambda row: check_bog_location(df_output, row), axis=1)

In [None]:
print(len(df_output))
print(len(df_output.loc[df_output['in_bogota?'] == True]))
print(len(df_output.loc[df_output['in_bogota?'] == False]))
print(len(df_output.loc[(df_output['in_bogota?'] == False) & (df_output['LATITUD']==-1) & (df_output['LONGITUD']==-1)]))
print(len(df_output.loc[(df_output['in_bogota?'] == False) & (df_output['LATITUD']!=-1) & (df_output['LONGITUD']!=-1)]))

In [None]:
df_output.loc[(df_output['in_bogota?'] == False)]

In [None]:
df_output.to_csv(r'/home/combios/Documents/amreyesp/clean_rnmc_data/normalise_find_out_bogota_RNMC_12022020.csv',index=None)

In [None]:
#Get index of registers out of Bogota and drop it
list_index_out_bogota=df_output[(df_output['in_bogota?'] == False)].index
df_output=df_output.drop(list_index_out_bogota)
df_output['in_bogota?'].all()

In [None]:
df_output.drop(columns=['index','in_bogota?'],inplace=True)
df_output.reset_index(inplace=True)
df_output.drop(columns=['index'],inplace=True)
print(len(df_output))

### other variables: TIPO_SITIO, RANGO_EDAD_1, TIPO_PRIORIZACION, ES_PEDAGOGICO, STR_RELATO_HECHOS, ACTIVIDAD_COMERCIAL, RAZON_SOCIAL

In [None]:
df_output['TIPO_SITIO'].unique()

In [None]:
df_output['RANGO_EDAD_1'].unique()

In [None]:
df_output['TIPO_PRIORIZACION'].unique()

In [None]:
df_output['ES_PEDAGOGICO'].unique()

In [None]:
df_output['STR_RELATO_HECHOS'].unique()

In [None]:
df_output['ACTIVIDAD_COMERCIAL'].unique()

In [None]:
df_output['RAZON_SOCIAL'].unique()

### normativity variables: TITULO, CAPITULO, ARTICULO, DESCRIPCION_ARTICULO

In [None]:
df_output['TITULO'].unique()

In [None]:
df_output['CAPITULO'].unique()

In [None]:
df_output['ARTICULO'].unique()

In [None]:
df_output['DESCRIPCION_ARTICULO'].unique()

In [None]:
df_output['COD_COMPORTAMIENTO'].unique()

In [None]:
df_output['DESCRIPCION_COMPORTAMIENTO'].unique()

In [None]:
df_output['MEDIDA_CORRECTIVA_CODIGO'].unique()

In [None]:
df_output.to_csv(r'/home/combios/Documents/amreyesp/clean_rnmc_data/normalise_result_rnmc_12022020.csv',index=None)

# 5. De-duplicate

In [None]:
data_location = '/home/combios/Documents/amreyesp/clean_rnmc_data/normalise_result_rnmc_12022020.csv'
df_input = pd.read_csv(data_location,delimiter=",")

In [None]:
print(df_input.shape)
pd.DataFrame({"Tipo de dato":df_input.dtypes.values,
              "Celdas con valor '-'":(df_input == '-').sum().values,
              "Celdas con valor 'ND'":(df_input == 'ND').sum().values,
              "Celdas vacías": df_input.isna().sum().values},
             index=df_input.columns)

### Verify there are not identycal rows

In [None]:
print("Filas duplicadas",df_input.duplicated().sum())

### Verify unique NUM_ID_HECHOS

In [None]:
len(df_input) == len(df_input['NUM_ID_HECHOS'].unique())

In [None]:
df_input.to_csv(r'/home/combios/Documents/amreyesp/clean_rnmc_data/deduplicate_rnmc_12022020.csv',index=None)

# 6. Verify and enrich

In [None]:
data_location = '/home/combios/Documents/amreyesp/clean_rnmc_data/deduplicate_rnmc_12022020.csv'
df_input = pd.read_csv(data_location,delimiter=",")

In [None]:
print(df_input.shape)
pd.DataFrame({"Tipo de dato":df_input.dtypes.values,
              "Celdas con valor '-'":(df_input == '-').sum().values,
              "Celdas con valor 'ND'":(df_input == 'ND').sum().values,
              "Celdas vacías": df_input.isna().sum().values},
             index=df_input.columns)

In [None]:
df_output=df_input.copy()

### Verify columns with empty or anomalous values

In [None]:
# Check ACTIVIDAD_COMERCIAL, RAZON_SOCIAL  with 'ND' values
df1 = df_output.loc[df_output['ACTIVIDAD_COMERCIAL']=='ND']
df2 = df_output.loc[df_output['RAZON_SOCIAL']=='ND']
df1.equals(df2)

### Delete aditional columns created on cleaning process

In [None]:
# compare columns of original dataset vs cleaned dataset

In [None]:
df_original = pd.read_csv('/home/combios/Documents/amreyesp/clean_rnmc_data/df_riñas_rnmc.csv', delimiter=',')

In [None]:
(df_original.columns == df_output.columns).all()

In [None]:
print(df_output.shape)
pd.DataFrame({"Tipo de dato":df_output.dtypes.values,
              "Celdas con valor '-'":(df_output == '-').sum().values,
              "Celdas con valor 'ND'":(df_output == 'ND').sum().values,
              "Celdas vacías": df_output.isna().sum().values},
             index=df_output.columns)

In [None]:
df_output.to_csv(r'/home/combios/Documents/amreyesp/clean_rnmc_data/verify_enrich_rnmc_12022020.csv',index=None)