In [1]:
import glob
import json
import os

import pandas as pd
import numpy as np

from dotenv import load_dotenv


pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 150)

Environment variables:

In [2]:
load_dotenv('envvars')

ROOT_DATA_PATH = os.environ.get('ROOT_DATA_PATH')
RAW_DATA_PATH = os.environ.get('RAW_DATA_PATH') or f'{ROOT_DATA_PATH}/raw'
CLEAN_DATA_PATH = os.environ.get('CLEAN_DATA_PATH') or f'{ROOT_DATA_PATH}/clean'

Reading data:

In [3]:
def read_csv(file, nrows=None):
    df = pd.read_csv(file, sep=';', encoding="ISO-8859-1", nrows=nrows)

    return df


files = glob.glob(f'{RAW_DATA_PATH}/*CARTERA*.csv')
dfs = [read_csv(f) for f in files]

We need to rename columns from all datasets considering there are spacing characters such as tabs and new lines in some columns' names, and there are typos that lead to different column names in multiple files containing the same variable.

In [4]:
with open(f'{ROOT_DATA_PATH}/dict-renaming-raw-columns.json', 'r') as f:
    renaming_dict = json.load(f)

for df in dfs:
    df.columns = pd.Series(df.columns).replace(renaming_dict)

df = pd.concat(dfs)
df.shape

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  import sys


(790696, 68)

In [5]:
df.head()

Unnamed: 0,BUS_REGION,CALIFICACION_CIERRE,CAPITAL_VEN,CEDULA,CELULAR,COD_LINEA,COD_MODALIDAD,COD_TIPOCLIENTE,COD_TIPO_CLIENTE,COMISION,CORREO,CUOTAS_PACTADAS,CUOTAS_PENDIENTES,DIAS_VENCIDO,DIRECCION,EDAD,EJECUTIVO_ACTUAL,ESTRATO,EST_CIVIL,FACTORRH,FECHA_UTL_ACTUALIZACION,FEC_APROBA,FEC_DESEMBOLSO,FEC_NACIMIENTO,FEC_PROXIMO_PAGO,FEC_SOLICITUD,FEC_ULT_PAGO,GARNTIA_REAL,GENERO,ID_CLIENTE,INTERES_VEN,LINEA,MODALIDAD,MONTO,MORA,MUJER_CABEZA,MUNICIPIO,MUNICIPIO_CLIENTE,NIVEL_ESTUDIO,NOMBRE,NOM_TIPOCLIENTE,NRO_SOLICITUD,OBLIGACION,OTROS,PAGARE,PERIODICIDAD,PORCENTAJE_PAGO,PROFESION,RANGO_PAGO,REGION,REGION_1,REGION_REAL,SALDO_OBLIGACION,SEGURO_VIDA,SUCURSAL,SUCURSALES,SUCURSAL_1,SUCURSAL_COD,SUCURSAL_REAL,TASA_N_A_M_V,TASA_PERIODICA,TEL_FIJO,TIPO,TIPO_CREDITO,UBICACION_CLIENTE,VALOR_CUOTA,VENCIDA,VENCIMIENTO_FINAL
0,,A,-,,,GER,403.0,,1.0,0,,8.0,8.0,0.0,,50.0,,1.0,,,,13/02/2017,13/02/2017,7/05/1967,13/05/2017,3/02/2017,,SIN GARANTIAS REALES,Femenino,FA1303,-,GERMINA,CREDITO RURAL INDIVIDUAL - GERMINA,2950000,-,N,,PAZ DE ARIPORO,Primaria,,Microfinanciero,178000100.0,178000100.0,0,178000070,Trimestral,######,GANADERIA,,REGION NORTE,,,2950000,-,PAZ DE ARIPORO,,,8,,31.67,8.13,,10.0,NUEVO,RURAL,520344,-,13/02/2019
1,,A,-,,,CRE,400.0,,1.0,0,,12.0,10.0,0.0,,64.0,,,,,,24/02/2017,24/02/2017,22/03/1953,24/05/2017,20/02/2017,24/04/2017,SIN GARANTIAS REALES,Femenino,FA12443,"(1,837)",CRECER,DESARROLLO EMPRESARIAL CRECER (PYME),1000000,-,Y,,SABANALARGA,Primaria,,Microfinanciero,1711000000.0,1711000000.0,0,1711000059,Mensual,85.98,SIN PROFESION,,REGION SUR,,,859814,-,VILLANUEVA,,,4,,35.29,2.94,,10.0,NUEVO,URBANA,103305,-,24/02/2018
2,,A,-,,,CRE,400.0,,1.0,-6772,,12.0,10.0,0.0,,31.0,,2.0,,,,13/02/2017,13/02/2017,5/10/1985,13/05/2017,2/02/2017,17/04/2017,SIN GARANTIAS REALES,Masculino,FA11330,-,CRECER,DESARROLLO EMPRESARIAL CRECER (PYME),2900000,-,N,,YOPAL,Universitaria,,Microfinanciero,171000200.0,171000100.0,0,171000109,Mensual,85.98,SIN PROFESION,,REGION CENTRO,,,2493454,"(1,474)",YOPAL,,,1,,35.29,2.94,,10.0,RETANQUEADO,URBANA,299587,-,13/02/2018
3,,A,(60),,,CRE,400.0,,1.0,0,,18.0,16.0,0.0,,29.0,,2.0,,,,13/02/2017,13/02/2017,20/06/1987,13/05/2017,2/02/2017,12/04/2017,SIN GARANTIAS REALES,Femenino,FA19832,(123),CRECER,DESARROLLO EMPRESARIAL CRECER (PYME),2950000,-,N,,GRANADA,Universitaria,,Microfinanciero,176000000.0,176000000.0,0,176000028,Mensual,91.61,SIN PROFESION,,REGION META,,,2702516,-,GRANADA,,,6,,35.29,2.94,,10.0,RETANQUEADO,URBANA,223070,-,13/08/2018
4,,A,-,,,MCA,404.0,,1.0,0,,18.0,16.0,0.0,,29.0,,2.0,,,,24/02/2017,24/02/2017,24/08/1987,24/05/2017,21/02/2017,24/04/2017,SIN GARANTIAS REALES,Masculino,FA6630,(6),MI CASA,MICROVIVIENDA,2000000,-,N,,VILLANUEVA,Secundaria,,Microfinanciero,1711000000.0,1711000000.0,0,1711000061,Mensual,91.18,SIN PROFESION,,REGION SUR,,,1823637,-,VILLANUEVA,,,6,,33.61,2.8,,10.0,NUEVO,URBANA,143997,-,24/08/2018


## Draft

## SUCURSAL_COD

In [6]:
sucurs = df['SUCURSAL_COD'].replace("#N/D", np.nan).fillna(-1).astype(int).astype('category')

In [7]:
sucurs.cat.categories

Int64Index([-1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], dtype='int64')