In [1]:
import os

import pandas as pd
import numpy as np

from dotenv import load_dotenv

from core_ds4a_project import cleaning, columns as project_columns, datasets, location

%load_ext autoreload
%autoreload 1
%aimport core_ds4a_project, core_ds4a_project.cleaning, core_ds4a_project.columns, core_ds4a_project.datasets, core_ds4a_project.location

pd.set_option("display.max_columns", None)

Environment variables:

In [2]:
load_dotenv('envvars')

ROOT_DATA_PATH = os.environ.get('ROOT_DATA_PATH')
RAW_DATA_PATH = os.environ.get('RAW_DATA_PATH') or f'{ROOT_DATA_PATH}/raw'

Reading data:

In [3]:
df = (
    datasets.read_cartera(
        dir_path=RAW_DATA_PATH,
        clean_all=False,
        clean_ids=False,
    )
    .drop(columns=project_columns.CARTERA_USELESS_COLUMNS)
    .sort_values(by=['FECHA_CIERRE', 'OBLIGACION'])
)

df.columns.sort_values()

Index(['CALIFICACION_CIERRE', 'CAPITAL_VEN', 'CLIENTE', 'COD_LINEA',
       'COD_MODALIDAD', 'COMISION', 'CUOTAS_PACTADAS', 'CUOTAS_PENDIENTES',
       'DIAS_VENCIDO', 'FECHA_APROBA', 'FECHA_CIERRE', 'FECHA_DESEMBOLSO',
       'FECHA_PROXIMO_PAGO', 'FECHA_SOLICITUD', 'FECHA_ULT_PAGO',
       'FECHA_VENCIMIENTO_FINAL', 'GARANTIA_REAL', 'INTERES_VEN', 'LINEA',
       'MODALIDAD', 'MONTO', 'MORA', 'MUNICIPIO_CLIENTE', 'NRO_SOLICITUD',
       'OBLIGACION', 'OTROS', 'PAGARE', 'PERIODICIDAD_PAGO', 'PORCENTAJE_PAGO',
       'REGION', 'SALDO', 'SEGURO_VIDA', 'SUCURSAL_COD', 'SUCURSAL_REAL',
       'TASA_ANUAL', 'TASA_PERIODICA', 'TIPO', 'TIPO_CLIENTE',
       'TIPO_CLIENTE_COD', 'TIPO_CREDITO', 'TIPO_UBICACION', 'VALOR_CUOTA',
       'VENCIDA'],
      dtype='object')

Notes about CARTERA columns:
- Reading raw CARTERA is problematic as same columns are named differently across files. It's ideal to read dataset normalizing columns names.
- CARTERA_USELESS_COLUMNS is explained in [columns_in_datasets.ipynb](./columns_in_datasets.ipynb)
- FECHA_CIERRE is a datetime column that is created in dataset reading function by extracting corresponding value from each file name.

Presentational columns:

In [4]:
interest_cols = pd.Index([
    'TIPO_CREDITO',
    'CLIENTE',
    'OBLIGACION',
    'FECHA_CIERRE',
    'VALOR_CUOTA',
    'CUOTAS_PACTADAS',
    'CUOTAS_PENDIENTES',
    'PORCENTAJE_PAGO',
    'MONTO',
    'SALDO',
    'CALIFICACION_CIERRE',
    'VENCIDA',
    'DIAS_VENCIDO',
    'CAPITAL_VEN',
    'INTERES_VEN',
    'MORA',
    'FECHA_SOLICITUD', 'FECHA_APROBA', 'FECHA_DESEMBOLSO',
    'FECHA_ULT_PAGO',
    'FECHA_CIERRE',
    'FECHA_PROXIMO_PAGO',
    'PERIODICIDAD_PAGO',
    'FECHA_VENCIMIENTO_FINAL',
])


PRESENT_COLS = pd.Index([
    'CLIENTE',
    'OBLIGACION',
    'TIPO_CREDITO',
    'FECHA_DESEMBOLSO',
    'FECHA_CIERRE',
    # 'MONTO',
    'SALDO',
    'PORCENTAJE_PAGO',
    'CUOTAS_PENDIENTES',
    # 'CALIFICACION_CIERRE',
])


## OBLIGACION

Remove non-sense records associated to OBLIGACION NaN values:

In [5]:
ind_na = df['OBLIGACION'].isna()
discarded_df = df.loc[ind_na, PRESENT_COLS]
df = df[~ind_na]

discarded_df

Unnamed: 0,CLIENTE,OBLIGACION,TIPO_CREDITO,FECHA_DESEMBOLSO,FECHA_CIERRE,SALDO,PORCENTAJE_PAGO,CUOTAS_PENDIENTES
29462,#N/D,,,,2018-04-30,,,
29463,#N/D,,,,2018-04-30,,,
29464,#N/D,,,,2018-04-30,44799785497,,
29465,#N/D,,,,2018-04-30,,,
29466,#N/D,,,,2018-04-30,INDICADOR,,
538505,,,,,2020-03-31,,,
538506,,,,,2020-03-31,,,
538507,,,,,2020-03-31,,,
538508,,,,,2020-03-31,45270281317,,
538509,,,,,2020-03-31,,,


Casting OBLIGACION to integers:

In [6]:
temp_series = df['OBLIGACION']
cleaning.cast_float_to_int_in_place(df, columns=['OBLIGACION'])

pd.concat([temp_series, df['OBLIGACION']], axis=1)

Unnamed: 0,OBLIGACION,OBLIGACION.1
198611,2.917900e+04,29179
198021,3.082700e+04,30827
198517,3.300600e+04,33006
198023,3.331300e+04,33313
198758,3.376700e+04,33767
...,...,...
141006,2.112000e+09,2112000110
141080,2.112000e+09,2112000111
139345,2.112000e+09,2112000112
141124,2.112000e+09,2112000113


Validating there is a single OBLIGACION per FECHA_CIERRE:

In [7]:
counts_obligacion_per_cierre = df.groupby(by=['FECHA_CIERRE', 'OBLIGACION']).size()

assert (counts_obligacion_per_cierre == 1).all(), "There are multiple OBLIGACION per FECHA_CIERRE"

## Dropping inconsistent records

Dropping inconsistent records with PORCENTAJE_PAGO=NaN that are first records for corresponding OBLIGACION, considering these first records are duplicated:

In [8]:
ind_porcentaje_pago_na = df['PORCENTAJE_PAGO'].isna()

ind_porcentaje_pago_na.sum()

2

In [9]:
obligaciones = df.loc[ind_porcentaje_pago_na, 'OBLIGACION'].drop_duplicates()

df[PRESENT_COLS].query('OBLIGACION.isin(@obligaciones)').groupby('OBLIGACION').apply(lambda x: x.sort_values(by='FECHA_CIERRE').head(4))

Unnamed: 0_level_0,Unnamed: 1_level_0,CLIENTE,OBLIGACION,TIPO_CREDITO,FECHA_DESEMBOLSO,FECHA_CIERRE,SALDO,PORCENTAJE_PAGO,CUOTAS_PENDIENTES
OBLIGACION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
191003960,184339,,191003960,SIN_PERFIL,30/12/2019,2019-12-31,80000000,,
191003960,255552,,191003960,SIN_PERFIL,30/12/2019,2020-01-31,80000000,######,60.0
191003960,326401,,191003960,SIN_PERFIL,30/12/2019,2020-02-29,79345781,99.18,59.0
191003960,538425,,191003960,SIN_PERFIL,30/12/2019,2020-03-31,78677169,98.35,58.0
1914000001,777801,,1914000001,,09/27/2019,2019-09-30,2000000,,
1914000001,714894,FA60042,1914000001,NUEVO,27/09/2019,2019-10-31,2000000,######,15.0
1914000001,658969,FA60042,1914000001,NUEVO,27/09/2019,2019-11-30,1899700,94.99,14.0
1914000001,173130,FA60042,1914000001,NUEVO,27/09/2019,2019-12-31,1795444,89.77,13.0


In [10]:
count_prev = df.shape[0]
df = df[~ind_porcentaje_pago_na]

count_prev, df.shape[0]

(790679, 790677)

Remove single inconsistent record having CLIENTE=FA8913 and OBLIGACION=178000341, considering such OBLIGACION corresponds to another credit consistently paid by CLIENTE=FA19354:

In [11]:
df.query('(CLIENTE == "FA8913") & (OBLIGACION == 178000341)')[PRESENT_COLS]

Unnamed: 0,CLIENTE,OBLIGACION,TIPO_CREDITO,FECHA_DESEMBOLSO,FECHA_CIERRE,SALDO,PORCENTAJE_PAGO,CUOTAS_PENDIENTES
3221,FA8913,178000341,RETANQUEADO,17/04/2017,2017-04-30,2900000,######,8.0


In [12]:
(
    df
    .query('OBLIGACION == 178000341')
    [PRESENT_COLS]
    .drop_duplicates(subset=PRESENT_COLS[PRESENT_COLS != "FECHA_CIERRE"])
)

Unnamed: 0,CLIENTE,OBLIGACION,TIPO_CREDITO,FECHA_DESEMBOLSO,FECHA_CIERRE,SALDO,PORCENTAJE_PAGO,CUOTAS_PENDIENTES
3221,FA8913,178000341,RETANQUEADO,17/04/2017,2017-04-30,2900000,######,8.0
554494,FA19354,178000341,RETANQUEADO,18/05/2017,2017-05-31,6000000,######,8.0
83365,FA19354,178000341,RETANQUEADO,18/05/2017,2017-08-31,5438574,90.64,7.0
625554,FA19354,178000341,RETANQUEADO,18/05/2017,2017-11-30,4831499,80.52,6.0
291316,FA19354,178000341,RETANQUEADO,18/05/2017,2018-02-28,4175063,69.58,5.0
577548,FA19354,178000341,RETANQUEADO,18/05/2017,2018-05-31,3465253,57.75,4.0
98670,FA19354,178000341,RETANQUEADO,18/05/2017,2018-08-31,2697728,44.96,3.0
647240,FA19354,178000341,RETANQUEADO,18/05/2017,2018-11-30,1867796,31.13,2.0


In [13]:
count_records_prev = df.shape[0]

df = df.query('~((CLIENTE == "FA8913") & (OBLIGACION == 178000341))')

count_records_prev, df.shape[0]

(790677, 790676)

## CLIENTE

Set CLIENTE "#N/D" values equal to NaN values:

In [14]:
ind = df['CLIENTE'].str.match('#N/D').fillna(False)
df.loc[ind, 'CLIENTE'] = np.nan

### Update CLIENTE NaN

Update some CLIENTE NaN values based on their OBLIGACION, considering there are other records for such OBLIGACION that present CLIENTE value. In other words, some records with same OBLIGACION value present a maximum of two different values for CLIENTE: an actual value and a NaN value.

In [15]:
cliente_obligacion_df = df[['CLIENTE', 'OBLIGACION']].drop_duplicates()

all_one_cliente_per_obligacion = (cliente_obligacion_df.dropna().groupby('OBLIGACION').size() == 1).all()
assert all_one_cliente_per_obligacion, "There are multiple CLIENT for single OBLIGACION"

obligacion_size_ss = cliente_obligacion_df.groupby('OBLIGACION').size()
max_cliente_per_obligacion = obligacion_size_ss.max()

all_one_cliente_per_obligacion, max_cliente_per_obligacion

(True, 2)

In [16]:
obligacion_size_ss.sort_values()

OBLIGACION
29179        1
192004587    1
192004588    1
192004589    1
192004590    1
            ..
191004091    2
191004089    2
191004085    2
191004078    2
191003510    2
Length: 56859, dtype: int64

In [17]:
obligacion_ids = obligacion_size_ss[obligacion_size_ss == 2].index
cliente_obligacion_ss = (
    cliente_obligacion_df
    .set_index('OBLIGACION')
    ['CLIENTE']
    .loc[obligacion_ids]
)
cliente_obligacion_ss

OBLIGACION
51889         FA28397
51889             NaN
52803             NaN
52803         FA31733
54786         FA22478
               ...   
1914000193     FA7339
1914000203     FA7383
1914000203        NaN
1914000216        NaN
1914000216    FA23297
Name: CLIENTE, Length: 14506, dtype: object

Half of previous CLIENT values in `cliente_obligacion_ss` are NaN values and the other half are the corresponding definitions:

In [18]:
ind_na_ss = cliente_obligacion_ss.isna()

[ind_na_ss.sum(), (~ind_na_ss).sum()]

[7253, 7253]

In [19]:
pd.concat([cliente_obligacion_ss[ind_na_ss], cliente_obligacion_ss[~ind_na_ss]], axis=1)

Unnamed: 0_level_0,CLIENTE,CLIENTE
OBLIGACION,Unnamed: 1_level_1,Unnamed: 2_level_1
51889,,FA28397
52803,,FA31733
54786,,FA22478
59097,,FA34019
60040,,FA9326
...,...,...
1914000185,,FA17423
1914000186,,FA31
1914000193,,FA7339
1914000203,,FA7383


Querying any of previous OBLIGACION values shows that CLIENTE can be correctly updated as records correspond to same OBLIGACION.

Define missing CLIENTE based on OBLIGACION:

In [20]:
definitions_ss = cliente_obligacion_ss[~ind_na_ss]
definitions_ss

OBLIGACION
51889         FA28397
52803         FA31733
54786         FA22478
59097         FA34019
60040          FA9326
               ...   
1914000185    FA17423
1914000186       FA31
1914000193     FA7339
1914000203     FA7383
1914000216    FA23297
Name: CLIENTE, Length: 7253, dtype: object

In [21]:
ind_na = df['CLIENTE'].isna()
ind_in_definition = df['OBLIGACION'].isin(definitions_ss.index)

ind = ind_na & ind_in_definition
ind_cliente_na_not_updated = ind_na & ~ind_in_definition

prev = df.loc[ind, 'CLIENTE']
df.loc[ind, 'CLIENTE'] = definitions_ss.loc[df.loc[ind, 'OBLIGACION']].values

pd.concat([prev, df.loc[ind, 'CLIENTE']], axis=1)

Unnamed: 0,CLIENTE,CLIENTE.1
201279,,FA31733
277420,,FA31733
489599,,FA31733
9864,,FA31733
580429,,FA4329
...,...,...
538394,,FA7373
538145,,FA7288
537890,,FA31
538389,,FA7383


Dropping records whose CLIENTE NaN values couldn't be updated as all records for corresponding OBLIGACION present CLIENTE NaN values:

In [22]:
(ind_cliente_na_not_updated == df['CLIENTE'].isna()).all()

True

In [23]:
(
    df
    .loc[ind_cliente_na_not_updated, ['CLIENTE', 'OBLIGACION']]
    .drop_duplicates()
    .set_index('OBLIGACION')
    .sort_index()
)

Unnamed: 0_level_0,CLIENTE
OBLIGACION,Unnamed: 1_level_1
188002312,
191003364,
191003838,
191003839,
191003840,
...,...
2112000106,
2112000107,
2112000108,
2112000109,


In [24]:
ind_cliente_na_not_updated.sum()

442

In [25]:
count_prev = df.shape[0]
df = df.dropna(subset='CLIENTE')

count_prev, df.shape[0]

(790676, 790234)

## FECHA_*

### FECHA_DESEMBOLSO

There is a single record with FECHA_DESEMBOLSO that do not complain with date format DAY/MONTH/YEAR having FECHA_DESEMBOLSO="09/27/2019" and OBLIGACION=1914000001. This record has been previously dropped as it has PORCENTAJE=NaN. In case previous dropping is not executed, this record is removed here:

In [26]:
ind_day_month_year = df['FECHA_DESEMBOLSO'].str.match(r"[0-3]?\d/[01]?\d/\d{4}")

if ((~ind_day_month_year).any()):
    display(df.loc[~ind_day_month_year, PRESENT_COLS])

    df = df[ind_day_month_year]

### FECHA_PROXIMO_PAGO

Question: FECHA_PROXIMO_PAGO is not consistent

In [27]:
df[interest_cols].query('OBLIGACION == 1914000185')

Unnamed: 0,TIPO_CREDITO,CLIENTE,OBLIGACION,FECHA_CIERRE,VALOR_CUOTA,CUOTAS_PACTADAS,CUOTAS_PENDIENTES,PORCENTAJE_PAGO,MONTO,SALDO,CALIFICACION_CIERRE,VENCIDA,DIAS_VENCIDO,CAPITAL_VEN,INTERES_VEN,MORA,FECHA_SOLICITUD,FECHA_APROBA,FECHA_DESEMBOLSO,FECHA_ULT_PAGO,FECHA_CIERRE.1,FECHA_PROXIMO_PAGO,PERIODICIDAD_PAGO,FECHA_VENCIMIENTO_FINAL
326112,NUEVO,FA17423,1914000185,2020-02-29,158965.0,18.0,18.0,######,2000000,2000000,A,-,0.0,0.0,-41344.0,0.0,18/02/2020,20/02/2020,20/02/2020,20/02/2020,2020-02-29,10/04/2020,Mensual,10/09/2021
525379,NUEVO,FA17423,1914000185,2020-03-31,158965.0,18.0,18.0,######,2000000,2000000,A,-,0.0,0.0,0.0,0.0,18/02/2020,20/02/2020,20/02/2020,20/02/2020,2020-03-31,10/04/2020,Mensual,10/09/2021
47122,NUEVO,FA17423,1914000185,2020-04-30,158965.0,18.0,18.0,######,2000000,2000000,A,-,20.0,78390.0,64000.0,2059.0,18/02/2020,20/02/2020,20/02/2020,20/02/2020,2020-04-30,10/04/2020,Mensual,10/09/2021
608845,NUEVO,FA17423,1914000185,2020-05-31,158965.0,18.0,18.0,######,2000000,2000000,A,-,0.0,0.0,162784.0,6740.0,18/02/2020,20/02/2020,20/02/2020,20/02/2020,2020-05-31,10/08/2020,Mensual,10/01/2022
463348,NUEVO,FA17423,1914000185,2020-06-30,158965.0,18.0,18.0,######,2000000,2000000,A,-,0.0,0.0,162784.0,6740.0,18/02/2020,20/02/2020,20/02/2020,20/02/2020,2020-06-30,10/08/2020,Mensual,10/01/2022
392761,NUEVO,FA17423,1914000185,2020-07-31,158965.0,18.0,18.0,######,2000000,2000000,A,-,0.0,0.0,250251.0,6740.0,18/02/2020,20/02/2020,20/02/2020,20/02/2020,2020-07-31,10/08/2020,Mensual,10/01/2022
121376,NUEVO,FA17423,1914000185,2020-08-31,158965.0,18.0,18.0,######,2000000,2000000,A,-,20.0,78390.0,314251.0,8665.0,18/02/2020,20/02/2020,20/02/2020,20/02/2020,2020-08-31,10/08/2020,Mensual,10/01/2022
787107,NUEVO,FA17423,1914000185,2020-09-30,158965.0,18.0,18.0,######,2000000,2000000,B,2000000,50.0,159872.0,272211.0,187.0,18/02/2020,20/02/2020,20/02/2020,6/10/2020,2020-09-30,10/08/2020,Mensual,10/01/2022
730286,NUEVO,FA17423,1914000185,2020-10-31,158965.0,18.0,18.0,######,2000000,2000000,B,2000000,80.0,244567.0,248871.0,7330.0,18/02/2020,20/02/2020,20/02/2020,44110,2020-10-31,10/08/2020,Mensual,10/01/2022
668120,NUEVO,FA17423,1914000185,2020-11-30,158965.0,18.0,18.0,######,2000000,2000000,C,2000000,110.0,332602.0,305045.0,18876.0,18/02/2020,20/02/2020,20/02/2020,6/10/2020,2020-11-30,10/08/2020,Mensual,10/01/2022


In [28]:
cols = [
    'CLIENTE',
    'OBLIGACION',
    'FECHA_CIERRE',
    'FECHA_ULT_PAGO',
    'FECHA_PROXIMO_PAGO',
    'PERIODICIDAD_PAGO',
    'CALIFICACION_CIERRE',
]

df[cols].query('OBLIGACION == 171001161').sort_values(by='FECHA_CIERRE').iloc[-18:]

Unnamed: 0,CLIENTE,OBLIGACION,FECHA_CIERRE,FECHA_ULT_PAGO,FECHA_PROXIMO_PAGO,PERIODICIDAD_PAGO,CALIFICACION_CIERRE
659943,FA1992,171001161,2019-11-30,26/10/2019,1/10/2019,Trimestral,A
169904,FA1992,171001161,2019-12-31,30/12/2019,1/01/2020,Trimestral,A
249634,FA1992,171001161,2020-01-31,30/12/2019,1/01/2020,Trimestral,A
324792,FA1992,171001161,2020-02-29,30/12/2019,1/01/2020,Trimestral,A
534398,FA1992,171001161,2020-03-31,43829,1/01/2020,Trimestral,B
54720,FA1992,171001161,2020-04-30,30/12/2019,1/01/2020,Trimestral,D
605751,FA1992,171001161,2020-05-31,11/05/2020,10/07/2020,Mensual,D
460057,FA1992,171001161,2020-06-30,11/05/2020,10/07/2020,Mensual,D
391257,FA1992,171001161,2020-07-31,25/07/2020,10/08/2020,Mensual,D
121620,FA1992,171001161,2020-08-31,16/08/2020,10/09/2020,Mensual,D


### FECHA_ULT_PAGO

Question: what do numeric values mean?

FECHA_ULT_PAGO have numeric values instead of dates:

In [29]:
ind_dates = df['FECHA_ULT_PAGO'].str.match(r'^\d{1,2}/\d{1,2}/\d{4}$').fillna(False)
ind_na = df['FECHA_ULT_PAGO'].isna()
ind_numeric = df['FECHA_ULT_PAGO'].str.match(r'^\d+$').fillna(False)

count_dates = ind_dates.sum()
count_na = ind_na.sum()
count_numeric = ind_numeric.sum()

{
    "total_records": df.shape[0],
    "sum_counts": count_dates + count_numeric + count_na,
    "count_dates": count_dates,
    "count_na": count_na,
    "count_numeric": count_numeric,
}

{'total_records': 790234,
 'sum_counts': 790234,
 'count_dates': 736186,
 'count_na': 36382,
 'count_numeric': 17666}

In [30]:
df.loc[ind_numeric, 'FECHA_ULT_PAGO'].value_counts()

42735    412
42794    113
43890    100
43496     93
43738     86
        ... 
43884      1
44233      1
42975      1
42944      1
43166      1
Name: FECHA_ULT_PAGO, Length: 1243, dtype: int64

Set FECHA_ULT_PAGO numeric values as NaN:

In [31]:
df.loc[ind_numeric, 'FECHA_ULT_PAGO'] = np.nan

### Casting dates

In [32]:
DATE_COLS = df.columns[df.columns.str.contains('^FECHA_')]

df[DATE_COLS].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 790234 entries, 198611 to 141072
Data columns (total 7 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   FECHA_SOLICITUD          790234 non-null  object        
 1   FECHA_APROBA             790234 non-null  object        
 2   FECHA_DESEMBOLSO         790234 non-null  object        
 3   FECHA_ULT_PAGO           736186 non-null  object        
 4   FECHA_PROXIMO_PAGO       790234 non-null  object        
 5   FECHA_VENCIMIENTO_FINAL  790234 non-null  object        
 6   FECHA_CIERRE             790234 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(6)
memory usage: 48.2+ MB


In [33]:
cleaning.cast_dates_in_place(df, exclude=['FECHA_CIERRE'])

df[DATE_COLS].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 790234 entries, 198611 to 141072
Data columns (total 7 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   FECHA_SOLICITUD          790234 non-null  datetime64[ns]
 1   FECHA_APROBA             790234 non-null  datetime64[ns]
 2   FECHA_DESEMBOLSO         790234 non-null  datetime64[ns]
 3   FECHA_ULT_PAGO           736186 non-null  datetime64[ns]
 4   FECHA_PROXIMO_PAGO       790234 non-null  datetime64[ns]
 5   FECHA_VENCIMIENTO_FINAL  790234 non-null  datetime64[ns]
 6   FECHA_CIERRE             790234 non-null  datetime64[ns]
dtypes: datetime64[ns](7)
memory usage: 48.2 MB


## Considerations of older credits

There are unique credits with part of their history not registered as CARTERA dataset starts at 2017-01:

In [34]:
date_cierre_min = df['FECHA_CIERRE'].min()

date_cierre_min

Timestamp('2017-01-31 00:00:00')

In [35]:
date_max_older = date_cierre_min  - pd.tseries.offsets.MonthEnd(1)

date_max_older

Timestamp('2016-12-31 00:00:00')

In [36]:
ind_older = df['FECHA_DESEMBOLSO'] < "2017-01-01"

ind_older.sum(), ind_older.equals(df['FECHA_DESEMBOLSO'] <= date_max_older)

(182940, True)

In [37]:
df.loc[ind_older, 'OBLIGACION'].drop_duplicates()

198611    29179
198021    30827
198517    33006
198023    33313
198758    33767
          ...  
210689    71158
198144    71159
208001    71160
197888    71161
210696    71162
Name: OBLIGACION, Length: 15291, dtype: int32

## CALIFICACION_CIERRE

CALIFICACION_CIERRE has no NaN nor invalid values:

In [38]:
df['CALIFICACION_CIERRE'].isna().any()

False

In [39]:
df['CALIFICACION_CIERRE'].value_counts(dropna=False)

A    664608
E     79871
B     20835
C     13756
D     11164
Name: CALIFICACION_CIERRE, dtype: int64

Most records in CARTERA have good rating:

In [40]:
ind_calificacion_a = df['CALIFICACION_CIERRE'] == 'A'

{
    "count_records": df.shape[0],
    "count_calificacion_a": ind_calificacion_a.sum(),
}


{'count_records': 790234, 'count_calificacion_a': 664608}

Draft:

In [41]:
df['CALIFICACION_CIERRE'].to_frame().info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 790234 entries, 198611 to 141072
Data columns (total 1 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   CALIFICACION_CIERRE  790234 non-null  object
dtypes: object(1)
memory usage: 12.1+ MB


In [42]:
calificacion_cat_type = pd.api.types.CategoricalDtype(categories=['A', 'B', 'C', 'D', 'E'], ordered=True)

df['CALIFICACION_CIERRE'].astype(calificacion_cat_type).to_frame().info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 790234 entries, 198611 to 141072
Data columns (total 1 columns):
 #   Column               Non-Null Count   Dtype   
---  ------               --------------   -----   
 0   CALIFICACION_CIERRE  790234 non-null  category
dtypes: category(1)
memory usage: 6.8 MB


PENDING example defaulting not in CASTIGO

In [43]:
cols = [
    'CLIENTE',
    'OBLIGACION',
    'FECHA_CIERRE',
    'FECHA_ULT_PAGO',
    'FECHA_PROXIMO_PAGO',
    'PERIODICIDAD_PAGO',
    'TASA_PERIODICA',
    'CALIFICACION_CIERRE',
    'VALOR_CUOTA',
    'SALDO',
    'PORCENTAJE_PAGO',
]

df[cols].query('OBLIGACION == 171001161').sort_values(by='FECHA_CIERRE').iloc[-18:]

Unnamed: 0,CLIENTE,OBLIGACION,FECHA_CIERRE,FECHA_ULT_PAGO,FECHA_PROXIMO_PAGO,PERIODICIDAD_PAGO,TASA_PERIODICA,CALIFICACION_CIERRE,VALOR_CUOTA,SALDO,PORCENTAJE_PAGO
659943,FA1992,171001161,2019-11-30,2019-10-26,2019-10-01,Trimestral,8.13,A,1514357.0,3568783,35.69
169904,FA1992,171001161,2019-12-31,2019-12-30,2020-01-01,Trimestral,813.0,A,1514357.0,2668512,26.69
249634,FA1992,171001161,2020-01-31,2019-12-30,2020-01-01,Trimestral,8.13,A,1514357.0,2668512,26.69
324792,FA1992,171001161,2020-02-29,2019-12-30,2020-01-01,Trimestral,8.13,A,1514357.0,2668512,26.69
534398,FA1992,171001161,2020-03-31,NaT,2020-01-01,Trimestral,8.13,B,1514357.0,2668512,26.69
54720,FA1992,171001161,2020-04-30,2019-12-30,2020-01-01,Trimestral,8.13,D,1514357.0,2668512,26.69
605751,FA1992,171001161,2020-05-31,2020-05-11,2020-07-10,Mensual,2.64,D,245083.0,2668512,26.69
460057,FA1992,171001161,2020-06-30,2020-05-11,2020-07-10,Mensual,2.64,D,245083.0,2668512,26.69
391257,FA1992,171001161,2020-07-31,2020-07-25,2020-08-10,Mensual,2.64,D,245083.0,2498961,24.99
121620,FA1992,171001161,2020-08-31,2020-08-16,2020-09-10,Mensual,2.64,D,245083.0,2324934,23.25


In [44]:
# castigo_df = datasets.read_castigada_xlsx(dir_path=RAW_DATA_PATH, clean=True)
# castigo_df.query('OBLIGACION == "171001161"')

## CAPITAL_VEN, VENCIDA

In [45]:
df[['CAPITAL_VEN', 'VENCIDA']].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 790234 entries, 198611 to 141072
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   CAPITAL_VEN  790234 non-null  object
 1   VENCIDA      790234 non-null  object
dtypes: object(2)
memory usage: 18.1+ MB


Question: what is this?

In [46]:
df['CAPITAL_VEN'].value_counts()

0            235010
 -           151928
1000000         259
-12000          127
700000.0        121
              ...  
 726,332          1
 141,883          1
 (13,947)         1
 (7,554)          1
-106625           1
Name: CAPITAL_VEN, Length: 211052, dtype: int64

In [47]:
df['VENCIDA'].value_counts()

 -             660566
 1,000,000        493
 2,000,000        455
 3,000,000        303
 1,500,000        255
                ...  
 374,612            1
 330,412            1
 571,201            1
 4,077,953          1
 1,946,714          1
Name: VENCIDA, Length: 26481, dtype: int64

In [48]:
ind_capital_vencido_zero = df['CAPITAL_VEN'] == 0
ind_capital_vencido_dash = df['CAPITAL_VEN'] == ' -   '
ind_vencida_dash = df['VENCIDA'] == ' -   '
# ind_dias_vencido_zero = df['DIAS_VENCIDO'] == 0

ind_capital_vencido_good_1 = (
    ind_capital_vencido_zero
    & ind_capital_vencido_dash
)
ind_capital_vencido_good_2 = (
    ind_capital_vencido_zero
    | ind_capital_vencido_dash
)

ind_capital_vencido_zero.sum(), ind_capital_vencido_dash.sum(), ind_capital_vencido_good_1.sum(), ind_capital_vencido_good_2.sum(), ind_vencida_dash.sum()

(235010, 151928, 0, 386938, 660566)

## DIAS_VENCIDO

In [49]:
df['DIAS_VENCIDO'].isna().any()

False

In [50]:
temp_ss = df['DIAS_VENCIDO']
cleaning.cast_float_to_int_in_place(df, columns=['DIAS_VENCIDO'])

pd.concat([temp_ss, df['DIAS_VENCIDO']], axis=1)

Unnamed: 0,DIAS_VENCIDO,DIAS_VENCIDO.1
198611,419.0,419
198021,659.0,659
198517,569.0,569
198023,588.0,588
198758,329.0,329
...,...,...
139681,0.0,0
141006,0.0,0
141080,0.0,0
139345,0.0,0


## COMISION

In [51]:
df['COMISION'].isna().sum()

0

Question: negative COMISION?

In [52]:
df['COMISION'].value_counts(dropna=False)

0            409238
 -           211431
-7438           166
-14875          140
-22313          132
              ...  
 (22,177)         1
 (330)            1
 (7,778)          1
 (10,192)         1
-10820            1
Name: COMISION, Length: 95707, dtype: int64

In [53]:
ind_comision_dash = df['COMISION'] == ' -   '
ind_comision_zero = df['COMISION'] == 0

{
    "count_comision_dash": ind_comision_dash.sum(),
    "count_comision_zero": ind_comision_zero.sum(),
}

{'count_comision_dash': 211431, 'count_comision_zero': 409238}

In [54]:
comision_per_obligacion = df[['OBLIGACION', 'COMISION']].groupby('OBLIGACION').nunique()['COMISION'].sort_values()

comision_per_obligacion

OBLIGACION
2112000114     1
196002322      1
196002323      1
196002325      1
196002327      1
              ..
172001013     43
172000340     44
68770         45
68115         46
173000058     47
Name: COMISION, Length: 56533, dtype: int64

In [55]:
comision_per_obligacion[comision_per_obligacion == 47]

OBLIGACION
173000058    47
Name: COMISION, dtype: int64

In [56]:
df.query('OBLIGACION == 173000058')

Unnamed: 0,TIPO,NRO_SOLICITUD,OBLIGACION,PAGARE,CLIENTE,TIPO_CLIENTE_COD,TIPO_CLIENTE,REGION,MUNICIPIO_CLIENTE,TIPO_UBICACION,MONTO,FECHA_SOLICITUD,FECHA_APROBA,FECHA_DESEMBOLSO,VALOR_CUOTA,CUOTAS_PACTADAS,CUOTAS_PENDIENTES,TASA_ANUAL,TASA_PERIODICA,PERIODICIDAD_PAGO,CALIFICACION_CIERRE,COD_LINEA,LINEA,COD_MODALIDAD,MODALIDAD,SALDO,VENCIDA,DIAS_VENCIDO,CAPITAL_VEN,INTERES_VEN,MORA,SEGURO_VIDA,COMISION,OTROS,FECHA_ULT_PAGO,FECHA_PROXIMO_PAGO,FECHA_VENCIMIENTO_FINAL,GARANTIA_REAL,PORCENTAJE_PAGO,TIPO_CREDITO,SUCURSAL_COD,FECHA_CIERRE,SUCURSAL_REAL
270532,10.0,173000075.0,173000058,173000058.0,FA10913,4.0,Codeudor no cliente,REGION META,ACACIAS,URBANA,10000000,2017-02-15,2017-02-21,2017-02-21,454799.0,42.0,42.0,35.29,2.94,Mensual,A,CRE,CRECER,400.0,DESARROLLO EMPRESARIAL CRECER (PYME),10000000,-,0,-,-,-,-,0,0,NaT,2017-03-21,2020-08-21,SIN GARANTIAS REALES,######,NUEVO,,2017-02-28,
482808,10.0,173000075.0,173000058,173000058.0,FA10913,4.0,Codeudor no cliente,REGION META,ACACIAS,URBANA,10000000,2017-02-15,2017-02-21,2017-02-21,454799.0,42.0,41.0,35.29,2.94,Mensual,A,CRE,CRECER,400.0,DESARROLLO EMPRESARIAL CRECER (PYME),9888993,-,0,-,-,-,"(4,807)",0,0,2017-03-24,2017-04-21,2020-08-21,SIN GARANTIAS REALES,98.89,NUEVO,3.0,2017-03-31,
3294,10.0,173000075.0,173000058,173000058.0,FA10913,4.0,Codeudor no cliente,REGION META,ACACIAS,URBANA,10000000,2017-02-15,2017-02-21,2017-02-21,454799.0,42.0,41.0,35.29,2.94,Mensual,A,CRE,CRECER,400.0,DESARROLLO EMPRESARIAL CRECER (PYME),9888993,-,9,114767,290819,1222,276,44625,0,2017-03-24,2017-04-21,2020-08-21,SIN GARANTIAS REALES,98.89,NUEVO,8.0,2017-04-30,
554736,10.0,173000075.0,173000058,173000058.0,FA10913,4.0,Codeudor no cliente,REGION META,ACACIAS,URBANA,10000000,2017-02-15,2017-02-21,2017-02-21,454799.0,42.0,39.0,35.29,2.94,Mensual,A,CRE,CRECER,400.0,DESARROLLO EMPRESARIAL CRECER (PYME),9655571,-,0,-,-,-,"(5,083)","(6,084)",-,2017-05-31,2017-06-21,2020-08-21,SIN GARANTIAS REALES,96.56,NUEVO,3.0,2017-05-31,ACACIAS
413674,10.0,173000075.0,173000058,173000058.0,FA10913,4.0,Codeudor no cliente,REGION META,ACACIAS,URBANA,10000000,2017-02-15,2017-02-21,2017-02-21,454799.0,42.0,39.0,35.29,2.94,Mensual,A,CRE,CRECER,400.0,DESARROLLO EMPRESARIAL CRECER (PYME),9655571,-,9,122674,283954,1452,-,37533,-,2017-05-31,2017-06-21,2020-08-21,SIN GARANTIAS REALES,96.56,NUEVO,3.0,2017-06-30,ACACIAS
349999,10.0,173000075.0,173000058,173000058.0,FA10913,4.0,Codeudor no cliente,REGION META,ACACIAS,URBANA,10000000,2017-02-15,2017-02-21,2017-02-21,454799.0,42.0,38.0,35.29,2.94,Mensual,A,CRE,CRECER,400.0,DESARROLLO EMPRESARIAL CRECER (PYME),9532897,-,9,126829,280346,-,4202,43088,-,2017-07-31,2017-07-21,2020-08-21,SIN GARANTIAS REALES,95.33,NUEVO,3.0,2017-07-31,ACACIAS
81049,10.0,173000075.0,173000058,173000058.0,FA10913,4.0,Codeudor no cliente,REGION META,ACACIAS,URBANA,10000000,2017-02-15,2017-02-21,2017-02-21,454799.0,42.0,37.0,35.29,2.94,Mensual,A,CRE,CRECER,400.0,DESARROLLO EMPRESARIAL CRECER (PYME),9406068,-,9,131125,276616,931,3920,43088,-,2017-08-25,2017-08-21,2020-08-21,SIN GARANTIAS REALES,94.06,NUEVO,3.0,2017-08-31,ACACIAS
739055,10.0,173000075.0,173000058,173000058.0,FA10913,4.0,Codeudor no cliente,REGION META,ACACIAS,URBANA,10000000,2017-02-15,2017-02-21,2017-02-21,454799.0,42.0,35.0,35.29,2.94,Mensual,A,CRE,CRECER,400.0,DESARROLLO EMPRESARIAL CRECER (PYME),9139376,-,0,-,-,-,"(3,216)",-,-,2017-09-27,2017-10-21,2020-08-21,SIN GARANTIAS REALES,91.39,NUEVO,3.0,2017-09-30,ACACIAS
682699,10.0,173000075.0,173000058,173000058.0,FA10913,4.0,Codeudor no cliente,REGION META,ACACIAS,URBANA,10000000,2017-02-15,2017-02-21,2017-02-21,454799.0,42.0,35.0,35.29,2.94,Mensual,A,CRE,CRECER,400.0,DESARROLLO EMPRESARIAL CRECER (PYME),9139376,-,9,140158,268774,1824,1867,41975,-,2017-09-27,2017-10-21,2020-08-21,SIN GARANTIAS REALES,91.39,NUEVO,3.0,2017-10-31,ACACIAS
634711,10.0,173000075.0,173000058,173000058.0,FA10913,4.0,Codeudor no cliente,REGION META,ACACIAS,URBANA,10000000,2017-02-15,2017-02-21,2017-02-21,363172.0,60.0,60.0,35.29,2.94,Mensual,A,CRE,CRECER,400.0,DESARROLLO EMPRESARIAL CRECER (PYME),9139376,-,0,-,392278,2487,6950,82759,-,2017-09-27,2017-12-05,2022-11-05,SIN GARANTIAS REALES,91.39,NUEVO,3.0,2017-11-30,ACACIAS


## CUOTAS_PACTADAS, CUOTAS_PENDIENTES

In [57]:
cols = ['CUOTAS_PACTADAS', 'CUOTAS_PENDIENTES']

df[cols].isna().any()

CUOTAS_PACTADAS      False
CUOTAS_PENDIENTES    False
dtype: bool

In [58]:
cols = ['CUOTAS_PACTADAS', 'CUOTAS_PENDIENTES']
temp_df = df[cols]
cleaning.cast_float_to_int_in_place(df, columns=cols)

pd.concat([temp_df, df[cols]], axis=1)

Unnamed: 0,CUOTAS_PACTADAS,CUOTAS_PENDIENTES,CUOTAS_PACTADAS.1,CUOTAS_PENDIENTES.1
198611,1.0,1.0,1,1
198021,12.0,12.0,12,12
198517,6.0,6.0,6,6
198023,1.0,1.0,1,1
198758,5.0,5.0,5,5
...,...,...,...,...
139681,24.0,24.0,24,24
141006,1.0,1.0,1,1
141080,6.0,6.0,6,6
139345,18.0,18.0,18,18


Question: There are records with inconsistent number of CUOTAS. Should remove these 26 credits?

In [59]:
i_inconsistent_cuotas = df['CUOTAS_PACTADAS'] < df['CUOTAS_PENDIENTES']
inconsistent_obligaciones = df.loc[i_inconsistent_cuotas, 'OBLIGACION'].drop_duplicates()

inconsistent_obligaciones.shape[0], inconsistent_obligaciones

(26,
 409357         63308
 409356         67813
 146704     171000052
 145086     171000291
 148213     172000017
 663976     171001362
 56078      179001548
 605927     191003464
 790301     174001300
 780959     179001143
 779360     191003317
 780680     191003369
 779361     191003944
 783932     192003136
 779114     193001575
 788677     193001904
 779113     193001949
 785725     194002121
 780958     196001918
 788722     198003915
 785481     208004782
 783931    1711000981
 782574    1913000341
 779362    1914000074
 474096     201004683
 470001    1912001200
 Name: OBLIGACION, dtype: int32)

In [60]:
df.loc[i_inconsistent_cuotas, interest_cols]

Unnamed: 0,TIPO_CREDITO,CLIENTE,OBLIGACION,FECHA_CIERRE,VALOR_CUOTA,CUOTAS_PACTADAS,CUOTAS_PENDIENTES,PORCENTAJE_PAGO,MONTO,SALDO,CALIFICACION_CIERRE,VENCIDA,DIAS_VENCIDO,CAPITAL_VEN,INTERES_VEN,MORA,FECHA_SOLICITUD,FECHA_APROBA,FECHA_DESEMBOLSO,FECHA_ULT_PAGO,FECHA_CIERRE.1,FECHA_PROXIMO_PAGO,PERIODICIDAD_PAGO,FECHA_VENCIMIENTO_FINAL
409357,RENOVADO,FA13219,63308,2017-06-30,1199856,6,7,67.45,8500000,5732916,A,-,0,-,851209,-,2016-05-13,2016-05-13,2016-05-13,NaT,2017-06-30,2017-08-13,Trimestral,2019-02-13
409356,RETANQUEADO,FA17572,67813,2017-06-30,852562,4,5,44.14,6000000,2648166,A,-,0,-,386595,-,2016-09-21,2016-09-21,2016-09-22,2017-04-28,2017-06-30,2018-06-22,Trimestral,2019-06-22
341624,RENOVADO,FA13219,63308,2017-07-31,1199856,6,7,67.45,8500000,5732916,A,-,0,-,851209,-,2016-05-13,2016-05-13,2016-05-13,NaT,2017-07-31,2017-08-13,Trimestral,2019-02-13
346823,RETANQUEADO,FA17572,67813,2017-07-31,852562,4,5,44.14,6000000,2648166,A,-,0,-,"(11,405)",-,2016-09-21,2016-09-21,2016-09-22,2017-07-27,2017-07-31,2018-06-22,Trimestral,2019-06-22
78006,RETANQUEADO,FA17572,67813,2017-08-31,852562,4,5,44.14,6000000,2648166,A,-,0,-,"(11,405)",-,2016-09-21,2016-09-21,2016-09-22,2017-07-27,2017-08-31,2018-06-22,Trimestral,2019-06-22
734707,RETANQUEADO,FA17572,67813,2017-09-30,852562,4,5,44.14,6000000,2648166,A,-,0,-,167247,-,2016-09-21,2016-09-21,2016-09-22,2017-10-05,2017-09-30,2018-06-22,Trimestral,2019-06-22
678480,RETANQUEADO,FA17572,67813,2017-10-31,852562,4,5,44.14,6000000,2648166,A,-,0,-,(753),-,2016-09-21,2016-09-21,2016-09-22,2017-10-05,2017-10-31,2018-06-22,Trimestral,2019-06-22
146704,NUEVO,FA4411,171000052,2017-12-31,258265,2,8,63.15,2500000,1578682,E,1578682,180,1089682,184496,111932,2017-01-27,2017-01-31,2017-01-31,2017-08-16,2017-12-31,2017-06-30,Mensual,2018-02-28
145086,NUEVO,FA23522,171000291,2017-12-31,743348,33,34,88.71,18000000,15967464,B,-,2,287110,439990,1019,2017-03-24,2017-03-28,2017-03-28,2017-12-27,2017-12-31,2017-12-28,Mensual,2020-09-30
148213,NUEVO,FA24957,172000017,2017-12-31,342969,25,34,95.8,7000000,6706105,E,6706105,243,1042916,1489148,129178,2017-01-23,2017-01-27,2017-01-27,2017-06-21,2017-12-31,2017-04-27,Mensual,2020-01-30


In [61]:
df.query('OBLIGACION == 171000052')[interest_cols.drop_duplicates()].sort_values(by=['OBLIGACION','FECHA_CIERRE'])

Unnamed: 0,TIPO_CREDITO,CLIENTE,OBLIGACION,FECHA_CIERRE,VALOR_CUOTA,CUOTAS_PACTADAS,CUOTAS_PENDIENTES,PORCENTAJE_PAGO,MONTO,SALDO,CALIFICACION_CIERRE,VENCIDA,DIAS_VENCIDO,CAPITAL_VEN,INTERES_VEN,MORA,FECHA_SOLICITUD,FECHA_APROBA,FECHA_DESEMBOLSO,FECHA_ULT_PAGO,FECHA_PROXIMO_PAGO,PERIODICIDAD_PAGO,FECHA_VENCIMIENTO_FINAL
211172,NUEVO,FA4411,171000052,2017-01-31,258265,12,12,######,2500000,2500000,A,-,0,-,-,-,2017-01-27,2017-01-31,2017-01-31,NaT,2017-02-28,Mensual,2018-01-31
268148,NUEVO,FA4411,171000052,2017-02-28,258265,12,11,93.09,2500000,2327175,A,-,0,(508),-,-,2017-01-27,2017-01-31,2017-01-31,2017-02-27,2017-03-31,Mensual,2018-01-31
480047,NUEVO,FA4411,171000052,2017-03-31,258265,12,11,93.09,2500000,2327175,A,-,0,177645,68439,-,2017-01-27,2017-01-31,2017-01-31,2017-02-27,2017-03-31,Mensual,2018-01-31
259,NUEVO,FA4411,171000052,2017-04-30,258265,12,10,85.98,2500000,2149530,A,-,0,184188,59919,-,2017-01-27,2017-01-31,2017-01-31,2017-04-07,2017-04-30,Mensual,2018-01-31
551242,NUEVO,FA4411,171000052,2017-05-31,258265,12,9,78.61,2500000,1965342,A,-,0,190427,54947,-,2017-01-27,2017-01-31,2017-01-31,2017-05-10,2017-05-31,Mensual,2018-01-31
409660,NUEVO,FA4411,171000052,2017-06-30,258265,12,8,71,2500000,1774915,A,-,0,196876,48964,233,2017-01-27,2017-01-31,2017-01-31,2017-06-06,2017-06-30,Mensual,2018-01-31
349314,NUEVO,FA4411,171000052,2017-07-31,258265,12,8,71,2500000,1774915,A,-,30,400421,95371,7222,2017-01-27,2017-01-31,2017-01-31,2017-06-06,2017-06-30,Mensual,2018-01-31
80373,NUEVO,FA4411,171000052,2017-08-31,258265,12,8,63.15,2500000,1578682,A,1578682,60,414627,86828,3625,2017-01-27,2017-01-31,2017-01-31,2017-08-16,2017-06-30,Mensual,2018-01-31
735131,NUEVO,FA4411,171000052,2017-09-30,258265,12,8,63.15,2500000,1578682,C,1578682,90,632193,121061,19091,2017-01-27,2017-01-31,2017-01-31,2017-08-16,2017-06-30,Mensual,2018-01-31
678893,NUEVO,FA4411,171000052,2017-10-31,258265,12,8,63.15,2500000,1578682,D,1578682,120,857128,148896,42547,2017-01-27,2017-01-31,2017-01-31,2017-08-16,2017-06-30,Mensual,2018-01-31


In [62]:
df.query('OBLIGACION == 63308')[interest_cols]

Unnamed: 0,TIPO_CREDITO,CLIENTE,OBLIGACION,FECHA_CIERRE,VALOR_CUOTA,CUOTAS_PACTADAS,CUOTAS_PENDIENTES,PORCENTAJE_PAGO,MONTO,SALDO,CALIFICACION_CIERRE,VENCIDA,DIAS_VENCIDO,CAPITAL_VEN,INTERES_VEN,MORA,FECHA_SOLICITUD,FECHA_APROBA,FECHA_DESEMBOLSO,FECHA_ULT_PAGO,FECHA_CIERRE.1,FECHA_PROXIMO_PAGO,PERIODICIDAD_PAGO,FECHA_VENCIMIENTO_FINAL
201216,RENOVADO,FA13219,63308,2017-01-31,1205926,8,7,67.45,8500000,5732916,A,-,0,-,77693,-,2016-05-13,2016-05-13,2016-05-13,NaT,2017-01-31,2017-05-13,Trimestral,2018-11-13
268083,RENOVADO,FA13219,63308,2017-02-28,1199856,6,6,67.45,8500000,5732916,A,-,0,-,537505,-,2016-05-13,2016-05-13,2016-05-13,NaT,2017-02-28,2017-08-13,Trimestral,2018-11-13
479905,RENOVADO,FA13219,63308,2017-03-31,1199856,6,6,67.45,8500000,5732916,A,-,0,-,537505,-,2016-05-13,2016-05-13,2016-05-13,NaT,2017-03-31,2017-08-13,Trimestral,2018-11-13
58,RENOVADO,FA13219,63308,2017-04-30,1199856,6,6,67.45,8500000,5732916,A,-,0,-,537505,-,2016-05-13,2016-05-13,2016-05-13,NaT,2017-04-30,2017-08-13,Trimestral,2018-11-13
551009,RENOVADO,FA13219,63308,2017-05-31,1199856,6,6,67.45,8500000,5732916,A,-,0,-,831383,-,2016-05-13,2016-05-13,2016-05-13,NaT,2017-05-31,2017-08-13,Trimestral,2018-11-13
409357,RENOVADO,FA13219,63308,2017-06-30,1199856,6,7,67.45,8500000,5732916,A,-,0,-,851209,-,2016-05-13,2016-05-13,2016-05-13,NaT,2017-06-30,2017-08-13,Trimestral,2019-02-13
341624,RENOVADO,FA13219,63308,2017-07-31,1199856,6,7,67.45,8500000,5732916,A,-,0,-,851209,-,2016-05-13,2016-05-13,2016-05-13,NaT,2017-07-31,2017-08-13,Trimestral,2019-02-13
73226,RENOVADO,FA13219,63308,2017-08-31,1199856,6,6,58.02,8500000,4931745,A,-,0,-,"(30,434)",-,2016-05-13,2016-05-13,2016-05-13,2017-08-24,2017-08-31,2017-11-13,Trimestral,2019-02-13
734708,RENOVADO,FA13219,63308,2017-09-30,1199856,6,6,58.02,8500000,4931745,A,-,0,-,"(30,434)",-,2016-05-13,2016-05-13,2016-05-13,2017-08-24,2017-09-30,2017-11-13,Trimestral,2019-02-13
678481,RENOVADO,FA13219,63308,2017-10-31,1199856,6,5,47.6,8500000,4046020,A,-,0,"(885,725)","(335,365)",-,2016-05-13,2016-05-13,2016-05-13,2017-10-26,2017-10-31,2018-02-13,Trimestral,2019-02-13


## GARANTIA_REAL

Question: discarded?

In [63]:
rr = df['GARANTIA_REAL'].value_counts(dropna=False)
rr

SIN GARANTIAS REALES                                                                                     790196
HIPOTECAS HIPOTECA CREDITO SC-191005824 Vlr del Bien --->    347470500 Vlr Hipoteca --->    347470500        20
HIPOTECAS HIPOTECA  LOCAL COMERCIAL Vlr del Bien --->     78911008 Vlr Hipoteca --->     78911008            18
Name: GARANTIA_REAL, dtype: int64

Draft

In [64]:
rr.index[1]

'HIPOTECAS HIPOTECA CREDITO SC-191005824 Vlr del Bien --->    347470500 Vlr Hipoteca --->    347470500'

In [65]:
rr.index[2]

'HIPOTECAS HIPOTECA  LOCAL COMERCIAL Vlr del Bien --->     78911008 Vlr Hipoteca --->     78911008'

In [66]:
df.query("GARANTIA_REAL == 'HIPOTECAS HIPOTECA CREDITO SC-191005824 Vlr del Bien --->    347470500 Vlr Hipoteca --->    347470500'")['OBLIGACION'].value_counts()

191003960     19
2112000089     1
Name: OBLIGACION, dtype: int64

In [67]:
df.query("GARANTIA_REAL == 'HIPOTECAS HIPOTECA  LOCAL COMERCIAL Vlr del Bien --->     78911008 Vlr Hipoteca --->     78911008'")['OBLIGACION'].value_counts()

192003327    18
Name: OBLIGACION, dtype: int64

## MODALIDAD

Question: is MODALIDAD manually set?

In [68]:
df[['MODALIDAD', 'COD_MODALIDAD']].drop_duplicates().set_index('COD_MODALIDAD').sort_index()

Unnamed: 0_level_0,MODALIDAD
COD_MODALIDAD,Unnamed: 1_level_1
390.0,MICROVIVIENDA NUEVO DCA
392.0,EMP219 70 DESARROLLO EMPRESARIAL CRECER FNG
392.0,DESARROLLO EMPRESARIAL CRECER (FNG EMP219)
393.0,CREDITO RURAL INDIVIDUAL - GERMINA (FNG EMP226)
393.0,EMP226 75 CREDITO RURAL INDIVIDUAL - GERMINA FNG
394.0,CREDITO RURAL INDIVIDUAL - GERMINA (FNG EMP225)
394.0,EMP225 60 CREDITO RURAL INDIVIDUAL - GERMINA FNG
395.0,CREDITO RURAL INDIVIDUAL - GERMINA NUEVO DCA
396.0,DESARROLLO EMPRESARIAL CRECER NUEVO DCA
397.0,CREDI-DESVARE NEGOCIO SEMESTRAL


## MONTO

> Note: MONTO values in CARTERA and COLOCACION are all equal. See [joining notebook](joining_datasets.ipynb).

In [69]:
df['MONTO'] = df['MONTO'].str.strip()

ind_monto_na = df['MONTO'].isna()
ind_monto_int = df['MONTO'].str.contains('^\d+$')
ind_monto_int_thousands = df['MONTO'].str.replace(' ', '').str.contains(',\d{3}$')

count_monto_int = ind_monto_int.sum()
count_monto_int_thousands = ind_monto_int_thousands.sum()
count_monto_na = ind_monto_na.sum()

count_monto_all = count_monto_na + count_monto_int + count_monto_int_thousands

assert count_monto_all == df.shape[0], "There are values for parsing"

{
    "count_dataset_records": df.shape[0],
    "sum_monto_counts": count_monto_all,
    "count_monto_na": count_monto_na,
    "count_monto_int": count_monto_int,
    "count_monto_int_thousands": count_monto_int_thousands,
}

{'count_dataset_records': 790234,
 'sum_monto_counts': 790234,
 'count_monto_na': 0,
 'count_monto_int': 61013,
 'count_monto_int_thousands': 729221}

In [70]:
# df['MONTO'] = temp_series

temp_series = df['MONTO']
df['MONTO'] = (
    df['MONTO']
    .str.replace(',', '')
    .astype('int64')
)
pd.concat([temp_series, df['MONTO']], axis=1)

Unnamed: 0,MONTO,MONTO.1
198611,24000000,24000000
198021,15000000,15000000
198517,10000000,10000000
198023,5800000,5800000
198758,22000000,22000000
...,...,...
139681,6000000,6000000
141006,1000000,1000000
141080,4000000,4000000
139345,2000000,2000000


## MUNICIPIO_CLIENTE, MUNICIPIO_LAT, MUNICIPIO_LON

Replacing typos and considering "Default" as NaN value:

In [71]:
df['MUNICIPIO_CLIENTE'].isna().any()

False

In [72]:
df['MUNICIPIO_CLIENTE'].value_counts(dropna=False)

YOPAL             125310
VILLAVICENCIO      98396
PAZ DE ARIPORO     61731
ACACIAS            44986
VILLANUEVA         39355
                   ...  
SACAMA                18
Default               12
CHIQUIZA              11
TUTAZA                 6
MESETAS                3
Name: MUNICIPIO_CLIENTE, Length: 79, dtype: int64

In [73]:
replace_dict = {
    'FIRABITOBA': 'FIRAVITOBA',
    'NUCHIA': 'NUNCHIA',
    'Default': np.nan,
}

df['MUNICIPIO_CLIENTE'] = (
    df['MUNICIPIO_CLIENTE']
    .replace(replace_dict)
    .astype('category')
)

df['MUNICIPIO_CLIENTE'].value_counts(dropna=False)

YOPAL             125310
VILLAVICENCIO      98396
PAZ DE ARIPORO     61731
ACACIAS            44986
VILLANUEVA         39355
                   ...  
SACAMA                18
NaN                   12
CHIQUIZA              11
TUTAZA                 6
MESETAS                3
Name: MUNICIPIO_CLIENTE, Length: 77, dtype: int64

Consider there are some clients with multiple MUNICIPIO_CLIENTE:

In [74]:
ss = df[['CLIENTE', 'MUNICIPIO_CLIENTE']].drop_duplicates().groupby('CLIENTE').size()

ss[ss > 1].sort_values()

CLIENTE
FA100      2
FA31191    2
FA3100     2
FA30790    2
FA299      2
          ..
FA130      3
FA16090    3
FA11633    3
FA12232    3
FA7247     3
Length: 533, dtype: int64

Adding MUNICIPIO_LAT and MUNICIPIO_LON:

In [75]:
df = df.merge(location.coords_df, how='left', on='MUNICIPIO_CLIENTE')

In [76]:
(
    df[['MUNICIPIO_CLIENTE', 'MUNICIPIO_LAT', 'MUNICIPIO_LON']]
    .value_counts(dropna=False)
)

MUNICIPIO_CLIENTE  MUNICIPIO_LAT  MUNICIPIO_LON
YOPAL              5.33775        -72.39586        125310
VILLAVICENCIO      4.14200        -73.62664         98396
PAZ DE ARIPORO     5.88148        -71.89167         61731
ACACIAS            3.98695        -73.75797         44986
VILLANUEVA         5.28333        -71.96667         39355
                                                    ...  
SACAMA             6.09908        -72.24880            18
NaN                NaN            NaN                  12
CHIQUIZA           5.60412        -73.48518            11
TUTAZA             6.03228        -72.85639             6
MESETAS            3.38463        -74.04424             3
Length: 77, dtype: int64

## PERIODICIDAD_PAGO

In [77]:
df['PERIODICIDAD_PAGO'].value_counts(dropna=False)

Mensual          633742
Trimestral       156172
Semestral           176
Bimensual           142
Cuatrimestral         2
Name: PERIODICIDAD_PAGO, dtype: int64

In [78]:
df['PERIODICIDAD_PAGO'] = df['PERIODICIDAD_PAGO'].astype('category')

df['PERIODICIDAD_PAGO'].value_counts(dropna=False)

Mensual          633742
Trimestral       156172
Semestral           176
Bimensual           142
Cuatrimestral         2
Name: PERIODICIDAD_PAGO, dtype: int64

Question: how do work changes in PERIODICIDAD_PAGO? Note these changes also alter TASA_PERIODICA and VALOR_CUOTA, but not TASA_ANUAL. Should discard PERIODICIDAD_PAGO in modeling?

In [79]:
comparison_cols = ['CLIENTE', 'OBLIGACION', 'PERIODICIDAD_PAGO']
extra_cols = [
    'FECHA_CIERRE', 'CALIFICACION_CIERRE', 'TASA_ANUAL', 'TASA_PERIODICA', 'VALOR_CUOTA',
    'FECHA_ULT_PAGO', 'FECHA_PROXIMO_PAGO', 'SALDO', 'PORCENTAJE_PAGO',
]
cols = [*comparison_cols, *extra_cols]

period_multiple_df = (
    df
    [cols]
    .drop_duplicates(subset=comparison_cols)
    .sort_values(by='OBLIGACION')
    .reset_index(drop=True)
)

ind = period_multiple_df['OBLIGACION'].drop(columns=extra_cols).duplicated(keep=False)

period_multiple_df[ind].sort_values(by=['OBLIGACION', 'FECHA_CIERRE'])

Unnamed: 0,CLIENTE,OBLIGACION,PERIODICIDAD_PAGO,FECHA_CIERRE,CALIFICACION_CIERRE,TASA_ANUAL,TASA_PERIODICA,VALOR_CUOTA,FECHA_ULT_PAGO,FECHA_PROXIMO_PAGO,SALDO,PORCENTAJE_PAGO
10713,FA23983,66408,Trimestral,2017-01-31,A,26.40,6.75,677517,2017-01-12,2018-05-17,310544,7.76
10712,FA23983,66408,Semestral,2017-02-28,A,26.40,13.95,686511,NaT,2018-02-17,110544,2.76
14490,FA4010,70338,Trimestral,2017-01-31,A,34.80,8.95,1089985,2016-12-31,2017-03-09,6000000,######
14491,FA4010,70338,Mensual,2018-03-31,A,34.80,2.90,532999,2018-03-09,2018-04-20,2802832,46.71
16442,FA1992,171001161,Trimestral,2017-09-30,A,31.68,8.13,1514357,2017-09-26,2018-01-01,10000000,######
...,...,...,...,...,...,...,...,...,...,...,...,...
45036,FA853,208004822,Bimensual,2020-05-31,A,35.28,5.97,700000,NaT,2020-08-20,700000,######
45054,FA19354,208004842,Mensual,2020-02-29,A,35.28,2.94,700000,NaT,2020-04-26,700000,######
45055,FA19354,208004842,Bimensual,2020-05-31,A,35.28,5.97,700000,NaT,2020-08-26,700000,######
51671,FA15474,1910001523,Trimestral,2019-02-28,A,31.68,8.13,546750,NaT,2019-05-18,2500000,######


In [80]:
(period_multiple_df['OBLIGACION'] == 172001891).any()

True

In [81]:
df[cols].query('OBLIGACION == 172001891').sort_values(by='FECHA_CIERRE').iloc[-6:]

Unnamed: 0,CLIENTE,OBLIGACION,PERIODICIDAD_PAGO,FECHA_CIERRE,CALIFICACION_CIERRE,TASA_ANUAL,TASA_PERIODICA,VALOR_CUOTA,FECHA_ULT_PAGO,FECHA_PROXIMO_PAGO,SALDO,PORCENTAJE_PAGO
526828,FA20824,172001891,Mensual,2020-01-31,A,35.28,2.94,154970.0,2020-01-29,2020-02-22,291680,14.58
541312,FA20824,172001891,Mensual,2020-02-29,A,35.28,2.94,154970.0,2020-01-29,2020-02-22,291680,14.58
555915,FA20824,172001891,Mensual,2020-03-31,A,35.28,2.94,154970.0,2020-03-02,2020-03-22,148471,7.42
570407,FA20824,172001891,Trimestral,2020-04-30,A,35.28,9.08,157003.0,2020-04-25,2020-07-22,148471,7.42
584578,FA20824,172001891,Trimestral,2020-05-31,A,35.28,9.08,157003.0,2020-04-25,2020-07-22,148471,7.42
598541,FA20824,172001891,Trimestral,2020-06-30,A,35.28,9.08,157003.0,2020-04-25,2020-07-22,148471,7.42


In [82]:
(period_multiple_df['OBLIGACION'] == 171001161).any()

True

In [83]:
df[cols].query('OBLIGACION == 171001161').sort_values(by='FECHA_CIERRE').iloc[-18:]

Unnamed: 0,CLIENTE,OBLIGACION,PERIODICIDAD_PAGO,FECHA_CIERRE,CALIFICACION_CIERRE,TASA_ANUAL,TASA_PERIODICA,VALOR_CUOTA,FECHA_ULT_PAGO,FECHA_PROXIMO_PAGO,SALDO,PORCENTAJE_PAGO
496319,FA1992,171001161,Trimestral,2019-11-30,A,31.68,8.13,1514357.0,2019-10-26,2019-10-01,3568783,35.69
511014,FA1992,171001161,Trimestral,2019-12-31,A,3168.0,813.0,1514357.0,2019-12-30,2020-01-01,2668512,26.69
525610,FA1992,171001161,Trimestral,2020-01-31,A,31.68,8.13,1514357.0,2019-12-30,2020-01-01,2668512,26.69
540174,FA1992,171001161,Trimestral,2020-02-29,A,31.68,8.13,1514357.0,2019-12-30,2020-01-01,2668512,26.69
554832,FA1992,171001161,Trimestral,2020-03-31,B,31.68,8.13,1514357.0,NaT,2020-01-01,2668512,26.69
569402,FA1992,171001161,Trimestral,2020-04-30,D,31.68,8.13,1514357.0,2019-12-30,2020-01-01,2668512,26.69
583616,FA1992,171001161,Mensual,2020-05-31,D,31.68,2.64,245083.0,2020-05-11,2020-07-10,2668512,26.69
597682,FA1992,171001161,Mensual,2020-06-30,D,31.68,2.64,245083.0,2020-05-11,2020-07-10,2668512,26.69
611297,FA1992,171001161,Mensual,2020-07-31,D,31.68,2.64,245083.0,2020-07-25,2020-08-10,2498961,24.99
624667,FA1992,171001161,Mensual,2020-08-31,D,31.68,2.64,245083.0,2020-08-16,2020-09-10,2324934,23.25


## PORCENTAJE_PAGO

In [84]:
df['PORCENTAJE_PAGO'] = df['PORCENTAJE_PAGO'].str.strip()

In [85]:
ind_porcentaje_pago_na = df['PORCENTAJE_PAGO'].isna()
ind_pago_missing = df['PORCENTAJE_PAGO'].str.match('######').fillna(False)
ind_pago_first = df['PORCENTAJE_PAGO'].str.match('######').fillna(False)
ind_porcentaje_pago_correct = df['PORCENTAJE_PAGO'].str.match('^\d{1,2}\.\d{1,2}$').fillna(False)

ind_porcentaje_pago_two_dec = df['PORCENTAJE_PAGO'].str.match('^\.\d{1,2}$').fillna(False)
ind_porcentaje_pago_two_dig = df['PORCENTAJE_PAGO'].str.match('^\d{1,2}$').fillna(False)

count_porcentaje_pago_na = ind_porcentaje_pago_na.sum()
count_porcentaje_pago_first = ind_pago_first.sum()
count_porcentaje_pago_correct = ind_porcentaje_pago_correct.sum()
count_porcentaje_pago_two_dig = ind_porcentaje_pago_two_dig.sum()
count_porcentaje_pago_two_dec = ind_porcentaje_pago_two_dec.sum()

sum_counts_fecha_ult_pago = (
    count_porcentaje_pago_na
    + count_porcentaje_pago_first
    + count_porcentaje_pago_correct
    + count_porcentaje_pago_two_dig
    + count_porcentaje_pago_two_dec
)

{
    "count_records": df.shape[0],
    "sum_counts_fecha_ult_pago": sum_counts_fecha_ult_pago,
    "count_porcentaje_pago_na": count_porcentaje_pago_na,
    "count_porcentaje_pago_first": count_porcentaje_pago_first,
    "count_porcentaje_pago_correct": count_porcentaje_pago_correct,
    "count_porcentaje_pago_two_dec": count_porcentaje_pago_two_dec,
    "count_porcentaje_pago_two_dig": count_porcentaje_pago_two_dig,
}


{'count_records': 790234,
 'sum_counts_fecha_ult_pago': 790234,
 'count_porcentaje_pago_na': 0,
 'count_porcentaje_pago_first': 75840,
 'count_porcentaje_pago_correct': 709919,
 'count_porcentaje_pago_two_dec': 2002,
 'count_porcentaje_pago_two_dig': 2473}

> Note: records with PORCENTAJE_PAGO NaN values were previously removed.

In [86]:
ind = ind_porcentaje_pago_two_dec
temp_series = df.loc[ind, 'PORCENTAJE_PAGO']
df.loc[ind, 'PORCENTAJE_PAGO'] = (
    temp_series
    .str.replace(r'^.', '', regex=True)
    .astype(int)
    )

pd.concat([temp_series, df.loc[ind, 'PORCENTAJE_PAGO']], axis=1)

Unnamed: 0,PORCENTAJE_PAGO,PORCENTAJE_PAGO.1
46569,.55,55
46583,.08,8
46621,.19,19
46634,.76,76
46686,.48,48
...,...,...
784288,.77,77
784532,.48,48
784724,.22,22
785621,.05,5


In [87]:
df['PORCENTAJE_PAGO'] = df['PORCENTAJE_PAGO'].replace('######', 100).astype(float)

df['PORCENTAJE_PAGO']

0           4.32
1          44.78
2          24.76
3          15.07
4          20.25
           ...  
790229     48.98
790230    100.00
790231    100.00
790232    100.00
790233     91.06
Name: PORCENTAJE_PAGO, Length: 790234, dtype: float64

## REGION

Question: is REGION manually set? There are multiple REGION for each OBLIGACION

In [88]:
columns = ['CLIENTE', 'OBLIGACION', 'REGION']
d2 = (
    df
    [['FECHA_CIERRE', *columns]]
    .drop_duplicates(subset=columns)
)

ind = d2['OBLIGACION'].drop(columns=['FECHA_CIERRE']).duplicated(keep=False)

d3 = d2[ind].sort_values(by=['CLIENTE', 'OBLIGACION', 'FECHA_CIERRE'])
d3

Unnamed: 0,FECHA_CIERRE,CLIENTE,OBLIGACION,REGION
510559,2019-11-30,FA1,1914000083,REGION CRENTRO BOYACA
624357,2020-07-31,FA1,1914000083,CENTRO BOYACA
569160,2020-03-31,FA10,1914000230,REGION CRENTRO BOYACA
624488,2020-07-31,FA10,1914000230,CENTRO BOYACA
61925,2017-04-30,FA100,1711000162,REGION SUR
...,...,...,...,...
254925,2018-06-30,FA9997,68225,VILLAVICENCIO
148168,2017-10-31,FA9998,172000796,REGION VILLAVICENCIO
258497,2018-06-30,FA9998,172000796,VILLAVICENCIO
45420,2017-03-31,FA9999,172000133,REGION VILLAVICENCIO


In [89]:
columns = ['CLIENTE', 'OBLIGACION', 'REGION']
d2 = (
    df
    [['FECHA_CIERRE', *columns]]
    .drop_duplicates(subset=columns)
)

ind = d2['OBLIGACION'].drop(columns=['FECHA_CIERRE']).duplicated(keep=False)

d3 = d2[ind].sort_values(by=['CLIENTE', 'OBLIGACION', 'FECHA_CIERRE'])
d3

Unnamed: 0,FECHA_CIERRE,CLIENTE,OBLIGACION,REGION
510559,2019-11-30,FA1,1914000083,REGION CRENTRO BOYACA
624357,2020-07-31,FA1,1914000083,CENTRO BOYACA
569160,2020-03-31,FA10,1914000230,REGION CRENTRO BOYACA
624488,2020-07-31,FA10,1914000230,CENTRO BOYACA
61925,2017-04-30,FA100,1711000162,REGION SUR
...,...,...,...,...
254925,2018-06-30,FA9997,68225,VILLAVICENCIO
148168,2017-10-31,FA9998,172000796,REGION VILLAVICENCIO
258497,2018-06-30,FA9998,172000796,VILLAVICENCIO
45420,2017-03-31,FA9999,172000133,REGION VILLAVICENCIO


In [90]:
d4 = (
    d3
    .assign(
        REGION=(
            d3['REGION']
            .str.replace(r'^REGION\s', '', regex=True)
            .replace('CRENTRO BOYACA', 'CENTRO BOYACA', regex=True)
            .replace('CENTRO NORTE', 'NORTE', regex=True)
        )
    )
    .drop_duplicates(subset=columns))
ind = d4.duplicated(keep=False, subset=['OBLIGACION'])
d4[ind]

Unnamed: 0,FECHA_CIERRE,CLIENTE,OBLIGACION,REGION
90923,2017-06-30,FA10407,175000127,META
261179,2018-06-30,FA10407,175000127,VILLAVICENCIO
177943,2017-12-31,FA10408,175000336,META
261362,2018-06-30,FA10408,175000336,VILLAVICENCIO
75884,2017-05-31,FA10410,175000120,META
...,...,...,...,...
256935,2018-06-30,FA8756,171001331,NORTE
257949,2018-06-30,FA907,171002402,NORTE
284816,2018-08-31,FA907,171002402,CENTRO
15140,2017-01-31,FA9303,71010,CENTRO


## SALDO

In [91]:
df['SALDO'] = df['SALDO'].str.strip()

ind_saldo_na = df['SALDO'].isna()
ind_saldo_int = df['SALDO'].str.replace(' ', '').str.contains('^\d+$')
ind_saldo_int_thousands = df['SALDO'].str.replace(' ', '').str.contains(',\d{3}$')

count_saldo_int = ind_saldo_int.sum()
count_saldo_int_comma = ind_saldo_int_thousands.sum()
count_saldo_na = ind_saldo_na.sum()

count_saldo_all = count_saldo_na + count_saldo_int + count_saldo_int_comma

assert count_saldo_all == df.shape[0], "There are values missing parsing"

{
    "count_dataset_records": df.shape[0],
    "sum_saldo_counts": count_saldo_all,
    "count_saldo_na": count_saldo_na,
    "count_saldo_int": count_saldo_int,
    "count_saldo_int_comma": count_saldo_int_comma,
}

{'count_dataset_records': 790234,
 'sum_saldo_counts': 790234,
 'count_saldo_na': 0,
 'count_saldo_int': 443,
 'count_saldo_int_comma': 789791}

In [92]:
# df['SALDO'] = temp_series

temp_series = df['SALDO']
df['SALDO'] = (
    df['SALDO']
    .str.replace(',', '')
    .astype('int64')
)
pd.concat([temp_series, df['SALDO']], axis=1)

Unnamed: 0,SALDO,SALDO.1
0,1037679,1037679
1,6716353,6716353
2,2476048,2476048
3,873989,873989
4,4454037,4454037
...,...,...
790229,6000000,6000000
790230,1000000,1000000
790231,4000000,4000000
790232,2000000,2000000


## SUCURSAL_COD

NOTE: SUCURSAL_COD in COLOCACION dataset has no null values but in CARTERA dataset some null values are denoted by "#N/D"

Replacing "#N/D" as NaN value and casting duplicated string codes into integer codes:

In [93]:
df['SUCURSAL_COD'].value_counts(dropna=False)

1       113914
8        97492
2        79730
9        57454
3        56657
6        54680
4        49603
11       41219
NaN      30984
5        26240
12       22633
1        22210
7        21478
10       16648
8        15624
2        14085
13       10096
4         9392
6         9240
3         8972
11        7594
14        5552
9         5305
7         4901
5         4507
10        2326
12         774
13         494
14         351
#N/D        79
Name: SUCURSAL_COD, dtype: int64

In [94]:
df['SUCURSAL_COD'] = (
    df['SUCURSAL_COD']
    .replace('#N/D', np.nan)
)

ind_str = df['SUCURSAL_COD'].apply(lambda x: isinstance(x, str))

all_str_numeric = (
    df
    .loc[ind_str, 'SUCURSAL_COD']
    .str.match(r'\d+')
    .all()
)
assert all_str_numeric, "There are non-numeric SUCURSAL_COD"

df.loc[ind_str, 'SUCURSAL_COD'] = (
    df.loc[ind_str, 'SUCURSAL_COD']
    .astype(int)
)

df['SUCURSAL_COD'] = df['SUCURSAL_COD'].astype('category')

df['SUCURSAL_COD'].value_counts(dropna=False)

1      136124
8      113116
2       93815
3       65629
6       63920
9       62759
4       58995
11      48813
NaN     31063
5       30747
7       26379
12      23407
10      18974
13      10590
14       5903
Name: SUCURSAL_COD, dtype: int64

Question: SUCURSAL_COD in this case refers to each payment individually?

In [95]:
df2 = (
    df
    [['OBLIGACION', 'SUCURSAL_COD']]
    .drop_duplicates()
    .sort_values(by='OBLIGACION')
    .reset_index(drop=True)
)

ind = df2['OBLIGACION'].duplicated(keep=False)
df2[ind]

Unnamed: 0,OBLIGACION,SUCURSAL_COD
0,29179,
1,29179,1
2,29179,11
3,30827,
4,30827,8
...,...,...
98732,2112000054,8
98733,2112000055,8
98734,2112000055,12
98735,2112000056,12


Consider SUCURSAL_REAL is inconsistent:

In [96]:
df[['SUCURSAL_COD', 'SUCURSAL_REAL']].query('SUCURSAL_COD == 1 & ~SUCURSAL_REAL.isna()').drop_duplicates().head()

Unnamed: 0,SUCURSAL_COD,SUCURSAL_REAL
61941,1,YOPAL
252785,1,VILLAVICENCIO
466267,1,SOGAMOSO
777735,1,ACACIAS
777818,1,AGUAZUL


In [97]:
location.SUCURSAL_COD_DICT

{1: 'YOPAL',
 2: 'VILLAVICENCIO',
 3: 'ACACIAS',
 4: 'AGUAZUL',
 5: 'CUMARAL',
 6: 'GRANADA',
 7: 'MONTERREY',
 8: 'PAZ DE ARIPORO',
 9: 'SOGAMOSO',
 10: 'TAURAMENA',
 11: 'VILLANUEVA',
 12: 'TAME',
 13: 'DUITAMA',
 14: 'TUNJA',
 15: 'SARAVENA'}

## TIPO_CLIENTE, TIPO_CLIENTE_COD

In [98]:
df[['TIPO_CLIENTE', 'TIPO_CLIENTE_COD']].value_counts()

TIPO_CLIENTE         TIPO_CLIENTE_COD
Microfinanciero      1.0                 774918
Codeudor no cliente  4.0                   7457
Mixto                3.0                   6568
Gestion social       2.0                   1243
Fondeador            5.0                     39
Proveedor            7.0                      9
dtype: int64

PENDING: compare these values regarding CONTACTO

## TASA_ANUAL, TASA_PERIODICA

TASA_ANUAL values from records with FECHA_CIERRE="2019-12-21" are multiplied by 10_000 instead of 100 as other TASA_ANUAL values:

In [99]:
df.query('FECHA_CIERRE == "2019-12-31"')[['TASA_ANUAL', 'TASA_PERIODICA']].describe()

Unnamed: 0,TASA_ANUAL,TASA_PERIODICA
count,14617.0,14617.0
mean,3445.729835,398.986044
std,239.854331,215.324348
min,2400.0,200.0
25%,3168.0,280.0
50%,3528.0,294.0
75%,3528.0,320.0
max,3960.0,908.0


In [100]:
df.query('FECHA_CIERRE != "2019-12-31"')[['TASA_ANUAL', 'TASA_PERIODICA']].describe()

Unnamed: 0,TASA_ANUAL,TASA_PERIODICA
count,775617.0,775617.0
mean,33.427864,3.840373
std,3.114186,2.052753
min,12.0,1.0
25%,31.68,2.8
50%,33.6,2.94
75%,35.28,3.2
max,39.6,13.95


In [101]:
ind = df['FECHA_CIERRE'] == "2019-12-31"

df.loc[ind, ['FECHA_CIERRE', 'TASA_ANUAL', 'TASA_PERIODICA']]

Unnamed: 0,FECHA_CIERRE,TASA_ANUAL,TASA_PERIODICA
510562,2019-12-31,2640.0,675.0
510563,2019-12-31,3360.0,280.0
510564,2019-12-31,3360.0,280.0
510565,2019-12-31,2640.0,675.0
510566,2019-12-31,3120.0,260.0
...,...,...,...
525174,2019-12-31,3528.0,294.0
525175,2019-12-31,3840.0,320.0
525176,2019-12-31,3528.0,294.0
525177,2019-12-31,3840.0,320.0


In [102]:
df.loc[ind, ['TASA_ANUAL', 'TASA_PERIODICA']] /= 100

df.loc[ind, ['FECHA_CIERRE', 'TASA_ANUAL', 'TASA_PERIODICA']]

Unnamed: 0,FECHA_CIERRE,TASA_ANUAL,TASA_PERIODICA
510562,2019-12-31,26.40,6.75
510563,2019-12-31,33.60,2.80
510564,2019-12-31,33.60,2.80
510565,2019-12-31,26.40,6.75
510566,2019-12-31,31.20,2.60
...,...,...,...
525174,2019-12-31,35.28,2.94
525175,2019-12-31,38.40,3.20
525176,2019-12-31,35.28,2.94
525177,2019-12-31,38.40,3.20


In [103]:
df[['TASA_ANUAL']].describe()

Unnamed: 0,TASA_ANUAL
count,790234.0
mean,33.446906
std,3.105546
min,12.0
25%,31.68
50%,33.6
75%,35.28
max,39.6


## TIPO_CREDITO

In [104]:
df['TIPO_CREDITO'].value_counts()

NUEVO          340590
RENOVADO       201444
RETANQUEADO    158237
PARALELO        77904
SIN PERFIL       9446
SIN_PERFIL       2613
Name: TIPO_CREDITO, dtype: int64

In [105]:
df['TIPO_CREDITO'] = df['TIPO_CREDITO'].replace('SIN PERFIL', 'SIN_PERFIL')
df['TIPO_CREDITO'].value_counts()

NUEVO          340590
RENOVADO       201444
RETANQUEADO    158237
PARALELO        77904
SIN_PERFIL      12059
Name: TIPO_CREDITO, dtype: int64

## TIPO_UBICACION

In [106]:
df['TIPO_UBICACION'].value_counts(dropna=False)

URBANA    463571
RURAL     326663
Name: TIPO_UBICACION, dtype: int64

In [107]:
(df['TIPO_UBICACION'] == df['TIPO_UBICACION'].str.strip()).all()

True

## Dropping columns

In [108]:
df = df.drop(columns=project_columns.CARTERA_DISCARDED_COLUMNS)

## Validating cleaning code

In [109]:
df2 = datasets.read_cartera(dir_path=RAW_DATA_PATH, clean_all=True)

In [110]:
sort_by = ['FECHA_CIERRE', 'OBLIGACION']

assert cleaning.compare_dataframes(df, df2, sort_by=sort_by), "Dataframes are different"

Differences (if any):

In [111]:
cleaning.compare_dataframes_diff(df, df2, sort_by)

{}