# Cleaning COLOCACION dataset

Imports:

In [95]:
import datetime
import os

import pandas as pd
import numpy as np

from dotenv import load_dotenv

from core_ds4a_project import cleaning, columns as project_columns, datasets

%load_ext autoreload
%autoreload 1
%aimport core_ds4a_project, core_ds4a_project.cleaning, core_ds4a_project.columns, core_ds4a_project.datasets

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [96]:
# pd.set_option("display.max_columns", None)
# pd.set_option("display.max_columns", 60)

Environment variables:

In [97]:
load_dotenv('envvars')

ROOT_DATA_PATH = os.environ.get('ROOT_DATA_PATH')
RAW_DATA_PATH = os.environ.get('RAW_DATA_PATH') or f'{ROOT_DATA_PATH}/raw'

Reading data:

In [98]:
df = (
    datasets.read_colocacion_xlsx(RAW_DATA_PATH, raw=True)
)

df.shape

(42009, 55)

## Schema of columns for raw data

In [99]:
df[df.columns.sort_values()].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42009 entries, 0 to 42008
Data columns (total 55 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   Año_Contabiliza       41974 non-null  object        
 1   Cedula                42009 non-null  int64         
 2   Celular               42009 non-null  object        
 3   Cod_Destinacion       42009 non-null  object        
 4   Cod_Modalidad         42009 non-null  object        
 5   Cod_Sucursal          42009 non-null  int64         
 6   Cod_linea             42009 non-null  object        
 7   Cod_usuario_apr       41974 non-null  object        
 8   Codeudor              42009 non-null  object        
 9   Destinacion           42009 non-null  object        
 10  Dias Ciclo_credito    41974 non-null  object        
 11  Direccion             42009 non-null  object        
 12  Doc_contabiliza       41974 non-null  float64       
 13  Edad            

## Shifting row 879 in raw data

Element at index 879 contains column OBSERVACIONES (column index 23) wrongly splitted into two: self and next column; shifting subsequent columns values towards right one position and creating an unnamed column "Unnamed: 54" at last with a single non-null value:

In [100]:
df.index[~(df.iloc[:, -1].isnull())]

Int64Index([879], dtype='int64')

In [101]:
df.iloc[878:881, -5:]

Unnamed: 0,Dias Ciclo_credito,creditos_vigentes,Tipo_Credito,Estado,Unnamed: 54
878,1,1.0,NUEVO,Contabilizado,
879,Arriendo,0.0,1,NUEVO,Contabilizado
880,2,1.0,NUEVO,Contabilizado,


Fixing shift and updating dataframe by eliminating last unnamed column:

In [102]:
prev_count_cols = df.shape[1]

df.iloc[879, 23] = df.iloc[879, 23] + df.iloc[879, 24]
df.iloc[879, 24:] = df.iloc[879, 24:].shift(-1)
df = df.iloc[:, :-1]

prev_count_cols, df.shape[1]

(55, 54)

In [103]:
df.iloc[878:881, -5:]

Unnamed: 0,Tipo Vivienda,Dias Ciclo_credito,creditos_vigentes,Tipo_Credito,Estado
878,Propia,1.0,1.0,NUEVO,Contabilizado
879,Arriendo,0.0,1.0,NUEVO,Contabilizado
880,Propia,2.0,1.0,NUEVO,Contabilizado


## Renaming columns

In [104]:
df.columns = cleaning.normalize_columns_name(df.columns)

df.columns.sort_values()  # sorting for presentational purposes only

Index(['ANO_CONTABILIZA', 'CEDULA', 'CELULAR', 'CODEUDOR', 'COD_DESTINACION',
       'COD_LINEA', 'COD_MODALIDAD', 'COD_USUARIO_APR', 'CORREO',
       'CREDITOS_VIGENTES', 'DESTINACION', 'DIAS_CICLO_CREDITO', 'DIRECCION',
       'DOC_CONTABILIZA', 'EDAD', 'ESTADO', 'ESTADO_CIVIL', 'ESTRATO',
       'EST_SOLICITUD', 'FECHA_APROBA', 'FECHA_DESEMBOLSO', 'FECHA_NACIMIENTO',
       'FECHA_PAGO', 'FECHA_SOLICITUD', 'GENERO', 'IDENTI_EJECUTIVO', 'LINEA',
       'MODALIDAD', 'MONTO', 'MUNICIPIO', 'NIVEL_ESTUDIOS', 'NOMBRE',
       'NOMBRE_EJECUTIVO', 'NOMBRE_USUARIO', 'NRO_CUOTAS', 'OBLIGACION',
       'OBSERVACIONES', 'PERIODICIDAD_PAGO', 'PROFESION', 'REGION',
       'SOLICITUD', 'SUCURSAL', 'SUCURSAL_COD', 'SUCURSAL_CONTABILIZA',
       'TASA_ANUAL', 'TEL_FIJO', 'TIPO_CREDITO', 'TIPO_DOCUMENTO',
       'TIPO_OBLIGA', 'TIPO_UBICACION', 'TIPO_VIVIENDA', 'VALOR_CUOTA',
       'VALOR_DESEMBOLSADO', 'VALOR_REFINANCIADO'],
      dtype='object')

## Categorical columns

### CODEUDOR

CODEUDOR column has following possible forms of values containing sensitive data such as identity document number:
- "SIN CODEUDORES" (no cosigner).
- Cosigners ID and name (maximum two cosigners).
- Cosigners ID, name, and payment calification (maximum two cosigners).

In [105]:
i_ids = df['CODEUDOR'].str.contains('\d+')
i_cosign_cal = df['CODEUDOR'].str.contains('CALIFICACION')
i_no_cosign = df['CODEUDOR'].str.match('SIN CODEUDORES')

i_cosign_no_cal = ~i_no_cosign & ~i_cosign_cal

count_no_cosign = i_no_cosign.sum()
count_cosign_cal = i_cosign_cal.sum()
count_cosign_no_cal = i_cosign_no_cal.sum()

count_sum = count_no_cosign + count_cosign_cal + count_cosign_no_cal

{
    "total_records": df.shape[0],
    "sum_codeudor_counts": count_sum,
    "count_cosign": count_cosign_no_cal,
    "count_cosign_with_cal": count_cosign_cal,
    "count_no_cosign": count_no_cosign,
    "all_cosign_cal_have_ids": i_ids[i_cosign_cal].all(),
    "all_cosign_no_cal_have_ids": i_ids[i_cosign_no_cal].all()
}


{'total_records': 42009,
 'sum_codeudor_counts': 42009,
 'count_cosign': 6304,
 'count_cosign_with_cal': 8806,
 'count_no_cosign': 26899,
 'all_cosign_cal_have_ids': True,
 'all_cosign_no_cal_have_ids': True}

Update values as following:
- "SIN CODEUDORES" as "SIN_CODEUDORES" (replace space for future column handling).
- Records with no cosigner calification as "CODEUDOR_NUM", where NUM is the number of cosigners.
- Records with cosigner calification as "CODEUDOR_CAL", where CAL is the best cosigner calification (i.e. minimum letter).
- Records with invalid cosigner calification as if they had no calification: "CODEUDOR_NUM".

In [106]:
CALIFICATIONS = set(['A', 'B', 'C', 'D', 'E'])


def format_codeudor_no_cal(x):
    return f'COUDEUDOR_{len(x)}'


def format_codeudor_cal(x):
    valid_cals = CALIFICATIONS.intersection(x)
    if len(valid_cals) == 0:
        return format_codeudor_no_cal(x)

    return f'CODEUDOR_{min(CALIFICATIONS.intersection(x))}'


In [107]:
cals_ss = (
    df
    .loc[i_cosign_cal, 'CODEUDOR']
    .str.findall(r'(?:CALIFICACION:\s)(?P<calificacion>\w)')
)

cals_ss

8           [B]
9           [A]
14          [M]
16          [B]
22       [A, E]
          ...  
41994    [E, E]
41996       [B]
42001       [B]
42002       [B]
42004       [B]
Name: CODEUDOR, Length: 8806, dtype: object

Question: are these typos? how is it the process of calification? how is the relation of CODEUDOR with defaulting?

In [108]:
unknown_cals = set()
for val in cals_ss:
    diff = set(val).difference(CALIFICATIONS)
    unknown_cals = unknown_cals.union(diff)

unknown_cals

{'I', 'M'}

Question: verify meaning of variable composition: rates, numbers

In [109]:
df.loc[i_no_cosign, 'CODEUDOR'] = 'SIN_CODEUDOR'
df.loc[i_cosign_no_cal, 'CODEUDOR'] = (
    df
    .loc[i_cosign_no_cal, 'CODEUDOR']
    .str.findall('\d+')
    .apply(format_codeudor_no_cal)
)

df.loc[i_cosign_cal, 'CODEUDOR'] = cals_ss.apply(format_codeudor_cal)


# Validation with commas
# ddb = df.loc[i_cosign, 'CODEUDOR'].strip().str.replace(', ,', ',')
# ddc2 = ddb.str.split(',').rename('VALUES')
# ddd2 = ddc2.apply(lambda x: len(x) if len(x[-1]) > 0 else len(x) - 1).rename('LEN')
# ddd22 = ddb.str.findall('\d+').apply(lambda x: len(x)).rename('LEN_1')
# dde2 = pd.concat([ddb, ddc2, ddd2, ddd22], axis=1)
# print((dde2.iloc[:, -1] == dde2.iloc[:, -2]).all())
# dde2

df['CODEUDOR'].value_counts(dropna=False)

SIN_CODEUDOR    26899
COUDEUDOR_1      7067
CODEUDOR_B       4235
CODEUDOR_A       1939
CODEUDOR_E       1769
COUDEUDOR_2       100
Name: CODEUDOR, dtype: int64

In [110]:
df['CODEUDOR'].sample(5)

9737     SIN_CODEUDOR
4396     SIN_CODEUDOR
14001    SIN_CODEUDOR
13381    SIN_CODEUDOR
28023    SIN_CODEUDOR
Name: CODEUDOR, dtype: object

In [111]:
df.loc[22, 'CODEUDOR']

'CODEUDOR_A'

### COD_DESTINACION, DESTINACION

COD_DESTINACION and DESTINACION don't have NaN values nor invalid values:

In [112]:
df[['COD_DESTINACION', 'DESTINACION']].isna().any()

COD_DESTINACION    False
DESTINACION        False
dtype: bool

In [113]:
df[['COD_DESTINACION', 'DESTINACION']].drop_duplicates().sort_values(by='COD_DESTINACION')

Unnamed: 0,COD_DESTINACION,DESTINACION
7,AF,ACTIVOS FIJOS
3807,COS,COSECHA
0,CT,CAPITAL DE TRABAJO
13,EME,EMERGENCIA
12630,GRR,GARANTIA REAL
31239,MO,MANO DE OBRA
31043,MT,MATERIALES
12,MX,MIXTO
294,NEG,NEGOCIO


### COD_LINEA, LINEA

COD_LINEA and LINEA don't have NaN values nor invalid values:

In [114]:
df[['COD_LINEA', 'LINEA']].isna().any()

COD_LINEA    False
LINEA        False
dtype: bool

In [115]:
df[['COD_LINEA', 'LINEA']].value_counts(dropna=False).sort_index()

COD_LINEA  LINEA          
CRD        CREDITO DIGITAL      348
CRE        CRECER             18033
CRN        CREDINEGOCIO         188
FID        FIDELIZACION        1874
GER        GERMINA            15166
GRR        GARANTIA REAL          2
MCA        MI CASA             6392
UNE        UNETE                  6
dtype: int64

### COD_MODALIDAD, MODALIDAD

COD_MODALIDAD and MODALIDAD don't have NaN values nor invalid values:

In [116]:
df[['COD_MODALIDAD', 'MODALIDAD']].isna().any()

COD_MODALIDAD    False
MODALIDAD        False
dtype: bool

In [117]:
df[['COD_MODALIDAD', 'MODALIDAD']].value_counts()

COD_MODALIDAD  MODALIDAD                                               
400            DESARROLLO EMPRESARIAL CRECER (COMISION PYME) RENOVACION    11128
403            CREDITO RURAL INDIVIDUAL GERMINA                             9872
404            MICROVIVIENDA                                                6069
410            DESARROLLO EMPRESARIAL CRECER (COMISION PYME) NUEVO          4166
412            CREDITO RURAL INDIVIDUAL - GERMINA NUEVO                     3189
398            CREDI-DESVARE EMERGENCIA TRIMESTRAL                          1650
396            DESARROLLO EMPRESARIAL CRECER NUEVO DCA                      1629
395            CREDITO RURAL INDIVIDUAL - GERMINA NUEVO DCA                  935
399            CRÉDI COSECHA                                                 492
479            MICROCREDITO DIGITAL                                          348
506            MUJER URBANO                                                  346
392            EMP219 70 DESARROLLO E

### ESTADO_CIVIL

In [118]:
df['ESTADO_CIVIL'].value_counts(dropna=False)

Union libre    16591
Casado         11305
Soltero        11008
Divorciado      1627
Viudo           1321
Separado          91
NaN               50
Otro              16
Name: ESTADO_CIVIL, dtype: int64

### ESTRATO

PENDING: as categorical?

ESTRATO column has some NaN values, so, it is not possible to cast the column to integer type:

In [119]:
df['ESTRATO'].value_counts(dropna=False)

1.0    21165
2.0    17584
3.0     3102
4.0       85
NaN       63
5.0        9
6.0        1
Name: ESTRATO, dtype: int64

### GENERO

GENERO don't have NaN values nor invalid values:

In [120]:
df['GENERO'].isna().any()

False

In [121]:
df['GENERO'].value_counts(dropna=False)

Femenino     23603
Masculino    18406
Name: GENERO, dtype: int64

### NIVEL_ESTUDIOS

NIVEL_ESTUDIOS has some NaN values but no invalid values:

In [122]:
df['NIVEL_ESTUDIOS'].value_counts(dropna=False)

Secundaria         17665
Primaria           16818
Tecnica             3848
Universitaria       2127
Tecnológica         1032
Analfabetismo        262
No escolarizado      141
Especializacion       86
Maestria              11
No indica              7
Magister               6
NaN                    6
Name: NIVEL_ESTUDIOS, dtype: int64

In [123]:
(
    cleaning.normalize_columns_name(df['NIVEL_ESTUDIOS'])
    .replace({
        'MAGISTER': 'MAESTRIA',
    })
    .value_counts(dropna=False)
)

SECUNDARIA         17665
PRIMARIA           16818
TECNICA             3848
UNIVERSITARIA       2127
TECNOLOGICA         1032
ANALFABETISMO        262
NO_ESCOLARIZADO      141
ESPECIALIZACION       86
MAESTRIA              17
NO_INDICA              7
NaN                    6
Name: NIVEL_ESTUDIOS, dtype: int64

### PERIODICIDAD_PAGO

PERIODICIDAD_PAGO has some NaN values but no invalid values:

In [124]:
df['PERIODICIDAD_PAGO'].value_counts(dropna=False)

Mensual          32936
Trimestral        9007
NaN                 35
Bimensual           30
Cuatrimestral        1
Name: PERIODICIDAD_PAGO, dtype: int64

Draft:

In [125]:
i_na = df['PERIODICIDAD_PAGO'].isna()
i_rechazado = df['ESTADO'].str.match('RECHAZADO')

[
    i_na.sum(),
    i_rechazado.sum(),
    (i_na & i_rechazado).sum(),
]

[35, 0, 0]

In [126]:
izero = df['VALOR_CUOTA'] == 0
(i_na & ~i_rechazado).sum(), (i_na & ~i_rechazado & izero).sum()

(35, 35)

### PROFESION

In [127]:
df['PROFESION'].value_counts(dropna=False)

SIN PROFESION           13187
DESCONOCIDA             11933
AGRICULTURA              5227
GANADERIA                4379
ADMON DE NEGOCIOS        2272
                        ...  
QUIMICA FARMACEUTICA        2
FONOAUDIOLOGIA              2
DISEÑO INDUSTRIAL           2
ING. ADMINISTRATIVA         2
FISICA                      1
Name: PROFESION, Length: 95, dtype: int64

### SUCURSAL, SUCURSAL_COD (also in CARTERA)

> Note: SUCURSAL_COD in COLOCACION dataset has no null values but in CARTERA dataset some null values are denoted by "#N/D"

Question: what is SUCURSAL? In joining_datasets.ipynb it is seemed that there are multiple SUCURSAL values for same CLIENTE

In [128]:
df[['SUCURSAL_COD']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42009 entries, 0 to 42008
Data columns (total 1 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   SUCURSAL_COD  42009 non-null  int64
dtypes: int64(1)
memory usage: 328.3 KB


In [129]:
df['SUCURSAL_COD'].isna().any()

False

In [130]:
df['SUCURSAL_COD'].value_counts(dropna=False)

1     6787
8     6054
2     5067
9     4278
3     3644
6     3442
4     2704
11    2704
12    1889
5     1441
7     1389
10    1036
13     989
14     585
Name: SUCURSAL_COD, dtype: int64

### TIPO_CREDITO (also in CARTERA)

Question: SIN PERFIL?

In [131]:
df['TIPO_CREDITO'].value_counts(dropna=False)

NUEVO          17817
RENOVADO       10262
RETANQUEADO     8033
PARALELO        4631
SIN PERFIL      1231
NaN               35
Name: TIPO_CREDITO, dtype: int64

### TIPO_UBICACION (also in CONTACTO)

No NaN values, all other are valid:

In [132]:
df['TIPO_UBICACION'].isna().any()

False

In [133]:
df['TIPO_UBICACION'].value_counts(dropna=False)

URBANA    23969
RURAL     18040
Name: TIPO_UBICACION, dtype: int64

### TIPO_VIVIENDA (also in CONTACTO)

No NaN values, all other are valid:

In [134]:
df['TIPO_VIVIENDA'].isna().any()

False

In [135]:
df['TIPO_VIVIENDA'].value_counts(dropna=False)

Propia                   25727
Familiar                  8998
Arriendo                  6244
Otra                       912
SIN VIVIENDA               112
Inmueble con Hipoteca       16
Name: TIPO_VIVIENDA, dtype: int64

In [136]:
df['TIPO_VIVIENDA'] = cleaning.clean_tipo_vivienda(df['TIPO_VIVIENDA'])

df['TIPO_VIVIENDA'].value_counts(dropna=False)

PROPIA          25727
FAMILIAR         8998
ARRIENDO         6244
OTRA              912
SIN_VIVIENDA      112
HIPOTECA           16
Name: TIPO_VIVIENDA, dtype: int64

## Dates columns

### FECHA_NACIMIENTO (also in CONTACTO)

In [137]:
df['FECHA_NACIMIENTO'].apply(lambda x: isinstance(x, datetime.datetime)).all()

True

In [138]:
df['FECHA_NACIMIENTO'] = pd.to_datetime(df['FECHA_NACIMIENTO'])

### FECHA_PAGO

Question: inconsistent column when compared to dates in CARTERA dataset

In [139]:
df['FECHA_PAGO']

0        2021-10-02 00:00:00
1        2021-12-02 00:00:00
2        2021-10-02 00:00:00
3        2021-10-02 00:00:00
4        2021-12-02 00:00:00
                ...         
42004    2017-04-18 00:00:00
42005    2017-02-17 00:00:00
42006    2017-02-17 00:00:00
42007    2017-02-17 00:00:00
42008    2017-04-17 00:00:00
Name: FECHA_PAGO, Length: 42009, dtype: object

### Casting dates

Some dates columns are automatically parsed when reading with Pandas considering such columns have been setup with date format directly in .xlsx file (see [this GitHub issue](https://github.com/pandas-dev/pandas/issues/29217))

In [140]:
DATE_COLS = df.columns[df.columns.str.match('^FECHA_')]

df.columns[df.columns.str.match('^FECHA_')]

Index(['FECHA_SOLICITUD', 'FECHA_APROBA', 'FECHA_DESEMBOLSO', 'FECHA_PAGO',
       'FECHA_NACIMIENTO'],
      dtype='object')

In [141]:
df[DATE_COLS].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42009 entries, 0 to 42008
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   FECHA_SOLICITUD   42009 non-null  datetime64[ns]
 1   FECHA_APROBA      42009 non-null  datetime64[ns]
 2   FECHA_DESEMBOLSO  41974 non-null  datetime64[ns]
 3   FECHA_PAGO        41974 non-null  object        
 4   FECHA_NACIMIENTO  42009 non-null  datetime64[ns]
dtypes: datetime64[ns](4), object(1)
memory usage: 1.6+ MB


In [142]:
exclude = df[DATE_COLS].select_dtypes('datetime64').columns

exclude

Index(['FECHA_SOLICITUD', 'FECHA_APROBA', 'FECHA_DESEMBOLSO',
       'FECHA_NACIMIENTO'],
      dtype='object')

In [143]:
cleaning.cast_dates_in_place(df, exclude=exclude)

df[DATE_COLS].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42009 entries, 0 to 42008
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   FECHA_SOLICITUD   42009 non-null  datetime64[ns]
 1   FECHA_APROBA      42009 non-null  datetime64[ns]
 2   FECHA_DESEMBOLSO  41974 non-null  datetime64[ns]
 3   FECHA_PAGO        41974 non-null  datetime64[ns]
 4   FECHA_NACIMIENTO  42009 non-null  datetime64[ns]
dtypes: datetime64[ns](5)
memory usage: 1.6 MB


## Numeric columns

### ANO_CONTABILIZA

In [144]:
df['ANO_CONTABILIZA'].drop_duplicates()

0        2021
804       NaN
6347     2020
13754    2019
23826    2018
33290    2017
Name: ANO_CONTABILIZA, dtype: object

In [145]:
(df['ANO_CONTABILIZA'] == df['FECHA_APROBA'].dt.year).all()

False

In [146]:
ind = df['ANO_CONTABILIZA'] == df['FECHA_APROBA'].dt.year

df.loc[~ind, ['ANO_CONTABILIZA', *DATE_COLS]]

Unnamed: 0,ANO_CONTABILIZA,FECHA_SOLICITUD,FECHA_APROBA,FECHA_DESEMBOLSO,FECHA_PAGO,FECHA_NACIMIENTO
804,,2021-08-06,2021-08-09,NaT,NaT,1960-02-22
1586,,2021-06-29,2021-07-12,NaT,NaT,1997-09-24
2202,,2021-06-15,2021-06-18,NaT,NaT,2001-12-31
2583,,2021-05-31,2021-05-31,NaT,NaT,2001-11-13
2685,,2021-05-26,2021-05-28,NaT,NaT,1986-10-29
3803,,2021-04-14,2021-04-16,NaT,NaT,1966-11-11
5211,,2021-02-22,2021-02-24,NaT,NaT,1997-03-21
6840,,2020-12-14,2020-12-15,NaT,NaT,1986-01-24
10020,,2020-08-06,2020-08-06,NaT,NaT,1972-03-25
11115,,2020-05-20,2020-05-20,NaT,NaT,1963-02-01


### MONTO, VALOR_DESEMBOLSADO

In [147]:
(df['MONTO'] - df['VALOR_DESEMBOLSADO']).abs().describe()

count    4.200900e+04
mean     2.172544e+05
std      6.567472e+05
min      0.000000e+00
25%      0.000000e+00
50%      1.033600e+04
75%      6.664100e+04
max      2.100000e+07
dtype: float64

In [148]:
ind = (df['MONTO'] - df['VALOR_DESEMBOLSADO']) > 685618
df.loc[ind, ['MONTO', 'VALOR_DESEMBOLSADO']].describe()

Unnamed: 0,MONTO,VALOR_DESEMBOLSADO
count,4201.0,4201.0
mean,6548248.0,4892358.0
std,5452141.0,4641675.0
min,1000000.0,0.0
25%,3120000.0,2186787.0
50%,5000000.0,3507993.0
75%,8000000.0,5867098.0
max,80000000.0,78851470.0


In [149]:
df[['MONTO', 'VALOR_DESEMBOLSADO']]

Unnamed: 0,MONTO,VALOR_DESEMBOLSADO
0,4500000,4478698
1,2000000,1984347
2,1100000,1061583
3,1000000,985933
4,2500000,2483434
...,...,...
42004,8000000,8000000
42005,7000000,7000000
42006,3000000,3000000
42007,1000000,1000000


### CREDITOS_VIGENTES

Question: does this number includes current new credit?

In [150]:
df['CREDITOS_VIGENTES'].value_counts(dropna=False)

NaN    21395
1.0    17617
2.0     2880
3.0      111
4.0        6
Name: CREDITOS_VIGENTES, dtype: int64

In [151]:
df['CREDITOS_VIGENTES'].describe()

count    20614.000000
mean         1.151353
std          0.375457
min          1.000000
25%          1.000000
50%          1.000000
75%          1.000000
max          4.000000
Name: CREDITOS_VIGENTES, dtype: float64

### DIAS_CICLO_CREDITO

Question: what is this?

In [152]:
df['DIAS_CICLO_CREDITO'].value_counts()

1     12399
2      7720
0      6387
3      4441
4      4143
5      3117
6      1905
7       922
8       358
9       141
10      120
11      106
12       64
14       48
13       42
15       12
21        9
17        9
18        6
22        6
19        4
16        4
28        2
36        2
25        1
52        1
20        1
34        1
23        1
38        1
26        1
Name: DIAS_CICLO_CREDITO, dtype: int64

### EDAD

All values are integer type with no invalid values:

In [153]:
df['EDAD'].apply(lambda x: isinstance(x, int)).all()

True

In [154]:
df['EDAD'] = df['EDAD'].astype(int)

In [155]:
df['EDAD'].describe()

count    42009.000000
mean        46.125426
std         13.262953
min         19.000000
25%         35.000000
50%         46.000000
75%         56.000000
max         78.000000
Name: EDAD, dtype: float64

### NRO_CUOTAS

In [156]:
df['NRO_CUOTAS']

0        24
1         6
2        12
3        12
4         6
         ..
42004    10
42005    36
42006    18
42007     6
42008     4
Name: NRO_CUOTAS, Length: 42009, dtype: int64

In [157]:
df['NRO_CUOTAS'].isna().any()

False

In [158]:
df['NRO_CUOTAS'].value_counts(dropna=False)

12    9402
18    7206
24    6344
6     2439
8     2225
      ... 
71       1
69       1
62       1
57       1
53       1
Name: NRO_CUOTAS, Length: 70, dtype: int64

### TASA_ANUAL

All values are non-NaN with no invalid values:

In [159]:
df['TASA_ANUAL'].isna().any()

False

In [160]:
df['TASA_ANUAL'].describe()

count    42009.000000
mean        34.283402
std          2.954325
min         12.000000
25%         31.680000
50%         35.280000
75%         35.280000
max         39.600000
Name: TASA_ANUAL, dtype: float64

### VALOR_CUOTA

In [161]:
df['VALOR_CUOTA'].isna().any()

False

In [162]:
df['VALOR_CUOTA'].astype(float).describe()

count    4.200900e+04
mean     3.907145e+05
std      4.241920e+05
min      0.000000e+00
25%      1.581840e+05
50%      2.377460e+05
75%      4.470430e+05
max      1.128652e+07
Name: VALOR_CUOTA, dtype: float64

Question: valor_cuota zero? manually set?

In [163]:
izero = df['VALOR_CUOTA'] == 0
[
    izero.sum(),
    i_rechazado.sum(),
    (izero & i_rechazado).sum()
]
# df.loc[izero, 'VALOR_CUOTA']

[35, 0, 0]

In [164]:
# df[izero]
# raise ValueError


### VALOR_REFINANCIADO

All values are non-NaN and valid:

In [165]:
df['VALOR_REFINANCIADO'].isna().any()

False

Most of the values are zero:

In [166]:
ind_zero = df['VALOR_REFINANCIADO'] == 0

{
    "count_zero": ind_zero.sum(),
    "count_nonzero": (~ind_zero).sum(),
}

{'count_zero': 33434, 'count_nonzero': 8575}

Stats non-zero values:

In [167]:
df.loc[~ind_zero, 'VALOR_REFINANCIADO'].astype(float).describe()

count    8.575000e+03
mean     9.492007e+05
std      1.100428e+06
min      1.000000e+00
25%      3.325230e+05
50%      6.366620e+05
75%      1.163212e+06
max      2.000000e+07
Name: VALOR_REFINANCIADO, dtype: float64

Question: lower payments

In [168]:
ind_lower = df.loc[~ind_zero, 'VALOR_REFINANCIADO'] <= 10000

df.loc[~ind_zero, 'VALOR_REFINANCIADO'][ind_lower].sort_values()

26115       1
11171      40
41788      72
39690     129
41377     171
         ... 
33336    8298
8413     8330
40823    9496
40107    9551
40868    9817
Name: VALOR_REFINANCIADO, Length: 92, dtype: int64

## Text columns

### OBSERVACIONES

In [169]:
df[['OBSERVACIONES']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42009 entries, 0 to 42008
Data columns (total 1 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   OBSERVACIONES  41142 non-null  object
dtypes: object(1)
memory usage: 328.3+ KB


Question: why 200 characters?

In [170]:
df['OBSERVACIONES'].dropna().str.len().describe()

count    41142.000000
mean       196.576345
std         12.971626
min          8.000000
25%        200.000000
50%        200.000000
75%        200.000000
max        200.000000
Name: OBSERVACIONES, dtype: float64

Draft:

In [171]:
# 0, 60, 15685, 32571
df['OBSERVACIONES'].iloc[32571]

'CLIENTE NUEVO, HISTORIAL CREDITICIO CON PAGOS NORMALES, ACTIVIDAD GANADERIA Y CULTIVO DE PLATANO, CON 31 AÑOS DE EXPERIENCIA EN LA ACTIVIDAD, PREDIO PROPIO, SOLICITA PARA COMPRA DE TORO REPRODUCTOR Y '

In [172]:
df['OBSERVACIONES'].dropna().str.contains('^CLIENTE').sum()

36706

In [173]:
(
    df['OBSERVACIONES']
    .dropna()
    .str.findall(r'^CLIENTE\s+\w+\s\w+')
    .apply(lambda x: x[0] if len(x) > 0 else [])
    .replace({
        'CLIENTE DE RENOVACION': 'CLIENTE RENOVACION',
        'CLIENTE DE RETANQUEO': 'CLIENTE RETANQUEO'
    })
    .value_counts()
)

[]                         22165
CLIENTE RENOVACION          2607
CLIENTE DE LA               1781
CLIENTE RETANQUEO           1644
CLIENTE NUEVO CON           1224
                           ...  
CLIENTE RENOVACION C0ON        1
CLIENTE  4 CREDITO             1
CLIENTE   N UEVO               1
CLIENTE NUEVO REGIONA          1
CLIENTE HA CANCELADO           1
Name: OBSERVACIONES, Length: 1438, dtype: int64

## Discarded columns

We drop data that is useless for bussiness analysis considering context domain and/or information is sparse.

### DOC_CONTABILIZA

In [174]:
df['DOC_CONTABILIZA'].drop_duplicates()

0        13001030.0
1        13001034.0
2        13001036.0
3        13001038.0
4        13001040.0
            ...    
42004     8000003.0
42005    10000001.0
42006     8000001.0
42007     1000001.0
42008     1000003.0
Name: DOC_CONTABILIZA, Length: 36267, dtype: float64

### ESTADO, EST_SOLICITUD

ESTADO and EST_SOLICITUD are discarded as most values are CONTABILIZADO:

Question: Contabilizado?

In [175]:
df['ESTADO'].value_counts(dropna=False)

Contabilizado    41974
Rechazado           27
Aprobado             8
Name: ESTADO, dtype: int64

In [176]:
df['EST_SOLICITUD'].value_counts(dropna=False)

C    41974
R       27
A        8
Name: EST_SOLICITUD, dtype: int64

### TIPO_OBLIGA

TIPO_OBLIGA is useless as there is only one single category value:

In [177]:
df['TIPO_OBLIGA'].drop_duplicates()

0    10
Name: TIPO_OBLIGA, dtype: int64

## Unknown columns

Question: What is this? vendor user?

In [178]:
df['COD_USUARIO_APR'].drop_duplicates()

0             CHADOR
7             RAMCLA
8             MEDARI
15            INOHEN
22            VEGJUL
24            SANCAM
36            PEDMAR
38            AGUWIL
76            PATSAN
120           APPMOV
804              NaN
11473         PIRJEN
11484    SIN-USUARIO
12763         HERCLA
15170         VELEDI
15294         PARALC
15599         PATJOS
29441         SANJOH
32647         NORMAR
Name: COD_USUARIO_APR, dtype: object

## Discarding columns

In [179]:
df = (
    df
    .drop(columns=project_columns.COLOCACION_DISCARDED_COLUMNS)
    .drop(columns=project_columns.COLOCACION_LOCATION_COLUMNS)
)

## Schema of columns for clean data

In [180]:
df[df.columns.sort_values()].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42009 entries, 0 to 42008
Data columns (total 33 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   ANO_CONTABILIZA     41974 non-null  object        
 1   CODEUDOR            42009 non-null  object        
 2   COD_DESTINACION     42009 non-null  object        
 3   COD_LINEA           42009 non-null  object        
 4   COD_MODALIDAD       42009 non-null  object        
 5   CREDITOS_VIGENTES   20614 non-null  float64       
 6   DESTINACION         42009 non-null  object        
 7   DIAS_CICLO_CREDITO  41974 non-null  object        
 8   EDAD                42009 non-null  int32         
 9   ESTADO_CIVIL        41959 non-null  object        
 10  ESTRATO             41946 non-null  float64       
 11  FECHA_APROBA        42009 non-null  datetime64[ns]
 12  FECHA_DESEMBOLSO    41974 non-null  datetime64[ns]
 13  FECHA_NACIMIENTO    42009 non-null  datetime64

## Validating cleaning code

In [181]:
df2 = datasets.read_colocacion_xlsx(dir_path=RAW_DATA_PATH, clean=True)

In [182]:
sort_by = ['OBLIGACION']

assert cleaning.compare_dataframes(df, df2, sort_by=sort_by), "Dataframes are different"

Differences (if any):

In [183]:
cleaning.compare_dataframes_diff(df, df2, sort_by)

{}