In [1]:
import dask.dataframe as dd
import numpy as np

In [2]:
dtypes = {
    'age': 'float64',
    'antiguedad': 'float64',
    'renta': 'float64',
    'indrel_1mes': 'object',
    'conyuemp': 'object',
    'ult_fec_cli_1t': 'object',
    'tiprel_1mes': 'object',
    'canal_entrada': 'object',
    'segmento': 'object'
}

df = dd.read_csv("Final_cleaned.csv", dtype=dtypes, low_memory=False)

In [3]:
# 1. Missing values
missing = df.isnull().sum().compute()
print("Missing values:\n", missing[missing > 0])

Missing values:
 ind_empleado             27734
pais_residencia          27734
sexo                     27804
fecha_alta               27734
ind_nuevo                27734
antiguedad               27734
indrel                   27734
indresi                  27734
indext                   27734
indfall                  27734
tipodom                  27735
cod_prov                 93591
nomprov                  93591
ind_actividad_cliente    27734
ind_nomina_ult1          16063
ind_nom_pens_ult1        16063
dtype: int64


In [4]:
# 2. Dtypes
print("\nDtypes:\n", df.dtypes)


Dtypes:
 fecha_dato               string[pyarrow]
ncodpers                         float64
ind_empleado             string[pyarrow]
pais_residencia          string[pyarrow]
sexo                     string[pyarrow]
age                              float64
fecha_alta               string[pyarrow]
ind_nuevo                        float64
antiguedad                       float64
indrel                           float64
ult_fec_cli_1t           string[pyarrow]
indrel_1mes              string[pyarrow]
tiprel_1mes              string[pyarrow]
indresi                  string[pyarrow]
indext                   string[pyarrow]
conyuemp                 string[pyarrow]
canal_entrada            string[pyarrow]
indfall                  string[pyarrow]
tipodom                          float64
cod_prov                         float64
nomprov                  string[pyarrow]
ind_actividad_cliente            float64
renta                            float64
segmento                 string[pyarrow]
ind_ah

In [7]:
# 3. Duplicates
dupe_counts = df.groupby(['ncodpers', 'fecha_dato']).size().reset_index() # Count duplicate combinations

dupe_counts = dupe_counts.rename(columns={0: 'count'}) # Rename the size column

duplicates = dupe_counts[dupe_counts['count'] > 1]['count'].count().compute() # Filter to find duplicates

print(f"\nDuplicate customer-date combinations: {duplicates}")


Duplicate customer-date combinations: 0


In [8]:
# 4. Outliers (renta, age)
stats = df[['renta', 'age', 'antiguedad']].describe().compute()
print("\nSummary stats:\n", stats)


Summary stats:
              renta           age    antiguedad
count  1.364731e+07  1.364731e+07  1.361958e+07
mean   1.200119e+05  4.024009e+01  7.659194e+01
std    6.040065e+04  1.707727e+01  1.671807e+03
min    1.202730e+03  1.800000e+01 -9.999990e+05
25%    1.012124e+05  2.900000e+01  3.500000e+01
50%    1.192053e+05  4.400000e+01  1.050000e+02
75%    1.702766e+05  6.100000e+01  1.910000e+02
max    2.736639e+05  1.000000e+02  2.560000e+02


In [9]:
# 5. Segment balance
print("\nSegment counts:\n", df['segmento'].value_counts().compute())


Segment counts:
 segmento
02 - PARTICULARES     8149588
03 - UNIVERSITARIO    4935579
01 - TOP               562142
Name: count, dtype: int64[pyarrow]


In [10]:
# 6. Fecha_dato validation
print("\nDate range:\n", df['fecha_dato'].min().compute(), " to ", df['fecha_dato'].max().compute())


Date range:
 2015-01-28  to  2016-05-28


In [11]:
# 7. Product column validation
product_cols = [col for col in df.columns if col.startswith('ind_') and col.endswith('_ult1')]
for col in product_cols:
    vals = df[col].dropna().unique().compute()
    if not set(vals).issubset({0, 1}):
        print(f"Warning: Unexpected values in {col}: {vals}")