# Exploratory Data Analysis

In [1]:
import dask.dataframe as dd
import numpy as np

In [2]:
dtypes = {
    'age': 'float64',
    'antiguedad': 'float64',
    'renta': 'float64',
    'indrel_1mes': 'object',
    'conyuemp': 'object',
    'ult_fec_cli_1t': 'object',
    'tiprel_1mes': 'object',
    'canal_entrada': 'object',
    'segmento': 'object'
}

df = dd.read_csv("Final_cleaned.csv", dtype=dtypes, low_memory=False)

In [6]:
rename_dict = {
    'ncodpers': 'customer_id',
    'ind_empleado': 'employee_index',
    'pais_residencia': 'country_of_residence',
    'sexo': 'gender',
    'age': 'age',
    'fecha_alta': 'registration_date',
    'ind_nuevo': 'new_customer',
    'antiguedad': 'customer_seniority',
    'indrel': 'customer_type',
    'ult_fec_cli_1t': 'last_primary_date',
    'indrel_1mes': 'initial_customer_type',
    'tiprel_1mes': 'initial_relation',
    'indresi': 'resident_index',
    'indext': 'foreigner_index',
    'conyuemp': 'spouse_index',
    'canal_entrada': 'joining_channel',
    'indfall': 'deceased',
    'tipodom': 'address_type',
    'cod_prov': 'province_code',
    'nomprov': 'province_name',
    'ind_actividad_cliente': 'active_customer',
    'renta': 'income',
    'segmento': 'customer_segment'
}

df = df.rename(columns=rename_dict) #Applying renames to columns for better readability

df['gender'] = df['gender'].replace({'H': 'Male', 'V': 'Female'}) # "H" - Hombre - Male and "V" - Mujer/Varón - Female

# 1: Customer Profile Analysis

In [4]:
# Gender distribution
print("\n--- Gender Distribution ---")
print(df['gender'].value_counts().compute())

# Age distribution
print("\n--- Age Summary ---")
print(df['age'].describe().compute())

# Customer Segment distribution
print("\n--- Customer Segment Distribution ---")
print(df['customer_segment'].value_counts().compute())

# Country of residence
print("\n--- Top 10 Countries by Customer Count ---")
print(df['country_of_residence'].value_counts().nlargest(10).compute())

# Customer seniority (in months)
print("\n--- Customer Seniority Summary ---")
print(df['customer_seniority'].describe().compute())


--- Gender Distribution ---
gender
H    6195253
V    7424252
Name: count, dtype: int64[pyarrow]

--- Age Summary ---
count    1.364731e+07
mean     4.024009e+01
std      1.707727e+01
min      1.800000e+01
25%      2.900000e+01
50%      4.400000e+01
75%      6.100000e+01
max      1.000000e+02
Name: age, dtype: float64

--- Customer Segment Distribution ---
customer_segment
02 - PARTICULARES     8149588
03 - UNIVERSITARIO    4935579
01 - TOP               562142
Name: count, dtype: int64[pyarrow]

--- Top 10 Countries by Customer Count ---
country_of_residence
ES    13553710
FR        5161
AR        4835
DE        4625
GB        4605
US        3651
CO        3526
IT        2947
RO        2931
MX        2573
Name: count, dtype: int64[pyarrow]

--- Customer Seniority Summary ---
count    1.361958e+07
mean     7.659194e+01
std      1.671807e+03
min     -9.999990e+05
25%      3.500000e+01
50%      1.050000e+02
75%      1.910000e+02
max      2.560000e+02
Name: customer_seniority, dtype: floa

In [5]:
# Fixing extreme invalid seniority
df['customer_seniority'] = df['customer_seniority'].mask(df['customer_seniority'] < 0, np.nan)

print(df['customer_seniority'].describe().compute())

count    1.361954e+07
mean     7.938226e+01
std      6.637447e+01
min      0.000000e+00
25%      3.500000e+01
50%      1.050000e+02
75%      1.910000e+02
max      2.560000e+02
Name: customer_seniority, dtype: float64
