## Librerías

In [1]:
import pandas as pd

## Datos

In [2]:
df = pd.read_csv(
    '../data/raw/cardio_train.csv',  # Ruta al archivo
    sep=',',                         # Separador: coma
    encoding='utf-8',                # Codificación
    dtype={                          # Tipos de datos por columna
        'id': int,
        'age': float,             # Edad en días
        'gender': str,          # Genero: 1 = mujer, 2 = hombre (según dataset típico)
        'height': float,        # Altura
        'weight': float,        # Peso
        'ap_hi': float,         # Presión sistólica
        'ap_lo': float,         # Presión diastólica
        'cholesterol': float,     # Colesterol: 1 = normal, 2 = alto, 3 = muy alto
        'gluc': float,            # Glucosa: 1 = normal, 2 = alto, 3 = muy alto
        'smoke': float,           # 0 = no fuma, 1 = fuma
        'alco': float,            # 0 = no bebe, 1 = bebe
        'active': float,          # 0 = inactivo, 1 = activo
        'cardio': int           # 0 = sin enfermedad, 1 = con enfermedad
    }
)

In [3]:
df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393.0,2,168.0,62.0,110.0,80.0,1.0,1.0,0.0,0.0,1.0,0
1,1,20228.0,1,156.0,85.0,140.0,90.0,3.0,1.0,0.0,0.0,1.0,1
2,2,18857.0,1,165.0,64.0,130.0,70.0,3.0,1.0,0.0,0.0,0.0,1
3,3,17623.0,2,169.0,82.0,150.0,100.0,1.0,1.0,0.0,0.0,1.0,1
4,4,17474.0,1,156.0,56.0,100.0,60.0,1.0,1.0,0.0,0.0,0.0,0


In [4]:
df['hombre'] = (df['gender'] == "2").astype(float)
df['mujer'] = (df['gender'] == "1").astype(float)

df = df.drop(['id', 'gender'], axis=1)

df.head()

Unnamed: 0,age,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,hombre,mujer
0,18393.0,168.0,62.0,110.0,80.0,1.0,1.0,0.0,0.0,1.0,0,1.0,0.0
1,20228.0,156.0,85.0,140.0,90.0,3.0,1.0,0.0,0.0,1.0,1,0.0,1.0
2,18857.0,165.0,64.0,130.0,70.0,3.0,1.0,0.0,0.0,0.0,1,0.0,1.0
3,17623.0,169.0,82.0,150.0,100.0,1.0,1.0,0.0,0.0,1.0,1,1.0,0.0
4,17474.0,156.0,56.0,100.0,60.0,1.0,1.0,0.0,0.0,0.0,0,0.0,1.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          70000 non-null  float64
 1   height       70000 non-null  float64
 2   weight       70000 non-null  float64
 3   ap_hi        70000 non-null  float64
 4   ap_lo        70000 non-null  float64
 5   cholesterol  70000 non-null  float64
 6   gluc         70000 non-null  float64
 7   smoke        70000 non-null  float64
 8   alco         70000 non-null  float64
 9   active       70000 non-null  float64
 10  cardio       70000 non-null  int64  
 11  hombre       70000 non-null  float64
 12  mujer        70000 non-null  float64
dtypes: float64(12), int64(1)
memory usage: 6.9 MB


## Preparacion de datos

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          70000 non-null  float64
 1   height       70000 non-null  float64
 2   weight       70000 non-null  float64
 3   ap_hi        70000 non-null  float64
 4   ap_lo        70000 non-null  float64
 5   cholesterol  70000 non-null  float64
 6   gluc         70000 non-null  float64
 7   smoke        70000 non-null  float64
 8   alco         70000 non-null  float64
 9   active       70000 non-null  float64
 10  cardio       70000 non-null  int64  
 11  hombre       70000 non-null  float64
 12  mujer        70000 non-null  float64
dtypes: float64(12), int64(1)
memory usage: 6.9 MB


In [7]:
df['cardio'].value_counts()

cardio
0    35021
1    34979
Name: count, dtype: int64

## Balancear el DataSet

In [8]:
from imblearn.over_sampling import RandomOverSampler

# Separar variables predictoras y objetivo
X = df.drop('cardio', axis=1)
y = df['cardio']

# Aplicar RandomOverSampler para balancear
ros = RandomOverSampler()
X_resampled, y_resampled = ros.fit_resample(X, y)

# Convertir a DataFrame nuevamente
df_balanceados = pd.concat([X_resampled, y_resampled], axis=1)

In [9]:
df_balanceados['cardio'].value_counts()

cardio
0    35021
1    35021
Name: count, dtype: int64

In [10]:
df_balanceados.to_parquet('../data/processed/dataset_cardio_balan.parquet')