<a href="https://colab.research.google.com/github/juanpajaro/nube_analisis_palabras/blob/main/prueba_pandas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

In [5]:
def generate_cardiovascular_risk_dataset(num_patients=100):
    """
    Generates a synthetic dataset of possible patients with cardiovascular risk.

    Args:
        num_patients (int): The number of patients to generate.

    Returns:
        pd.DataFrame: A DataFrame containing patient data and cardiovascular risk.
    """
    np.random.seed(42)  # for reproducibility

    data = {
        'age': np.random.randint(30, 80, num_patients),
        'gender': np.random.choice(['Male', 'Female'], num_patients),
        'blood_pressure_systolic': np.random.randint(100, 180, num_patients),
        'blood_pressure_diastolic': np.random.randint(60, 120, num_patients),
        'cholesterol': np.random.randint(150, 300, num_patients),
        'bmi': np.random.uniform(18.0, 40.0, num_patients),
        'smoking': np.random.choice([0, 1], num_patients, p=[0.7, 0.3]), # 0: No, 1: Yes
        'diabetes': np.random.choice([0, 1], num_patients, p=[0.85, 0.15]), # 0: No, 1: Yes
        'family_history': np.random.choice([0, 1], num_patients, p=[0.6, 0.4]), # 0: No, 1: Yes
    }

    df = pd.DataFrame(data)

    # Simulate cardiovascular risk based on some factors (simplified)
    # Higher age, blood pressure, cholesterol, bmi, smoking, diabetes, family history increase risk
    risk_score = (df['age'] * 0.1 +
                  df['blood_pressure_systolic'] * 0.05 +
                  df['cholesterol'] * 0.02 +
                  df['bmi'] * 0.5 +
                  df['smoking'] * 10 +
                  df['diabetes'] * 15 +
                  df['family_history'] * 12)

    df['cardiovascular_risk'] = (risk_score > risk_score.median()).astype(int) # 1 for higher risk, 0 for lower risk

    return df

In [6]:
# Example usage:
df_patients = generate_cardiovascular_risk_dataset(num_patients=200)
display(df_patients.head())

Unnamed: 0,age,gender,blood_pressure_systolic,blood_pressure_diastolic,cholesterol,bmi,smoking,diabetes,family_history,cardiovascular_risk
0,68,Male,168,108,258,33.333152,1,0,1,1
1,58,Male,160,95,154,39.939622,1,0,0,1
2,44,Female,147,71,268,37.725426,1,0,0,1
3,72,Female,118,100,182,30.671965,0,0,0,0
4,37,Female,103,78,267,38.182703,0,1,0,1


In [9]:
df.columns

NameError: name 'df' is not defined