In [13]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Cargar los datos
data = pd.read_csv('./Sleep_health_and_lifestyle_dataset.csv')

# Dividir la columna 'Blood Pressure' en dos columnas: presión sistólica y diastólica
data[['Systolic Pressure', 'Diastolic Pressure']] = data['Blood Pressure'].str.split('/', expand=True)

# Convertir los valores a números enteros
data['Systolic Pressure'] = data['Systolic Pressure'].astype(int)
data['Diastolic Pressure'] = data['Diastolic Pressure'].astype(int)

# Eliminar la columna original 'Blood Pressure'
data.drop(columns=['Blood Pressure'], inplace=True)

# Dividir los datos en características (X) y el objetivo (y)
X = data.drop(columns=['Person ID', 'Stress Level'])
y = data['Stress Level']

# Definir qué columnas son categóricas
categorical_columns = ['Gender', 'Occupation', 'BMI Category', 'Sleep Disorder']

# Crear un objeto ColumnTransformer para aplicar one-hot encoding
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_columns)
    ],
    remainder='passthrough'
)

# Aplicar la transformación a las características
X_encoded = preprocessor.fit_transform(X)

# Inicializar el modelo de bosque aleatorio
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Entrenar el modelo
rf.fit(X_encoded, y)

# Obtener la importancia de las características
feature_importances = rf.feature_importances_

# Obtener el nombre de las características después de la codificación one-hot
encoded_feature_names = preprocessor.get_feature_names_out(input_features=X.columns)

# Crear un DataFrame para mostrar la importancia de las características
feature_importance_df = pd.DataFrame({
    'Feature': encoded_feature_names,
    'Importance': feature_importances
})

# Ordenar las características por su importancia
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Mostrar las características más importantes
print(feature_importance_df)

# Crear una columna para los niveles de estrés basada en la descripción dada
data['Stress Level Range'] = pd.cut(data['Stress Level'], bins=[0, 3, 6, 8], labels=['No estresado', 'Estresado', 'Extremadamente estresado'])

# Calcular la importancia promedio de las características para cada nivel de estrés
importance_by_stress_level = {}
for stress_level in data['Stress Level Range'].unique():
    importance_by_stress_level[stress_level] = feature_importance_df[data['Stress Level Range'] == stress_level]['Importance'].mean()

# Mostrar la importancia promedio de las características para cada nivel de estrés
for stress_level, importance in importance_by_stress_level.items():
    print(f'Importancia promedio de las características para {stress_level}: {importance}')



                                 Feature  Importance
22           remainder__Quality of Sleep    0.761536
21             remainder__Sleep Duration    0.100287
24                 remainder__Heart Rate    0.086146
23    remainder__Physical Activity Level    0.011024
25                remainder__Daily Steps    0.010225
20                        remainder__Age    0.007745
5                 cat__Occupation_Lawyer    0.007238
27         remainder__Diastolic Pressure    0.002045
3                 cat__Occupation_Doctor    0.002029
26          remainder__Systolic Pressure    0.001696
10             cat__Occupation_Scientist    0.001553
1                       cat__Gender_Male    0.001236
0                     cat__Gender_Female    0.001052
2             cat__Occupation_Accountant    0.000938
4               cat__Occupation_Engineer    0.000859
17          cat__Sleep Disorder_Insomnia    0.000792
7                  cat__Occupation_Nurse    0.000614
14       cat__BMI Category_Normal Weight    0.

  importance_by_stress_level[stress_level] = feature_importance_df[data['Stress Level Range'] == stress_level]['Importance'].mean()
  importance_by_stress_level[stress_level] = feature_importance_df[data['Stress Level Range'] == stress_level]['Importance'].mean()
  importance_by_stress_level[stress_level] = feature_importance_df[data['Stress Level Range'] == stress_level]['Importance'].mean()
