In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.svm import SVR
import xgboost as xgb
import lightgbm as lgb
import optuna

* Carga de datos

In [11]:
df = pd.read_csv('student_sleep_patterns.csv')
df.head()

Unnamed: 0,Student_ID,Age,Gender,University_Year,Sleep_Duration,Study_Hours,Screen_Time,Caffeine_Intake,Physical_Activity,Sleep_Quality,Weekday_Sleep_Start,Weekend_Sleep_Start,Weekday_Sleep_End,Weekend_Sleep_End
0,1,24,Other,2nd Year,7.7,7.9,3.4,2,37,10,14.16,4.05,7.41,7.06
1,2,21,Male,1st Year,6.3,6.0,1.9,5,74,2,8.73,7.1,8.21,10.21
2,3,22,Male,4th Year,5.1,6.7,3.9,5,53,5,20.0,20.47,6.88,10.92
3,4,24,Other,4th Year,6.3,8.6,2.8,4,55,9,19.82,4.08,6.69,9.42
4,5,20,Male,4th Year,4.7,2.7,2.7,0,85,3,20.98,6.12,8.98,9.01


* Tratamiento de valores nulos

In [12]:
df.isnull().sum()

Student_ID             0
Age                    0
Gender                 0
University_Year        0
Sleep_Duration         0
Study_Hours            0
Screen_Time            0
Caffeine_Intake        0
Physical_Activity      0
Sleep_Quality          0
Weekday_Sleep_Start    0
Weekend_Sleep_Start    0
Weekday_Sleep_End      0
Weekend_Sleep_End      0
dtype: int64

No se encontraron variables nulas en el dataset

* Deteccion de outliers

In [13]:
# Identificar columnas
categorical_columns = ['Gender', 'University_Year']
numerical_columns = ['Age', 'Study_Hours', 'Screen_Time', 'Caffeine_Intake', 
                        'Physical_Activity', 'Weekday_Sleep_Start', 'Weekend_Sleep_Start', 
                        'Weekday_Sleep_End', 'Weekend_Sleep_End']
    
def detect_outliers(df, columns):
    outliers = {}
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers[col] = ((df[col] < lower_bound) | (df[col] > upper_bound)).sum()
    return outliers

outliers = detect_outliers(df, numerical_columns)
print("Outliers detectados:", outliers)

Outliers detectados: {'Age': 0, 'Study_Hours': 0, 'Screen_Time': 0, 'Caffeine_Intake': 0, 'Physical_Activity': 0, 'Weekday_Sleep_Start': 0, 'Weekend_Sleep_Start': 0, 'Weekday_Sleep_End': 0, 'Weekend_Sleep_End': 0}


In [14]:
# Convertir columna de tiempo a minutos

def convert_time_to_minutes(time):
    hours = int(time)
    minutes = (time - hours) * 60
    return hours * 60 + minutes

df['Weekday_Sleep_Start'] = df['Weekday_Sleep_Start'].apply(convert_time_to_minutes)
df['Weekend_Sleep_Start'] = df['Weekend_Sleep_Start'].apply(convert_time_to_minutes)
df['Weekday_Sleep_End'] = df['Weekday_Sleep_End'].apply(convert_time_to_minutes)
df['Weekend_Sleep_End'] = df['Weekend_Sleep_End'].apply(convert_time_to_minutes)


* Definicion de variable objetivo

In [15]:
X = df.drop(['Student_ID', 'Sleep_Duration'], axis=1)
y = df['Sleep_Duration']

# Dividir en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Crear preprocessor con ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), numerical_columns),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_columns)
    ])

* Definicion de modelos de prediccion a ser usados

In [16]:
# 2. Definir modelos para comparación
models = {
    'Linear_Regression': LinearRegression(),
    'KNN': KNeighborsRegressor(),
    'Decision_Tree': DecisionTreeRegressor(random_state=42),
    'Random_Forest': RandomForestRegressor(random_state=42),
    'SVR': SVR(),
    'XGBoost': xgb.XGBRegressor(random_state=42),
    'LightGBM': lgb.LGBMRegressor(random_state=42)
}

def evaluate_models(X_train, X_test, y_train, y_test):
    results = {}
    for name, model in models.items():
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('regressor', model)
        ])
        
        # Validación cruzada
        cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
        cv_rmse_scores = np.sqrt(-cv_scores)
        
        # Entrenar y predecir
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        
        results[name] = {
            'MSE': mean_squared_error(y_test, y_pred),
            'MAE': mean_absolute_error(y_test, y_pred),
            'R2': r2_score(y_test, y_pred),
            'CV_RMSE_Mean': cv_rmse_scores.mean(),
            'CV_RMSE_Std': cv_rmse_scores.std()
        }
    
    return results

# Ejecutar evaluación de modelos
model_results = evaluate_models(X_train, X_test, y_train, y_test)

# Imprimir resultados
for name, metrics in model_results.items():
    print(f"\nModelo: {name}")
    for metric, value in metrics.items():
        print(f"{metric}: {value}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000099 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 617
[LightGBM] [Info] Number of data points in the train set: 320, number of used features: 16
[LightGBM] [Info] Start training from score 6.542187
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000111 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 622
[LightGBM] [Info] Number of data points in the train set: 320, number of used features: 16
[LightGBM] [Info] Start training from score 6.512500
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000107 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 626
[LightGBM] [Info] Number of data points in the train set: 320, number of used features: 16
[LightGBM] [Info] Start training fro

* Optimizacion de hiperparametros con optuna

In [17]:
def objective(trial):
    # Definir hiperparámetros para Random Forest
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 2, 15),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5)
    }
    
    model = RandomForestRegressor(**params, random_state=42)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # Validación cruzada
    scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
    return np.sqrt(-scores).mean()

# Crear estudio de Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

# Imprimir mejores hiperparámetros
print("\nMejores Hiperparámetros (Optuna):")
print(study.best_params)

# Entrenar modelo final con mejores hiperparámetros
best_rf = RandomForestRegressor(**study.best_params, random_state=42)
final_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', best_rf)
])

final_pipeline.fit(X_train, y_train)
final_predictions = final_pipeline.predict(X_test)

[I 2024-12-01 13:40:53,012] A new study created in memory with name: no-name-e4acf4f1-5448-45b2-901d-71a4b11769fa
[I 2024-12-01 13:40:54,848] Trial 0 finished with value: 1.5288704153191426 and parameters: {'n_estimators': 120, 'max_depth': 15, 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 0 with value: 1.5288704153191426.
[I 2024-12-01 13:40:56,572] Trial 1 finished with value: 1.5016145286820375 and parameters: {'n_estimators': 165, 'max_depth': 5, 'min_samples_split': 3, 'min_samples_leaf': 3}. Best is trial 1 with value: 1.5016145286820375.
[I 2024-12-01 13:40:58,275] Trial 2 finished with value: 1.4779694608222156 and parameters: {'n_estimators': 240, 'max_depth': 2, 'min_samples_split': 4, 'min_samples_leaf': 3}. Best is trial 2 with value: 1.4779694608222156.
[I 2024-12-01 13:40:59,590] Trial 3 finished with value: 1.495813895086528 and parameters: {'n_estimators': 159, 'max_depth': 4, 'min_samples_split': 3, 'min_samples_leaf': 2}. Best is trial 2 with value: 1.


Mejores Hiperparámetros (Optuna):
{'n_estimators': 294, 'max_depth': 2, 'min_samples_split': 6, 'min_samples_leaf': 4}


* Entrenamiento con el modelo optimizado

In [18]:
best_rf = RandomForestRegressor(**study.best_params, random_state=42)
final_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', best_rf)
])

final_pipeline.fit(X_train, y_train)
final_predictions = final_pipeline.predict(X_test)

# Métricas finales
print("\nMétricas del Modelo Optimizado:")
print(f"MSE: {mean_squared_error(y_test, final_predictions)}")
print(f"MAE: {mean_absolute_error(y_test, final_predictions)}")
print(f"R2: {r2_score(y_test, final_predictions)}")


Métricas del Modelo Optimizado:
MSE: 2.444833987098717
MAE: 1.401313899396839
R2: -0.015168345482429624
