In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Cargar el dataset
df = pd.read_csv("../data/student_depression.csv")

# Convertir 'Sleep Duration' a número
def convert_sleep_duration(duration):
    if pd.isnull(duration):
        return None
    duration = str(duration).replace("'", "").replace("hours", "").strip()
    if '-' in duration:
        start, end = duration.split('-')
        return (int(start) + int(end)) / 2
    if "Less than" in duration:
        return 4
    if "More than" in duration:
        return 9
    try:
        return float(duration)
    except:
        return None

df['Sleep Duration'] = df['Sleep Duration'].apply(convert_sleep_duration)

# Eliminar columnas irrelevantes
columns_to_drop = ['Job Satisfaction', 'id', 'City', 'Degree', 'Profession']
df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])

# Codificar variables categóricas
label_encoder = LabelEncoder()
df['Gender'] = label_encoder.fit_transform(df['Gender'])
df['Dietary Habits'] = label_encoder.fit_transform(df['Dietary Habits'])
df['Have you ever had suicidal thoughts ?'] = label_encoder.fit_transform(df['Have you ever had suicidal thoughts ?'])
df['Financial Stress'] = label_encoder.fit_transform(df['Financial Stress'])
df['Family History of Mental Illness'] = label_encoder.fit_transform(df['Family History of Mental Illness'])

# Eliminar filas nulas
df = df.dropna()

# Variables predictoras y objetivo
X = df.drop(columns=["Depression"])
y = df["Depression"]

# División de datos
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Entrenamiento del modelo
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluación
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Acuracy real del modelo: {accuracy * 100:.2f}%")


Acuracy real del modelo: 84.11%


In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Cargar dataset
df = pd.read_csv("../data/student_depression.csv")

# Convertir 'Sleep Duration'
def convert_sleep_duration(duration):
    if pd.isnull(duration):
        return None
    duration = str(duration).replace("'", "").replace("hours", "").strip()
    if '-' in duration:
        start, end = duration.split('-')
        return (int(start) + int(end)) / 2
    if "Less than" in duration:
        return 4
    if "More than" in duration:
        return 9
    try:
        return float(duration)
    except:
        return None

df['Sleep Duration'] = df['Sleep Duration'].apply(convert_sleep_duration)

# Limpiar y codificar
df.drop(columns=["Job Satisfaction", "id", "City", "Degree", "Profession"], inplace=True, errors='ignore')
df = df.dropna()
le = LabelEncoder()
for col in ['Gender', 'Dietary Habits', 'Have you ever had suicidal thoughts ?', 'Financial Stress', 'Family History of Mental Illness']:
    df[col] = le.fit_transform(df[col])

# Separar variables
X = df.drop(columns=['Depression'])
y = df['Depression']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Entrenar modelo
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Accuracy base
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy base: {accuracy:.4f}")


Accuracy base: 0.8405


In [15]:
# Cambiar modelo (menos árboles)
model2 = RandomForestClassifier(n_estimators=10, random_state=42)
model2.fit(X_train, y_train)

# Calcular nuevo accuracy
y_pred2 = model2.predict(X_test)
accuracy2 = accuracy_score(y_test, y_pred2)
print(f"Accuracy con menos árboles: {accuracy2:.4f}")


Accuracy con menos árboles: 0.8261


In [16]:
# Solo una parte de los datos
X_train_small = X_train[:1000]
y_train_small = y_train[:1000]

model3 = RandomForestClassifier(random_state=42)
model3.fit(X_train_small, y_train_small)

y_pred3 = model3.predict(X_test)
accuracy3 = accuracy_score(y_test, y_pred3)
print(f"Accuracy con menos datos de entrenamiento: {accuracy3:.4f}")


Accuracy con menos datos de entrenamiento: 0.8299
