In [84]:
import os
import numpy as np
import pandas as pd
import random
import torch
import torch.nn as nn
import torch.optim as optim
from matplotlib import pyplot
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

%matplotlib inline

!ls
%mkdir data
!ls

data  drive  sample_data
mkdir: cannot create directory ‘data’: File exists
data  drive  sample_data


In [85]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [86]:
import shutil
shutil.copy("/content/drive/MyDrive/sis_420/lab_repaso/02_regersion_logistica/exams.csv","/content/data/data.csv")

'/content/data/data.csv'

In [87]:

data = pd.read_csv( os.path.join('data', 'data.csv') )
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [88]:
# Codificar variables categóricas
columnas_categoricas = data.select_dtypes(include=['object']).columns
label_encoders = {}
for columna in columnas_categoricas:
    le = LabelEncoder()
    data[columna] = le.fit_transform(data[columna])
    label_encoders[columna] = le

In [89]:
#Reorganizar las columnas para mantener la consistencia con el modelo
column_order = ["gender", "race/ethnicity", "parental level of education", "lunch", "math score", "reading score", "writing score", "test preparation course"]
data = data[column_order]

In [90]:
X = data.iloc[:, :7].values #Las primeras siete columnas, que son nuestras X
y = data.iloc[:, 7].values.reshape(-1, 1) #La ultima columna, Que es nuestra y

In [91]:
#creamos instacias de la clase para normalizar las x, y
scaler_x = StandardScaler()
scaler_y = StandardScaler()

# Normalizar las características (X). fit_transform ajusta el escalador y transforma los datos.
X = scaler_x.fit_transform(X)
y = scaler_y.fit_transform(y.reshape(-1, 1))

In [92]:
# Separar datos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)
X_test_tensor  = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor  = torch.tensor(y_test, dtype=torch.float32)


In [93]:

# Definir el modelo de regresión logística
class LogisticRegressionModel(nn.Module):
    def __init__(self, input_size):
        super(LogisticRegressionModel, self).__init__()
        self.linear = nn.Linear(input_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out = self.linear(x)
        out = self.sigmoid(out)
        return out


In [94]:

# Configurar el modelo
input_size = X_train.shape[1]  # Número de características de entrada
model = LogisticRegressionModel(input_size)
criterion = nn.BCELoss()  # Pérdida para un problema de clasificación binaria
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Entrenamiento del modelo
num_epochs = 1000
for epoch in range(num_epochs):
    # Forward pass
    outputs = model(X_train)

    # Calcular la pérdida
    loss = criterion(outputs, y_train)

    # Backward pass y optimización
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if (epoch + 1) % 100 == 0:
    # Imprimir la pérdida en cada epoch
      print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')

# Evalúa el modelo en el conjunto de prueba
model.eval()
with torch.no_grad():
    y_pred = model(X_test_tensor)
    test_loss = criterion(y_pred, y_test_tensor)
    print(f'Pérdida en el conjunto de prueba: {test_loss.item():.2f}')

Epoch [100/1000], Loss: 0.6194
Epoch [200/1000], Loss: 0.5188
Epoch [300/1000], Loss: 0.4386
Epoch [400/1000], Loss: 0.3707
Epoch [500/1000], Loss: 0.3104
Epoch [600/1000], Loss: 0.2554
Epoch [700/1000], Loss: 0.2041
Epoch [800/1000], Loss: 0.1557
Epoch [900/1000], Loss: 0.1095
Epoch [1000/1000], Loss: 0.0652
Pérdida en el conjunto de prueba: 0.12


In [95]:
# Obtener 10 filas aleatorias de los datos de prueba
random_indices = random.sample(range(len(X_test)), 10)
threshold = .51 #umbral

print('Nº Test | Real | Predecido')
for i in random_indices:
    # Obtener la entrada y la etiqueta real de una fila
    X_sample = X_test_tensor[i]
    y_sample = y_test_tensor[i]

    # Hacer predicciones en la fila de muestra
    with torch.no_grad():
        prediction = model(X_sample)

    # Convertir la probabilidad en clase binaria usando un umbral
    prediction_final = 1 if prediction > threshold else 0
    real = 1 if y_sample.item() > threshold else 0
    print(f'{i + 1 }     |  {real}  | {prediction_final}')



Nº Test | Real | Predecido
92     |  1  | 0
151     |  1  | 1
130     |  1  | 1
198     |  1  | 1
45     |  0  | 0
64     |  1  | 0
35     |  1  | 1
155     |  1  | 0
178     |  1  | 1
16     |  0  | 1
