In [36]:
import numpy as np
import pandas as pd 
import matplotlib as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import torch 
import torch.optim
import torch.nn as nn
from  torch.utils.data import DataLoader,Dataset

import time

In [7]:
df = pd.read_csv('dataset.csv')
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [8]:
df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0
mean,6.854788,0.278241,0.334192,6.391415,0.045772,35.308085,138.360657,0.994027,3.188267,0.489847,10.514267,5.877909
std,0.843868,0.100795,0.12102,5.072058,0.021848,17.007137,42.498065,0.002991,0.151001,0.114126,1.230621,0.885639
min,3.8,0.08,0.0,0.6,0.009,2.0,9.0,0.98711,2.72,0.22,8.0,3.0
25%,6.3,0.21,0.27,1.7,0.036,23.0,108.0,0.991723,3.09,0.41,9.5,5.0
50%,6.8,0.26,0.32,5.2,0.043,34.0,134.0,0.99374,3.18,0.47,10.4,6.0
75%,7.3,0.32,0.39,9.9,0.05,46.0,167.0,0.9961,3.28,0.55,11.4,6.0
max,14.2,1.1,1.66,65.8,0.346,289.0,440.0,1.03898,3.82,1.08,14.2,9.0


In [None]:
print(df.shape)
print("Número de valores nulos:\n", df.isnull().sum())


(4898, 12)
Número de valores nulos:
 fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64


In [15]:
#Comprobamos que son todo variables numéricas 
print(df.dtypes)

fixed acidity           float64
volatile acidity        float64
citric acid             float64
residual sugar          float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
quality                   int64
dtype: object


In [18]:
#Separamos dataset y etiquetas
y = df['quality'].copy()
x = df.drop('quality',axis=1)

print(y.shape)
print(x.shape)

(4898,)
(4898, 11)


In [22]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,shuffle = True)

In [23]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [24]:
x_train, x_val, y_train, y_val = train_test_split(x_train,y_train,test_size=0.2,shuffle=True)

In [25]:
# Dataset and DataLoader: define the function for the dataset
class WineDataset(Dataset):
    def __init__(self, x, y):
        self.x = torch.tensor(x, dtype=torch.float32)
        self.y = torch.tensor(np.array(y).astype(np.float32), dtype=torch.float32)
    
    def __len__(self):
        return self.x.shape[0]
    
    def __getitem__(self, idx):
        return self.x[idx],self.y[idx]

In [26]:
train_dataset = WineDataset(x_train, y_train)
val_dataset = WineDataset(x_val,y_val)
test_dataset = WineDataset(x_test,y_test) # Crear el dataset para los datos de validación

In [29]:
class WineQualityModel(nn.Module):
    def __init__(self, input_shape): # input_shape es el número de variables que vamos a usar para la predicción
        super(WineQualityModel, self).__init__()
        
        self.fc1 = nn.Linear(input_shape,16)
        self.fc2 = nn.Linear(16,32)
        self.fc3 = nn.Linear(32,1)




    def forward(self, x):
        
        x = nn.ReLU(self.fc1(x))
        x = nn.ReLU(self.fc2(x))
        x = self.fc3(x)
        
        return x

In [32]:
model = WineQualityModel(input_shape=x.shape[1])

In [None]:
learning_rate = 0.01
epochs = 400 # Especificar número de épocas

optimizer = torch.optim.SGD(model.parameters(),lr=learning_rate ) # Especificar el optimizador con la tasa de aprendizaje y parametros del modelo
loss_fn = nn.MSELoss() # Especificar la función de coste

In [None]:
def train(model, train_dataloader, optimizer, loss_fn):
    model.train()
    epoch_loss = 0
    for i_batch, (x_train, y_train) in enumerate(train_dataloader):
        
        optimizer.zero_grad()

        predictions = model(x_train)
        

        batch_loss = loss_fn(predictions, y_train.reshape(-1,1)) # Calcula el coste (función de coste con los valores predichos y reales)

        batch_loss.backward() 

        optimizer.step()


        epoch_loss += batch_loss.item()
    
    loss_train = epoch_loss / i_batch
    
    return loss_train

In [35]:
def evaluation(model, val_dataloader, loss_fn):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():

        for i_batch, (x_val, y_val) in enumerate(val_dataloader):
            
            predictions = model(x_val)

            batch_loss = loss_fn(predictions, y_val.reshape(-1,1)) # Calcula el coste (función de coste con los valores reales y predichos)
            
            epoch_loss += batch_loss.item()
    
    loss_val = epoch_loss / i_batch
    
    return loss_val


In [37]:
def training_evaluation_loop(epochs, model, train_dataloader, val_dataloader, optimizer, loss_fn):
    # Empty loss and accuracy lists to track values
    start = time.time()

    loss_values_train = []
    loss_values_val = []

    for epoch in range(epochs):

        loss_train = train(model, train_dataloader, optimizer, loss_fn) # Llama la función para el entrenamiento 
        loss_values_train.append(loss_train)

        loss_val = evaluation(model, val_dataloader, loss_fn) # Llama la función para la evaluación
        loss_values_val.append(loss_val)

        

        # Imprime cada 10 épocas loss_train y loss_val
        
    end = time.time()
    total_time = end - start

    print(f'Total training time: {total_time}')

    return loss_values_train, loss_values_val

In [38]:
def predictions(model, test_dataloader):
    predictions = []
    real_values = []

    model.eval()
    with torch.no_grad():
        for x_test, y_test in test_dataloader: 

            outputs = model(x_test)
            predictions.append(outputs.detach().cpu().np())
            real_values.append(y_test.detach().cpu().np())

    predictions = np.vstack(predictions)
    real_values = np.hstack(real_values)
    
    # Calcula la métrica paa la regresión para ver el error entre los valores predichos y reales

    # Imprima la métrica

    return predictions, real_values