# KNN

In [None]:
import pandas as pd
from sklearn.impute import KNNImputer
import numpy as np

# Cargar el archivo Excel en un DataFrame
df = pd.read_excel('DATA2.xlsx')
'''
# Drop the rows with the specified IDs
ids_to_drop = [36, 44, 53]
df = df[~df['ID'].isin(ids_to_drop)]  # Assuming the ID column is named 'ID'

with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 3,
                       ):
    print(df)
'''
# Mostrar las primeras filas del DataFrame para verificar la carga
print("DataFrame original:")
print(df.head())

df = df.drop(columns=['Variedad', 'Método', 'Fuente'], axis=1)

# Seleccionar solo las columnas numéricas para la imputación
df_numeric = df.select_dtypes(include=['float64', 'int64'])

# Crear el imputador KNN
knn_imputer = KNNImputer(n_neighbors=5)  # Ajusta el número de vecinos según sea necesario

# Aplicar la imputación
df_imputed_numeric = pd.DataFrame(knn_imputer.fit_transform(df_numeric), columns=df_numeric.columns)

# Combinar las columnas imputadas con las no numéricas (si las hay)
df_imputed = df.copy()
df_imputed[df_numeric.columns] = df_imputed_numeric

# Mostrar las primeras filas del DataFrame imputado
print("\nDataFrame imputado con KNN:")
print(df_imputed.head())

# Guardar el DataFrame imputado en un nuevo archivo Excel
df_imputed.to_excel('knn.xlsx', index=False)

# GAN

In [None]:
### Variedad String -> Int
import pandas as pd

df = pd.read_excel("knn.xlsx")

# Crear un diccionario para mapear cada variedad a un número único
variedad_a_numero = {variedad: i for i, variedad in enumerate(df['Variedad'].unique())}

# Aplicar el mapeo a la columna 'Variedad'
df['Variedad'] = df['Variedad'].map(variedad_a_numero)

print("Enteros asignados a cada variedad:")
for variedad, numero in variedad_a_numero.items():
    print(f"{variedad}: {numero}")

conteo_variedades = df['Variedad'].value_counts()
print("\nConteo de cada variedad:")
print(conteo_variedades)

print("\nDataFrame con la columna numérica:")
print(df)

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

# Load and Normalize Data
real_data = df
scaler = MinMaxScaler()
real_data_scaled = scaler.fit_transform(real_data.iloc[:, 1:])  # Drop ID column

# Convert to Torch Tensor
real_tensor = torch.tensor(real_data_scaled, dtype=torch.float32)

# Extract condition column (Puntaje en Taza)
puntaje_index = real_data.columns.get_loc("Puntaje en taza") - 1  # Adjust for dropped ID column
condition_data = real_tensor[:, puntaje_index].unsqueeze(1)  # Keep as 2D tensor

# Define Dataset and DataLoader
batch_size = 64
dataloader = DataLoader(TensorDataset(real_tensor, condition_data), batch_size=batch_size, shuffle=True)

# Generator (With Conditional Input)
class Generator(nn.Module):
    def __init__(self, input_dim, condition_dim, output_dim):
        super(Generator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim + condition_dim, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Linear(128, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Linear(256, output_dim),
            nn.Sigmoid()
        )
    
    def forward(self, z, condition):
        x = torch.cat((z, condition), dim=1)
        return self.model(x)

# Discriminator (With Conditional Input)
class Discriminator(nn.Module):
    def __init__(self, input_dim, condition_dim):
        super(Discriminator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim + condition_dim, 256),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.LeakyReLU(0.2),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )
    
    def forward(self, x, condition):
        x = torch.cat((x, condition), dim=1)
        return self.model(x)

# Initialize Models
input_dim = 10  # Latent space size
condition_dim = 1  # Only conditioning on "Puntaje en Taza"
output_dim = real_data_scaled.shape[1]
generator = Generator(input_dim, condition_dim, output_dim)
discriminator = Discriminator(output_dim, condition_dim)

# Optimizers & Loss Function
lr = 0.0002
betas = (0.5, 0.999)
optimizer_G = optim.Adam(generator.parameters(), lr=lr, betas=betas)
optimizer_D = optim.Adam(discriminator.parameters(), lr=lr, betas=betas)
loss_function = nn.BCELoss()

# Training Loop
num_epochs = 5000
for epoch in range(num_epochs):
    for real_samples, conditions in dataloader:
        batch_size = real_samples.size(0)
        
        # === Train Discriminator ===
        optimizer_D.zero_grad()
        real_labels = torch.full((batch_size, 1), 0.9)  # Label Smoothing
        fake_labels = torch.zeros(batch_size, 1)
        
        real_predictions = discriminator(real_samples, conditions)
        loss_real = loss_function(real_predictions, real_labels)
        
        z = torch.randn(batch_size, input_dim)
        fake_samples = generator(z, conditions)
        fake_predictions = discriminator(fake_samples.detach(), conditions)
        loss_fake = loss_function(fake_predictions, fake_labels)
        
        loss_D = (loss_real + loss_fake) / 2
        loss_D.backward()
        optimizer_D.step()

        # === Train Generator ===
        optimizer_G.zero_grad()
        fake_predictions = discriminator(fake_samples, conditions)
        loss_G = loss_function(fake_predictions, real_labels)  # Wants fake to be classified as real
        loss_G.backward()
        optimizer_G.step()
    
    if epoch % 500 == 0:
        print(f"Epoch {epoch}/{num_epochs} | Loss_D: {loss_D.item():.4f} | Loss_G: {loss_G.item():.4f}")

# Generate Synthetic Data
num_samples = 10000
z = torch.randn(num_samples, input_dim)

# Generate random conditions for "Puntaje en Taza"
min_puntaje, max_puntaje = real_data["Puntaje en taza"].min(), real_data["Puntaje en taza"].max()
random_conditions = np.random.uniform(min_puntaje, max_puntaje, (num_samples, 1))
random_conditions_scaled = scaler.transform(np.concatenate([np.zeros((num_samples, real_data_scaled.shape[1] - 1)), random_conditions], axis=1))[:, puntaje_index]
conditions = torch.tensor(random_conditions_scaled.reshape(-1, 1), dtype=torch.float32)

#######
# Identificar el índice de la columna "Variedad"
variedad_index = real_data.columns.get_loc("Variedad") - 1  # Ajuste por la columna ID eliminada
#######

synthetic_data = generator(z, conditions).detach().numpy()
synthetic_data = scaler.inverse_transform(synthetic_data)

######
# Redondear solo la columna "Variedad"
synthetic_data[:, variedad_index] = np.round(synthetic_data[:, variedad_index])
# Asegurar que sean enteros
synthetic_data[:, variedad_index] = synthetic_data[:, variedad_index].astype(int)
######

# Save Synthetic Data
synthetic_df = pd.DataFrame(synthetic_data, columns=real_data.columns[1:])
synthetic_df.to_csv("synthetic_data.csv", index=False)
print("Synthetic data generated and saved!")

# Concatenate original dataset with synthetic dataset
combined_data = pd.concat([real_data.iloc[:, 1:], synthetic_df], ignore_index=True)

# Save to Excel
combined_data.to_excel("combined_data.xlsx", index=False)

## Statistical Similarity Check

In [None]:
#### Statistical Similarity Check (Compare Real & Synthetic Data)
###### Ensures synthetic data follows the same distribution as the real data.

from scipy.stats import ks_2samp
import seaborn as sns
import matplotlib.pyplot as plt

# Select numerical columns
real_data = real_data.iloc[:, 1:]  # Exclude ID column if present

synthetic_data = synthetic_df  # Generated data

# Compare distributions for each column
for col in real_data.columns:
    real_vals = real_data[col]
    synthetic_vals = synthetic_data[col]
    
    # KS test (p > 0.05 means similar distributions)
    ks_stat, p_value = ks_2samp(real_vals, synthetic_vals)
    
    print(f"Column: {col} | KS Test p-value: {p_value:.4f}")
    
    # Plot distributions
    sns.kdeplot(real_vals, label="Real", fill=True)
    sns.kdeplot(synthetic_vals, label="Synthetic", fill=True, linestyle="dashed")
    plt.title(f"Distribution of {col}")
    plt.legend()
    plt.show()

## If p-value > 0.05, real & synthetic data follow a similar distribution.
## If p-value < 0.05, synthetic data does not match real data well.

## Limpieza del dataset

In [None]:
##DATASET CON OUTLIERS
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

data = pd.read_excel("dataset_SIN_out.xlsx")

# Primeras filas del dataset
print("Primeras filas del dataset:")
print(data.head())

# Información general del dataset (tipos de datos, valores no nulos)
print("\nInformación del dataset:")
print(data.info())

# Estadísticas descriptivas
print("\nEstadísticas descriptivas:")
print(data.describe())

# Verificar valores faltantes
print("\nValores faltantes por columna:")
print(data.isnull().sum())

# (Opcional) Visualizaciones básicas para entender distribuciones y relaciones
# Ejemplo: Histogramas para variables numéricas
data.hist(figsize=(12, 10)) # Ajusta el tamaño según el número de variables
plt.show()

# Ejemplo: Matriz de correlación para ver relaciones lineales entre variables numéricas
correlation_matrix = data.corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Matriz de Correlación')
plt.show()

In [None]:
import pandas as pd

# Cargar el dataset (reemplaza con tu archivo)
df = pd.read_excel("dataset_SIN_out.xlsx")

# Redondear todos los valores numéricos a 2 decimales
df = df.round(2)

# Lista de columnas a redondear
columnas_a_redondear1 = ["Cantidad (L)", "Temperatura"]  # Redondear a 1 decimal
columnas_a_redondear0 = ["Altura", "Tiempo de fermentación"] # Redondear a 0 decimales
# Aplicar el redondeo solo a esas columnas
df[columnas_a_redondear1] = df[columnas_a_redondear1].round(1)
df[columnas_a_redondear0] = df[columnas_a_redondear0].round(0)
# Aproximar "Puntaje en taza" al múltiplo de 0.25 más cercano
df["Puntaje en taza"] = df["Puntaje en taza"].apply(lambda x: round(x * 4) / 4)

# Guardar el dataset limpio (opcional)
df.to_csv("sin_outliers_rounded.csv", index=False)

# Mostrar las primeras filas para verificar
print(df.head())
print(df.tail())