# Cria simulação (Geração de amostras Sintéticas)

In [4]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Configuração da semente para reprodutibilidade
def set_seed(seed=42):
    torch.manual_seed(seed)
    np.random.seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed()

# Definir a arquitetura da GAN
class Generator(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Generator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, output_dim)
        )

    def forward(self, x):
        return self.model(x)


class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

# Função para treinar a GAN
def train_gan(real_data, num_features, latent_dim, num_samples, num_epochs=5000, batch_size=32):
    scaler = MinMaxScaler()
    real_data = scaler.fit_transform(real_data)
    real_data = torch.tensor(real_data, dtype=torch.float32)

    generator = Generator(latent_dim, num_features)
    discriminator = Discriminator(num_features)

    criterion = nn.BCELoss()
    optimizer_G = optim.Adam(generator.parameters(), lr=0.0002)
    optimizer_D = optim.Adam(discriminator.parameters(), lr=0.0002)

    for epoch in range(num_epochs):
        for _ in range(max(1, real_data.size(0) // batch_size)):
            idx = np.random.randint(0, real_data.size(0), batch_size)
            real_samples = real_data[idx]

            real_labels = torch.ones((real_samples.size(0), 1))
            fake_labels = torch.zeros((real_samples.size(0), 1))

            noise = torch.randn((real_samples.size(0), latent_dim))
            fake_samples = generator(noise)

            real_preds = discriminator(real_samples)
            fake_preds = discriminator(fake_samples.detach())

            loss_real = criterion(real_preds, real_labels)
            loss_fake = criterion(fake_preds, fake_labels)
            loss_D = loss_real + loss_fake

            optimizer_D.zero_grad()
            loss_D.backward()
            optimizer_D.step()

        noise = torch.randn((batch_size, latent_dim))
        fake_samples = generator(noise)
        fake_preds = discriminator(fake_samples)
        loss_G = criterion(fake_preds, torch.ones((batch_size, 1)))

        optimizer_G.zero_grad()
        loss_G.backward()
        optimizer_G.step()

        if epoch % 500 == 0 or epoch == num_epochs - 1:
            print(f"Epoch [{epoch}/{num_epochs}] - Loss D: {loss_D.item():.4f}, Loss G: {loss_G.item():.4f}")

    noise = torch.randn((num_samples, latent_dim))
    synthetic_data = generator(noise).detach().numpy()
    synthetic_data = scaler.inverse_transform(synthetic_data)
    return synthetic_data

# Função para processar subpastas e gerar saídas
def process_subfolders(input_directory, output_directory, total_samples=10000):
    for root, dirs, files in os.walk(input_directory):
        for file in files:
            filepath = os.path.join(root, file)
            relative_path = os.path.relpath(root, input_directory)
            output_subdir = os.path.join(output_directory, relative_path)
            os.makedirs(output_subdir, exist_ok=True)

            if file.endswith('.txt'):
                try:
                    print(f"Lendo arquivo: {filepath}")
                    df = pd.read_csv(filepath, delimiter=',')
                    df = df.apply(pd.to_numeric, errors='coerce').dropna()
                    if df.empty:
                        print(f"Arquivo {file} está vazio após a limpeza.")
                        continue

                    num_features = df.shape[1]
                    samples_per_dataset = total_samples // len(files)
                    synthetic_data = train_gan(df.values, num_features, latent_dim=10, num_samples=samples_per_dataset)

                    # Normalização Min-Max das saídas geradas
                    scaler_output = MinMaxScaler(feature_range=(0, 1))
                    synthetic_data_normalized = scaler_output.fit_transform(synthetic_data)

                    output_path = os.path.join(output_subdir, file)
                    synthetic_df = pd.DataFrame(synthetic_data_normalized, columns=df.columns)
                    synthetic_df.to_csv(output_path, sep=',', index=False)
                    print(f"Arquivo gerado: {output_path}")

                except Exception as e:
                    print(f"Erro ao processar {file}: {e}")

# Diretórios de entrada e saída
input_directory = r"C:\Users\CALEO\OneDrive - Hexagon\Documents\GitHub\Software_effort_estimation\proposal\abordagem2\saida\1-tratamento-target-encoding-normalizado"
output_directory = r"C:\Users\CALEO\OneDrive - Hexagon\Documents\GitHub\Software_effort_estimation\proposal\abordagem2\saida\3-simulacao\1-tratamento"

input_directory2 = r"C:\Users\CALEO\OneDrive - Hexagon\Documents\GitHub\Software_effort_estimation\proposal\abordagem2\saida\2-geracao-variaveis\GAN-25"
output_directory2 = r"C:\Users\CALEO\OneDrive - Hexagon\Documents\GitHub\Software_effort_estimation\proposal\abordagem2\saida\3-simulacao\GAN-25"

input_directory3 = r"C:\Users\CALEO\OneDrive - Hexagon\Documents\GitHub\Software_effort_estimation\proposal\abordagem2\saida\2-geracao-variaveis\GAN-50"
output_directory3 = r"C:\Users\CALEO\OneDrive - Hexagon\Documents\GitHub\Software_effort_estimation\proposal\abordagem2\saida\3-simulacao\GAN-50"

input_directory4 = r"C:\Users\CALEO\OneDrive - Hexagon\Documents\GitHub\Software_effort_estimation\proposal\abordagem2\saida\2-geracao-variaveis\GAN-75"
output_directory4 = r"C:\Users\CALEO\OneDrive - Hexagon\Documents\GitHub\Software_effort_estimation\proposal\abordagem2\saida\3-simulacao\GAN-75"

input_directory5 = r"C:\Users\CALEO\OneDrive - Hexagon\Documents\GitHub\Software_effort_estimation\proposal\abordagem2\saida\2-geracao-variaveis\MIMIC-25"
output_directory5 = r"C:\Users\CALEO\OneDrive - Hexagon\Documents\GitHub\Software_effort_estimation\proposal\abordagem2\saida\3-simulacao\MIMIC-25"

input_directory6 = r"C:\Users\CALEO\OneDrive - Hexagon\Documents\GitHub\Software_effort_estimation\proposal\abordagem2\saida\2-geracao-variaveis\MIMIC-50"
output_directory6 = r"C:\Users\CALEO\OneDrive - Hexagon\Documents\GitHub\Software_effort_estimation\proposal\abordagem2\saida\3-simulacao\MIMIC-50"

input_directory7 = r"C:\Users\CALEO\OneDrive - Hexagon\Documents\GitHub\Software_effort_estimation\proposal\abordagem2\saida\2-geracao-variaveis\MIMIC-75"
output_directory7 = r"C:\Users\CALEO\OneDrive - Hexagon\Documents\GitHub\Software_effort_estimation\proposal\abordagem2\saida\3-simulacao\MIMIC-75"

# Processar subpastas e gerar saídas
process_subfolders(input_directory, output_directory)
process_subfolders(input_directory2, output_directory2)
process_subfolders(input_directory3, output_directory3)
process_subfolders(input_directory4, output_directory4)
process_subfolders(input_directory5, output_directory5)
process_subfolders(input_directory6, output_directory6)
process_subfolders(input_directory7, output_directory7)


Lendo arquivo: C:\Users\CALEO\OneDrive - Hexagon\Documents\GitHub\Software_effort_estimation\proposal\abordagem2\saida\1-tratamento-target-encoding-normalizado\tratamento_china.txt
Epoch [0/5000] - Loss D: 1.3442, Loss G: 0.6790
Epoch [500/5000] - Loss D: 0.3864, Loss G: 2.2634
Epoch [1000/5000] - Loss D: 0.3813, Loss G: 2.5238
Epoch [1500/5000] - Loss D: 0.2655, Loss G: 3.2843
Epoch [2000/5000] - Loss D: 0.3484, Loss G: 3.4396
Epoch [2500/5000] - Loss D: 0.3255, Loss G: 2.7824
Epoch [3000/5000] - Loss D: 0.1266, Loss G: 3.8165
Epoch [3500/5000] - Loss D: 0.1032, Loss G: 4.1187
Epoch [4000/5000] - Loss D: 0.0957, Loss G: 4.3513
Epoch [4500/5000] - Loss D: 0.0801, Loss G: 3.7662
Epoch [4999/5000] - Loss D: 0.2042, Loss G: 4.0304
Arquivo gerado: C:\Users\CALEO\OneDrive - Hexagon\Documents\GitHub\Software_effort_estimation\proposal\abordagem2\saida\3-simulacao\1-tratamento\.\tratamento_china.txt
Lendo arquivo: C:\Users\CALEO\OneDrive - Hexagon\Documents\GitHub\Software_effort_estimation\p