In [2]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.utils import shuffle
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from confusionmatrix import *
from train import train
from GAN import *
from math import sqrt
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [3]:


def train(X_train, y_train, X_test, y_test):
  scaler = StandardScaler()
  X_test_scaled = scaler.fit_transform(X_test)
  X_train_scaled = scaler.transform(X_train)

  # KNN
  knn_classifier = KNeighborsClassifier(n_neighbors=int(sqrt(len(X_train))))
  knn_classifier.fit(X_train_scaled, y_train)
  knn_predictions = knn_classifier.predict(X_test_scaled)
  knn = confusionMatrix(y_test, knn_predictions)

  # Logistic Regression
  logreg_classifier = LogisticRegression()
  logreg_classifier.fit(X_train_scaled, y_train)
  logreg_predictions = logreg_classifier.predict(X_test_scaled)
  logreg = confusionMatrix(y_test, logreg_predictions)

  # SVM
  svm_classifier = SVC(kernel='linear')
  svm_classifier.fit(X_train_scaled, y_train)
  svm_predictions = svm_classifier.predict(X_test_scaled)
  svm = confusionMatrix(y_test, svm_predictions)

  # Random Forest
  rf_classifier = RandomForestClassifier(n_estimators=100)
  rf_classifier.fit(X_train_scaled, y_train)
  rf_predictions = rf_classifier.predict(X_test_scaled)
  rf = confusionMatrix(y_test, rf_predictions)

  return [knn, logreg, svm, rf]

In [4]:
import torch.nn as nn

# Generator
class Generator(nn.Module):
    def __init__(self, input_size, hidden_sizes, output_size):
          super(Generator, self).__init__()
          layers = []
          prev_size = input_size
          for size in hidden_sizes:
              layers.append(nn.Linear(prev_size, size))
              layers.append(nn.LeakyReLU(0.2))
              if not (size == hidden_sizes[-1]):
                layers.append(nn.Dropout(0.5))
              prev_size = size
          layers.append(nn.Linear(prev_size, output_size))
          layers.append(nn.Tanh())
          self.layers = nn.Sequential(*layers)

    def forward(self, x):
        return self.layers(x)

# Discriminator
class Discriminator(nn.Module):
    def __init__(self, input_size, hidden_sizes):
        super(Discriminator, self).__init__()
        layers = []
        prev_size = input_size
        output_size = 1
        for size in hidden_sizes:
          layers.append(nn.Linear(prev_size, size))
          layers.append(nn.LeakyReLU(0.2))
          layers.append(nn.Dropout(0.5))
          prev_size = size
        layers.append(nn.Linear(prev_size, output_size))
        layers.append(nn.Sigmoid())
        self.layers = nn.Sequential(*layers)
    def forward(self, x):
      return self.layers(x)

In [5]:
# Load and preprocess the breast cancer dataset
data = load_breast_cancer()
scaler = StandardScaler()
scaled = scaler.fit_transform(data.data)

train_return = ["knn", "logreg", "svm", "randomforest"]


In [6]:
from sklearn.utils import shuffle
from random import randint

import numpy as np
def join_lists_of_lists(list1, list2):
    if len(list1) != len(list2):
        raise ValueError("Input lists must have the same number of sublists")

    result = []
    for sublist1, sublist2 in zip(list1, list2):
        result.append(sublist1 + sublist2)

    return result


def divide_list_into_k_parts(input_list, k):
    n = len(input_list)
    avg = n // k
    remainder = n % k

    result = []
    start = 0

    for i in range(k):
        end = start + avg + (1 if i < remainder else 0)
        result.append(input_list[start:end])
        start = end
    return result

def kcv(data,target,k,print_info=False,randomize=False):

    random_state = randint(0,100) if randomize else 42
    data, target = shuffle(data,target, random_state=random_state)

    b_rows = []
    m_rows = []

    for i, row in enumerate(data):
        if target[i] == 1:
            # Add row as an element of b_rows
            b_rows += [row]
        else:
            m_rows += [row]


    proportion = len(b_rows)/len(m_rows)
    if (print_info):
      print(f'Benign to Malignant proportion: {proportion} ({len(b_rows)}/{len(m_rows)})')

    # Use a list comprehension to create new arrays with the added element
    b_rows = [np.append(arr, 0) for arr in b_rows]
    m_rows = [np.append(arr, 1) for arr in m_rows]

    b_splits = divide_list_into_k_parts(b_rows, k)
    m_splits = divide_list_into_k_parts(m_rows, k)

    folds = np.array(join_lists_of_lists(b_splits, m_splits), dtype=object)

    partitions = []

    for i in range(k):
        X_test = np.array([arr[:-1] for arr in folds[i]])
        y_test = np.array([arr[-1] for arr in folds[i]])

        X_train = np.concatenate([([arr[:-1] for arr in folds[j]]) for j in range(k) if j != i])
        y_train = np.concatenate([([arr[-1] for arr in folds[j]]) for j in range(k) if j != i])

        # Calculate the mean and standard deviation of each column
        mean = np.mean(X_train, axis=0)
        std_dev = np.std(X_train, axis=0)

        # Subtract the mean from each element and divide by the standard deviation
        training_set = (X_train - mean) / std_dev
        test_set = (X_test - mean) / std_dev


        #training_set, y_train = shuffle(training_set, y_train, random_state=random_state)
        #test_set, y_test = shuffle(test_set, y_test, random_state=random_state)


        if (print_info):
          print(f'Test fold {i + 1}: Instances for training: {len(training_set)}, Instances for testing: {len(test_set)})')
        partitions.append((training_set, y_train, X_test, y_test))


    return partitions

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

losses = []
datasets = []

def generate_data(dataset, hidden_size):
    print(f"Training GAN: {hidden_size}")
    
    input_size = dataset.shape[1]  # Assuming input_size is defined somewhere
    batch_size = 64  # Define your desired batch size
    num_epochs = 2000  # Define your desired number of epochs
    learning_rate = 0.0002  # Define your desired learning rate
    num_samples = 1000  # Define the number of samples you want to generate
    
    # Create the generator and discriminator
    generator = Generator(input_size, hidden_size, input_size).to(device)
    discriminator = Discriminator(input_size, hidden_size).to(device)
    
    # Loss and optimizers
    criterion = nn.BCELoss()
    generator_optimizer = optim.Adam(generator.parameters(), lr=learning_rate)
    discriminator_optimizer = optim.Adam(discriminator.parameters(), lr=learning_rate)

    results = []
    g_loss_list = []
    d_loss_list = []
    for epoch in range(num_epochs):
        # Train Discriminator
        discriminator.zero_grad()
        real_data = torch.tensor(dataset[np.random.choice(dataset.shape[0], batch_size)], dtype=torch.float32).to(device)
        real_labels = torch.ones(batch_size, 1, dtype=torch.float32).to(device)

        fake_data = generator(torch.randn(batch_size, input_size, dtype=torch.float32).to(device))
        fake_labels = torch.zeros(batch_size, 1, dtype=torch.float32).to(device)

        real_outputs = discriminator(real_data)
        fake_outputs = discriminator(fake_data.detach())

        d_loss_real = criterion(real_outputs, real_labels)
        d_loss_fake = criterion(fake_outputs, fake_labels)
        d_loss = d_loss_real + d_loss_fake
        d_loss_list.append(d_loss.item())

        d_loss.backward()
        discriminator_optimizer.step()

        # Train Generator
        generator.zero_grad()
        fake_data = generator(torch.randn(batch_size, input_size, dtype=torch.float32).to(device))
        fake_outputs = discriminator(fake_data)
        g_loss = criterion(fake_outputs, real_labels)
        g_loss_list.append(g_loss.item())
        g_loss.backward()
        generator_optimizer.step()

        # Print progress
        if epoch % 500 == 0:
            #print(f"    Epoch [{epoch}/{num_epochs}] D Loss: {d_loss.item():.4f} G Loss: {g_loss.item():.4f}")
            pass

    # Generate synthetic data
    synthetic_data = generator(torch.randn(num_samples, input_size, dtype=torch.float32).to(device)).detach().cpu().numpy()

    return synthetic_data

# Call generate_data with your dataset and hidden_size
# generated_data = generate_data(your_dataset, your_hidden_size)

In [8]:
def graph_losses(losses_list):
  l = np.array(losses[datasets.index(max(datasets))])
  plt.plot(l[0],label="Generator")
  plt.plot(l[1],label="Discriminator")
  plt.legend(loc='upper right')

  l = np.array(losses[datasets.index(min(datasets))])
  plt.plot(l[0],label="Generator")
  plt.plot(l[1],label="Discriminator")
  plt.legend(loc='upper right')


In [9]:
from sklearn.utils import shuffle

def get_true_false_list(y_train):
  """
  Given the list of targets, return the indexes of positives and negatives
  """
  y_true = []
  y_false = []
  for i in range(len(y_train)):
    if (y_train[i] == 0):
      y_false.append(i)
    else:
      y_true.append(i)
  return y_true, y_false


def create_table(X_train, y_true_indexes, y_false_indexes,hidden_size):
  # Given a table, return a synthetic generated table
  gan_results = []
  true_data = generate_data(X_train[y_true_indexes],hidden_size) # Train a GAN on the positive data
  true_labels = np.ones((true_data.shape[0],1))
  true_table = np.concatenate((true_data,true_labels),axis=1)

  false_data = generate_data(X_train[y_false_indexes],hidden_size) # Train a GAN on the false data
  false_labels = np.zeros((false_data.shape[0],1))
  false_table = np.concatenate((false_data,false_labels),axis=1)

  true_rows =  true_table[np.random.choice(len(true_table), len(y_true_indexes))]
  false_rows =  false_table[np.random.choice(len(false_table),len(y_false_indexes))]

  final_table = np.concatenate((true_rows,false_rows),axis=0)
  final_table = shuffle(final_table)

  return final_table



In [10]:
# GAN parameters
GANS = [
    [256,512],
    [256],
    [128,256],
    [128],
]
# Hyperparameters
input_size = 30 # n atributos
batch_size = 16
learning_rate = 0.0012
num_epochs = 5000
num_samples = 569 # mesma qtt de dados no dataset

Treina as GANs e depois os algoritmos usando os dados reais e sintéticos

In [11]:
repetitions_per_gan = 1
k_number = 5;
folds_results_natural=[]
folds_results_gan_all=[]
def results():
  folds = kcv(data.data, data.target,k_number)
  for i,fold in enumerate(folds):
    print(i)
    X_train, y_train, X_test, y_test = fold
    results = train(X_train, y_train, X_test, y_test)

    y_true_indexes, y_false_indexes = get_true_false_list(y_train)
    for hidden_size in GANS: # For each GAN sizes
      final_table = create_table(X_train,y_true_indexes,y_false_indexes,hidden_size)
      train_results = train(final_table[:,:30], final_table[:,30], X_test, y_test)
      folds_results_gan_all.append((train_results))

    folds_results_natural.append((results))

results()

0
Training GAN: [256, 512]
Training GAN: [256, 512]
Training GAN: [256]
Training GAN: [256]
Training GAN: [128, 256]
Training GAN: [128, 256]
Training GAN: [128]
Training GAN: [128]
1
Training GAN: [256, 512]
Training GAN: [256, 512]
Training GAN: [256]
Training GAN: [256]
Training GAN: [128, 256]
Training GAN: [128, 256]
Training GAN: [128]
Training GAN: [128]
2
Training GAN: [256, 512]
Training GAN: [256, 512]
Training GAN: [256]
Training GAN: [256]
Training GAN: [128, 256]
Training GAN: [128, 256]
Training GAN: [128]
Training GAN: [128]
3
Training GAN: [256, 512]
Training GAN: [256, 512]
Training GAN: [256]
Training GAN: [256]
Training GAN: [128, 256]
Training GAN: [128, 256]
Training GAN: [128]
Training GAN: [128]
4
Training GAN: [256, 512]
Training GAN: [256, 512]
Training GAN: [256]
Training GAN: [256]
Training GAN: [128, 256]
Training GAN: [128, 256]
Training GAN: [128]
Training GAN: [128]


In [12]:
# Comparar os dados sinteticos e naturais
# Mais arquiteturas
def get_attr_from_list(train_return, info):
  knn, logreg, svm, rf = train_return
  return [getattr(model,info) for model in train_return]

Imprime uma tabela para os dados originais, imprimindo avg(stdev) de todos os atributos para cada algorítmo

In [13]:
from prettytable import PrettyTable
import statistics

t = PrettyTable(['Attribute','knn', 'logreg','svm','randomforest'])
print("natural results")
cm_attrs = ["accuracy","precision","recall","f_score","npv"]
for attr in cm_attrs:
  table = [0,0,0,0]
  table2 = [0,0,0,0]

  knnres=[]
  logregres=[]
  svmres=[]
  rfres=[]
  for natural_result in folds_results_natural:
    
    atributes =  (get_attr_from_list(natural_result,attr))
    knnres.append(atributes[0])
    logregres.append(atributes[1])
    svmres.append(atributes[2])
    rfres.append(atributes[3])
    table = [x + y for x, y in zip(table, atributes)]
  table2 =[attr]
  table2.extend([statistics.stdev(knnres),statistics.stdev(logregres),statistics.stdev(svmres),statistics.stdev(rfres)])
  table = [x / k_number for x in table]
  table.insert(0,attr)
  tableText=[attr];
  for i in range(1,len(table)):
    tableText.append("{:.3f}({:.3f})".format(table[i],table2[i]))
  t.add_row(tableText)

t.float_format = '.3'
print(t)





natural results
+-----------+--------------+--------------+--------------+--------------+
| Attribute |     knn      |    logreg    |     svm      | randomforest |
+-----------+--------------+--------------+--------------+--------------+
|  accuracy | 0.961(0.012) | 0.981(0.012) | 0.979(0.013) | 0.956(0.009) |
| precision | 0.995(0.011) | 0.991(0.013) | 0.986(0.013) | 0.948(0.025) |
|   recall  | 0.901(0.032) | 0.957(0.035) | 0.957(0.031) | 0.934(0.010) |
|  f_score  | 0.945(0.018) | 0.973(0.016) | 0.971(0.019) | 0.941(0.012) |
|    npv    | 0.944(0.016) | 0.976(0.020) | 0.975(0.018) | 0.961(0.006) |
+-----------+--------------+--------------+--------------+--------------+


Imprime uma tabela para cada arquitetura da GAN, imprimindo avg(stdev) de todos os atributos para cada algorítmo

In [14]:
from prettytable import PrettyTable
print("gan results")
cm_attrs = ["accuracy","precision","recall","f_score","npv"]
architecture256_512 = folds_results_gan_all[0::4]
architecture256 = folds_results_gan_all[1::4]
architecture128_256 = folds_results_gan_all[2::4]
architecture128 = folds_results_gan_all[3::4]
archAll=[architecture256_512,architecture256,architecture128_256,architecture128]
for arch in archAll:
  t = PrettyTable(['Attribute','knn', 'logreg','svm','randomforest'])

  for attr in cm_attrs:
    table = [0,0,0,0];
    table2 = [0,0,0,0]
    knnres=[]
    logregres=[]
    svmres=[]
    rfres=[]

    for gan_result in arch:
      atributes =  (get_attr_from_list(gan_result,attr))
      knnres.append(atributes[0])
      logregres.append(atributes[1])
      svmres.append(atributes[2])
      rfres.append(atributes[3])
      table = [x + y for x, y in zip(table, atributes)]

    table2 =[attr]
    table2.extend([statistics.stdev(knnres),statistics.stdev(logregres),statistics.stdev(svmres),statistics.stdev(rfres)])
    table = [x / k_number for x in table]
    
    table.insert(0,attr)
    tableText=[attr];
    for i in range(1,len(table)):
      tableText.append("{:.3f}({:.3f})".format(table[i],table2[i]))
    t.add_row(tableText)

  t.float_format = '.3'
  print(t)


gan results
+-----------+--------------+--------------+--------------+--------------+
| Attribute |     knn      |    logreg    |     svm      | randomforest |
+-----------+--------------+--------------+--------------+--------------+
|  accuracy | 0.909(0.022) | 0.917(0.027) | 0.914(0.041) | 0.880(0.050) |
| precision | 0.989(0.026) | 0.988(0.026) | 0.982(0.041) | 0.974(0.024) |
|   recall  | 0.764(0.059) | 0.787(0.064) | 0.783(0.095) | 0.697(0.138) |
|  f_score  | 0.861(0.036) | 0.875(0.044) | 0.869(0.067) | 0.807(0.093) |
|    npv    | 0.877(0.028) | 0.888(0.030) | 0.886(0.045) | 0.850(0.060) |
+-----------+--------------+--------------+--------------+--------------+
+-----------+--------------+--------------+--------------+--------------+
| Attribute |     knn      |    logreg    |     svm      | randomforest |
+-----------+--------------+--------------+--------------+--------------+
|  accuracy | 0.935(0.028) | 0.953(0.013) | 0.951(0.015) | 0.902(0.033) |
| precision | 0.994(0.012)