In [1]:
import torch
import torch.nn as nn
from tony_dataset import CTGdataset
from neural_net import Neural_Net, train_nn
import sys, os
import pandas as pd
import numpy as np

# check computation device
device = ('cuda' if torch.cuda.is_available() else 'cpu')
# print(f"Computation device: {device}\n")

In [2]:
'''
STEP 1: LOADING DATASET
'''
CTG_dataset = CTGdataset()

train_dataset, test_dataset = torch.utils.data.random_split(CTG_dataset, 
    [int(0.6 * len(CTG_dataset)), len(CTG_dataset) - int(0.6 * len(CTG_dataset))])
# training vs testing ratio is 6:4

Consider hidden dimensions be 20, 40, 60, 80, 100, 120

In [3]:
def generate_data(model, test_dataset):
    test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                            batch_size=len(test_dataset), 
                                            shuffle=True)
    # Iterate through test dataset, calculate testing accuracy
    for samples, labels in test_loader:
        outputs = model(samples)
        _, predicted = torch.max(outputs.data, 1)
    return samples, predicted

In [7]:
for hidden_dim in [20,40,60,80,100,120]:
    # Instantiate model class
    input_dim = 22  # 22 features in case of CTG
    output_dim = 10 # 10 class labels in case of CTG, _class = 10
    model = Neural_Net(input_dim, hidden_dim, output_dim)

    orig_stdout = sys.stdout

    # Warmstart the neural networks
    filepath = f'synthetic_data/vary_width/dim_{hidden_dim}/test_performance.txt'
    os.makedirs(os.path.dirname(filepath), exist_ok = True)

    with open(filepath, 'w') as sys.stdout:
        batch_size = 100
        train_nn(model, train_dataset, test_dataset, batch_size = batch_size, l_r = 0.005, num_epochs = 6)
        # num_epochs is chosen to make the test accuracies of all 6 NN models to be less than 50%,
        # so that these models don't end up simply replicating the dataset

    # Generate synthetic data with the neural networks
    x, y = generate_data(model, test_dataset)
    x_df, y_df = pd.DataFrame(x.numpy()), pd.DataFrame(y.numpy())
    x_df.to_csv(f'synthetic_data/vary_width/dim_{hidden_dim}/x.csv', index=False, header=False)
    y_df.to_csv(f'synthetic_data/vary_width/dim_{hidden_dim}/y.csv', index=False, header=False)

    sys.stdout = orig_stdout

In [5]:
# hyperparameters for gridsearch use
timelimit = 600
datasets = ['CTG','balance-scale', 'breast-cancer', 'car-evaluation', 'hayes-roth', 'house-votes-84', 
            'soybean-small', 'spect', 'tic-tac-toe', 'monks-1', 'monks-2', 'monks-3']
alpha = [0, 0.01, 0.1]
depth = [2, 3, 4, 5]
seeds = [37, 42, 53]