**This code will train a CTGAN and then evaluate its performance just using the synthetic dataset and different regression models.**

The first cell here will just load the libraries we need and the data we will be using.

In [1]:
from CTGAN import *

all_real_data = pd.read_csv('/Users/jakehirst/Desktop/sfx/sfx_ML_data/New_Crack_Len_FULL_OG_dataframe_2023_11_16.csv', index_col=0)
train_df, test_df = train_test_split(all_real_data, test_size=0.2, random_state=42)

epochs_to_try = [100, 500, 1000, 2500, 5000, 7500, 10000]
batch_sizes_to_try = [10, 30, 50, 100, 200, 300, 500]
# epochs_to_try = [100, 500, 1000, 2500, 5000]
# batch_sizes_to_try = [50, 100, 200, 300, 500]


**This next cell will make the CTGAN by fitting the generator and discriminator to the training dataset. Then, it will sample num_rows from the CTGAN for new synthetic data.**

In [2]:
# synthesizer = train_CTGAN(train_df, 500, 100)
# synthetic_data = synthesizer.sample(1000)

Gen. (-2.53) | Discrim. (0.65): 100%|██████████| 100/100 [00:05<00:00, 17.05it/s]


**Now we will do a grid search on the hyperparameters of CTGAN. Namely the epochs and batch sizes.**

In [None]:


saving_folder = '/Volumes/Jake_ssd/GANS/hyperparameter_tuning'

for epochs in epochs_to_try:
    for batch_size in batch_sizes_to_try:
        path = saving_folder + f'/{epochs}_epochs_{batch_size}_batch_size'
        if(not os.path.exists(path)): os.makedirs(path)
        # elif(os.path.exists(path)): 
        #     print('\n$$$$$$$$$$$$$$$$$$$$$$ This one was already done!! $$$$$$$$$$$$$$$$$$$$$$\n')
        #     continue
        
        synthesizer = train_CTGAN(train_df, batch_size, epochs)
        synthetic_data = synthesizer.sample(1000)
        save_quality_metrics(train_df, synthetic_data, path + '/quality_metrics.csv')
        analyze_R2_performance(synthesizer, train_df, test_df, 
                           path + '/R2_performances',)
                        #    all_labels=['impact site x', 'impact site y'], #TODO delete all_labels when done
                        #    model_types = ['linear','lasso', 'ridge']) #TODO delete model_types when done
        save_CTGAN(synthesizer, path + f'/{epochs}_epochs_{batch_size}_batch_size_synthesizer.pkl')
        

**Now we analyze the best hyperparamters based on the Column Shapes score, the Colun Pair Trends score, or the sum of both of them.**

In [4]:
saving_folder = '/Volumes/Jake_ssd/GANS/hyperparameter_tuning'
metric_options = ['Column Shapes', 'Column Pair Trends', 'Quality metric sum']


for metric in metric_options:
    best_score = 0
    best_hyperparameters = None
    epochs_batchsizes_scores = []
    for epoch in epochs_to_try:
        for batch_size in batch_sizes_to_try:
            path = saving_folder + f'/{epoch}_epochs_{batch_size}_batch_size'
            quality_metrics = pd.read_csv(path + '/quality_metrics.csv')
            score = get_quality_metric(metric, path)
            epochs_batchsizes_scores.append((epoch, batch_size, score))
            if(score > best_score):
                best_score = score
                best_hyperparameters = (epoch, batch_size)
        
    x, y, z = zip(*epochs_batchsizes_scores)

    # Create the scatter plot
    scatter = go.Scatter3d(x=x, y=y, z=z, mode='markers')

    # Define the layout
    layout = go.Layout(
        title=f"Epoch and batch size vs quality metrics for synthetic data",
        scene=dict(
            xaxis=dict(title='epoch'),
            yaxis=dict(title='batch size'),
            zaxis=dict(title=f'{metric} score'),
        ),
        width=800,  # Width of the figure in pixels
        height=600  # Height of the figure in pixels
    )

    # Combine the plot and layout
    fig = go.Figure(data=[scatter], layout=layout)

    # Display the plot
    fig.show()

In [3]:

all_labels=['impact site x', 'impact site y', 'height']
# all_labels = ['impact site x']
model_types = ['linear', 'RF', 'lasso', 'ridge', 'poly2', 'GPR']
model_types = ['linear', 'RF', 'lasso', 'ridge', 'GPR']



saving_folder = '/Volumes/Jake_ssd/GANS/hyperparameter_tuning'

model_symbols = {'linear':('circle', 'red'),
                 'RF': ('square', 'blue'),
                 'lasso': ('diamond', 'green'), 
                 'ridge': ('cross', 'orange'), 
                 'poly2': ('x', 'purple'), 
                 'GPR': ('diamond-open', 'pink')}


for label in all_labels:
    best_score = 0
    best_hyperparameters = None
    epochs_batchsizes_scores = {}
    for model_type in model_types:
        epochs_batchsizes_scores[model_type] = []
    for epoch in epochs_to_try:
        for batch_size in batch_sizes_to_try:
                path = saving_folder + f'/{epoch}_epochs_{batch_size}_batch_size'
                R2_performances = pd.read_csv(path + f'/R2_performances/{label}.csv')
                real_scores = R2_performances.drop('synthetic_data test r2', axis=1)
                synthetic_scores = R2_performances.drop('real_data test r2', axis=1)
                
                # print('\n')
                # print(R2_performances)
                for model_type in model_types:
                    # print(model_type)
                    real_score_r2 = real_scores.loc[real_scores['Unnamed: 0'] == model_type]['real_data test r2'].iloc[0]  
                    synthetic_score_r2 = synthetic_scores.loc[synthetic_scores['Unnamed: 0'] == model_type]['synthetic_data test r2'].iloc[0]
                    # print(real_score_r2)
                    # print(synthetic_score_r2)
                    # epochs_batchsizes_scores.append((epoch, batch_size, real_scores_r2, synthetic_scores_r2))
                    epochs_batchsizes_scores[model_type].append((epoch, batch_size, real_score_r2, synthetic_score_r2))

    
    make_performance_figures(label, epochs_batchsizes_scores, model_symbols)
        


    

**Now we can plot the **