**This code will train a CTGAN and then evaluate its performance just using the synthetic dataset and different regression models.**

The first cell here will just load the libraries we need and the data we will be using.

In [1]:
from CTGAN import *

all_real_data = pd.read_csv('/Users/jakehirst/Desktop/sfx/sfx_ML_data/New_Crack_Len_FULL_OG_dataframe_2023_11_16.csv', index_col=0)
train_df, test_df = train_test_split(all_real_data, test_size=0.2, random_state=42)

**This next cell will make the CTGAN by fitting the generator and discriminator to the training dataset. Then, it will sample num_rows from the CTGAN for new synthetic data.**

In [13]:
synthesizer = train_CTGAN(train_df, 500, 100)
synthetic_data = synthesizer.sample(1000)

Gen. (-0.84) | Discrim. (-0.11):  67%|██████▋   | 67/100 [00:04<00:01, 17.77it/s]

**Now we will do a grid search on the hyperparameters of CTGAN. Namely the epochs and batch sizes.**

In [None]:
epochs_to_try = [100, 500, 1000, 2500, 5000]
batch_sizes_to_try = [50, 100, 200, 300, 500]

saving_folder = '/Volumes/Jake_ssd/GANS/hyperparameter_tuning'

for epochs in epochs_to_try:
    for batch_size in batch_sizes_to_try:
        path = saving_folder + f'/{epochs}_epochs_{batch_size}_batch_size'
        if(not os.path.exists(path)): os.makedirs(path)
        
        synthesizer = train_CTGAN(train_df, batch_size, epochs)
        synthetic_data = synthesizer.sample(1000)
        save_quality_metrics(train_df, synthetic_data, path + '/quality_metrics.csv')
        analyze_R2_performance(synthesizer, train_df, test_df, 
                           path + '/R2_performances',)
                        #    all_labels=['impact site x', 'impact site y'], #TODO delete all_labels when done
                        #    model_types = ['linear','lasso', 'ridge']) #TODO delete model_types when done
        save_CTGAN(synthesizer, path + f'/{epochs}_epochs_{batch_size}_batch_size_synthesizer.pkl')
        

**Now we analyze the best hyperparamters based on the Column Shapes score, the Colun Pair Trends score, or the sum of both of them.**

In [7]:
saving_folder = '/Volumes/Jake_ssd/GANS/hyperparameter_tuning'
metric_options = ['Column Shapes', 'Column Pair Trends', 'Quality metric sum']
epochs_to_try = [100, 500, 1000, 2500, 5000]
batch_sizes_to_try = [50, 100, 200, 300, 500]


for metric in metric_options:
    best_score = 0
    best_hyperparameters = None
    epochs_batchsizes_scores = []
    for epoch in epochs_to_try:
        for batch_size in batch_sizes_to_try:
            path = saving_folder + f'/{epoch}_epochs_{batch_size}_batch_size'
            quality_metrics = pd.read_csv(path + '/quality_metrics.csv')
            score = get_quality_metric(metric, path)
            epochs_batchsizes_scores.append((epoch, batch_size, score))
            if(score > best_score):
                best_score = score
                best_hyperparameters = (epoch, batch_size)
        
    x, y, z = zip(*epochs_batchsizes_scores)

    # Create the scatter plot
    scatter = go.Scatter3d(x=x, y=y, z=z, mode='markers')

    # Define the layout
    layout = go.Layout(
        title="Epoch and batch size vs quality metrics for synthetic data",
        scene=dict(
            xaxis=dict(title='epoch'),
            yaxis=dict(title='batch size'),
            zaxis=dict(title=f'{metric} score'),
        ),
        width=800,  # Width of the figure in pixels
        height=600  # Height of the figure in pixels
    )

    # Combine the plot and layout
    fig = go.Figure(data=[scatter], layout=layout)

    # Display the plot
    fig.show()

In [4]:
from plotly.subplots import make_subplots

all_labels=['impact site x', 'impact site y', 'height']
all_labels = ['impact site x']
model_types = ['linear', 'RF', 'lasso', 'ridge', 'poly2', 'GPR']
saving_folder = '/Volumes/Jake_ssd/GANS/hyperparameter_tuning'
epochs_to_try = [100, 500, 1000, 2500, 5000]
batch_sizes_to_try = [50, 100, 200, 300, 500]
model_symbols = {'linear':('circle', 'red'),
                 'RF': ('square', 'blue'),
                 'lasso': ('diamond', 'green'), 
                 'ridge': ('cross', 'orange'), 
                 'poly2': ('x', 'purple'), 
                 'GPR': ('diamond-open', 'pink')}


for label in all_labels:
    best_score = 0
    best_hyperparameters = None
    epochs_batchsizes_scores = []
    for epoch in epochs_to_try:
        for batch_size in batch_sizes_to_try:
            for model_type in model_types:
                path = saving_folder + f'/{epoch}_epochs_{batch_size}_batch_size'
                R2_performances = pd.read_csv(path + f'/R2_performances/{label}.csv')
                real_scores = R2_performances.drop('synthetic_data test r2', axis=1)
                synthetic_scores = R2_performances.drop('real_data test r2', axis=1)
                
                real_scores_r2 = real_scores['real_data test r2'].values
                synthetic_scores_r2 = synthetic_scores['synthetic_data test r2'].values
                epochs_batchsizes_scores.append((epoch, batch_size, real_scores_r2, synthetic_scores_r2))

        
    # Create subplots: one for real scores and one for abs(difference)
    fig = make_subplots(
        rows=1, cols=2,
        subplot_titles=('Real Data Test R2', 'Absolute Difference of Test R2'),
        specs=[[{'type': 'scatter3d'}, {'type': 'scatter3d'}]]  # Specify 3D subplots
    )
    # Add the scatter plots to each subplot
    for epoch, batch_size, real_score, synthetic_score in epochs_batchsizes_scores:
        # Determine model_type here from the scores and then look up the symbol and color from model_symbols
        # This is a placeholder loop that you need to replace with your actual logic
        for model_type, (symbol, color) in model_symbols.items():
            # print(symbol)
            # Add scatter plot for real scores
            fig.add_trace(
                go.Scatter3d(
                    x=[epoch], y=[batch_size], z=[synthetic_score],
                    mode='markers',
                    marker=dict(symbol=symbol, color=color)
                ),
                row=1, col=1
            )
            
            # Add scatter plot for absolute differences
            fig.add_trace(
                go.Scatter3d(
                    x=[epoch], y=[batch_size], z=[abs(synthetic_score - real_score)],
                    mode='markers',
                    marker=dict(symbol=symbol, color=color)
                ),
                row=1, col=2
            )
            
        fig.update_layout(height=600, width=1200, title_text="Comparison of Model Performance")
        fig.show()
    # Update layout, if necessary, then show the plot
    # fig.show()

    # # Create the scatter plot
    # scatter = go.Scatter3d(x=x, y=y, z=z1, mode='markers')

    # # Define the layout
    # layout = go.Layout(
    #     title=f"Epoch and batch size vs R2 performance predicting {label} on test dataset",
    #     scene=dict(
    #         xaxis=dict(title='epoch'),
    #         yaxis=dict(title='batch size'),
    #         zaxis=dict(title=f'{metric} score'),
    #     ),
    #     width=800,  # Width of the figure in pixels
    #     height=600  # Height of the figure in pixels
    # )

    # # Combine the plot and layout
    # fig = go.Figure(data=[scatter], layout=layout)

    # # Display the plot
    # fig.show()
    