**Warning:** you will need to do a `pip install tabgan` to help get this notebook to work.

See this [GitHub repo](https://github.com/Diyago/GAN-for-tabular-data).

In [None]:
cd ../

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from tabgan.sampler import OriginalGenerator, GANGenerator

import seaborn as sns
import matplotlib.pyplot as plt

from data.processed.pre_process_titanic import pre_process_df

%load_ext lab_black

### Parameters

In [None]:
TRAIN_DATA_PATH = "data/raw/train.csv"
TEST_DATA_PATH = "data/raw/test.csv"
NUM_COLUMNS = ["Fare", "Age"]
LABEL = "Survived"

In [None]:
# CAT_COLUMNS = [f"embarked_{c}" for c in enc.categories_[0]]
# COLUMNS = data.columns.tolist() + CAT_COLUMNS

### 1. Load in the real data and prepare it

In [None]:
# Load the Titanic dataset
train = pre_process_df(TRAIN_DATA_PATH).drop(LABEL, axis=1)
target = pd.read_csv(TRAIN_DATA_PATH)[[LABEL]]
test = pre_process_df(TEST_DATA_PATH)

### 2. Visualise the data

See what the seaborn pair-plot can show for insight into the numeric feature distributions.

In [None]:
train = data[NUM_COLUMNS]

In [None]:
sns.pairplot(data[NUM_COLUMNS])
plt.show()

In [None]:
sns.histplot(data["Age"], label="Real Data")
plt.legend()
plt.show()

### 3. tabgan application

In [None]:
DROP_COLUMNS = ["Name", "Ticket", "Cabin", "PassengerId"]

In [None]:
train = train.drop(DROP_COLUMNS, axis=1)
test = test.drop(DROP_COLUMNS, axis=1)

In [None]:
# Impute missing values
imputer = SimpleImputer(strategy="median")
data = imputer.fit_transform(data)

In [None]:
train.fillna(0, inplace=True)
test.fillna(0, inplace=True)

In [None]:
# example with all params defined
new_train, new_target = GANGenerator(
    gen_x_times=100,
    cat_cols=["Pclass", "Sex", "SibSp", "Parch", "Embarked"],
    bot_filter_quantile=0.001,
    top_filter_quantile=0.999,
    is_post_process=True,
    adversarial_model_params={
        "metrics": "AUC",
        "max_depth": 2,
        "max_bin": 100,
        "learning_rate": 0.02,
        "random_state": 42,
        "n_estimators": 500,
    },
    pregeneration_frac=2,
    only_generated_data=False,
    gan_params={
        "batch_size": 500,
        "patience": 25,
        "epochs": 500,
    },
).generate_data_pipe(
    train, target, test, deep_copy=True, only_adversarial=False, use_adversarial=True
)

In [None]:
new_train1, new_target1 = OriginalGenerator(gen_x_times=1.5).generate_data_pipe(
    train,
    target,
    test,
)

In [None]:
new_train1

### Comparing the real and fake data distributions

In [None]:
real_df = pd.DataFrame(real_features, columns = NUM_COLUMNS)
fake_df = pd.DataFrame(fake_features, columns = NUM_COLUMNS)

In [None]:
# Normalize the data
fake_data = (fake_data - np.mean(data, axis=0)) / np.std(fake_data, axis=0)

In [None]:
fig = plt.figure()
ax1 = fig.add_subplot(111)

feature = "Age"

sns.histplot(data[feature], label='Real Data', alpha=0.5, ax=ax1)
sns.histplot(0.7*data[feature] + 10, label='Fake Data', alpha=0.5, ax=ax1)
plt.legend(loc='upper left')
plt.show()

In [None]:
numerical_feature_means["Fare"]

In [None]:
# Normalize the data
data = (data - np.mean(data, axis=0)) / np.std(data, axis=0)

In [None]:
for ft in NUM_COLUMNS:
    real_df[ft] = (real_df[ft] * numerical_feature_stddev[ft]) + numerical_feature_means[ft]
    fake_df[ft] = (fake_df[ft] * numerical_feature_stddev[ft]) + numerical_feature_means[ft]

In [None]:
data.head()

In [None]:
real_df.head()

In [None]:
real_df.describe()

In [None]:
fake_df.describe()