Statistical Learning HW3

Kasra Mojallal 110124782

# Generating data with GAN

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers
from sklearn.datasets import fetch_california_housing

2023-08-04 10:33:23.059287: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [14]:
data

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]]),
 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]),
 'frame': None,
 'target_names': ['MedHouseVal'],
 'feature_names': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'DESCR': '.. _california_housing_dataset:\n

In [2]:
# We use California Dataset as a regression problem
# It has 8 features
data = fetch_california_housing()
X, y = data.data, data.target

In [13]:
X

array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
          37.88      , -122.23      ],
       [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
          37.86      , -122.22      ],
       [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
          37.85      , -122.24      ],
       ...,
       [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
          39.43      , -121.22      ],
       [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
          39.43      , -121.32      ],
       [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
          39.37      , -121.24      ]])

In [3]:
X.shape, y.shape

((20640, 8), (20640,))

In [4]:
# The Generator
# The Generator will generate 9 features, 8 relating to the X values
# the last one would be the y
generator = models.Sequential()
generator.add(layers.Dense(128, input_dim=50, activation='relu'))
generator.add(layers.Dense(256, activation='relu'))
generator.add(layers.Dense(9, activation='linear'))

# The Discriminator
discriminator = models.Sequential()
discriminator.add(layers.Dense(256, input_dim=9, activation='relu'))
discriminator.add(layers.Dense(128, activation='relu'))
discriminator.add(layers.Dense(1, activation='sigmoid'))

In [5]:
# Creating the GAN model
discriminator.trainable = False
gan_input = layers.Input(shape=(50,))
x = generator(gan_input)
gan_output = discriminator(x)
gan = models.Model(gan_input, gan_output)

In [6]:
def normalize_data(data):
    return (data - np.min(data, axis=0)) / (np.max(data, axis=0) - np.min(data, axis=0))

def preprocess_data(X, y):
    X_normalized = normalize_data(X)
    y_normalized = normalize_data(y.reshape(-1, 1))
    return X_normalized, y_normalized

In [7]:
# Compiling the Discriminator and the GAN
discriminator.compile(loss='binary_crossentropy',
                      optimizer=optimizers.Adam(learning_rate=0.0002))
gan.compile(loss='binary_crossentropy',
            optimizer=optimizers.Adam(learning_rate=0.0002))

In [8]:
epochs = 50
batch_size = 32

for epoch in range(epochs):
        idx = np.random.randint(0, X.shape[0], batch_size)
        real_samples_X = X[idx]
        real_samples_y = y[idx]
        real_samples_y = real_samples_y.reshape(-1, 1)

        noise = np.random.normal(0, 1, size=(batch_size, 50))
        generated_samples = generator.predict(noise)

        X_batch = np.concatenate([real_samples_X, generated_samples[:, :8]], axis=0)
        y_batch = np.concatenate([real_samples_y, generated_samples[:, 8:]], axis=0)
        
        Xy = np.concatenate([X_batch, y_batch], axis=1)

        y_discriminator = np.zeros(2 * batch_size)
        y_discriminator[:batch_size] = 0.9  # Label smoothing

        discriminator_loss = discriminator.train_on_batch(Xy, y_discriminator)

        noise = np.random.normal(0, 1, size=(batch_size, 50))
        y_gan = np.ones(batch_size)
        gan_loss = gan.train_on_batch(noise, y_gan)

        print(f"Epoch: {epoch}, Discriminator Loss: {discriminator_loss}, GAN Loss: {gan_loss}")

Epoch: 0, Discriminator Loss: 73.0049819946289, GAN Loss: 0.7152770757675171
Epoch: 1, Discriminator Loss: 87.5365219116211, GAN Loss: 0.7110215425491333
Epoch: 2, Discriminator Loss: 57.251102447509766, GAN Loss: 0.7050318717956543
Epoch: 3, Discriminator Loss: 72.19908142089844, GAN Loss: 0.7039123773574829
Epoch: 4, Discriminator Loss: 80.90995788574219, GAN Loss: 0.7036734223365784
Epoch: 5, Discriminator Loss: 67.80726623535156, GAN Loss: 0.7017752528190613
Epoch: 6, Discriminator Loss: 68.39640808105469, GAN Loss: 0.7003308534622192
Epoch: 7, Discriminator Loss: 90.03854370117188, GAN Loss: 0.6957268118858337
Epoch: 8, Discriminator Loss: 83.80489349365234, GAN Loss: 0.6969507932662964
Epoch: 9, Discriminator Loss: 73.7119369506836, GAN Loss: 0.6951723098754883
Epoch: 10, Discriminator Loss: 62.18650436401367, GAN Loss: 0.6901654601097107
Epoch: 11, Discriminator Loss: 65.37095642089844, GAN Loss: 0.6920673251152039
Epoch: 12, Discriminator Loss: 84.71754455566406, GAN Loss: 0.68

In [9]:
# Generating the new data with labels
num_synthetic_samples = 1000
noise = np.random.normal(0, 1, size=(num_synthetic_samples, 50))
synthetic_samples = generator.predict(noise)



In [12]:
synthetic_samples

array([[-0.32345915, -0.47504732,  1.70821   , ..., -2.2538948 ,
        -1.497531  , -2.2600029 ],
       [ 0.24176769, -0.17335233,  1.2272843 , ..., -2.0606976 ,
        -0.9735132 , -2.2675252 ],
       [ 0.02540219, -0.23192069,  0.9667895 , ..., -1.878648  ,
        -0.80007035, -1.8910781 ],
       ...,
       [ 0.31821173, -0.83303046,  1.1491485 , ..., -1.8884203 ,
        -1.018875  , -1.4994751 ],
       [-0.06323391, -0.25735375,  0.78762186, ..., -0.9178278 ,
        -0.8579634 , -1.3415406 ],
       [-0.23944673, -0.45586166,  1.0438039 , ..., -2.2090416 ,
        -1.2579383 , -1.8022774 ]], dtype=float32)

# Linear Regression Model on original data

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
X_small = X[:100, :]
y_small = y[:100]

print(X_small.shape, y_small.shape)

(100, 8) (100,)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_small,
                                                    y_small,
                                                    test_size=0.2,
                                                    random_state=42)

In [None]:
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)

In [None]:
y_pred = regression_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 0.24338957249433815


# Linear Regression Model on (original+generated) data

In [None]:
generated_X = synthetic_samples[:100, :8]
generated_y = synthetic_samples[:100, 8:].reshape(-1)

In [None]:
combined_X = np.concatenate([X_small, generated_X], axis=0)
combined_y = np.concatenate([y_small, generated_y], axis=0)

In [None]:
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(combined_X,
                                                            combined_y,
                                                            test_size=0.2,
                                                            random_state=42)

In [None]:
new_regression_model = LinearRegression()
new_regression_model.fit(X_train_c, y_train_c)

In [None]:
y_pred_c = new_regression_model.predict(X_test_c)

mse = mean_squared_error(y_test_c, y_pred_c)
print("Mean Squared Error:", mse)

Mean Squared Error: 0.12584670369745898


Results:
We can see that when we add the generated data from the GAN, it makes the loss better