In [2]:
import pandas as pd
df=pd.read_csv('kaggle_real_data.csv')
df=pd.get_dummies(df,['Airline','Source','Destination','Journey_Day_of_Week'])

In [3]:
df.head(3)

Unnamed: 0,Duration,Price,Journey_Date,Journey_Month,Journey_Year,Stops,Departure_Minutes,Arrival_Minutes,Reaching_Next_Day,Airline_Air Asia,...,Destination_Hyderabad,Destination_Kolkata,Destination_New Delhi,Journey_Day_of_Week_Friday,Journey_Day_of_Week_Monday,Journey_Day_of_Week_Saturday,Journey_Day_of_Week_Sunday,Journey_Day_of_Week_Thursday,Journey_Day_of_Week_Tuesday,Journey_Day_of_Week_Wednesday
0,170,3897,24,3,2019,0,1340,70,True,False,...,False,False,True,False,False,False,True,False,False,False
1,445,7662,1,5,2019,2,350,795,False,False,...,False,False,False,False,False,False,False,False,False,True
2,1140,13882,9,6,2019,2,565,265,True,False,...,False,False,False,False,False,False,True,False,False,False


In [3]:
import os
import logging
import numpy as np
import pandas as pd
import tensorflow as tf

tf.get_logger().setLevel(logging.ERROR)

class GAN:
    def __init__(self, data, latent_dim=64, batch_size=32, n_epochs=200):
        self.data = data
        self.latent_dim = latent_dim
        self.batch_size = batch_size
        self.n_epochs = n_epochs

    # Generate random noise in the latent space
    def _noise(self, batch_size):
        return np.random.normal(0, 1, (batch_size, self.latent_dim))

    def _generator(self):
        model = tf.keras.Sequential(name="Generator_model")
        model.add(tf.keras.layers.Dense(128, activation='relu', kernel_initializer='he_uniform', input_dim=self.latent_dim))
        model.add(tf.keras.layers.Dense(256, activation='relu'))
        model.add(tf.keras.layers.Dense(self.data.shape[1], activation='linear'))  # Output layer has the same size as input data
        return model

    def _discriminator(self):
        model = tf.keras.Sequential(name="Discriminator_model")
        model.add(tf.keras.layers.Dense(256, activation='relu', kernel_initializer='he_uniform', input_dim=self.data.shape[1]))
        model.add(tf.keras.layers.Dense(128, activation='relu'))
        model.add(tf.keras.layers.Dense(1, activation='sigmoid'))  # Binary classification (real or fake)
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        return model

    # Define the combined generator and discriminator model, for updating the generator
    def _GAN(self, generator, discriminator):
        discriminator.trainable = False
        generator.trainable = True
        model = tf.keras.Sequential(name="GAN")
        model.add(generator)
        model.add(discriminator)
        model.compile(loss='binary_crossentropy', optimizer='adam')
        return model

    # Train the generator and discriminator
    def train(self):
        generator = self._generator()
        discriminator = self._discriminator()
        gan = self._GAN(generator, discriminator)

        # Training loop
        for epoch in range(self.n_epochs):
            # Select a random batch of real data
            idx = np.random.randint(0, self.data.shape[0], self.batch_size)
            real_data = self.data[idx]

            # Generate a batch of fake data
            noise = self._noise(self.batch_size)
            fake_data = generator.predict(noise)

            # Train the discriminator on real and fake data
            d_loss_real = discriminator.train_on_batch(real_data, np.ones((self.batch_size, 1)))
            d_loss_fake = discriminator.train_on_batch(fake_data, np.zeros((self.batch_size, 1)))
            d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

            # Train the generator (wants discriminator to label generated samples as real)
            noise = self._noise(self.batch_size)
            g_loss = gan.train_on_batch(noise, np.ones((self.batch_size, 1)))

            # Print progress
            if (epoch + 1) % 100 == 0:
                print(f"Epoch: {epoch + 1}/{self.n_epochs}, D Loss: {d_loss[0]:.4f}, G Loss: {g_loss:.4f}")

        return generator

    # Generate synthetic data
    def generate_synthetic_data(self, generator, num_samples=1000):
        noise = self._noise(num_samples)
        synthetic_data = generator.predict(noise)
        synthetic_df = pd.DataFrame(synthetic_data, columns=self.data.columns)
        return synthetic_df

2024-11-11 00:05:10.695506: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-11 00:05:10.807935: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
data = scaler.fit_transform(df)
gan = GAN(data, latent_dim=64, batch_size=32, n_epochs=2000)


In [5]:
#generator = gan.train()
from tensorflow.keras.models import load_model

# Load the saved model
generator = load_model('generator_model.h5')


2024-11-11 00:05:18.430019: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-11-11 00:05:18.430550: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2251] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [7]:
import numpy as np
import pandas as pd

# Function to generate synthetic data using a trained generator and restore original column names
def generate_synthetic_data(generator, latent_dim, original_columns, num_samples=1000):
    # Generate random noise to feed into the generator
    noise = np.random.normal(0, 1, (num_samples, latent_dim))
    
    # Use the generator to create synthetic data
    synthetic_data = generator.predict(noise)
    
    # Convert the generated data to a pandas DataFrame
    synthetic_df = pd.DataFrame(synthetic_data, columns=original_columns)
    
    return synthetic_df

# Example usage
# Assuming 'generator' is your trained generator model, 'latent_dim' is the size of the noise vector
# and 'original_columns' is a list of your original column names

# List of original columns from your dataset
original_columns =df.columns

num_samples = 2000  # Number of synthetic samples you want to generate
synthetic_data = generate_synthetic_data(generator, latent_dim=64, original_columns=original_columns, num_samples=num_samples)



[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step


In [9]:
reverted_data = scaler.inverse_transform(synthetic_data)

# Convert it back to a DataFrame with original column names
reverted_df = pd.DataFrame(reverted_data, columns=df.columns)

In [10]:
scaled_array = scaler.fit_transform(df)

# Convert the scaled array back to a DataFrame, keeping the same columns
df1 = pd.DataFrame(scaled_array, columns=df.columns)

In [26]:
reverted_df.columns

Index(['Duration', 'Price', 'Journey_Date', 'Journey_Month', 'Journey_Year',
       'Stops', 'Departure_Minutes', 'Arrival_Minutes', 'Reaching_Next_Day',
       'Airline_Air Asia', 'Airline_Air India', 'Airline_GoAir',
       'Airline_IndiGo', 'Airline_Jet Airways', 'Airline_Jet Airways Business',
       'Airline_Multiple carriers',
       'Airline_Multiple carriers Premium economy', 'Airline_SpiceJet',
       'Airline_Trujet', 'Airline_Vistara', 'Airline_Vistara Premium economy',
       'Source_Banglore', 'Source_Chennai', 'Source_Delhi', 'Source_Kolkata',
       'Source_Mumbai', 'Destination_Banglore', 'Destination_Cochin',
       'Destination_Delhi', 'Destination_Hyderabad', 'Destination_Kolkata',
       'Destination_New Delhi', 'Journey_Day_of_Week_Friday',
       'Journey_Day_of_Week_Monday', 'Journey_Day_of_Week_Saturday',
       'Journey_Day_of_Week_Sunday', 'Journey_Day_of_Week_Thursday',
       'Journey_Day_of_Week_Tuesday', 'Journey_Day_of_Week_Wednesday'],
      dtype='objec

In [212]:
reverted_df.head()

Unnamed: 0,Duration,Price,Journey_Date,Journey_Month,Journey_Year,Stops,Departure_Minutes,Arrival_Minutes,Reaching_Next_Day,Airline_Air Asia,...,Destination_Hyderabad,Destination_Kolkata,Destination_New Delhi,Journey_Day_of_Week_Friday,Journey_Day_of_Week_Monday,Journey_Day_of_Week_Saturday,Journey_Day_of_Week_Sunday,Journey_Day_of_Week_Thursday,Journey_Day_of_Week_Tuesday,Journey_Day_of_Week_Wednesday
0,999.0,3942.0,7.0,4.0,2019.0,1.0,696.0,664.0,True,False,...,False,False,False,False,False,True,False,False,False,False
1,775.0,5852.0,12.0,4.0,2019.0,1.0,656.0,846.0,False,False,...,False,False,False,False,False,False,False,True,False,False
2,845.0,5405.0,16.0,4.0,2019.0,2.0,453.0,688.0,False,False,...,False,False,False,False,True,False,False,False,False,False
3,734.0,4905.0,10.0,4.0,2019.0,1.0,486.0,829.0,False,False,...,False,False,False,False,True,False,False,False,False,False
4,532.0,3273.0,12.0,4.0,2019.0,2.0,511.0,1091.0,False,False,...,True,False,False,False,True,False,False,False,False,False


In [64]:


print("Range of the column(reverted_df):", reverted_df['Arrival_Minutes'].max())
print("Range of the column(reverted_df):", reverted_df['Arrival_Minutes'].min())
print("Range of the column(df):", df['Arrival_Minutes'].max())
print("Range of the column(df):", df['Arrival_Minutes'].min())



Range of the column(reverted_df): 7283.674
Range of the column(reverted_df): 95.08223
Range of the column(df): 1435
Range of the column(df): 5


In [67]:
min_df = df['Arrival_Minutes'].min()
max_df = df['Arrival_Minutes'].max()

min_reverted = reverted_df['Arrival_Minutes'].min()
max_reverted = reverted_df['Arrival_Minutes'].max()

scaled = ((reverted_df['Arrival_Minutes'] - min_reverted) / (max_reverted - min_reverted)) * (max_df - min_df) + min_df

# Step 3: Update the 'Price' column in reverted_df with the scaled values
reverted_df['Arrival_Minutes'] = scaled
reverted_df['Arrival_Minutes'] = reverted_df['Arrival_Minutes'].round()

# If you want to convert the rounded values to integers (optional):
reverted_df['Arrival_Minutes'] = reverted_df['Arrival_Minutes'].astype(int)

In [216]:
# Save the DataFrame to a CSV file
reverted_df.to_csv('real+generated_data.csv', index=False)
