Case Study 4:
Present your POV on how to generate synthetic data using GANs. You can assume a sample dataset from an IOT
enabled machine where the failure rates are minimal.

In [None]:
# gan_for_ncr_ride_bookings.py
# Run in Google Colab or local env after: pip install pandas scikit-learn tensorflow matplotlib

import os
import numpy as np
import pandas as pd

# ---- Step 0: path to your uploaded CSV ----
CSV_PATH = "/content/ncr_ride_bookings (1).csv"  # keep as-is if using same environment / path

if not os.path.exists(CSV_PATH):
    raise FileNotFoundError(f"CSV not found at {CSV_PATH}. Upload it or adjust the path.")


In [None]:
# ---- Step 1: Load & inspect ----
df = pd.read_csv(CSV_PATH)
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
print("\nFirst 8 rows:")
print(df.head(8).to_string(index=False))

Shape: (150000, 21)
Columns: ['Date', 'Time', 'Booking ID', 'Booking Status', 'Customer ID', 'Vehicle Type', 'Pickup Location', 'Drop Location', 'Avg VTAT', 'Avg CTAT', 'Cancelled Rides by Customer', 'Reason for cancelling by Customer', 'Cancelled Rides by Driver', 'Driver Cancellation Reason', 'Incomplete Rides', 'Incomplete Rides Reason', 'Booking Value', 'Ride Distance', 'Driver Ratings', 'Customer Rating', 'Payment Method']

First 8 rows:
      Date     Time   Booking ID  Booking Status  Customer ID  Vehicle Type     Pickup Location     Drop Location  Avg VTAT  Avg CTAT  Cancelled Rides by Customer Reason for cancelling by Customer  Cancelled Rides by Driver Driver Cancellation Reason  Incomplete Rides Incomplete Rides Reason  Booking Value  Ride Distance  Driver Ratings  Customer Rating Payment Method
2024-03-23 12:29:38 "CNR5884300" No Driver Found "CID1982111"         eBike         Palam Vihar           Jhilmil       NaN       NaN                          NaN                    

In [None]:
# ---- Step 2: Create binary target 'is_cancelled' from 'Booking Status' ----
# If 'Booking Status' contains the substring 'cancel' (case-insensitive) -> mark as cancelled
if "Booking Status" not in df.columns:
    raise ValueError("Expected column 'Booking Status' not found. Please adjust column name in script.")
df['is_cancelled'] = df['Booking Status'].fillna("").astype(str).str.lower().str.contains("cancel").astype(int)
print("\nis_cancelled value counts:\n", df['is_cancelled'].value_counts())

# Show fraction of cancellations (minority class)
n_cancel = int(df['is_cancelled'].sum())
n_total = len(df)
print(f"\nCancellations: {n_cancel}/{n_total} ({n_cancel/n_total:.4%})")



is_cancelled value counts:
 is_cancelled
0    112500
1     37500
Name: count, dtype: int64

Cancellations: 37500/150000 (25.0000%)


In [None]:
# ---- Step 3: Choose features ----
# We'll use numeric columns + a few categorical columns (one-hot) commonly present in ride datasets.
# Adjust 'categorical_keep' if you'd like different columns.
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
# remove the target from numeric columns if present
numeric_cols = [c for c in numeric_cols if c != 'is_cancelled']

# pick small-cardinality categoricals to include
categorical_keep = []
for c in ["Vehicle Type", "Pickup Location", "Drop Location"]:
    if c in df.columns:
        # only keep if cardinality is not huge (to avoid too many one-hot columns)
        if df[c].nunique() <= 50:
            categorical_keep.append(c)

print("\nNumeric features chosen:", numeric_cols)
print("Categorical features to one-hot (if present & small cardinality):", categorical_keep)

# Build feature DataFrame
X_num = df[numeric_cols].fillna(0).copy()  # numeric features (fill missing with 0)
X_cat = pd.get_dummies(df[categorical_keep].fillna("NA").astype(str), drop_first=True) if categorical_keep else pd.DataFrame()
X = pd.concat([X_num.reset_index(drop=True), X_cat.reset_index(drop=True)], axis=1)

print("\nFinal feature matrix shape:", X.shape)
print("Feature sample (first 6 cols):")
print(X.iloc[:5, :6].to_string(index=False))

# Save prepared dataset for review
prepared_path = "/content/ncr_ride_bookings (1).csv"
pd.concat([X, df['is_cancelled'].reset_index(drop=True)], axis=1).to_csv(prepared_path, index=False)
print(f"\nSaved prepared dataset to: {prepared_path}")


Numeric features chosen: ['Avg VTAT', 'Avg CTAT', 'Cancelled Rides by Customer', 'Cancelled Rides by Driver', 'Incomplete Rides', 'Booking Value', 'Ride Distance', 'Driver Ratings', 'Customer Rating']
Categorical features to one-hot (if present & small cardinality): ['Vehicle Type']

Final feature matrix shape: (150000, 15)
Feature sample (first 6 cols):
 Avg VTAT  Avg CTAT  Cancelled Rides by Customer  Cancelled Rides by Driver  Incomplete Rides  Booking Value
      0.0       0.0                          0.0                        0.0               0.0            0.0
      4.9      14.0                          0.0                        0.0               1.0          237.0
     13.4      25.8                          0.0                        0.0               0.0          627.0
     13.1      28.5                          0.0                        0.0               0.0          416.0
      5.3      19.6                          0.0                        0.0               0.0    

In [None]:
# ---- Step 4: Isolate minority class (cancelled) and scale ----
from sklearn.preprocessing import StandardScaler

features = X.columns.tolist()
X_arr = X.values.astype(np.float32)
y_arr = df['is_cancelled'].values.astype(np.int32)

# Fit scaler on all data (or only on minority if you prefer)
scaler = StandardScaler()
scaler.fit(X_arr)
X_scaled = scaler.transform(X_arr)

# Minority class (cancelled) scaled examples for GAN training
X_minority_scaled = X_scaled[y_arr == 1]
print("\nMinority scaled shape (for GAN training):", X_minority_scaled.shape)
if X_minority_scaled.shape[0] == 0:
    raise ValueError("No cancelled rows found (is_cancelled==1). Adjust the mapping or confirm dataset contains cancellations.")



Minority scaled shape (for GAN training): (37500, 15)


In [None]:
# ---- Step 5: Build GAN (Keras/TensorFlow). If TF isn't available, this will error with guidance. ----
try:
    import tensorflow as tf
    from tensorflow.keras import layers, Model, Input
    print("\nTensorFlow version:", tf.__version__)
except Exception as e:
    raise ImportError("TensorFlow is required to train the GAN. Run this script in Colab or install TF: pip install tensorflow") from e

latent_dim = 32
feature_dim = X_minority_scaled.shape[1]
lr = 2e-4
batch_size = 32
steps = 2000  # increase for better results

def build_generator(latent_dim, feature_dim):
    inp = Input(shape=(latent_dim,))
    x = layers.Dense(128, activation="relu")(inp)
    x = layers.Dense(256, activation="relu")(x)
    x = layers.Dense(256, activation="relu")(x)
    out = layers.Dense(feature_dim, activation="linear")(x)
    return Model(inp, out, name="generator")

def build_discriminator(feature_dim):
    inp = Input(shape=(feature_dim,))
    x = layers.Dense(256, activation="relu")(inp)
    x = layers.Dense(128, activation="relu")(x)
    out = layers.Dense(1, activation="sigmoid")(x)
    return Model(inp, out, name="discriminator")

generator = build_generator(latent_dim, feature_dim)
discriminator = build_discriminator(feature_dim)
discriminator.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr), loss="binary_crossentropy", metrics=["accuracy"])

# Combined model
discriminator.trainable = False
z = Input(shape=(latent_dim,))
gen_samples = generator(z)
validity = discriminator(gen_samples)
combined = Model(z, validity)
combined.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr), loss="binary_crossentropy")

generator.summary()
discriminator.summary()



TensorFlow version: 2.19.0


In [None]:
# ---- Step 6: Train GAN on minority class samples ----
# If minority class is very small, replicate to reach reasonable batch size
X_train = X_minority_scaled.astype(np.float32)
if X_train.shape[0] < batch_size:
    reps = int(np.ceil(batch_size / X_train.shape[0]))
    X_train = np.tile(X_train, (reps, 1))

half_batch = batch_size // 2
d_losses, g_losses = [], []

for step in range(steps):
    # Train discriminator
    idx = np.random.randint(0, X_train.shape[0], half_batch)
    real = X_train[idx]
    noise = np.random.normal(0, 1, (half_batch, latent_dim)).astype(np.float32)
    fake = generator.predict(noise, verbose=0)
    real_y = np.ones((half_batch, 1), dtype=np.float32)
    fake_y = np.zeros((half_batch, 1), dtype=np.float32)
    d_loss_real = discriminator.train_on_batch(real, real_y)
    d_loss_fake = discriminator.train_on_batch(fake, fake_y)
    d_loss = 0.5 * (d_loss_real[0] + d_loss_fake[0])

    # Train generator
    noise = np.random.normal(0, 1, (batch_size, latent_dim)).astype(np.float32)
    valid_y = np.ones((batch_size, 1), dtype=np.float32)
    g_loss = combined.train_on_batch(noise, valid_y)

    d_losses.append(d_loss)
    g_losses.append(g_loss)

    if (step + 1) % 200 == 0 or step == 0:
        print(f"Step {step+1}/{steps} — d_loss: {d_loss:.4f}, g_loss: {g_loss:.4f}")



Step 1/2000 — d_loss: 0.5963, g_loss: 0.6536
Step 200/2000 — d_loss: 2.7467, g_loss: 0.0772
Step 400/2000 — d_loss: 3.5895, g_loss: 0.0389
Step 600/2000 — d_loss: 4.0833, g_loss: 0.0260
Step 800/2000 — d_loss: 4.4393, g_loss: 0.0195
Step 1000/2000 — d_loss: 4.7127, g_loss: 0.0156
Step 1200/2000 — d_loss: 4.9329, g_loss: 0.0130
Step 1400/2000 — d_loss: 5.1251, g_loss: 0.0112
Step 1600/2000 — d_loss: 5.2962, g_loss: 0.0098
Step 1800/2000 — d_loss: 5.4475, g_loss: 0.0087
Step 2000/2000 — d_loss: 5.5851, g_loss: 0.0078


In [None]:
# ---- Step 7: Generate synthetic cancelled-ride samples ----
n_gen = 1000
noise = np.random.normal(0, 1, (n_gen, latent_dim)).astype(np.float32)
gen_scaled = generator.predict(noise)
gen_original = scaler.inverse_transform(gen_scaled)  # convert back to original feature space
df_gen = pd.DataFrame(gen_original, columns=features)
df_gen['is_cancelled'] = 1
gen_out_path = "/content/ncr_ride_bookings (1).csv"
df_gen.to_csv(gen_out_path, index=False)
print(f"\nSaved {n_gen} synthetic cancelled-ride rows to: {gen_out_path}")

# ---- Step 8: Quick comparisons (stats and small classifier demo) ----
real_cancelled = pd.DataFrame(scaler.inverse_transform(X_minority_scaled), columns=features)
print("\nReal-cancelled summary (first 5 rows):")
print(real_cancelled.head().to_string(index=False))
print("\nSynthetic-cancelled summary (first 5 rows):")
print(df_gen[features].head().to_string(index=False))

print("\nDescriptive comparison (real vs synthetic) — means and std:")
print(pd.DataFrame({
    'real_mean': real_cancelled.mean(),
    'real_std' : real_cancelled.std(),
    'syn_mean' : df_gen[features].mean(),
    'syn_std'  : df_gen[features].std()
}).round(4))

# Optional: train a small classifier to observe effect of augmentation
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score

X_all = pd.concat([pd.DataFrame(X_arr := X.values, columns=features), pd.DataFrame(y_arr, columns=['is_cancelled'])], axis=1)
X_vals = X_all[features].values
y_vals = X_all['is_cancelled'].values
X_train_o, X_test_o, y_train_o, y_test_o = train_test_split(X_vals, y_vals, test_size=0.2, random_state=42, stratify=y_vals)

# Augment training data by adding generated samples
X_train_aug = np.vstack([X_train_o, df_gen[features].values])
y_train_aug = np.hstack([y_train_o, np.ones(len(df_gen), dtype=int)])

clf_o = RandomForestClassifier(n_estimators=100, random_state=42)
clf_o.fit(X_train_o, y_train_o)
pred_o = clf_o.predict(X_test_o)
f1_o = f1_score(y_test_o, pred_o, zero_division=0)

clf_aug = RandomForestClassifier(n_estimators=100, random_state=42)
clf_aug.fit(X_train_aug, y_train_aug)
pred_aug = clf_aug.predict(X_test_o)
f1_aug = f1_score(y_test_o, pred_aug, zero_division=0)

print(f"\nF1-score (original training): {f1_o:.4f}")
print(f"F1-score (augmented training): {f1_aug:.4f}")
print("\nClassification report (augmented model):")
print(classification_report(y_test_o, pred_aug, digits=4))

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Saved 1000 synthetic cancelled-ride rows to: /content/ncr_ride_bookings (1).csv

Real-cancelled summary (first 5 rows):
 Avg VTAT     Avg CTAT  Cancelled Rides by Customer  Cancelled Rides by Driver  Incomplete Rides  Booking Value  Ride Distance  Driver Ratings  Customer Rating  Vehicle Type_Bike  Vehicle Type_Go Mini  Vehicle Type_Go Sedan  Vehicle Type_Premier Sedan  Vehicle Type_Uber XL  Vehicle Type_eBike
      4.6 1.196670e-07                -2.980232e-10               9.999999e-01     -2.384186e-09       0.000015  -1.725197e-07   -1.079893e-07    -1.122824e-07       4.041990e-09         -4.928907e-09          -2.202988e-09                3.566742e-09          4.655123e-10        1.000000e+00
      6.0 1.196670e-07                -2.980232e-10               9.999999e-01     -2.384186e-09       0.000015  -1.725197e-07   -1.079893e-07    -1.122824e-07       4.041990e-09         -4.928907e-09           9.99999