# Implement a basic GAN architecture (generator + discriminator) and train it on MNIST images

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
import matplotlib.pyplot as plt

(x_train, _), _ = tf.keras.datasets.mnist.load_data()
x_train = x_train.astype("float32") / 255.0
x_train = x_train.reshape((-1, 28, 28, 1))

def build_generator(latent_dim):
    model = tf.keras.Sequential([
        layers.Dense(7*7*128, input_shape=(latent_dim,)),
        layers.Reshape((7, 7, 128)),
        layers.BatchNormalization(),
        layers.Conv2DTranspose(64, 4, strides=2, padding='same', activation='relu'),
        layers.BatchNormalization(),
        layers.Conv2DTranspose(1, 4, strides=2, padding='same', activation='sigmoid')
    ])
    return model

def build_discriminator():
    model = tf.keras.Sequential([
        layers.Conv2D(64, 4, strides=2, padding='same', input_shape=(28, 28, 1)),
        layers.LeakyReLU(0.2),
        layers.Conv2D(128, 4, strides=2, padding='same'),
        layers.LeakyReLU(0.2),
        layers.Flatten(),
        layers.Dense(1, activation='sigmoid')
    ])
    return model

latent_dim = 100
generator = build_generator(latent_dim)
discriminator = build_discriminator()

cross_entropy = tf.keras.losses.BinaryCrossentropy()
g_optimizer = tf.keras.optimizers.Adam(1e-4)
d_optimizer = tf.keras.optimizers.Adam(1e-4)

@tf.function
def train_step(real_images):
    batch_size = tf.shape(real_images)[0]
    noise = tf.random.normal([batch_size, latent_dim])
    with tf.GradientTape() as disc_tape:
        fake_images = generator(noise, training=True)
        real_output = discriminator(real_images, training=True)
        fake_output = discriminator(fake_images, training=True)
        d_loss = cross_entropy(tf.ones_like(real_output), real_output) + \
                 cross_entropy(tf.zeros_like(fake_output), fake_output)
    d_gradients = disc_tape.gradient(d_loss, discriminator.trainable_variables)
    d_optimizer.apply_gradients(zip(d_gradients, discriminator.trainable_variables))
    noise = tf.random.normal([batch_size, latent_dim])
    with tf.GradientTape() as gen_tape:
        fake_images = generator(noise, training=True)
        fake_output = discriminator(fake_images, training=True)
        g_loss = cross_entropy(tf.ones_like(fake_output), fake_output)
    g_gradients = gen_tape.gradient(g_loss, generator.trainable_variables)
    g_optimizer.apply_gradients(zip(g_gradients, generator.trainable_variables))
    return d_loss, g_loss

def train(dataset, epochs, batch_size=128):
    for epoch in range(epochs):
        for batch in dataset:
            d_loss, g_loss = train_step(batch)
        print(f"Epoch {epoch + 1}: D Loss: {d_loss.numpy():.4f}, G Loss: {g_loss.numpy():.4f}")
        if (epoch + 1) % 5 == 0:
            generate_and_plot_images(generator, latent_dim)

batch_size = 128
train_dataset = tf.data.Dataset.from_tensor_slices(x_train).shuffle(1024).batch(batch_size)

def generate_and_plot_images(generator, latent_dim, n=16):
    noise = tf.random.normal([n, latent_dim])
    fake_images = generator(noise, training=False)
    fake_images = fake_images.numpy().squeeze()
    plt.figure(figsize=(4, 4))
    for i in range(n):
        plt.subplot(4, 4, i + 1)
        plt.imshow(fake_images[i], cmap='gray')
        plt.axis('off')
    plt.tight_layout()
    plt.show()

train(train_dataset, epochs=50)


# Experiment with DCGAN architecture on the CIFAR-10 dataset

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
import matplotlib.pyplot as plt

(x_train, _), (_, _) = tf.keras.datasets.cifar10.load_data()
x_train = x_train.astype("float32")
x_train = (x_train - 127.5) / 127.5

BUFFER_SIZE = 50000
BATCH_SIZE = 128

train_dataset = tf.data.Dataset.from_tensor_slices(x_train).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)

def make_generator_model():
    model = tf.keras.Sequential()
    model.add(layers.Dense(4*4*512, use_bias=False, input_shape=(100,)))
    model.add(layers.BatchNormalization())
    model.add(layers.ReLU())
    model.add(layers.Reshape((4, 4, 512)))
    model.add(layers.Conv2DTranspose(256, 4, strides=2, padding='same', use_bias=False))
    model.add(layers.BatchNormalization())
    model.add(layers.ReLU())
    model.add(layers.Conv2DTranspose(128, 4, strides=2, padding='same', use_bias=False))
    model.add(layers.BatchNormalization())
    model.add(layers.ReLU())
    model.add(layers.Conv2DTranspose(3, 4, strides=2, padding='same', use_bias=False, activation='tanh'))
    return model

def make_discriminator_model():
    model = tf.keras.Sequential()
    model.add(layers.Conv2D(64, 4, strides=2, padding='same', input_shape=[32, 32, 3]))
    model.add(layers.LeakyReLU(0.2))
    model.add(layers.Conv2D(128, 4, strides=2, padding='same'))
    model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU(0.2))
    model.add(layers.Conv2D(256, 4, strides=2, padding='same'))
    model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU(0.2))
    model.add(layers.Flatten())
    model.add(layers.Dense(1, activation='sigmoid'))
    return model

cross_entropy = tf.keras.losses.BinaryCrossentropy()

def discriminator_loss(real_output, fake_output):
    real_loss = cross_entropy(tf.ones_like(real_output), real_output)
    fake_loss = cross_entropy(tf.zeros_like(fake_output), fake_output)
    return real_loss + fake_loss

def generator_loss(fake_output):
    return cross_entropy(tf.ones_like(fake_output), fake_output)

generator = make_generator_model()
discriminator = make_discriminator_model()
generator_optimizer = tf.keras.optimizers.Adam(1e-4, beta_1=0.5)
discriminator_optimizer = tf.keras.optimizers.Adam(1e-4, beta_1=0.5)

EPOCHS = 50
noise_dim = 100
num_examples_to_generate = 16
seed = tf.random.normal([num_examples_to_generate, noise_dim])

@tf.function
def train_step(images):
    noise = tf.random.normal([BATCH_SIZE, noise_dim])
    with tf.GradientTape() as disc_tape, tf.GradientTape() as gen_tape:
        generated_images = generator(noise, training=True)
        real_output = discriminator(images, training=True)
        fake_output = discriminator(generated_images, training=True)
        d_loss = discriminator_loss(real_output, fake_output)
        g_loss = generator_loss(fake_output)
    gradients_of_generator = gen_tape.gradient(g_loss, generator.trainable_variables)
    gradients_of_discriminator = disc_tape.gradient(d_loss, discriminator.trainable_variables)
    generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables))
    discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))
    return d_loss, g_loss

def generate_and_plot_images(model, epoch):
    predictions = model(seed, training=False)
    predictions = (predictions + 1) / 2.0
    fig = plt.figure(figsize=(4, 4))
    for i in range(predictions.shape[0]):
        plt.subplot(4, 4, i + 1)
        plt.imshow(predictions[i])
        plt.axis('off')
    plt.suptitle(f"Epoch {epoch}")
    plt.show()

def train(dataset, epochs):
    for epoch in range(epochs):
        for image_batch in dataset:
            d_loss, g_loss = train_step(image_batch)
        print(f"Epoch {epoch + 1}, D Loss: {d_loss.numpy():.4f}, G Loss: {g_loss.numpy():.4f}")
        if (epoch + 1) % 5 == 0:
            generate_and_plot_images(generator, epoch + 1)

train(train_dataset, EPOCHS)


# TEXT TO IMAGE AND IMAGE TO TEXT 

In [None]:
"""
!pip install diffusers transformers accelerate safetensors pillow
!pip install torch torchvision torchaudio
!pip install --upgrade diffusers

install the above packages if required.
"""

import os
import json
from PIL import Image
import torch
from diffusers import StableDiffusionPipeline
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer

def text_to_image(prompt, output_dir="outputs", num_images=1, steps=30, guidance_scale=7.5, seed=None):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    os.makedirs(output_dir, exist_ok=True)
    print(f"Loading Stable Diffusion model on {device}...")
    try:
        pipe = StableDiffusionPipeline.from_pretrained(
            "runwayml/stable-diffusion-v1-5",
            torch_dtype=torch.float16 if device == "cuda" else torch.float32,
            safety_checker=None
        ).to(device)
    except Exception as e:
        print("Error loading Stable Diffusion pipeline:", e)
        return
    generator = torch.Generator(device=device).manual_seed(seed) if seed else None
    results = []
    for i in range(num_images):
        print(f"Generating image {i + 1} of {num_images}...")
        try:
            image = pipe(
                prompt=prompt,
                num_inference_steps=steps,
                guidance_scale=guidance_scale,
                generator=generator
            ).images[0]
        except Exception as e:
            print(f"Error during image generation: {e}")
            continue
        file_path = os.path.join(output_dir, f"text2img_{i + 1}.png")
        image.save(file_path)
        results.append(file_path)
    metadata = {
        "prompt": prompt,
        "num_images": num_images,
        "steps": steps,
        "guidance_scale": guidance_scale,
        "seed": seed,
        "device": device,
        "outputs": results
    }
    metadata_path = os.path.join(output_dir, "text2img_metadata.json")
    try:
        with open(metadata_path, "w") as f:
            json.dump(metadata, f, indent=2)
    except Exception as e:
        print("Error saving metadata:", e)
    print("Images saved to:", results)

def image_to_text(image_path, output_dir="outputs"):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    os.makedirs(output_dir, exist_ok=True)
    print(f"Loading image captioning model on {device}...")
    model_name = "nlpconnect/vit-gpt2-image-captioning"
    try:
        model = VisionEncoderDecoderModel.from_pretrained(model_name).to(device)
        processor = ViTImageProcessor.from_pretrained(model_name)
        tokenizer = AutoTokenizer.from_pretrained(model_name)
    except Exception as e:
        print("Error loading image captioning model:", e)
        return
    try:
        image = Image.open(image_path).convert("RGB")
    except Exception as e:
        print(f"Error opening image: {e}")
        return
    pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(device)
    try:
        output_ids = model.generate(pixel_values, max_length=32, num_beams=4)
        caption = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    except Exception as e:
        print(f"Error during caption generation: {e}")
        return
    caption_file = os.path.join(output_dir, "img2text_caption.txt")
    try:
        with open(caption_file, "w") as f:
            f.write(caption)
    except Exception as e:
        print(f"Error saving caption: {e}")
    metadata = {
        "image_path": image_path,
        "caption": caption,
        "device": device
    }
    metadata_path = os.path.join(output_dir, "img2text_metadata.json")
    try:
        with open(metadata_path, "w") as f:
            json.dump(metadata, f, indent=2)
    except Exception as e:
        print(f"Error saving metadata: {e}")
    print("Caption:", caption)
    print("Caption saved to:", caption_file)

if __name__ == "__main__":
    choice = input("Enter choice (1 for Text-to-Image, 2 for Image-to-Text): ").strip()
    if choice == "1":
        print("\n--- Text-to-Image ---")
        prompt = input("Enter your text prompt: ").strip()
        num_images = int(input("Number of images to generate (default 1): ") or 1)
        steps = int(input("Number of diffusion steps (default 30): ") or 30)
        guidance_scale = float(input("Guidance scale (default 7.5): ") or 7.5)
        seed_input = input("Random seed (leave blank for random): ").strip()
        seed = int(seed_input) if seed_input else None
        text_to_image(
            prompt,
            num_images=num_images,
            steps=steps,
            guidance_scale=guidance_scale,
            seed=seed
        )
    elif choice == "2":
        print("\n--- Image-to-Text ---")
        image_path = input("Enter path to the image: ").strip()
        if not os.path.exists(image_path):
            print("Error: File not found.")
        else:
            image_to_text(image_path)
    else:
        print("Invalid choice.")


# SPEECH TO TEXT

In [None]:
"""
!pip install openai-whisper
!pip install whisper
!pip install librosa

install these packages if required.
"""

import whisper
import numpy as np
import soundfile as sf

MODEL_SIZE = "base"

def transcribe_file(audio_path):
    print("\nLoading Whisper model...")
    model = whisper.load_model(MODEL_SIZE)
    print("Model loaded. Transcribing audio file...\n")
    audio, sr = sf.read(audio_path)
    if sr != 16000:
        import librosa
        audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
    audio = audio.astype(np.float32)
    result = model.transcribe(audio, fp16=False, language="en")
    print("Transcription:", result["text"].strip())

if __name__ == "__main__":
    audio_file = input("Enter path to the recorded audio file: ").strip()
    transcribe_file(audio_file)


# To implement the forward diffusion process on images (adding noise gradually)

In [None]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

(x_train, _), _ = tf.keras.datasets.mnist.load_data()
x_train = x_train.astype("float32") / 255.0
x_train = np.expand_dims(x_train, -1)
image = x_train[0]

T = 1000
beta = np.linspace(1e-4, 0.02, T)
alpha = 1.0 - beta
alpha_hat = np.cumprod(alpha)

def forward_diffusion_sample(x_0, t):
    sqrt_alpha_hat = np.sqrt(alpha_hat[t])
    sqrt_one_minus_alpha_hat = np.sqrt(1 - alpha_hat[t])
    noise = np.random.randn(*x_0.shape)
    x_t = sqrt_alpha_hat * x_0 + sqrt_one_minus_alpha_hat * noise
    return x_t, noise

def show_forward_diffusion(x_0, steps=[0, 100, 200, 500, 999]):
    plt.figure(figsize=(15, 3))
    for i, t in enumerate(steps):
        x_t, _ = forward_diffusion_sample(x_0, t)
        plt.subplot(1, len(steps), i + 1)
        plt.imshow(x_t.squeeze(), cmap='gray')
        plt.title(f"t = {t}")
        plt.axis('off')
    plt.suptitle("Forward Diffusion Process")
    plt.show()

show_forward_diffusion(image)


# reverse diffusion process

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
import matplotlib.pyplot as plt

T = 1000
beta = np.linspace(1e-4, 0.02, T)
alpha = 1.0 - beta
alpha_hat = np.cumprod(alpha)

beta = tf.constant(beta, dtype=tf.float32)
alpha = tf.constant(alpha, dtype=tf.float32)
alpha_hat = tf.constant(alpha_hat, dtype=tf.float32)

def get_simple_model():
    model = tf.keras.Sequential([
        layers.Input(shape=(28, 28, 2)),
        layers.Conv2D(64, 3, padding="same", activation="relu"),
        layers.Conv2D(64, 3, padding="same", activation="relu"),
        layers.Conv2D(1, 3, padding="same")
    ])
    return model

noise_predictor = get_simple_model()

def get_timestep_tensor(t, shape):
    t_scaled = tf.fill(shape, tf.cast(t, tf.float32) / T)
    return t_scaled[..., tf.newaxis]

optimizer = tf.keras.optimizers.Adam(1e-4)

@tf.function
def train_step(x_0):
    t = tf.random.uniform([], minval=1, maxval=T, dtype=tf.int32)
    noise = tf.random.normal(tf.shape(x_0))
    alpha_hat_t = tf.gather(alpha_hat, t)
    sqrt_alpha_hat_t = tf.sqrt(alpha_hat_t)
    sqrt_one_minus_alpha_hat_t = tf.sqrt(1 - alpha_hat_t)
    x_t = sqrt_alpha_hat_t * x_0 + sqrt_one_minus_alpha_hat_t * noise
    t_tensor = get_timestep_tensor(t, tf.shape(x_0)[:3])
    x_input = tf.concat([x_t, t_tensor], axis=-1)
    with tf.GradientTape() as tape:
        pred_noise = noise_predictor(x_input)
        loss = tf.reduce_mean(tf.square(noise - pred_noise))
    grads = tape.gradient(loss, noise_predictor.trainable_variables)
    optimizer.apply_gradients(zip(grads, noise_predictor.trainable_variables))
    return loss

def reverse_diffusion(x_T, steps=T, model=noise_predictor):
    x_t = x_T
    for t in reversed(range(1, steps)):
        alpha_t = alpha[t]
        alpha_hat_t = alpha_hat[t]
        beta_t = beta[t]
        t_tensor = get_timestep_tensor(t, tf.shape(x_t)[:3])
        x_input = tf.concat([x_t, t_tensor], axis=-1)
        pred_noise = model(x_input)
        coef1 = 1 / tf.sqrt(alpha_t)
        coef2 = (1 - alpha_t) / tf.sqrt(1 - alpha_hat_t)
        x_t = coef1 * (x_t - coef2 * pred_noise)
        if t > 1:
            noise = tf.random.normal(tf.shape(x_t))
            x_t += tf.sqrt(beta_t) * noise
    return x_t

(x_train, _), _ = tf.keras.datasets.mnist.load_data()
x_train = x_train.astype("float32") / 255.0
x_train = np.expand_dims(x_train, -1)
x_train = tf.convert_to_tensor(x_train)

for epoch in range(1000):
    for i in range(0, 1024, 32):
        batch = x_train[i:i+32]
        loss = train_step(batch)
    print(f"Epoch {epoch+1}, Loss: {loss.numpy():.4f}")

x_T = tf.random.normal((1, 28, 28, 1))
x_denoised = reverse_diffusion(x_T, steps=1000)

plt.subplot(1, 2, 1)
plt.imshow(x_T[0, ..., 0], cmap="gray")
plt.title("x_T (Noise)")
plt.axis("off")

plt.subplot(1, 2, 2)
plt.imshow(tf.clip_by_value(x_denoised[0, ..., 0], 0, 1), cmap="gray")
plt.title("x_0 (Denoised)")
plt.axis("off")

plt.show()


## fine-tune a pre-trained transformer model on a text dataset for text generation.

In [None]:
from transformers import GPT2Tokenizer, TFGPT2LMHeadModel
import tensorflow as tf

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = TFGPT2LMHeadModel.from_pretrained("gpt2")

texts = ["Here is some example text.", "Another example sentence."]
inputs = tokenizer(texts, return_tensors="tf", padding=True, truncation=True)
labels = inputs.input_ids

dataset = tf.data.Dataset.from_tensor_slices((inputs.input_ids, labels))
dataset = dataset.shuffle(100).batch(2)

optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

@tf.function
def train_step(input_ids, labels):
    with tf.GradientTape() as tape:
        outputs = model(input_ids, labels=labels)
        loss_value = outputs.loss
    grads = tape.gradient(loss_value, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))
    return loss_value

for epoch in range(3):
    for batch in dataset:
        loss_value = train_step(batch[0], batch[1])
    print(f"Epoch {epoch+1} Loss: {loss_value.numpy():.4f}")
