# 🎭 DLA GAN Dialogue Project

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/jawaharganesh24189/DLA/blob/main/DLA_GAN_Dialogue_Project.ipynb)

An adversarial dialogue generation system using GANs.

## 📦 Section 1: Setup & Dependencies

In [None]:
# Install required packages (uncomment if needed)
# !pip install tensorflow numpy matplotlib

In [None]:
import os
import re
import glob
import json
import csv
import pickle
from dataclasses import dataclass
from typing import List, Dict, Optional, Tuple
from collections import defaultdict, Counter
from io import StringIO

import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import matplotlib.pyplot as plt

print(f'✅ TensorFlow version: {tf.__version__}')
print(f'✅ GPU available: {tf.config.list_physical_devices("GPU")}')

## 💾 Section 2: Google Drive & Data Loading

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
DATASET_PATH = '/content/drive/MyDrive/DLA_Notebooks_Data_PGPM/Dataset/'
OUTPUT_FILE = 'cleandata.txt'

print(f'📁 Dataset path: {DATASET_PATH}')
print(f'📄 Output file: {OUTPUT_FILE}')

## 🎬 Section 3: Dialogue Parsing

In [None]:
@dataclass
class DialogueTurn:
    context: str
    response: str
    metadata: Optional[Dict] = None

In [None]:
class DialogueParser:
    """Multi-format dialogue parser"""
    
    def __init__(self):
        self.context_response_pattern = re.compile(
            r'context:\s*(.+?)\s*response:\s*(.+?)(?=\ncontext:|$)',
            re.DOTALL | re.IGNORECASE
        )
        self.dialogue_pattern = re.compile(r'^(.+?):\s*(.+?)$', re.MULTILINE)
    
    def parse_file(self, filepath: str) -> List[DialogueTurn]:
        with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
            content = f.read()
        turns = self._parse_context_response(content)
        if not turns:
            turns = self._parse_dialogue_format(content)
        return turns
    
    def _parse_context_response(self, content: str) -> List[DialogueTurn]:
        matches = self.context_response_pattern.findall(content)
        turns = []
        for context, response in matches:
            context = self._clean_text(context)
            response = self._clean_text(response)
            if context and response:
                turns.append(DialogueTurn(
                    context=context,
                    response=response,
                    metadata={'format': 'context_response'}
                ))
        return turns
    
    def _parse_dialogue_format(self, content: str) -> List[DialogueTurn]:
        lines = content.strip().split('\n')
        turns = []
        context_buffer = []
        for line in lines:
            line = line.strip()
            if not line:
                continue
            match = self.dialogue_pattern.match(line)
            if match:
                speaker, dialogue = match.groups()
                dialogue = self._clean_text(dialogue)
                if dialogue:
                    context = ' '.join(context_buffer[-3:]) if context_buffer else ''
                    turns.append(DialogueTurn(
                        context=context,
                        response=dialogue,
                        metadata={'format': 'dialogue', 'speaker': speaker.strip()}
                    ))
                    context_buffer.append(f'{speaker}: {dialogue}')
        return turns
    
    def _clean_text(self, text: str) -> str:
        text = re.sub(r'\s+', ' ', text)
        text = text.replace('\\', ' ')
        return text.strip()
    
    def parse_directory(self, directory: str, pattern: str = '*.txt',
                       auto_save_txt: bool = True,
                       output_file: str = 'cleandata.txt') -> List[DialogueTurn]:
        all_turns = []
        file_pattern = os.path.join(directory, pattern)
        files = glob.glob(file_pattern)
        print(f'📁 Found {len(files)} files')
        for i, filepath in enumerate(sorted(files), 1):
            try:
                turns = self.parse_file(filepath)
                all_turns.extend(turns)
                if i % 100 == 0:
                    print(f'   Processed {i}/{len(files)} files...')
            except Exception as e:
                print(f'⚠️  Error parsing {os.path.basename(filepath)}: {e}')
                continue
        print(f'\n✅ Total dialogue turns: {len(all_turns)}')
        if auto_save_txt and all_turns:
            self.save_to_cleandata(all_turns, output_file)
        return all_turns
    
    def save_to_cleandata(self, turns: List[DialogueTurn], output_file: str = 'cleandata.txt'):
        print(f'\n{"-"*70}')
        print(f'💾 Creating {output_file}')
        dialogue_text = self.to_dialogue_format(turns)
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(dialogue_text)
        lines = dialogue_text.split('\n')
        char_counts = {}
        for line in lines:
            if ':' in line:
                char = line.split(':', 1)[0].strip()
                char_counts[char] = char_counts.get(char, 0) + 1
        print(f'✅ Saved: {output_file} ({len(lines)} lines, {len(char_counts)} characters)')
        print("-"*70)
    
    def to_dialogue_format(self, turns: List[DialogueTurn]) -> str:
        lines = []
        for turn in turns:
            speaker = 'Unknown'
            if turn.metadata and 'speaker' in turn.metadata:
                speaker = turn.metadata['speaker']
            if speaker and turn.response:
                lines.append(f'{speaker}: {turn.response}')
        return '\n'.join(lines)

print('✅ DialogueParser class defined!')

In [None]:
# Parse dialogue files
dialogue_parser = DialogueParser()
turns = dialogue_parser.parse_directory(
    directory=DATASET_PATH,
    auto_save_txt=True,
    output_file=OUTPUT_FILE
)
print(f'\n✅ Parsing complete! {OUTPUT_FILE} ready.')

## 🔤 Section 4: Data Processing & Tokenization

In [None]:
class FlexibleDialogueDataProcessor:
    """Processes cleandata.txt for GAN training"""
    
    def __init__(self, file_path: str, seq_length: int = 50):
        self.file_path = file_path
        self.seq_length = seq_length
        self.dialogues = []
        self.characters = set()
        self.tokenizer = None
        self.vocab_size = 0
    
    def load_and_parse(self) -> List[str]:
        print(f'📖 Loading: {self.file_path}')
        with open(self.file_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()
        for line in lines:
            line = line.strip()
            if ':' in line:
                parts = line.split(':', 1)
                if len(parts) == 2:
                    character = parts[0].strip()
                    dialogue = parts[1].strip()
                    if character and dialogue:
                        self.characters.add(character)
                        self.dialogues.append(dialogue)
        print(f'✅ Loaded {len(self.dialogues)} dialogues')
        print(f'✅ Found {len(self.characters)} characters')
        return self.dialogues
    
    def build_vocabulary(self, max_vocab_size: int = 5000):
        print(f'\n🔤 Building vocabulary (max: {max_vocab_size})...')
        self.tokenizer = Tokenizer(num_words=max_vocab_size, oov_token='<UNK>')
        self.tokenizer.fit_on_texts(self.dialogues)
        self.vocab_size = min(len(self.tokenizer.word_index) + 1, max_vocab_size)
        print(f'✅ Vocabulary size: {self.vocab_size}')
        return self.tokenizer
    
    def create_sequences(self) -> np.ndarray:
        print(f'\n📊 Creating sequences (length: {self.seq_length})...')
        sequences = self.tokenizer.texts_to_sequences(self.dialogues)
        padded = pad_sequences(sequences, maxlen=self.seq_length, padding='pre')
        print(f'✅ Created {len(padded)} sequences')
        return padded

print('✅ FlexibleDialogueDataProcessor class defined!')

In [None]:
# Initialize and load data
data_processor = FlexibleDialogueDataProcessor(file_path=OUTPUT_FILE, seq_length=50)
dialogues = data_processor.load_and_parse()
tokenizer = data_processor.build_vocabulary(max_vocab_size=5000)
vocab_size = data_processor.vocab_size
sequences = data_processor.create_sequences()
print(f'\n✅ Data ready! Vocab: {vocab_size}, Sequences: {len(sequences)}')

## 🧠 Section 5: Model Architecture

In [None]:
class Generator(keras.Model):
    def __init__(self, vocab_size, embedding_dim=128, lstm_units=256):
        super(Generator, self).__init__()
        self.embedding = layers.Embedding(vocab_size, embedding_dim)
        self.lstm1 = layers.LSTM(lstm_units, return_sequences=True)
        self.lstm2 = layers.LSTM(lstm_units, return_sequences=True)
        self.dropout = layers.Dropout(0.3)
        self.dense = layers.Dense(vocab_size, activation='softmax')
    
    def call(self, inputs, training=False):
        x = self.embedding(inputs)
        x = self.lstm1(x)
        x = self.lstm2(x)
        if training:
            x = self.dropout(x, training=training)
        return self.dense(x)

print('✅ Generator defined!')

In [None]:
class Discriminator(keras.Model):
    def __init__(self, vocab_size, embedding_dim=128, lstm_units=256):
        super(Discriminator, self).__init__()
        self.embedding = layers.Embedding(vocab_size, embedding_dim)
        self.lstm = layers.LSTM(lstm_units)
        self.dropout = layers.Dropout(0.3)
        self.dense1 = layers.Dense(128, activation='relu')
        self.dense2 = layers.Dense(1, activation='sigmoid')
    
    def call(self, inputs, training=False):
        x = self.embedding(inputs)
        x = self.lstm(x)
        if training:
            x = self.dropout(x, training=training)
        x = self.dense1(x)
        return self.dense2(x)

print('✅ Discriminator defined!')

In [None]:
generator = Generator(vocab_size=vocab_size, embedding_dim=128, lstm_units=256)
discriminator = Discriminator(vocab_size=vocab_size, embedding_dim=128, lstm_units=256)
generator_optimizer = keras.optimizers.Adam(learning_rate=0.0001)
discriminator_optimizer = keras.optimizers.Adam(learning_rate=0.0001)
loss_fn = keras.losses.BinaryCrossentropy()
print('✅ Models initialized!')

## 🏋️ Section 6: Model Training

In [None]:
BATCH_SIZE = 64
EPOCHS = 10
print(f'⚙️  Batch size: {BATCH_SIZE}, Epochs: {EPOCHS}')

In [None]:
def train_gan(generator, discriminator, sequences, epochs, batch_size):
    """Train the GAN"""
    history = {'d_loss': [], 'g_loss': [], 'd_accuracy': []}
    num_batches = len(sequences) // batch_size
    
    for epoch in range(epochs):
        print(f'\nEpoch {epoch + 1}/{epochs}')
        np.random.shuffle(sequences)
        epoch_d_loss, epoch_g_loss, epoch_d_acc = [], [], []
        
        for batch in range(num_batches):
            start_idx = batch * batch_size
            end_idx = start_idx + batch_size
            real_sequences = sequences[start_idx:end_idx]
            
            # Train Discriminator
            with tf.GradientTape() as tape:
                noise = tf.random.uniform(
                    shape=(batch_size, data_processor.seq_length),
                    minval=0, maxval=vocab_size, dtype=tf.int32
                )
                fake_sequences = generator(noise, training=True)
                fake_sequences = tf.argmax(fake_sequences, axis=-1)
                real_output = discriminator(real_sequences, training=True)
                fake_output = discriminator(fake_sequences, training=True)
                real_loss = loss_fn(tf.ones_like(real_output), real_output)
                fake_loss = loss_fn(tf.zeros_like(fake_output), fake_output)
                d_loss = real_loss + fake_loss
            d_gradients = tape.gradient(d_loss, discriminator.trainable_variables)
            discriminator_optimizer.apply_gradients(zip(d_gradients, discriminator.trainable_variables))
            
            # Train Generator
            with tf.GradientTape() as tape:
                noise = tf.random.uniform(
                    shape=(batch_size, data_processor.seq_length),
                    minval=0, maxval=vocab_size, dtype=tf.int32
                )
                fake_sequences = generator(noise, training=True)
                fake_sequences = tf.argmax(fake_sequences, axis=-1)
                fake_output = discriminator(fake_sequences, training=False)
                g_loss = loss_fn(tf.ones_like(fake_output), fake_output)
            g_gradients = tape.gradient(g_loss, generator.trainable_variables)
            generator_optimizer.apply_gradients(zip(g_gradients, generator.trainable_variables))
            
            d_accuracy = ((real_output > 0.5).numpy().mean() + (fake_output <= 0.5).numpy().mean()) / 2
            epoch_d_loss.append(d_loss.numpy())
            epoch_g_loss.append(g_loss.numpy())
            epoch_d_acc.append(d_accuracy)
            
            if (batch + 1) % 10 == 0:
                print(f'  Batch {batch + 1}/{num_batches} - D_loss: {d_loss.numpy():.4f}, G_loss: {g_loss.numpy():.4f}')
        
        history['d_loss'].append(np.mean(epoch_d_loss))
        history['g_loss'].append(np.mean(epoch_g_loss))
        history['d_accuracy'].append(np.mean(epoch_d_acc))
        print(f'Epoch {epoch + 1} - D_loss: {history["d_loss"][-1]:.4f}, G_loss: {history["g_loss"][-1]:.4f}')
    
    return history

print('✅ Training function defined!')

In [None]:
print('🚀 Starting training...\n')
training_history = train_gan(generator, discriminator, sequences, EPOCHS, BATCH_SIZE)
print('\n✅ Training complete!')

## 🎭 Section 7: Dialogue Generation

In [None]:
def generate_dialogue(generator, tokenizer, seed_text='', max_length=50):
    """Generate dialogue"""
    if seed_text:
        sequence = tokenizer.texts_to_sequences([seed_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length, padding='pre')
    else:
        sequence = np.random.randint(1, vocab_size, size=(1, max_length))
    predictions = generator(sequence, training=False)
    predicted_ids = tf.argmax(predictions, axis=-1).numpy()[0]
    words = []
    for idx in predicted_ids:
        for word, word_idx in tokenizer.word_index.items():
            if word_idx == idx:
                words.append(word)
                break
    return ' '.join(words)

print('✅ Generation function defined!')

In [None]:
print('🎭 Generating dialogues...\n')
for i in range(5):
    dialogue = generate_dialogue(generator, tokenizer, max_length=30)
    print(f'{i+1}. {dialogue}')
print('\n✅ Generation complete!')

## 📊 Section 8: Visualization

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
axes[0].plot(training_history['d_loss'], label='D Loss')
axes[0].plot(training_history['g_loss'], label='G Loss')
axes[0].set_title('Training Loss')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].legend()
axes[0].grid(True)
axes[1].plot(training_history['d_accuracy'], label='D Accuracy', color='green')
axes[1].set_title('Discriminator Accuracy')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Accuracy')
axes[1].legend()
axes[1].grid(True)
plt.tight_layout()
plt.show()

## 💾 Section 9: Model Persistence

In [None]:
# Save models
os.makedirs('models', exist_ok=True)
generator.save('models/generator.h5')
discriminator.save('models/discriminator.h5')
with open('models/tokenizer.pickle', 'wb') as f:
    pickle.dump(tokenizer, f)
print('✅ Models saved!')

## 🎯 Section 10: Summary

### ✅ Complete!

**Built:**
1. DialogueParser - Multi-format parsing
2. FlexibleDialogueDataProcessor - Tokenization
3. GAN Architecture - Generator & Discriminator
4. Training Pipeline - Adversarial training
5. Generation System - Create new dialogues
6. Visualization - Training metrics
7. Persistence - Save/load models

**Next Steps:**
- Train longer for better quality
- Experiment with hyperparameters
- Add character-specific generation
- Implement evaluation metrics

🎉 **Congratulations!**