In [2]:
import numpy as np
import pandas as pd
import re
from collections import Counter
import torch
from torch.utils.data import Dataset, DataLoader

class TextPreprocessor:
    def __init__(self, max_vocab_size=10000, max_seq_length=512):
        self.max_vocab_size = max_vocab_size
        self.max_seq_length = max_seq_length
        self.word2idx = {}
        self.idx2word = {}
        self.vocab_size = 0
        
        # print(f"Initialized TextPreprocessor with max_vocab_size={max_vocab_size}, max_seq_length={max_seq_length}")
        
    def clean_text(self, text):
        """Clean and normalize text"""
        if pd.isna(text):  # Handle NaN values
            return ""
            
        # print("\n--- Text Cleaning Steps ---")
        # print(f"Original text: {text}")
        
        # Convert to lowercase
        text = str(text).lower()
        # print(f"After lowercase: {text}")
        
        # Remove special characters and extra spaces
        text = re.sub(r'[^\w\s]', '', text)
        # print(f"After removing special chars: {text}")
        
        text = re.sub(r'\s+', ' ', text).strip()
        # print(f"After normalizing spaces: {text}")
        
        return text
    
    def build_vocabulary(self, texts):
        """Build vocabulary from list of texts"""
        print("\n=== Building Vocabulary ===")
        print(f"Processing {len(texts)} texts")
        
        word_counts = Counter()
        for i, text in enumerate(texts):
            cleaned_text = self.clean_text(text)
            words = cleaned_text.split()
            word_counts.update(words)
            # if i % 1000 == 0:
            #     print(f"Processed {i} texts...")
        
        print("\n--- Vocabulary Statistics ---")
        print(f"Total unique words: {len(word_counts)}")
        print("Most common 10 words:")
        for word, count in word_counts.most_common(10):
            print(f"{word}: {count}")
        
        # Keep most common words
        vocab_words = ['<PAD>', '<UNK>'] + [word for word, _ in 
                      word_counts.most_common(self.max_vocab_size - 2)]
        
        # Create word to index mappings
        self.word2idx = {word: idx for idx, word in enumerate(vocab_words)}
        self.idx2word = {idx: word for word, idx in self.word2idx.items()}
        self.vocab_size = len(self.word2idx)
        
        print(f"\nFinal vocabulary size: {self.vocab_size}")  # Keep this print statement
        print("Sample word indices:")
        for word in list(self.word2idx.keys())[:5]:
            print(f"{word}: {self.word2idx[word]}")
    
    def encode_text(self, text):
        """Convert text to sequence of indices"""
        cleaned_text = self.clean_text(text)
        words = cleaned_text.split()
        
        # print("\n--- Encoding Text ---")
        # print(f"Cleaned text: {cleaned_text}")
        # print(f"Number of words: {len(words)}")
        
        # Truncate or pad sequence
        if len(words) > self.max_seq_length:
            # print(f"Truncating sequence from {len(words)} to {self.max_seq_length}")
            words = words[:self.max_seq_length]
        else:
            padding_length = self.max_seq_length - len(words)
            # print(f"Adding {padding_length} padding tokens")
            words = words + ['<PAD>'] * padding_length
            
        # Convert words to indices
        indices = [self.word2idx.get(word, self.word2idx['<UNK>']) 
                  for word in words]
        
        # print("First 10 indices:", indices[:10])
        return indices

class TextDataset(Dataset):
    def __init__(self, texts, labels, preprocessor):
        self.texts = texts
        self.labels = labels
        self.preprocessor = preprocessor
        
        print(f"Created dataset with {len(texts)} samples")  # Keep this print statement
        # print(f"Label distribution: {Counter(labels)}")
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        # print(f"\n--- Processing item {idx} ---")
        # print(f"Original text: {text[:50]}...")
        
        # Convert text to tensor
        encoded_text = self.preprocessor.encode_text(text)
        text_tensor = torch.tensor(encoded_text, dtype=torch.long)
        
        # Convert label to tensor
        label_tensor = torch.tensor(label, dtype=torch.long)
        
        # print(f"Text tensor shape: {text_tensor.shape}")
        # print(f"Label: {label}")
        
        return text_tensor, label_tensor

def prepare_data(file_path, preprocessor, batch_size=32, text_column='text', label_column='label'):
    """Prepare data for training using pandas"""
    print("\n=== Data Preparation Started ===")  # Keep this print statement
    
    # Read data using pandas
    try:
        # First, try reading as CSV
        df = pd.read_csv(file_path)
        print("Successfully read CSV file")
    except:
        try:
            # If CSV fails, try reading as Excel
            df = pd.read_excel(file_path)
            print("Successfully read Excel file")
        except Exception as e:
            print(f"Error reading file: {e}")
            return None
    
    # Display basic dataset information
    print("\n=== Dataset Information ===")
    print(df.info())
    print("\n=== Value Counts for Labels ===")
    print(df[label_column].value_counts())
    
    # Basic data cleaning
    # Remove any rows with NaN in text or label columns
    df = df.dropna(subset=[text_column, label_column])
    
    # Convert text and labels to lists
    texts = df[text_column].tolist()
    labels = df[label_column].tolist()
    
    print(f"Loaded {len(texts)} samples")  # Keep this print statement
    
    # Build vocabulary
    preprocessor.build_vocabulary(texts)
    
    # Create dataset
    dataset = TextDataset(texts, labels, preprocessor)
    
    # Create dataloader
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    print(f"Created DataLoader with batch size {batch_size}")  # Keep this print statement
    
    # Optional: Display sample of processed data
    print("\n=== Sample of Processed Data ===")
    sample_idx = np.random.randint(0, len(dataset), 3)
    for idx in sample_idx:
        text_tensor, label_tensor = dataset[idx]
        print(f"\nSample {idx}:")
        print(f"Original text: {texts[idx][:100]}...")
        print(f"Label: {labels[idx]}")
        print(f"Tensor shape: {text_tensor.shape}")
    
    return dataloader, preprocessor.vocab_size, df

# Usage example:
if __name__ == "__main__":
    # Initialize preprocessor
    preprocessor = TextPreprocessor()
    
    # Prepare data
    print("\n=== Starting Data Processing ===")  # Keep this print statement
    
    # Assuming the CSV has 'text' and 'label' columns - adjust as needed
    dataloader, vocab_size, df = prepare_data('train.csv', 
                                            preprocessor,
                                            text_column='text',  # Change to your text column name
                                            label_column='label')  # Change to your label column name
    
    if dataloader is not None:
        # Print some statistics
        print(f"Vocabulary size: {vocab_size}")  # Keep this print statement
        print(f"Number of batches: {len(dataloader)}")  # Keep this print statement
        
        # Example of accessing a batch
        for batch_texts, batch_labels in dataloader:
            print(f"Batch shape: {batch_texts.shape}")  # Keep this print statement
            print(f"Labels shape: {batch_labels.shape}")  # Keep this print statement
            # print("Sample sequence:", batch_texts[0][:10])
            # print("Sample label:", batch_labels[0])
            break

    # #Optional: Data distribution visualization
    # import matplotlib.pyplot as plt
    # plt.figure(figsize=(10, 5))
    # df['label'].value_counts().plot(kind='bar')
    # plt.title('Label Distribution')
    # plt.xlabel('Label')
    # plt.ylabel('Count')
    # plt.show()


=== Starting Data Processing ===

=== Data Preparation Started ===
Successfully read CSV file

=== Dataset Information ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    159571 non-null  object
 1   label   159571 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 2.4+ MB
None

=== Value Counts for Labels ===
label
0    144277
1     15294
Name: count, dtype: int64
Loaded 159571 samples

=== Building Vocabulary ===
Processing 159571 texts

--- Vocabulary Statistics ---
Total unique words: 251359
Most common 10 words:
the: 495403
to: 296836
of: 224012
and: 222337
a: 214182
you: 204486
i: 200497
is: 175943
that: 154272
in: 144156

Final vocabulary size: 10000
Sample word indices:
<PAD>: 0
<UNK>: 1
the: 2
to: 3
of: 4
Created dataset with 159571 samples
Created DataLoader with batch size 32

=== Sample of Processed Data ===

Sample 85

In [3]:
import torch
import torch.nn as nn
import math
import torch.nn.functional as F

class MultiHeadAttention(nn.Module):
    """
    Implementation of Multi-Head Attention as described in Section 3.2.2 of "Attention Is All You Need"
    Multi-head attention allows the model to jointly attend to information from different representation 
    subspaces at different positions.
    """
    def __init__(self, d_model, num_heads, dropout=0.1):
        super().__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        
        self.d_model = d_model  # Model's dimension
        self.num_heads = num_heads  # Number of attention heads
        self.d_k = d_model // num_heads  # Dimension of each head's key/query
        
        # Linear projections for Q, K, V, and output as per Section 3.2.2
        self.q_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.out_linear = nn.Linear(d_model, d_model)
        
        self.dropout = nn.Dropout(dropout)
        
    def attention(self, q, k, v, mask=None):
        """
        Scaled Dot-Product Attention as defined in Section 3.2.1
        attention(Q,K,V) = softmax(QK^T/sqrt(d_k))V
        The scaling factor of sqrt(d_k) prevents softmax from having extremely small gradients
        """
        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)
        
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        
        attn_probs = F.softmax(scores, dim=-1)
        attn_probs = self.dropout(attn_probs)
        
        return torch.matmul(attn_probs, v), attn_probs

    def forward(self, q, k, v, mask=None):
        batch_size = q.size(0)
        
        # Linear projections and split into heads
        q = self.q_linear(q).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        k = self.k_linear(k).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        v = self.v_linear(v).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        
        # Apply attention on all the projected vectors in batch
        x, attn_probs = self.attention(q, k, v, mask)
        
        # Concatenate and apply final linear layer
        x = x.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        
        return self.out_linear(x)

class PositionalEncoding(nn.Module):
    """
    Implements positional encoding as described in Section 3.5
    Adds positional information to the input embeddings to provide sequence order information,
    since the attention mechanism itself is permutation-invariant
    """
    def __init__(self, d_model, max_seq_length=512, dropout=0.1):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        
        # Create positional encoding matrix
        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        return self.dropout(x + self.pe[:, :x.size(1)])

class TransformerEncoderLayer(nn.Module):
    """
    Single layer of the Transformer encoder as described in Section 3.1
    Combines multi-head self-attention with position-wise feed-forward networks
    """
    def __init__(self, d_model, num_heads, d_ff=2048, dropout=0.1):
        super().__init__()
        
        self.self_attn = MultiHeadAttention(d_model, num_heads, dropout)
        
        # Position-wise Feed-Forward Networks
        self.feed_forward = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(d_ff, d_model)
        )
        
        # Layer Normalization and Dropout
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask=None):
        # Self-attention with residual connection and layer norm
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        
        # Feed-forward with residual connection and layer norm
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        
        return x

class SelfAttentionClassifier(nn.Module):
    """
    Complete self-attention model for text classification
    Combines token embeddings, positional encoding, transformer encoder layers,
    and a classification head
    """
    def __init__(self, vocab_size, d_model=512, num_heads=8, num_layers=6, 
                 num_classes=2, max_seq_length=512, dropout=0.1):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model, max_seq_length, dropout)
        
        # Stack of Transformer Encoder layers
        self.encoder_layers = nn.ModuleList([
            TransformerEncoderLayer(d_model, num_heads, d_model * 4, dropout)
            for _ in range(num_layers)
        ])
        
        # Classification head
        self.classifier = nn.Sequential(
            nn.Linear(d_model, d_model // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(d_model // 2, num_classes)
        )
        
        # Initialize parameters with Xavier/Glorot initialization
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
    
    def forward(self, x, mask=None):
        # Token embeddings and positional encoding
        x = self.embedding(x)
        x = self.pos_encoding(x)
        
        # Apply transformer encoder layers
        for layer in self.encoder_layers:
            x = layer(x, mask)
        
        # Global average pooling over sequence length
        x = x.mean(dim=1)
        
        # Classification
        return self.classifier(x)

def create_attention_mask(seq_len):
    """
    Creates attention mask for self-attention as described in Section 3.2.3
    Prevents positions from attending to subsequent positions
    """
    mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool()
    return ~mask

# Training utilities
def train_epoch(model, dataloader, criterion, optimizer, device):
    """
    Training loop for one epoch
    Implements the training procedure with attention mechanisms
    """
    model.train()
    total_loss = 0
    
    for batch_idx, (texts, labels) in enumerate(dataloader):
        texts, labels = texts.to(device), labels.to(device)
        
        # Create attention mask
        mask = create_attention_mask(texts.size(1)).to(device)
        
        # Forward pass
        optimizer.zero_grad()
        outputs = model(texts, mask)
        loss = criterion(outputs, labels)
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
        if batch_idx % 100 == 0:
            print(f'Batch {batch_idx}, Loss: {loss.item():.4f}')
    
    return total_loss / len(dataloader)

# Example usage:
if __name__ == "__main__":
    # Model parameters
    VOCAB_SIZE = 10000
    D_MODEL = 512
    NUM_HEADS = 8
    NUM_LAYERS = 6
    NUM_CLASSES = 2
    MAX_SEQ_LENGTH = 512
    DROPOUT = 0.1
    
    # Initialize model
    model = SelfAttentionClassifier(
        vocab_size=VOCAB_SIZE,
        d_model=D_MODEL,
        num_heads=NUM_HEADS,
        num_layers=NUM_LAYERS,
        num_classes=NUM_CLASSES,
        max_seq_length=MAX_SEQ_LENGTH,
        dropout=DROPOUT
    )
    
    print("Model architecture:")
    print(model)
    
    # Count parameters
    total_params = sum(p.numel() for p in model.parameters())
    print(f"\nTotal parameters: {total_params:,}")

Model architecture:
SelfAttentionClassifier(
  (embedding): Embedding(10000, 512)
  (pos_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder_layers): ModuleList(
    (0-5): 6 x TransformerEncoderLayer(
      (self_attn): MultiHeadAttention(
        (q_linear): Linear(in_features=512, out_features=512, bias=True)
        (k_linear): Linear(in_features=512, out_features=512, bias=True)
        (v_linear): Linear(in_features=512, out_features=512, bias=True)
        (out_linear): Linear(in_features=512, out_features=512, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (feed_forward): Sequential(
        (0): Linear(in_features=512, out_features=2048, bias=True)
        (1): ReLU()
        (2): Dropout(p=0.1, inplace=False)
        (3): Linear(in_features=2048, out_features=512, bias=True)
      )
      (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((512,), eps=1e-05, elementwise_affin

In [4]:
import torch
import numpy as np
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd

class WordEmbeddingVisualizer:
    """
    Creates interactive visualizations of word embeddings in latent space
    with clustering and dynamic exploration capabilities.
    """
    def __init__(self, model, preprocessor):
        self.model = model
        self.preprocessor = preprocessor
        self.device = next(model.parameters()).device
        self.embeddings = None
        self.words = None
        
    def extract_embeddings(self, n_words=1000):
        """Extract embeddings from the model for visualization"""
        # Get most common words
        words = list(self.preprocessor.word2idx.keys())[:n_words]
        word_indices = [self.preprocessor.word2idx[word] for word in words]
        
        # Convert to tensor and get embeddings
        indices_tensor = torch.tensor(word_indices).to(self.device)
        with torch.no_grad():
            embeddings = self.model.embedding(indices_tensor).cpu().numpy()
            
        self.embeddings = embeddings
        self.words = words
        return embeddings, words
    
    def reduce_dimensions(self, method='tsne', n_components=2):
        """Reduce embedding dimensions for visualization"""
        if self.embeddings is None:
            self.extract_embeddings()
            
        if method.lower() == 'tsne':
            reducer = TSNE(n_components=n_components, 
                         random_state=42,
                         perplexity=min(30, len(self.embeddings)-1))
            reduced_embeddings = reducer.fit_transform(self.embeddings)
        else:
            try:
                from umap.umap_ import UMAP
                reducer = UMAP(n_components=n_components, random_state=42)
                reduced_embeddings = reducer.fit_transform(self.embeddings)
            except ImportError:
                print("UMAP not available. Falling back to t-SNE...")
                reducer = TSNE(n_components=n_components, 
                             random_state=42,
                             perplexity=min(30, len(self.embeddings)-1))
                reduced_embeddings = reducer.fit_transform(self.embeddings)
                
        return reduced_embeddings
    
    def cluster_words(self, n_clusters=5):
        """Cluster words based on their embeddings"""
        if self.embeddings is None:
            self.extract_embeddings()
            
        kmeans = KMeans(n_clusters=min(n_clusters, len(self.embeddings)), 
                       random_state=42)
        clusters = kmeans.fit_predict(self.embeddings)
        return clusters
    
    def create_interactive_plot(self, reduction_method='tsne', n_clusters=5):
        """Create an interactive plot of word embeddings"""
        # Reduce dimensions
        reduced_embeddings = self.reduce_dimensions(method=reduction_method)
        
        # Get clusters
        clusters = self.cluster_words(n_clusters=n_clusters)
        
        # Create DataFrame for plotting
        df = pd.DataFrame({
            'x': reduced_embeddings[:, 0],
            'y': reduced_embeddings[:, 1],
            'word': self.words,
            'cluster': clusters
        })
        
        # Create interactive plot
        fig = px.scatter(
            df,
            x='x',
            y='y',
            color='cluster',
            hover_data=['word'],
            text='word',
            title=f'Word Embeddings ({reduction_method.upper()}) with {n_clusters} clusters'
        )
        
        # Update layout
        fig.update_traces(
            textposition='top center',
            marker=dict(size=8),
            textfont=dict(size=10)
        )
        fig.update_layout(
            height=800,
            showlegend=True,
            title_x=0.5,
            title_y=0.95
        )
        
        return fig
    
    def create_3d_plot(self, n_clusters=5):
        """Create 3D interactive plot of word embeddings"""
        # Get 3D embeddings using t-SNE
        reduced_embeddings = self.reduce_dimensions(method='tsne', n_components=3)
        clusters = self.cluster_words(n_clusters=n_clusters)
        
        # Create DataFrame
        df = pd.DataFrame({
            'x': reduced_embeddings[:, 0],
            'y': reduced_embeddings[:, 1],
            'z': reduced_embeddings[:, 2],
            'word': self.words,
            'cluster': clusters
        })
        
        # Create 3D scatter plot
        fig = go.Figure(data=[go.Scatter3d(
            x=df['x'],
            y=df['y'],
            z=df['z'],
            mode='markers+text',
            text=df['word'],
            hoverinfo='text',
            marker=dict(
                size=5,
                color=df['cluster'],
                colorscale='Viridis',
                opacity=0.8
            ),
            textposition='top center'
        )])
        
        # Update layout
        fig.update_layout(
            title=dict(
                text='3D Word Embeddings Visualization',
                y=0.95
            ),
            scene=dict(
                xaxis_title='Dimension 1',
                yaxis_title='Dimension 2',
                zaxis_title='Dimension 3'
            ),
            height=800
        )
        
        return fig
    
    def find_similar_words(self, word, n=5):
        """Find most similar words based on embedding distance"""
        if word not in self.words:
            return []
            
        word_idx = self.words.index(word)
        word_embedding = self.embeddings[word_idx]
        
        # Calculate distances
        distances = np.linalg.norm(self.embeddings - word_embedding, axis=1)
        closest_indices = np.argsort(distances)[1:n+1]  # Exclude the word itself
        
        similar_words = [(self.words[idx], distances[idx]) for idx in closest_indices]
        return similar_words

def visualize_embeddings(model, preprocessor, n_words=500, n_clusters=8):
    """
    Creates both 2D and 3D visualizations of word embeddings
    
    Parameters:
    - model: trained model with embedding layer
    - preprocessor: text preprocessor with vocabulary
    - n_words: number of words to visualize
    - n_clusters: number of clusters for visualization
    
    Returns:
    - fig_2d: 2D interactive plot
    - fig_3d: 3D interactive plot
    - visualizer: WordEmbeddingVisualizer instance
    """
    visualizer = WordEmbeddingVisualizer(model, preprocessor)
    
    # Extract embeddings
    print("Extracting embeddings...")
    visualizer.extract_embeddings(n_words=n_words)
    
    # Create 2D visualization
    print("Creating 2D visualization...")
    fig_2d = visualizer.create_interactive_plot(reduction_method='tsne', 
                                              n_clusters=n_clusters)
    
    # Create 3D visualization
    print("Creating 3D visualization...")
    fig_3d = visualizer.create_3d_plot(n_clusters=n_clusters)
    
    # Find similar words example
    sample_word = visualizer.words[0]
    similar_words = visualizer.find_similar_words(sample_word, n=5)
    print(f"\nWords most similar to '{sample_word}':")
    for word, distance in similar_words:
        print(f"{word}: {distance:.3f}")
    
    return fig_2d, fig_3d, visualizer

if __name__ == "__main__":
    # Example usage
    # Assuming you have your trained model and preprocessor
    fig_2d, fig_3d, visualizer = visualize_embeddings(model, preprocessor)
    
    # Display the plots
    fig_2d.show()
    fig_3d.show()

Extracting embeddings...
Creating 2D visualization...
Creating 3D visualization...

Words most similar to '<PAD>':
has: 0.414
life: 0.417
links: 0.417
four: 0.419
sure: 0.419


In [5]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
import torch
import torch.nn as nn
import math
import torch.nn.functional as F
import numpy as np
from typing import List, Dict, Any
import os
from openai import OpenAI

# [Previous Self-Attention Model Classes: MultiHeadAttention, PositionalEncoding, TransformerEncoderLayer, SelfAttentionClassifier]

def create_attention_mask(seq_len):
    """Creates attention mask for self-attention"""
    mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool()
    return ~mask

# [Previous imports remain the same...]

class IntegratedToxicityFilter:
    """Direct integration of self-attention model for toxicity filtering"""
    def __init__(self, vocab_size=10000, device='cuda' if torch.cuda.is_available() else 'cpu', threshold=0.5):
        self.device = device
        self.threshold = threshold
        
        # Initialize self-attention model directly
        self.model = SelfAttentionClassifier(
            vocab_size=vocab_size,
            d_model=512,
            num_heads=8,
            num_layers=6,
            num_classes=2,
            max_seq_length=512,
            dropout=0.1
        ).to(device)
        
        self.model.eval()
        print(f"Initialized toxicity filter with threshold {threshold}")
    
    def text_to_tensor(self, text: str) -> torch.Tensor:
        """Convert text to tensor using basic tokenization"""
        tokens = text.lower().split()
        if not tokens:  # Handle empty text
            tokens = ['<pad>']
            
        indices = [hash(token) % (self.model.embedding.num_embeddings - 1) + 1 for token in tokens]
        if len(indices) > self.model.pos_encoding.pe.size(1):
            indices = indices[:self.model.pos_encoding.pe.size(1)]
        else:
            indices += [0] * (self.model.pos_encoding.pe.size(1) - len(indices))
        return torch.tensor(indices, dtype=torch.long)
    
    def is_toxic(self, text: str) -> bool:
        """Determines if text contains toxic content"""
        if not text.strip():  # Handle empty text
            return False
            
        try:
            with torch.no_grad():
                text_tensor = self.text_to_tensor(text).unsqueeze(0).to(self.device)
                mask = create_attention_mask(text_tensor.size(1)).to(self.device)
                output = self.model(text_tensor, mask)
                probabilities = torch.softmax(output, dim=1)
                toxic_prob = probabilities[0][1].item()
                return toxic_prob > self.threshold
        except Exception as e:
            print(f"Error in toxicity detection: {str(e)}")
            return False  # Default to non-toxic on error

    def filter_text(self, text: str, debug=False) -> str:
        """Filters toxic content by analyzing sentences"""
        if not text.strip():
            return ""
            
        sentences = text.split('.')
        clean_sentences = []
        toxic_sentences = []
        
        for sent in sentences:
            sent = sent.strip()
            if not sent:
                continue
                
            if self.is_toxic(sent):
                toxic_sentences.append(sent)
            else:
                clean_sentences.append(sent)
        
        if debug:
            print(f"Total sentences: {len(sentences)}")
            print(f"Clean sentences: {len(clean_sentences)}")
            print(f"Toxic sentences: {len(toxic_sentences)}")
        
        return '. '.join(clean_sentences)

class OpenAIDocumentQA:
    """Document QA system combining OpenAI with self-attention toxicity filtering"""
    def __init__(self, 
                 api_key: str,
                 model_name: str = "gpt-3.5-turbo",
                 temperature: float = 0.7,
                 max_tokens: int = 500,
                 toxic_threshold: float = 0.7):
        
        # Initialize toxic filter with adjusted threshold
        self.toxic_filter = IntegratedToxicityFilter(threshold=toxic_threshold)
        
        # Initialize OpenAI components with API key
        self.embeddings = OpenAIEmbeddings(openai_api_key=api_key)
        self.llm = ChatOpenAI(
            openai_api_key=api_key,
            model_name=model_name,
            temperature=temperature,
            max_tokens=max_tokens
        )
        
        # Initialize text splitter
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=500,
            chunk_overlap=50,
            length_function=len
        )
        
        self.vectorstore = None
        self.qa_chain = None
        self.client = OpenAI(api_key=api_key)
        
        print("Initialized OpenAI Document QA system with Self-Attention toxicity filtering")
    
    def load_document(self, pdf_path: str):
        """Loads and processes PDF document"""
        try:
            print(f"Loading document: {pdf_path}")
            loader = PyPDFLoader(pdf_path)
            pages = loader.load()
            
            print("Splitting document into chunks...")
            texts = self.text_splitter.split_documents(pages)
            print(f"Created {len(texts)} text chunks")
            
            # Filter toxic content with debugging
            clean_texts = []
            for i, text in enumerate(texts):
                print(f"\nProcessing chunk {i+1}/{len(texts)}")
                print(f"Original chunk length: {len(text.page_content)}")
                
                clean_content = self.toxic_filter.filter_text(text.page_content, debug=True)
                
                if clean_content.strip():
                    text.page_content = clean_content
                    clean_texts.append(text)
                    print(f"Clean chunk length: {len(clean_content)}")
                else:
                    print("Chunk filtered out completely")
            
            print(f"\nRetained {len(clean_texts)} clean text chunks")
            
            if not clean_texts:
                print("Warning: No clean text chunks remained after filtering!")
                print("Using original chunks with increased toxicity threshold...")
                self.toxic_filter.threshold = 0.9
                clean_texts = []
                for text in texts:
                    clean_content = self.toxic_filter.filter_text(text.page_content)
                    if clean_content.strip():
                        text.page_content = clean_content
                        clean_texts.append(text)
            
            if not clean_texts:
                raise ValueError("No usable text chunks after filtering. Please check document content.")
            
            # Create vector store
            print("Creating vector store with OpenAI embeddings...")
            self.vectorstore = FAISS.from_documents(clean_texts, self.embeddings)
            print("Document processing complete")
            
        except Exception as e:
            print(f"Error processing document: {str(e)}")
            raise
    
    def setup_retrieval_qa(self):
        """Sets up the retrieval QA chain"""
        if not self.vectorstore:
            raise ValueError("Please load a document first")
            
        self.qa_chain = RetrievalQA.from_chain_type(
            llm=self.llm,
            chain_type="stuff",
            retriever=self.vectorstore.as_retriever(
                search_kwargs={"k": 3}
            ),
            return_source_documents=True
        )
    
    def answer_question(self, question: str) -> Dict[str, Any]:
        """Answers questions with toxic content filtered"""
        try:
            if not self.qa_chain:
                self.setup_retrieval_qa()
            
            print(f"Processing question: {question}")
            
            result = self.qa_chain({"query": question})
            clean_answer = self.toxic_filter.filter_text(result['result'])
            
            sources = []
            for doc in result['source_documents']:
                sources.append({
                    'content': doc.page_content[:200] + "...",
                    'metadata': doc.metadata
                })
            
            return {
                'question': question,
                'answer': clean_answer,
                'sources': sources
            }
            
        except Exception as e:
            print(f"Error answering question: {str(e)}")
            raise

# Example usage
if __name__ == "__main__":
    try:
        # Get API key from user
        api_key = input("Please enter your OpenAI API key: ")
        
        print("Initializing Document QA system...")
        doc_qa = OpenAIDocumentQA(
            api_key=api_key,
            toxic_threshold=0.7
        )
        
        # Get PDF path
        pdf_path = 'Character Conversations.pdf'
        doc_qa.load_document(pdf_path)
        
        # Interactive QA
        while True:
            question = input("\nEnter your question (or 'quit' to exit): ")
            if question.lower() == 'quit':
                break
                
            try:
                result = doc_qa.answer_question(question)
                print(f"\nAnswer: {result['answer']}")
                print("\nSources:")
                for source in result['sources']:
                    print(f"- {source['content']}")
            except Exception as e:
                print(f"Error processing question: {str(e)}")
                continue
                
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        print("\nRequired packages:")
        print("pip install langchain langchain-openai openai pypdf faiss-cpu torch")

Initializing Document QA system...
Initialized toxicity filter with threshold 0.7
Initialized OpenAI Document QA system with Self-Attention toxicity filtering
Loading document: Character Conversations.pdf
Splitting document into chunks...
Created 9 text chunks

Processing chunk 1/9
Original chunk length: 464
Total sentences: 6
Clean sentences: 0
Toxic sentences: 5
Chunk filtered out completely

Processing chunk 2/9
Original chunk length: 492
Total sentences: 9
Clean sentences: 0
Toxic sentences: 8
Chunk filtered out completely

Processing chunk 3/9
Original chunk length: 487
Total sentences: 8
Clean sentences: 0
Toxic sentences: 7
Chunk filtered out completely

Processing chunk 4/9
Original chunk length: 90
Total sentences: 1
Clean sentences: 0
Toxic sentences: 1
Chunk filtered out completely

Processing chunk 5/9
Original chunk length: 477
Total sentences: 10
Clean sentences: 0
Toxic sentences: 9
Chunk filtered out completely

Processing chunk 6/9
Original chunk length: 486
Total sent