In [None]:
pip install torch torchtext transformers rouge-score pandas numpy matplotlib

Collecting torchtext
  Downloading torchtext-0.18.0-cp310-cp310-manylinux1_x86_64.whl.metadata (7.9 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading torchtext-0.18.0-cp310-cp310-manylinux1_x86_64.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m66.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=3e187dd94b5fd88d8f62b1bafd1e21016cd2ab60993db32953797278e35209b6
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score, torchtext
Successfully installed rouge-score-0.1.2 torchtext-0.18.0


## **V1**

In [None]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
import matplotlib.pyplot as plt
from transformers import BertTokenizer
import math
from tqdm import tqdm

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
class ProductDescriptionDataset(Dataset):
    def __init__(self, products, descriptions, tokenizer, max_length=128):
        assert len(products) == len(descriptions), "Products and descriptions must have same length"
        self.products = products
        self.descriptions = descriptions
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.products)

    def __getitem__(self, idx):
        try:
            product = str(self.products[idx])
            description = str(self.descriptions[idx])

            # Tokenize inputs
            product_encoding = self.tokenizer(
                product,
                max_length=self.max_length,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )

            description_encoding = self.tokenizer(
                description,
                max_length=self.max_length,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )

            return {
                'product_ids': product_encoding['input_ids'].squeeze(),
                'product_mask': product_encoding['attention_mask'].squeeze(),
                'description_ids': description_encoding['input_ids'].squeeze(),
                'description_mask': description_encoding['attention_mask'].squeeze()
            }
        except Exception as e:
            print(f"Error processing item at index {idx}: {e}")
            print(f"Product: {self.products[idx]}")
            print(f"Description: {self.descriptions[idx]}")
            raise e
# Positional Encoding
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

# Transformer Model
class ProductTransformer(nn.Module):
    def __init__(self, vocab_size, d_model=512, nhead=8, num_encoder_layers=6,
                 num_decoder_layers=6, dim_feedforward=2048, dropout=0.1):
        super().__init__()

        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model)

        encoder_layer = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_encoder_layers)

        decoder_layer = nn.TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout)
        self.transformer_decoder = nn.TransformerDecoder(decoder_layer, num_decoder_layers)

        self.output_layer = nn.Linear(d_model, vocab_size)

        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.output_layer.bias.data.zero_()
        self.output_layer.weight.data.uniform_(-initrange, initrange)

    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def forward(self, src, tgt, src_mask=None, tgt_mask=None):
        # Create padding masks
        src_padding_mask = (src == 0).to(src.device)  # Assuming 0 is the padding token
        tgt_padding_mask = (tgt == 0).to(tgt.device)

        # Create causal mask for decoder
        tgt_len = tgt.size(1)
        tgt_causal_mask = self.generate_square_subsequent_mask(tgt_len).to(tgt.device)

        # Embed and position encode
        src = self.embedding(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        src = src.transpose(0, 1)  # Convert to shape [seq_len, batch_size, embed_dim]

        tgt = self.embedding(tgt) * math.sqrt(self.d_model)
        tgt = self.pos_encoder(tgt)
        tgt = tgt.transpose(0, 1)  # Convert to shape [seq_len, batch_size, embed_dim]

        # Pass through transformer
        memory = self.transformer_encoder(src, src_key_padding_mask=src_padding_mask)
        output = self.transformer_decoder(tgt, memory,
                                       tgt_mask=tgt_causal_mask,
                                       tgt_key_padding_mask=tgt_padding_mask,
                                       memory_key_padding_mask=src_padding_mask)

        output = output.transpose(0, 1)  # Convert back to [batch_size, seq_len, embed_dim]
        return self.output_layer(output)
# Training function
def train_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0

    progress_bar = tqdm(dataloader, desc='Training')
    for batch in progress_bar:
        optimizer.zero_grad()

        try:
            src = batch['product_ids'].to(device)
            tgt = batch['description_ids'].to(device)

            # Shift target for teacher forcing
            tgt_input = tgt[:, :-1]
            tgt_output = tgt[:, 1:]

            output = model(src, tgt_input)  # Remove mask arguments

            loss = criterion(output.view(-1, output.size(-1)),
                           tgt_output.contiguous().view(-1))
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

            total_loss += loss.item()
            progress_bar.set_postfix({'loss': f'{loss.item():.4f}'})

        except RuntimeError as e:
            print(f"Error in batch: {e}")
            continue

    return total_loss / len(dataloader)
# Evaluation function
def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for batch in dataloader:
            src = batch['product_ids'].to(device)
            tgt = batch['description_ids'].to(device)
            tgt_mask = batch['description_mask'].to(device)

            tgt_input = tgt[:, :-1]
            tgt_output = tgt[:, 1:]

            output = model(src, tgt_input)

            loss = criterion(output.view(-1, output.size(-1)), tgt_output.contiguous().view(-1))
            total_loss += loss.item()

    return total_loss / len(dataloader)

# Main training loop
def train_model(model, train_dataloader, val_dataloader, optimizer, criterion, device,
                num_epochs=10, patience=3):
    train_losses = []
    val_losses = []
    best_val_loss = float('inf')
    patience_counter = 0

    for epoch in range(num_epochs):
        train_loss = train_epoch(model, train_dataloader, optimizer, criterion, device)
        val_loss = evaluate(model, val_dataloader, criterion, device)

        train_losses.append(train_loss)
        val_losses.append(val_loss)

        print(f'Epoch {epoch+1}/{num_epochs}')
        print(f'Train Loss: {train_loss:.4f}')
        print(f'Val Loss: {val_loss:.4f}')

        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            torch.save(model.state_dict(), 'best_model.pt')
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print('Early stopping triggered')
                break

    return train_losses, val_losses

# Calculate BLEU and ROUGE scores
# def calculate_metrics(model, test_dataloader, tokenizer, device):
#     model.eval()
#     scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
#     bleu_scores = []
#     rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}

#     with torch.no_grad():
#         for batch in test_dataloader:
#             src = batch['product_ids'].to(device)
#             tgt = batch['description_ids'].to(device)

#             # Generate description
#             output = model(src, tgt[:, :-1])
#             predicted_ids = torch.argmax(output, dim=-1)

#             # Convert ids to text
#             predicted_texts = tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)
#             target_texts = tokenizer.batch_decode(tgt, skip_special_tokens=True)

#             # Calculate BLEU
#             for pred, target in zip(predicted_texts, target_texts):
#                 bleu = bleu_score([pred.split()], [[target.split()]])
#                 bleu_scores.append(bleu)

#                 # Calculate ROUGE
#                 scores = scorer.score(pred, target)
#                 for key in rouge_scores:
#                     rouge_scores[key].append(scores[key].fmeasure)

#     avg_bleu = sum(bleu_scores) / len(bleu_scores)
#     avg_rouge = {key: sum(scores) / len(scores) for key, scores in rouge_scores.items()}

#     return avg_bleu, avg_rouge

# Plot training curves
def plot_training_curves(train_losses, val_losses):
    plt.figure(figsize=(10, 6))
    plt.plot(train_losses, label='Training Loss')
    plt.plot(val_losses, label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss Curves')
    plt.legend()
    plt.grid(True)
    plt.show()

# Hyperparameters
hyperparams = {
    'd_model': 512,
    'nhead': 8,
    'num_encoder_layers': 6,
    'num_decoder_layers': 6,
    'dim_feedforward': 2048,
    'dropout': 0.1,
    'batch_size': 32,
    'learning_rate': 0.0001,
    'num_epochs': 2,
    'patience': 3
}

# Load data
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/GenAI/Project/clean.csv')

# Make sure the required columns exist
if 'product' not in df.columns or 'description' not in df.columns:
    raise ValueError("DataFrame must contain 'product' and 'description' columns")

# Clean the data
df = df.dropna(subset=['product', 'description'])  # Remove rows with NaN values
df = df.reset_index(drop=True)  # Reset index after dropping NaN values

# Split data
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Reset indices for all splits
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

# Verify data
print("Dataset sizes:")
print(f"Training set: {len(train_df)}")
print(f"Validation set: {len(val_df)}")
print(f"Test set: {len(test_df)}")

# Create datasets
train_dataset = ProductDescriptionDataset(train_df['product'].values,
                                        train_df['description'].values,
                                        tokenizer)
val_dataset = ProductDescriptionDataset(val_df['product'].values,
                                      val_df['description'].values,
                                      tokenizer)
test_dataset = ProductDescriptionDataset(test_df['product'].values,
                                       test_df['description'].values,
                                       tokenizer)

# Create dataloaders
train_dataloader = DataLoader(train_dataset,
                            batch_size=hyperparams['batch_size'],
                            shuffle=True)
val_dataloader = DataLoader(val_dataset,
                           batch_size=hyperparams['batch_size'])
test_dataloader = DataLoader(test_dataset,
                           batch_size=hyperparams['batch_size'])

# Print sample data
print("\nSample data from training set:")
print(train_df.head())

# Initialize model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = ProductTransformer(
    vocab_size=tokenizer.vocab_size,
    d_model=hyperparams['d_model'],
    nhead=hyperparams['nhead'],
    num_encoder_layers=hyperparams['num_encoder_layers'],
    num_decoder_layers=hyperparams['num_decoder_layers'],
    dim_feedforward=hyperparams['dim_feedforward'],
    dropout=hyperparams['dropout']
).to(device)

# Initialize optimizer and criterion
optimizer = torch.optim.Adam(model.parameters(), lr=hyperparams['learning_rate'])
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

# Train model
train_losses, val_losses = train_model(
    model, train_dataloader, val_dataloader, optimizer, criterion, device,
    num_epochs=hyperparams['num_epochs'],
    patience=hyperparams['patience']
)

# Plot training curves
plot_training_curves(train_losses, val_losses)

# # Calculate metrics
# avg_bleu, avg_rouge = calculate_metrics(model, test_dataloader, tokenizer, device)
# print(f'Average BLEU score: {avg_bleu:.4f}')
# print('Average ROUGE scores:')
# for key, value in avg_rouge.items():
#     print(f'{key}: {value:.4f}')



Dataset sizes:
Training set: 941
Validation set: 202
Test set: 202


NameError: name 'tokenizer' is not defined

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import torch
from transformers import BertTokenizer
import numpy as np

class ProductDescriptionGenerator:
    def __init__(self, model_path, tokenizer, device=None):
        self.device = device if device else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.tokenizer = tokenizer
        self.model = self.load_model(model_path)
        self.model.eval()

    def load_model(self, model_path):
        # Initialize model with the same architecture
        model = ProductTransformer(
            vocab_size=self.tokenizer.vocab_size,
            d_model=512,  # Use the same hyperparameters as during training
            nhead=8,
            num_encoder_layers=6,
            num_decoder_layers=6,
            dim_feedforward=2048,
            dropout=0.1
        ).to(self.device)

        # Load the saved state dict
        model.load_state_dict(torch.load(model_path, map_location=self.device))
        return model

    def generate_description(self, product_name, max_length=128, temperature=0.7, top_k=50):
        """
        Generate a description for a given product name.

        Args:
            product_name (str): Name of the product
            max_length (int): Maximum length of generated description
            temperature (float): Sampling temperature (higher = more creative, lower = more focused)
            top_k (int): Number of top tokens to sample from

        Returns:
            str: Generated description
        """
        self.model.eval()
        with torch.no_grad():
            # Tokenize input
            product_encoding = self.tokenizer(
                product_name,
                max_length=max_length,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )

            # Move input to device
            src = product_encoding['input_ids'].to(self.device)

            # Initialize target with start token
            tgt = torch.tensor([[self.tokenizer.cls_token_id]]).to(self.device)

            # Generate tokens one by one
            for _ in range(max_length):
                # Generate prediction
                output = self.model(src, tgt)

                # Get the next token probabilities
                next_token_logits = output[:, -1, :] / temperature

                # Apply top-k sampling
                top_k_logits, top_k_indices = torch.topk(next_token_logits, top_k, dim=-1)
                prob_distribution = torch.softmax(top_k_logits, dim=-1)

                # Sample from the distribution
                next_token_idx = torch.multinomial(prob_distribution, num_samples=1)
                next_token = top_k_indices[:, next_token_idx.squeeze()]

                # Stop if we predict the end token
                if next_token.item() == self.tokenizer.sep_token_id:
                    break

                # Concatenate next token to target sequence
                tgt = torch.cat([tgt, next_token.unsqueeze(0)], dim=1)

            # Decode the generated sequence
            generated_description = self.tokenizer.decode(tgt.squeeze().tolist(),
                                                       skip_special_tokens=True)

            return generated_description

    def generate_batch_descriptions(self, product_names, batch_size=32, **kwargs):
        """
        Generate descriptions for a list of products in batches.

        Args:
            product_names (list): List of product names
            batch_size (int): Batch size for generation
            **kwargs: Additional arguments for generate_description

        Returns:
            list: List of generated descriptions
        """
        descriptions = []

        for i in range(0, len(product_names), batch_size):
            batch = product_names[i:i + batch_size]
            batch_descriptions = [self.generate_description(name, **kwargs)
                                for name in batch]
            descriptions.extend(batch_descriptions)

        return descriptions

# Example usage
def main():
    # Initialize tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Initialize generator
    generator = ProductDescriptionGenerator(
        model_path='best_model.pt',
        tokenizer=tokenizer
    )

    # Single product generation
    product_name = "Wireless Bluetooth Headphones"
    description = generator.generate_description(
        product_name,
        max_length=128,
        temperature=0.7,
        top_k=50
    )
    print(f"\nProduct: {product_name}")
    print(f"Generated Description: {description}")

    # Batch generation example
    product_names = [
        "Smart Watch with Heart Rate Monitor",
        "Ultra HD 4K TV 55-inch",
        "Professional Coffee Maker"
    ]

    descriptions = generator.generate_batch_descriptions(
        product_names,
        batch_size=2,
        max_length=128,
        temperature=0.7,
        top_k=50
    )

    print("\nBatch Generation Results:")
    for product, desc in zip(product_names, descriptions):
        print(f"\nProduct: {product}")
        print(f"Generated Description: {desc}")

if __name__ == "__main__":
    main()

  model.load_state_dict(torch.load(model_path, map_location=self.device))



Product: Wireless Bluetooth Headphones
Generated Description: unleash yoursh with the with this features for kids, - quality, this fun, and, and easy to your, and alea, these. a must - quality, and a fun!

Batch Generation Results:

Product: Smart Watch with Heart Rate Monitor
Generated Description: unleash yoursh your with this -'s the with the inner a orange! this adorable's a must - quality, this set! this and a, this a touch of all for kids and a must - a must - quality, or a must - the perfect for kids. perfect for kids and # kids, and a must - quality, this # kids # your # kids.

Product: Ultra HD 4K TV 55-inch
Generated Description: adorable yoursh's with the 3 - for kids! perfect for kids, and a must - quality, and a must - quality, or plush for kids, and. shop now!

Product: Professional Coffee Maker
Generated Description: unleash yoursh's with the features! this action with this set! this - quality, this fun, a must - a must - quality, and with this adorable and of all, thes