In [None]:
import torch
import torch.nn as nn
from transformers import T5EncoderModel
import keyword
import tokenize
from io import StringIO
import random
import numpy as np

def mask_tokens(code_snippet, language='python', mask_rate=0.15):
    masked_code = []
    if language == 'python':
        # Tokenize the Python code snippet
        tokens = list(tokenize.tokenize(StringIO(code_snippet).readline))
        for tok in tokens:
            # For keywords and identifiers (NAME tokens that aren't built-in functions), mask them based on mask_rate
            if (tok.type == tokenize.NAME and (keyword.iskeyword(tok.string) or not tok.string.startswith('__'))) and random.random() < mask_rate:
                masked_code.append('<MASK>')
            else:
                masked_code.append(tok.string)
    elif language == 'bash':
        # This is a placeholder: you'll need a more sophisticated method for Bash, possibly using regex
        bash_keywords = ['if', 'else', 'fi', 'do', 'done', 'for', 'in', 'while', 'case', 'esac', 'echo', 'printf', 'export']
        for word in code_snippet.split():
            if word in bash_keywords and random.random() < mask_rate:
                masked_code.append('<MASK>')
            else:
                masked_code.append(word)
    return ' '.join(masked_code)


def add_gaussian_noise(embeddings, mean=0.0, std=0.1):
    noise = torch.randn_like(embeddings) * std + mean
    return embeddings + noise
    
class Denoiser(nn.Module):
    def __init__(self, d_model, nhead, num_layers, dim_feedforward=2048, dropout=0.1):
        super(Denoiser, self).__init__()
        self.self_attn_layers = nn.ModuleList([
            nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, 
                                       dim_feedforward=dim_feedforward, dropout=dropout)
            for _ in range(num_layers)
        ])
        self.cross_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
        
        # Noise prediction layer
        self.noise_prediction_layer = nn.Linear(d_model, d_model)

    def forward(self, noisy_embeddings, encoded_utterance=None, t=None, src_key_padding_mask=None):
        # Self-attention layers
        for layer in self.self_attn_layers:
            noisy_embeddings = layer(noisy_embeddings, src_key_padding_mask=src_key_padding_mask)

        # If encoded_utterance is provided, perform cross-attention
        if encoded_utterance is not None:
            attn_output, _ = self.cross_attn(noisy_embeddings, encoded_utterance, encoded_utterance)
        else:
            attn_output = noisy_embeddings

        # Predict noise
        predicted_noise = self.noise_prediction_layer(attn_output)
        
        # Remove predicted noise to get denoised embeddings
        denoised_embeddings = attn_output - predicted_noise

        return denoised_embeddings, predicted_noise

class DecoderWithCrossAttention(nn.Module):
    def __init__(self, d_model, nhead, num_layers=1, dim_feedforward=2048, dropout=0.1):
        super(DecoderWithCrossAttention, self).__init__()
        decoder_layer = nn.TransformerDecoderLayer(d_model=d_model, nhead=nhead, 
                                                   dim_feedforward=dim_feedforward, dropout=dropout)
        self.transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)

    def forward(self, denoised_embeddings, encoded_utterance=None):
        if encoded_utterance is not None:
            # Cross-attention in the decoder
            output = self.transformer_decoder(denoised_embeddings, encoded_utterance)
        else:
            # If encoded_utterance is not provided, use dummy memory input
            dummy_memory = torch.zeros_like(denoised_embeddings)  # Create dummy memory tensor
            output = self.transformer_decoder(denoised_embeddings, memory=dummy_memory)
        return output

class CodeEmbeddingLayer(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CodeEmbeddingLayer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

    def forward(self, code_tokens):
        # Convert code tokens to embeddings
        return self.embedding(code_tokens)

class T5EncoderBlock(nn.Module):
    def __init__(self, model_name='t5-medium'):
        super(T5EncoderBlock, self).__init__()
        self.encoder = T5EncoderModel.from_pretrained(model_name)

    def forward(self, input_ids, attention_mask=None):
        # Process input tokens through the T5 encoder
        encoder_outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        return encoder_outputs.last_hidden_state

class ClassificationHead(nn.Module):
    def __init__(self, input_dim, vocab_size):
        super(ClassificationHead, self).__init__()
        self.linear = nn.Linear(input_dim, vocab_size)
        # Note: Softmax is not applied here as it's usually included in the loss function (e.g., nn.CrossEntropyLoss)

    def forward(self, input_embeddings):
        # Map embeddings to logits for each token in the vocabulary
        logits = self.linear(input_embeddings)
        return logits

class CODEFUSIONModel(nn.Module):
    def __init__(self, embedding_dim, vocab_size, t5_model_name='t5-medium', d_model=512, nhead=8, num_layers=6):
        super(CODEFUSIONModel, self).__init__()
        self.t5_encoder = T5EncoderBlock(t5_model_name)
        self.denoiser = Denoiser(d_model, nhead, num_layers)  # Updated Denoiser
        self.decoder = DecoderWithCrossAttention(d_model, nhead, num_layers)  # Updated Decoder
        self.classification_head = ClassificationHead(embedding_dim, vocab_size)

    def forward(self, noisy_code_embeddings, input_ids, attention_mask=None):
        # Encode natural language utterance
        encoded_utterance = self.t5_encoder(input_ids, attention_mask)

        # Process noisy code embeddings through denoiser with cross-attention to the encoded utterance
        denoised_embeddings = self.denoiser(noisy_code_embeddings, encoded_utterance)

        # Process denoised embeddings through decoder with cross-attention to the encoded utterance
        decoded_embeddings = self.decoder(denoised_embeddings, encoded_utterance)

        # Generate logits for each code token position
        logits = self.classification_head(decoded_embeddings)
        return logits



def compute_loss(predicted_noise, actual_noise, denoised_embeddings, original_embeddings, logits, target_tokens):
    # Noise Prediction Loss
    noise_loss = torch.norm(predicted_noise - actual_noise, p=2)
    print(f"Noise loss computed: {noise_loss.item()}")

    # Embedding Fidelity Loss
    embedding_loss = torch.norm(denoised_embeddings - original_embeddings, p=2)
    print(f"Embedding loss computed: {embedding_loss.item()}")

    # Token Prediction Loss with Padding Ignored
    pad_token_id = t5_tokenizer.pad_token_id  # Ensure this is correctly set
    ce_loss_fn = nn.CrossEntropyLoss(ignore_index=pad_token_id)

    # Debug: Print shapes and pad token ID
    print(f"Logits shape: {logits.shape}, Target tokens shape: {target_tokens.shape}, Pad token ID: {pad_token_id}")

    # Ensure logits are reshaped correctly for CrossEntropyLoss
    logits_reshaped = logits.view(-1, logits.size(-1))
    target_tokens_reshaped = target_tokens.view(-1)

    # Debug: Print reshaped logits and target tokens shapes
    print(f"Reshaped logits shape: {logits_reshaped.shape}, Reshaped target tokens shape: {target_tokens_reshaped.shape}")

    # Debug: Check if any target token is out of bounds
    max_logit_index = logits.size(-1) - 1
    max_target_token = target_tokens_reshaped.max().item()
    print(f"Max logit index (vocab size - 1): {max_logit_index}, Max target token index: {max_target_token}")

    # Ensure no target token index exceeds the model's vocabulary size
    if max_target_token > max_logit_index:
        print("Error: Target token index exceeds the model's vocabulary size.")

    # Compute the CrossEntropyLoss
    ce_loss = ce_loss_fn(logits_reshaped, target_tokens_reshaped)
    print(f"CrossEntropy loss computed: {ce_loss.item()}")

    # Combine losses
    total_loss = noise_loss + embedding_loss + ce_loss
    print(f"Total loss computed: {total_loss.item()}")

    return total_loss

def square_root_noise_schedule(t, total_steps=1200, max_noise=0.1):
    """
    Calculates the noise level for a given step t using a square root schedule.
    
    Parameters:
    - t: Current diffusion step (0 <= t < total_steps).
    - total_steps: Total number of diffusion steps.
    - max_noise: Maximum noise level at the final step.
    
    Returns:
    - noise_level: Noise level at step t.
    """
    # Normalize the current step to a range between 0 and 1
    step_fraction = t / total_steps
    # Calculate the noise level using a square root schedule
    noise_level = max_noise * np.sqrt(step_fraction)
    return noise_level

In [ ]:
import torch
import torch.optim as optim
from transformers import T5Tokenizer, T5EncoderModel

# Initialize the T5 tokenizer and model
t5_tokenizer = T5Tokenizer.from_pretrained('google-t5/t5-small')
t5_model = T5EncoderModel.from_pretrained('google-t5/t5-small')

# Initialize Denoiser and Decoder with specified parameters
hidden_dim = 512  # Hidden dimension size
denoiser = Denoiser(d_model=hidden_dim, nhead=8, num_layers=10)
decoder = DecoderWithCrossAttention(d_model=hidden_dim, nhead=8, num_layers=6)
code_embedding_layer = CodeEmbeddingLayer(vocab_size=t5_tokenizer.vocab_size, embedding_dim=512)

# Initialize the optimizer
optimizer = optim.AdamW(list(denoiser.parameters()) + list(decoder.parameters()), lr=5e-4, weight_decay=0)

# Define the path to your dataset
dataset_path = '/kaggle/input/pretraining-codefusion/PythonSnippets.txt'  # Update this path

# Training loop\
num_diffusion_steps = 1200
num_epochs = 10  # Specify the number of epochs
for epoch in range(num_epochs):
    with open(dataset_path, 'r') as file:
        for code_snippet in file:
            optimizer.zero_grad()
            code_snippet = code_snippet.rstrip('\n')
            tokens = t5_tokenizer.encode(code_snippet, return_tensors="pt", add_special_tokens=True, max_length=128, truncation=True, padding="max_length")
            
            embeddings = code_embedding_layer(tokens.to(torch.int64))

            total_loss = 0

            for t in range(num_diffusion_steps):
                noise_level = square_root_noise_schedule(t)
                noisy_embeddings = embeddings + torch.randn_like(embeddings) * noise_level
                
                # Pass the current timestep `t` to the Denoiser
                denoised_embeddings, predicted_noise = denoiser(noisy_embeddings, t=t)  # Adjusted to include timestep `t`
                
                decoded_embeddings = decoder(denoised_embeddings)

                noise_loss = torch.norm(predicted_noise - (noisy_embeddings - embeddings), p=2)
                embedding_loss = torch.norm(decoded_embeddings - embeddings, p=2)
                step_loss = noise_loss + embedding_loss

                total_loss += step_loss

            total_loss /= num_diffusion_steps
            total_loss.backward()
            optimizer.step()

            print(f"Epoch {epoch}, Avg Loss: {total_loss.item()}")

In [ ]:
class Denoiser(nn.Module):
    def __init__(self, d_model, nhead, num_layers, dim_feedforward=2048, dropout=0.1):
        super(Denoiser, self).__init__()
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, 
                                                   dim_feedforward=dim_feedforward, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

    def forward(self, src, src_mask=None, src_key_padding_mask=None):
        # src shape: [seq_length, batch_size, d_model]
        # src_mask and src_key_padding_mask are optional and can be used to mask out certain parts of the input
        output = self.transformer_encoder(src, src_mask=src_mask, src_key_padding_mask=src_key_padding_mask)
        return output
    
class Decoder(nn.Module):
    def __init__(self, d_model, nhead, num_layers=1, dim_feedforward=2048, dropout=0.1):
        super(Decoder, self).__init__()
        # Transformer decoder layer
        self.decoder_layer = nn.TransformerDecoderLayer(
            d_model=d_model, 
            nhead=nhead, 
            dim_feedforward=dim_feedforward, 
            dropout=dropout
        )
        self.transformer_decoder = nn.TransformerDecoder(self.decoder_layer, num_layers=num_layers)

    def forward(self, tgt, memory):
        """
        Args:
            tgt: The sequence to the decoder (denoised embeddings).
            memory: The sequence from the last layer of the encoder (for pre-training, this could be Gaussian noise or another form of representation).
        """
        # tgt and memory shapes: [seq_length, batch_size, d_model]
        output = self.transformer_decoder(tgt, memory)
        return output
    
tasks = ['unsupervised_code_generation', 'cpd']

for epoch in range(num_epochs):
    for code_snippet in code_corpus:
        # Randomly select a task for this iteration
        task = random.choice(tasks)

        # Convert code_snippet to embeddings
        # Note: Assuming code_embedding_layer can handle raw code snippets directly
        code_embeddings = code_embedding_layer(code_snippet)

        if task == 'unsupervised_code_generation':
            # For unsupervised code generation, start with Gaussian noise
            # Assuming the shape of code_embeddings is suitable for adding Gaussian noise directly
            noisy_embeddings = add_gaussian_noise(code_embeddings)
        elif task == 'cpd':
            # For the CPD task, mask tokens in the code snippet and then convert to embeddings
            masked_code_snippet = mask_tokens(code_snippet, language='python')  # Specify the language as needed
            # Convert masked code snippet to embeddings and add Gaussian noise
            noisy_embeddings = add_gaussian_noise(code_embedding_layer(masked_code_snippet))

        # Denoising step
        denoised_embeddings = denoiser(noisy_embeddings)

        # Decoding step
        decoded_embeddings = decoder(denoised_embeddings)

        # Calculate loss
        # You'll need to define compute_loss based on the task and expected output
        # For CPD, the loss might involve comparing decoded embeddings to original (unmasked) embeddings
        # For unsupervised code generation, the loss might involve the fidelity of generated code to valid code structures
        #TODO: Make the compute loss as per the paper
        loss = compute_loss(decoded_embeddings, code_embeddings, task=task)

        # Backpropagate and update model parameters
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
