In [None]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/

Mounted at /content/drive
/content/drive/MyDrive


In [None]:
!pip install transformers --upgrade
!pip install torch --upgrade
!pip install torchvision --upgrade

Collecting transformers
  Downloading transformers-4.50.1-py3-none-any.whl.metadata (39 kB)
Downloading transformers-4.50.1-py3-none-any.whl (10.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m65.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.50.0
    Uninstalling transformers-4.50.0:
      Successfully uninstalled transformers-4.50.0
Successfully installed transformers-4.50.1
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadat

In [None]:
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM

In [30]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import math
from typing import Dict

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class InputEmbeddings(nn.Module):
    def __init__(self, d_model: int, vocab_size: int):
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embeddings = nn.Embedding(vocab_size, d_model)

    def forward(self, x):
        return self.embeddings(x) * math.sqrt(self.d_model)


class PositionEncoding(nn.Module):
    def __init__(self, d_model: int, seq_len: int, dropout: float | None):
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len
        self.dropout = None
        if dropout is not None:
            self.dropout = nn.Dropout(dropout)

        pe = torch.zeros(seq_len, d_model)  # positional encoding blueprint matrix
        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1)  # numerator in the formula, shape=(seq_len, 1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float()*(-math.log(10000.0)/d_model)) # for numerical stability using log space
        pe[:, 0::2] = torch.sin(position*div_term)  # at 0, 2, ..., 2k, ...
        pe[:, 1::2] = torch.cos(position*div_term)  # at 1, 3, ..., 2k+1, ...
        self.pe = pe.unsqueeze(0).to(device)  # shape=(1, seq_len, d_model)

        #self.register_buffer('pe', pe.unsqueeze(0))  # save the positional encoding in the module along with the saved model (NOT as a learned param)

    def forward(self, x):
        if self.dropout is not None:
            return self.dropout(x+(self.pe[:, :x.shape[1], :].to(x.device)).requires_grad_(False))  # setting requries_grad_ False ensures pe is not learned
        return x+(self.pe[:, :x.shape[1], :].to(x.device)).requires_grad_(False)

# attention is all you need!!
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model: int, h: int, dropout: float | None):
        super().__init__()
        self.d_model = d_model
        self.h = h
        assert d_model%h == 0, "d_model has to be divisible by h"

        self.d_k = d_model//h
        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)
        self.w_o = nn.Linear(d_model, d_model)
        self.dropout = None
        if dropout is not None:
            self.dropout = nn.Dropout(dropout)

        self.attention = ScaledDotProductAttention(self.d_k, self.dropout)

    def forward(self, Q, K, V, mask=None):
        # shape transition: Q,K,V=(batch_size, seq_len, d_model) -W-> (batch_size, seq_len, d_model) -> (batch_size, seq_len, h, d_k) -T-> (batch_size, h, seq_len, d_k)
        q = self.w_q(Q).view(Q.shape[0], Q.shape[1], self.h, self.d_k).transpose(1, 2)
        k = self.w_k(K).view(K.shape[0], K.shape[1], self.h, self.d_k).transpose(1, 2)
        v = self.w_v(V).view(V.shape[0], V.shape[1], self.h, self.d_k).transpose(1, 2)

        x_out, attn = self.attention(q,k,v,mask)
        # x_out=(batch_size, h, seq_len, d_k) -> (batch_size, seq_len, h, d_k) -> (batch_size, seq_len, d_model)
        x_out = x_out.transpose(1, 2).contiguous().view(x_out.shape[0], -1, self.h*self.d_k)  # contiguous to ensure the tensor is stored as contiguous blocks
        return self.w_o(x_out)  # -> (batch_size, seq_len, d_model)


class ScaledDotProductAttention(nn.Module):
    def __init__(self, d_k, dropout: nn.Dropout | None):
        super().__init__()
        self.d_k = d_k
        self.dropout = dropout
    def forward(self, Q, K, V, mask=None):
        scores = torch.matmul(Q, K.transpose(-2, -1))/math.sqrt(self.d_k)  # (batch_size, h, seq_len, d_k)
        if mask is not None:
            scores = scores.masked_fill(mask==0, -1e-9)
        if self.dropout is not None:
            scores = self.dropout(scores)
        attn = F.softmax(scores, dim=-1)  # -> (batch_size, h, seq_len, seq_len)
        return torch.matmul(attn, V), attn  # -> (batch_size, h, seq_len, d_k)


class LayerNormalization(nn.Module):
    def __init__(self, epsilon=1e-6):
        super().__init__()
        self.epsilon = epsilon
        self.gamma = nn.Parameter(torch.ones(1))  # multiplier
        self.beta = nn.Parameter(torch.zeros(1))  # bias

    def forward(self, x):
        mean = x.float().mean(dim=-1, keepdim=True)
        std = x.float().std(dim=-1, keepdim=True)
        return self.gamma*(x-mean)/(std+self.epsilon)+self.beta


class FeedForward(nn.Module):
    # formula: max(0, xW1+b1)W2+b2, where W1,2 are linear layers, b1,b2 are biases, and max(0, z) is done by a relu layer
    def __init__(self, d_model: int, d_ff: int, dropout: float | None):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff, bias=True)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(d_ff, d_model, bias=True)
        self.dropout = None
        if dropout is not None:
            self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        if self.dropout is not None:
            return self.linear2(self.dropout(self.relu(self.linear1(x))))
        return self.linear2(self.relu(self.linear1(x)))


# skip connection norm->norm aside from norm->feedforward
class ResidualConnection(nn.Module):
    def __init__(self, dropout: float | None):
        super().__init__()
        self.dropout = None
        if dropout is not None:
            self.dropout = nn.Dropout(dropout)
        self.norm = LayerNormalization()

    def forward(self, x, sublayer):  # sublayer is prev layer
        if self.dropout is not None:
            return self.dropout(sublayer(self.norm(x)))
        return sublayer(self.norm(x))


# for the customizability (of dropout), pass each block to stack up instead of parameters to create blocks from scratch
class EncoderBlock(nn.Module):
    def __init__(self, self_attention: MultiHeadAttention, feed_forward: FeedForward, dropout: float | None):
        super().__init__()
        self.self_attention = self_attention
        self.feed_forward = feed_forward
        self.residual_connections = nn.ModuleList([ResidualConnection(dropout) for _ in range(2)])

    def forward(self, x, source_mask):
        x_out = self.residual_connections[0](x, lambda x: self.self_attention(x, x, x, source_mask))  # forward in MultiHeadAttention
        x_out = self.residual_connections[1](x_out, self.feed_forward)
        return x_out


class Encoder(nn.Module):
    def __init__(self, layers: nn.ModuleList):
        super().__init__()
        self.layers = layers
        self.norm = LayerNormalization()

    def forward(self, x, mask):
        x_out = x
        for layer in self.layers:
            x_out = layer(x_out, mask)
        return self.norm(x_out)


class DecoderBlock(nn.Module):
    def __init__(self, self_attention: MultiHeadAttention, cross_atention: MultiHeadAttention, feed_forward: FeedForward, dropout: float | None):
        super().__init__()
        self.self_attention = self_attention
        self.cross_attention = cross_atention
        self.feed_forward = feed_forward
        self.residual_connections = nn.ModuleList([ResidualConnection(dropout) for _ in range(3)])

    def forward(self, x, encoder_output, source_mask, target_mask):
        x_out = self.residual_connections[0](x, lambda x: self.self_attention(x, x, x, target_mask))
        x_out = self.residual_connections[1](x_out, lambda x_out: self.cross_attention(x_out, encoder_output, encoder_output, source_mask))
        x_out = self.residual_connections[2](x_out, self.feed_forward)
        return x_out


class Decoder(nn.Module):
    def __init__(self, layers: nn.ModuleList):
        super().__init__()
        self.layers = layers
        self.norm = LayerNormalization()

    def forward(self, x, encoder_output, source_mask, target_mask):
        x_out = x
        for layer in self.layers:
            x_out = layer(x_out, encoder_output, source_mask, target_mask)
        return self.norm(x_out)


class ProjectionLayer(nn.Module):
    def __init__(self, d_model: int, vocab_size: int):
        super().__init__()
        self.proj = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        return torch.log_softmax(self.proj(x), dim=-1)  # (batch_size, seq_len, d_model) -> (batch_size, seq_len, vocab_size)


# only encoder - useful for classification task
class TransformerEncoder(nn.Module):
    def __init__(self, encoder: Encoder, embed: InputEmbeddings, pe: PositionEncoding, proj):
        super().__init__()
        self.encoder = encoder
        self.embed = embed
        self.pe = pe
        self.proj = proj

    def forward(self, x, mask):
        x_out = self.embed(x)
        x_out = self.pe(x_out)
        return self.proj(self.encoder(x_out, mask))


# full transformer
class Transformer(nn.Module):
    def __init__(self, encoder: Encoder, decoder: Decoder, source_embed: InputEmbeddings, target_embed: InputEmbeddings, source_pe: PositionEncoding, target_pe: PositionEncoding, proj: ProjectionLayer):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.source_embed = source_embed
        self.target_embed = target_embed
        self.source_pe = source_pe
        self.target_pe = target_pe
        self.proj = proj

    def encode(self, x, source_mask):
        x_out = self.source_embed(x)
        x_out = self.source_pe(x_out)
        return self.encoder(x_out, source_mask)

    def decode(self, x, encoder_output, source_mask, target_mask):
        x_out = self.target_embed(x)
        x_out = self.target_pe(x_out)
        return self.decoder(x_out, encoder_output, source_mask, target_mask)

    def forward(self, source_input, target_input, source_mask, target_mask):
        encoder_output = self.encode(source_input, source_mask)
        decoder_output = self.decode(target_input, encoder_output, source_mask, target_mask)
        return self.proj(decoder_output)


# building functions
# for encoder-only transformer
def build_encoder_transformer(vocab_size: int, seq_len: int, dropouts: Dict[str, float | None] | None, d_model: int=512, d_ff: int=2048,  N: int=6, h: int=8):
    key_list = ['encoder_pe', 'encoder_self_attetion', 'encoder_feed_forward', 'encoder_block']
    if dropouts is None:
        dropouts = dict()
    for key in key_list:
        if key not in dropouts.keys():
            dropouts[key] = None

    embed = InputEmbeddings(d_model, vocab_size)
    pos = PositionEncoding(d_model, seq_len, dropouts['encoder_pe'])
    layers = []
    for _ in range(N):
        self_attention = MultiHeadAttention(d_model, h, dropouts['encoder_self_attetion'])
        feed_forward = FeedForward(d_model, d_ff, dropouts['encoder_feed_forward'])
        layers.append(EncoderBlock(self_attention, feed_forward, dropouts['encoder_block']))
    encoder = Encoder(nn.ModuleList(layers))
    proj = nn.Linear(d_model, vocab_size)

    transformer_encoder = TransformerEncoder(encoder, embed, pos, proj)

    # Xavier uniform distribution for parameter initialization
    for param in transformer_encoder.parameters():
        if param.dim() > 1:
            nn.init.xavier_uniform_(param)

    return transformer_encoder


# for full transformer
def build_transformer(source_vocab_size: int, target_vocab_size: int, source_seq_len: int, target_seq_len: int, dropouts: Dict[str, float | None] | None, d_model: int=512, d_ff: int=2048,  N: int=6, h: int=8):
    key_list = ['encoder_pe', 'encoder_self_attetion', 'encoder_feed_forward', 'encoder_block', 'decoder_pe', 'decoder_self_attention', 'decoder_cross_attention', 'decoder_feed_forward', 'decoder_block']
    if dropouts is None:
        dropouts = dict()
    for key in key_list:
        if key not in dropouts.keys():
            dropouts[key] = None

    # encoder
    source_embed = InputEmbeddings(d_model, source_vocab_size)
    source_pe = PositionEncoding(d_model, source_seq_len, dropouts['encoder_pe'])
    encoder_layers = []
    for _ in range(N):
        self_attention = MultiHeadAttention(d_model, h, dropouts['encoder_self_attetion'])
        feed_forward = FeedForward(d_model, d_ff, dropouts['encoder_feed_forward'])
        encoder_layers.append(EncoderBlock(self_attention, feed_forward, dropouts['encoder_block']))
    encoder = Encoder(nn.ModuleList(encoder_layers))

    # decoder
    target_embed = InputEmbeddings(d_model, target_vocab_size)
    target_pe = PositionEncoding(d_model, target_seq_len, dropouts['decoder_pe'])
    decoder_layers = []
    for _ in range(N):
        self_attention = MultiHeadAttention(d_model, h, dropouts['decoder_self_attention'])
        cross_attention = MultiHeadAttention(d_model, h, dropouts['decoder_cross_attention'])
        feed_forward = FeedForward(d_model, d_ff, dropouts['decoder_feed_forward'])
        decoder_layers.append(DecoderBlock(self_attention, cross_attention, feed_forward, dropouts['decoder_block']))
    decoder = Decoder(nn.ModuleList(decoder_layers))

    proj = ProjectionLayer(d_model, target_vocab_size)

    transformer = Transformer(encoder, decoder, source_embed, target_embed, source_pe, target_pe, proj)

    # Xavier uniform distribution for parameter initialization
    for param in transformer.parameters():
        if param.dim() > 1:
            nn.init.xavier_uniform_(param)

    return transformer


In [31]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def load_encoder_model():
    vocab_size = 30522
    seq_len = 512
    dropouts = {key: 0.1 for key in ['encoder_pe', 'encoder_self_attetion', 'encoder_feed_forward', 'encoder_block']}
    model = build_encoder_transformer(vocab_size, seq_len, dropouts)

    model_path = "trained_encoder_model.pth"
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()

    return model


def load_custom_codegen_model(tokenizer, seq_len=512):
    source_vocab_size = tokenizer.vocab_size
    target_vocab_size = tokenizer.vocab_size
    source_seq_len = seq_len
    target_seq_len = seq_len
    key_list = ['encoder_pe', 'encoder_self_attetion', 'encoder_feed_forward', 'encoder_block', 'decoder_pe', 'decoder_self_attention', 'decoder_cross_attention', 'decoder_feed_forward', 'decoder_block']
    dropouts = {key: 0.1 for key in key_list}
    model = build_transformer(source_vocab_size, target_vocab_size, source_seq_len, target_seq_len, dropouts)

    model_path = "custom_codegen_model.pth"
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()

    return model


def load_finetuned_codegen_model():
    model_path = "./codegen-finetuned"
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForCausalLM.from_pretrained(model_path)

    tokenizer.pad_token = tokenizer.eos_token
    model.eval()

    return tokenizer, model

In [40]:
# custom model (transformer)
custom_tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
custom_model = load_custom_codegen_model(custom_tokenizer)
print("Custom Model has been loaded.")

# pretrained model
pretrained_tokenizer = AutoTokenizer.from_pretrained("Salesforce/codegen-350M-mono")
pretrained_model = AutoModelForCausalLM.from_pretrained("Salesforce/codegen-350M-mono")
pretrained_tokenizer.pad_token = pretrained_tokenizer.eos_token
print("Pretrained model has been loaded.")

# fine-tuned pretrained model
finetuned_tokenizer, finetuned_model = load_finetuned_codegen_model()
print("Finetuned model has been loaded.")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

custom_model.to(device)
pretrained_model.to(device)
finetuned_model.to(device)

Custom Model has been loaded.


Some weights of the model checkpoint at Salesforce/codegen-350M-mono were not used when initializing CodeGenForCausalLM: ['transformer.h.0.attn.causal_mask', 'transformer.h.1.attn.causal_mask', 'transformer.h.10.attn.causal_mask', 'transformer.h.11.attn.causal_mask', 'transformer.h.12.attn.causal_mask', 'transformer.h.13.attn.causal_mask', 'transformer.h.14.attn.causal_mask', 'transformer.h.15.attn.causal_mask', 'transformer.h.16.attn.causal_mask', 'transformer.h.17.attn.causal_mask', 'transformer.h.18.attn.causal_mask', 'transformer.h.19.attn.causal_mask', 'transformer.h.2.attn.causal_mask', 'transformer.h.3.attn.causal_mask', 'transformer.h.4.attn.causal_mask', 'transformer.h.5.attn.causal_mask', 'transformer.h.6.attn.causal_mask', 'transformer.h.7.attn.causal_mask', 'transformer.h.8.attn.causal_mask', 'transformer.h.9.attn.causal_mask']
- This IS expected if you are initializing CodeGenForCausalLM from the checkpoint of a model trained on another task or with another architecture (e

Pretrained model has been loaded.
Finetuned model has been loaded.


CodeGenForCausalLM(
  (transformer): CodeGenModel(
    (wte): Embedding(51200, 1024)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-19): 20 x CodeGenBlock(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): CodeGenAttention(
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
          (qkv_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (out_proj): Linear(in_features=1024, out_features=1024, bias=False)
        )
        (mlp): CodeGenMLP(
          (fc_in): Linear(in_features=1024, out_features=4096, bias=True)
          (fc_out): Linear(in_features=4096, out_features=1024, bias=True)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=51200, bias=True)
)

In [61]:
def gencode_custom_model(prompt, model, tokenizer, max_len=512):
    model.eval()
    device = next(model.parameters()).device

    source_ids = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).input_ids.to(device)
    source_mask = (source_ids != tokenizer.pad_token_id).unsqueeze(1).unsqueeze(2).to(device)

    encoder_output = model.encode(source_ids, source_mask)
    decoder_input = torch.tensor([[tokenizer.cls_token_id]], device=device)
    output_tokens = []

    for _ in range(max_len):
        target_mask = (decoder_input != tokenizer.pad_token_id).unsqueeze(1).unsqueeze(2)
        seq_len = decoder_input.size(1)
        no_lookahead_mask = torch.tril(torch.ones((seq_len, seq_len), device=device)).unsqueeze(0).unsqueeze(0)
        target_mask = target_mask.to(torch.bool) & no_lookahead_mask.to(torch.bool)
        target_mask.to(device)

        decoder_output = model.decode(decoder_input, encoder_output, source_mask, target_mask)
        logits = model.proj(decoder_output)  # (batch_size, seq_len, vocab_size)
        next_token_logits = logits[:, -1, :]
        temperature = 0.85  # set by hyperparameter tuning (most "readable" output)
        probs = torch.softmax(next_token_logits / temperature, dim=-1)  # adjust temp if needed
        next_token = torch.multinomial(probs, num_samples=1)

        decoder_input = torch.cat([decoder_input, next_token], dim=-1)

        if next_token.item() == tokenizer.sep_token_id:
            break

        output_tokens.append(next_token.item())

    return tokenizer.decode(output_tokens, skip_special_tokens=True, clean_up_tokenization_spaces=True)


def gencode_pretrained_model(prompt, model, tokenizer, max_len=1024):
    model.eval()
    encoded = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True)
    input_ids = encoded.input_ids.to(model.device)
    attention_mask = encoded.attention_mask.to(model.device)


    with torch.no_grad():
        output_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=max_len,
            do_sample=True,
            top_p=0.95,
            temperature=0.8,
            pad_token_id=tokenizer.eos_token_id
        )

    output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return output[len(prompt):]

In [42]:
ml_prompt_list = [
    "Generate Python code: Implement a PyTorch neural network for binary classification.",
    "Generate Python code: Write a function to compute cross-entropy loss using logits.",
    "Generate Python code: Define a custom Dataset class for image classification using torchvision.",
    "Generate Python code: Implement a training loop using PyTorch Lightning for MNIST.",
    "Generate Python code: Write code to apply data augmentation using torchvision transforms.",
    "Generate Python code: Create a function to compute F1 score given predictions and labels.",
    "Generate Python code: Implement a Transformer encoder block using PyTorch.",
    "Generate Python code: Define a function to tokenize text using Hugging Face tokenizer.",
    "Generate Python code: Write a function to compute BLEU score for machine translation.",
    "Generate Python code: Implement early stopping during training based on validation loss.",
    "Generate Python code: Build a function to load the IMDB dataset using Hugging Face Datasets.",
    "Generate Python code: Define a CNN for CIFAR-10 classification in PyTorch.",
    "Generate Python code: Write code to train a BERT model for text classification.",
    "Generate Python code: Implement a function to compute accuracy in multi-class classification.",
    "Generate Python code: Write code to freeze layers of a pretrained model before fine-tuning.",
    "Generate Python code: Implement a learning rate scheduler that warms up then decays.",
    "Generate Python code: Define a function to visualize a confusion matrix with matplotlib.",
    "Generate Python code: Create a generator that yields batches of tokenized data.",
    "Generate Python code: Write code to save and load model checkpoints in PyTorch.",
    "Generate Python code: Implement a function to evaluate a model on a validation set.",
]

In [65]:
custom_model_outputs = []
pretrained_model_outputs = []
finetuned_model_outputs = []

for i, prompt in enumerate(ml_prompt_list):
    custom_model_outputs.append(gencode_custom_model(prompt, custom_model, custom_tokenizer))
    pretrained_model_outputs.append(gencode_pretrained_model(prompt, pretrained_model, pretrained_tokenizer))
    finetuned_model_outputs.append(gencode_pretrained_model(prompt, finetuned_model, finetuned_tokenizer))
    print(f"Prompt {i+1} done.")
print("Done.")

Prompt 1 done.
Prompt 2 done.
Prompt 3 done.
Prompt 4 done.
Prompt 5 done.
Prompt 6 done.
Prompt 7 done.
Prompt 8 done.
Prompt 9 done.
Prompt 10 done.
Prompt 11 done.
Prompt 12 done.
Prompt 13 done.
Prompt 14 done.
Prompt 15 done.
Prompt 16 done.
Prompt 17 done.
Prompt 18 done.
Prompt 19 done.
Prompt 20 done.
Done.


In [79]:
!pip install black



In [104]:
import re

def clean(raw_code: str):
    cleaned_code = re.sub(r'\\n', '\n', raw_code)
    cleaned_code = re.sub(r'\\t', '\t', cleaned_code)
    cleaned_code = re.sub(r'\s+\n', '\n', cleaned_code)
    return cleaned_code.strip()

df = pd.DataFrame({
    "Prompt": ml_prompt_list,
    "Custom (temperature=0.85)": [clean(custom_model_output) for custom_model_output in custom_model_outputs],
    "Pretrained (temperature=0.8)": [clean(pretrained_model_output) for pretrained_model_output in pretrained_model_outputs],
    "Finetuned (temperature=0.8)": [clean(finetuned_model_output) for finetuned_model_output in finetuned_model_outputs]
    })

for i, row in df.iterrows():
    print(f"Prompt {i+1}: {row['Prompt']}")
    print('-'*100)
    print(f"{'+'*10} Custom (temperature=0.85) {'+'*10}\n{row['Custom (temperature=0.85)']}\n")
    print(f"{'+'*10} Pretrained (temperature=0.8) {'+'*10}\n{row['Pretrained (temperature=0.8)']}\n")
    print(f"{'+'*10} Finetuned (temperature=0.8) {'+'*10}\n{row['Finetuned (temperature=0.8)']}\n")
    print()

Prompt 1: Generate Python code: Implement a PyTorch neural network for binary classification.
----------------------------------------------------------------------------------------------------
++++++++++ Custom (temperature=0.85) ++++++++++
select ) _ = return. " values. range ) _ get " default ashs = ]fi is ( "da id 'con ] _ panel not days - po (s ), in ( : data _ highlight " float angle.'_,, result ( false,to if. _, self "r. = _ " ] ) self ) ] : *t - supports _, : false " ) 1 z " size ( *'is'value '. '.gs c,, in _. = ]'[ ] - properties _. _ ', _ ) int self ( _. =, _dt none + ] ) [ _ " "'_ ( all _ (. = raw. )'=. : ( -bar vt _ )s,. _ *. ( ( # ='( labels. ),'(ue ". ) :0 ( ( _ def = ) _rf /. symbol _ # nu from len') if ( ) gather = ). generate number _'in _ ] ( method _ = 1 self _m ( classes (ara ) ) max if.. ) module, ( scaled "t ) max':'self : ) n _ara') ). _ ifum if ) ) v _ [ { )y res'_ ( ". ) header ) " it.. ). )ids _ transform points weights, next. ) return : _ :e. dry _ =, ='' ( 

### Summary
- Custom transformer model, unsuprisingly, generates random things because of lack of the training data (impossible to train a gen-model built from scratch locally). However, at least it does generate some Python-like syntaxes such as 'self' and 'return,' with appropriate temperature - in this case, 0.85.
- Pretrained model without fine-tuning does a descent task. The resulted codes are readable and accurate. However, it sometimes generates a too long code or even worse, gets stuck with commented/uncommented senteces. It also takes this model longer time to predict an output code.
- Fine-tuned pretrained model is overall the best model of all the models. It generates mostly function-styled Python code with descent accuracy. However, since the dataset used to fine tune was not expecting the output with source information (copy right), so the most outputs might omit this important part. For the task completion, it does a very good work.
- Overall, fine-tuned model is selected since this project requires more accurate, concise, and faster code generation.