In [1]:
import json
import tqdm
import torch
from torch.optim import Adam
from torch.utils.data import DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
# pretrained
pretrained_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
pretrained_tokenizer.add_special_tokens({"pad_token": "<PAD>", 
                                "bos_token": "<SOS>",
                                "eos_token": "<EOS>"})
pretrained_model = GPT2LMHeadModel.from_pretrained("gpt2")
pretrained_model = pretrained_model.to(device)

# Fine Tuned
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({"pad_token": "<PAD>", 
                                "bos_token": "<SOS>",
                                "eos_token": "<EOS>"})
tokenizer.add_tokens(["<INPUT>:", "<OUTPUT>:"])
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

model.load_state_dict(torch.load("model_150_50.pt"))

model = model.to(device)

In [5]:
from torch.utils.data import Dataset
import json

class CodeData(Dataset):
    def __init__(self, path:str, tokenizer):
        self.data = json.load(open(path, "r"))

        self.X = []
        for i in self.data:
          sample = i
          input = '<INPUT>: ' + sample['input'] if sample['input'] else ''
          sample = "<SOS> " + sample['instruction']  + input + ' <OUTPUT>: ' + sample['output'] + " <EOS>" 
          if len(sample) < 150:
            self.X.append(sample)
        
        # self.X = self.X[:2000]
        print(len(self.X))

        self.X_encoded = tokenizer(self.X,max_length=150, padding='max_length', truncation=True, return_tensors="pt")
        self.input_ids = self.X_encoded['input_ids']
        self.attention_mask = self.X_encoded['attention_mask']

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return (self.input_ids[idx], self.attention_mask[idx])

In [8]:
codeData = CodeData("code_20k.json", tokenizer)
codeData =  DataLoader(codeData, batch_size=25, shuffle=True)

model.train()
optim = Adam(model.parameters(), lr=1e-3)

2936


In [9]:
codeData.dataset.X

['<SOS> Create an array of length 5 which contains all even numbers between 1 and 10. <OUTPUT>: arr = [2, 4, 6, 8, 10] <EOS>',
 '<SOS> Write code to find the sum of all numbers between 1 and 10. <OUTPUT>: sum = 0\nfor i in range(1,11):\n    sum += i\nprint(sum) <EOS>',
 '<SOS> Create a function to calculate the area of a given circle. <OUTPUT>: def area_circle(radius):\n    return 3.14 * (radius**2) <EOS>',
 '<SOS> Change the variable `x` to the data type desired.<INPUT>: x = \'Hello World!\' <OUTPUT>: x = "Hello World!" <EOS>',
 '<SOS> Sort the array in ascending order.<INPUT>: arr = [3, 2, 1, 5, 4] <OUTPUT>: arr.sort() # [1, 2, 3, 4, 5] <EOS>',
 '<SOS> Generate a random integer between 4 and 8 (inclusively). <OUTPUT>: import random\nx = random.randint(4, 8) <EOS>',
 '<SOS> Write a SQL query to return the sum of all salaries. <OUTPUT>: SELECT SUM(salary)\nFROM employees; <EOS>',
 '<SOS> Create a function that takes in two numbers as arguments and returns the product of the two. <OUTPU

In [6]:
import numpy as np

def train(chatData, model, optim):
    epochs = 5
    for epoch in tqdm.tqdm(range(epochs), desc="Epoch"):
        running_loss = 0.0
        num_batches = 0
        
        for batch_idx, (X, a) in enumerate(chatData, start=1):
            X = X.to(device)
            a = a.to(device)
            optim.zero_grad()
            loss = model(X, attention_mask=a, labels=X).loss
            loss.backward()
            optim.step()
            
            running_loss += loss.item()
            num_batches += 1

            # Print batch information
            if batch_idx % 100 == 0:  # Change the number to control how often the information is printed
                print(f"Epoch {epoch+1}/{epochs}, Batch {batch_idx}: Loss = {running_loss / num_batches:.4f}")

        # Save model and print epoch information
        torch.save(model.state_dict(), f"model_state_epoch_{epoch+1}.pt")
        print(f"Completed epoch {epoch+1}/{epochs}: Average Loss = {running_loss / num_batches:.4f}")


In [7]:
print("training .... ")
train(codeData, model, optim)

training .... 


Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1/5, Batch 100: Loss = 2.8860
Epoch 1/5, Batch 200: Loss = 1.9817


Epoch:  20%|██        | 1/5 [06:13<24:53, 373.47s/it]

Completed epoch 1/5: Average Loss = 1.6945
Epoch 2/5, Batch 100: Loss = 0.5097
Epoch 2/5, Batch 200: Loss = 0.4797


Epoch:  40%|████      | 2/5 [12:27<18:40, 373.58s/it]

Completed epoch 2/5: Average Loss = 0.4619
Epoch 3/5, Batch 100: Loss = 0.3498
Epoch 3/5, Batch 200: Loss = 0.3486


Epoch:  60%|██████    | 3/5 [18:41<12:27, 373.94s/it]

Completed epoch 3/5: Average Loss = 0.3437
Epoch 4/5, Batch 100: Loss = 0.2947
Epoch 4/5, Batch 200: Loss = 0.2947


Epoch:  80%|████████  | 4/5 [24:55<06:13, 373.80s/it]

Completed epoch 4/5: Average Loss = 0.2921
Epoch 5/5, Batch 100: Loss = 0.2584
Epoch 5/5, Batch 200: Loss = 0.2590


Epoch: 100%|██████████| 5/5 [31:09<00:00, 373.83s/it]

Completed epoch 5/5: Average Loss = 0.2572





In [3]:
def generate_inference(prompt, model, tokenizer, max_length=50, num_return_sequences=1, temperature=0.8):
    model.eval()
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        output_sequences = model.generate(
            input_ids=input_ids,
            max_length=max_length,
            num_return_sequences=num_return_sequences,
            temperature=temperature,
            no_repeat_ngram_size=2,
            top_k=10,
            top_p=0.95,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

    generated_sequences = []
    for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
        generated_sequence = generated_sequence.tolist()
        text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)
        generated_sequences.append(text)

    return generated_sequences

In [5]:
prompts = [
    "<SOS> Generate a regular expression to match a valid email address. <OUTPUT>:",
    "<SOS> Write a JavaScript code to get the current browser URL. <OUTPUT>:",
    "<SOS> Write a SQL query to select the name and address from a given table.<INPUT>: Table: Employee <OUTPUT>:",
    "<SOS> Generate a random integer between 4 and 8 (inclusively). <OUTPUT>:",
    "<SOS> Create an array with 10 elements and populate it with zeros. <OUTPUT>:",
    "<SOS> Create a function in Python which takes two parameters and returns their product. <OUTPUT>:",
    "<SOS> Create a list comprehension to generate a list of multiples of 3 from 0 to 23. <OUTPUT>:",
]

# Generate inferences
for idx, prompt in enumerate(prompts):
    pretrained_prompt = prompt.replace("<SOS> ", "")
    pretrained_prompt = pretrained_prompt.replace("<OUTPUT>: ", "")
    pretrained_inf =  generate_inference(pretrained_prompt, pretrained_model, pretrained_tokenizer)
    pretrained_text = pretrained_inf[0]
    print(f"Example {idx + 1} (pretrained):")
    print(pretrained_text)
    print()
    
    inference_result = generate_inference(prompt, model, tokenizer, temperature=0.1)
    generated_text = inference_result[0].replace("<OUTPUT>:", "<OUTPUT>:\n")
    print(f"Example {idx + 1} (fine-tuned):")
    print(generated_text)
    print()

Example 1 (pretrained):
Generate a regular expression to match a valid email address. <OUTPUT>:

$ curl -X POST -d '{"email":"@example.com"}' -H 'Content-Type: application/json'
.


Example 1 (fine-tuned):
<SOS> Generate a regular expression to match a valid email address. <OUTPUT>:
 /^([a-zA-Z0-9_\-\.]+)@([\w.-]+) <EOS>

Example 2 (pretrained):
Write a JavaScript code to get the current browser URL. <OUTPUT>: <script src="https://www.google.com/analytics/js/query?query=http://google-analytic.org/"> <!DO

Example 2 (fine-tuned):
<SOS> Write a JavaScript code to get the current browser URL. <OUTPUT>:
 var currentURL = window.location.href;
console.log(currentURL); <EOS>

Example 3 (pretrained):
Write a SQL query to select the name and address from a given table.<INPUT>: Table: Employee <OUTPUT>: <NAME> <ID>

<NAME>: The name of the table.
. The table name. <

Example 3 (fine-tuned):
<SOS> Write a SQL query to select the name and address from a given table. <INPUT>: Table: Employee <OUTP