# Load Dataset



In [4]:
import pandas as pd

df = pd.read_csv("/view/extended_programming_code_snippets.csv")
df

Unnamed: 0,Query,Code_Snippet,Language,Tags
0,How to create a list comprehension in Python?,[x**2 for x in range(10)],Python,tutorial
1,How to handle missing data in pandas?,"df.fillna(0, inplace=True)",Python,example
2,How to use a lambda function in Python?,lambda x: x + 2,Python,advanced
3,How to create a REST API in Flask?,from flask import Flask\r\napp = Flask(__name_...,Python,tutorial
4,How to perform matrix multiplication in numpy?,"import numpy as np\r\nnp.dot(A, B)",Python,advanced
...,...,...,...,...
3015,How to implement a class in C++?,class MyClass {\r\npublic:\r\n void myMetho...,C++,tutorial
3016,How to use pointers in C++?,int x = 10;\r\nint* ptr = &x;,C++,advanced
3017,How to read a file in C++?,#include <fstream>\r\nstd::ifstream file('file...,C++,common-issues
3018,How to create a vector in C++?,"#include <vector>\r\nstd::vector<int> v = {1, ...",C++,tutorial


In [None]:
df['Language'].value_counts()

Unnamed: 0_level_0,count
Language,Unnamed: 1_level_1
SQL,627
Java,618
Shell,601
JavaScript,595
Python,574
C++,5


In [5]:
df['Query'] = df['Query'].str.lower()
df['Code_Snippet'] = df['Code_Snippet'].str.lower()
df['Tags'] = df['Tags'].str.lower()

# Tokenization

In [13]:
from transformers import T5Tokenizer

# Load the T5 tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")

# Tokenize the dataset
def preprocess_data(data):
    inputs = ["generate code: " + query for query in data["Query"]]
    targets = data["Code_Snippet"].tolist()
    input_encodings = tokenizer(inputs, max_length=128, truncation=True, padding="max_length", return_tensors="pt")
    target_encodings = tokenizer(targets, max_length=128, truncation=True, padding="max_length", return_tensors="pt")
    return input_encodings, target_encodings

input_encodings, target_encodings = preprocess_data(df)


# Dataset Preparation

In [12]:
import torch

class CodeSnippetDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets

    def __len__(self):
        return len(self.inputs["input_ids"])

    def __getitem__(self, idx):
        return {
            "input_ids": self.inputs["input_ids"][idx],
            "attention_mask": self.inputs["attention_mask"][idx],
            "labels": self.targets["input_ids"][idx],
        }

dataset = CodeSnippetDataset(input_encodings, target_encodings)


# fine Tune Model

In [8]:
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)

training_args = TrainingArguments(
    output_dir="./t5_finetuned",
    evaluation_strategy="epoch",  
    save_strategy="epoch",        
    learning_rate=5e-5,
    per_device_train_batch_size=4,  
    per_device_eval_batch_size=4,   
    num_train_epochs=5,            
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=500,            
    save_total_limit=1,        
    load_best_model_at_end=True,
    fp16=True,                     
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,
)

trainer.train()


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 Â·Â·Â·Â·Â·Â·Â·Â·Â·Â·


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,1.2088,0.071258
2,0.0787,0.011764
3,0.0454,0.009069
4,0.026,0.008584
5,0.023,0.008444


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=3775, training_loss=0.21656152434696424, metrics={'train_runtime': 620.3747, 'train_samples_per_second': 24.34, 'train_steps_per_second': 6.085, 'total_flos': 510915300556800.0, 'train_loss': 0.21656152434696424, 'epoch': 5.0})

# Save Fine Tuned Model

In [10]:
model.save_pretrained("/content/drive/MyDrive")
tokenizer.save_pretrained("/content/drive/MyDrive")

('/content/drive/MyDrive/tokenizer_config.json',
 '/content/drive/MyDrive/special_tokens_map.json',
 '/content/drive/MyDrive/spiece.model',
 '/content/drive/MyDrive/added_tokens.json')

# Searh Engine (Code Snipet Generation System)

In [11]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

model = T5ForConditionalGeneration.from_pretrained("/content/drive/MyDrive/t5_finetuned_model")
tokenizer = T5Tokenizer.from_pretrained("/content/drive/MyDrive/t5_finetuned_model")

def generate_code(query):
    query = query.lower()
    input_text = "generate code: " + query
    input_ids = tokenizer.encode(input_text, return_tensors="pt", truncation=True)
    outputs = model.generate(input_ids, max_length=512, num_beams=4, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

while True:
  query = input("\n\nYour Query or type (exit) to leave : \n  ")
  if query == "exit":
    print("\nGood Bye...")
    break
  code = generate_code(query)
  print("\n Code Snipet : \n", code)



Your Query or type (exit) to leave : 
  How to create a list in Python?

 Code Snipet : 
 [x**2 for x in range(10)]


Your Query or type (exit) to leave : 
  exit

Good Bye...
