In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers
!pip install datasets
!pip install accelerate -U
!pip install transformers[torch]

In [55]:
from transformers import Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5EncoderModel, T5ForConditionalGeneration, GPT2LMHeadModel, AutoModel, AutoTokenizer, EncoderDecoderModel,AutoConfig
import pandas as pd
import torch.nn as nn
import torch
from sklearn.model_selection import train_test_split
from datasets import load_dataset

In [64]:
def load_dataset_from_csv(file_path, tokenizer, max_length=512):
    dataset = load_dataset('csv', data_files=file_path, split='train')
    dataset = dataset.filter(lambda example: all(value is not None for value in example.values()))
    dataset = dataset.train_test_split(test_size=0.2)

    def tokenize_function(examples):
        inputs = tokenizer(examples['lang1'], padding='max_length', max_length=max_length, truncation=True, return_tensors="pt")
        targets = tokenizer(examples['lang2'], padding='max_length', max_length=max_length, truncation=True, return_tensors="pt")
        return {'input_ids': inputs.input_ids, 'labels': targets.input_ids}


    tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=dataset["train"].column_names)
    return tokenized_datasets

In [66]:
#checkpoint = "Salesforce/codet5p-110m-embedding"
#device = "cuda"
tokenizer = AutoTokenizer.from_pretrained('google-t5/t5-small')
tokenized_dataset = load_dataset_from_csv("./drive/MyDrive/preprocessed.csv", tokenizer)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Map:   0%|          | 0/1133 [00:00<?, ? examples/s]

Map:   0%|          | 0/284 [00:00<?, ? examples/s]

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
config = AutoConfig.from_pretrained("gpt2", add_cross_attention=True, vocab_size=len(tokenizer), n_ctx=128,
                                                bos_token_id=tokenizer.bos_token_id,
                                                eos_token_id=tokenizer.eos_token_id)

#encoder_model = AutoModel.from_pretrained("Salesforce/codet5p-110m-embedding", trust_remote_code=True)
encoder_model = T5EncoderModel.from_pretrained("google-t5/t5-small")

decoder_model = GPT2LMHeadModel(config)

decoder_model.resize_token_embeddings(len(tokenizer))
encoder_model.resize_token_embeddings(len(tokenizer))

model = EncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)

model.config.decoder_start_token_id = tokenizer.pad_token_id
model.config.eos_token_id = tokenizer.sep_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.vocab_size = model.config.encoder.vocab_size

model.to(device)

In [91]:
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/custom_transformer",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=600,
    save_total_limit=2,
    prediction_loss_only=True,
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps=200,
    logging_dir="./logs",
    logging_first_step=True,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    learning_rate=5e-5,
    lr_scheduler_type="linear",
    warmup_steps=0,
    gradient_accumulation_steps=1,
    logging_strategy="steps",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test']
)

In [92]:
trainer.train()

  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)


Step,Training Loss,Validation Loss
200,3.0444,2.716439
400,2.7374,2.446071
600,2.4124,2.309741
800,2.3716,2.19378
1000,2.13,2.130766
1200,2.0812,2.066429
1400,2.102,2.011552
1600,2.0393,1.983619


  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)
  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)
There were missing keys in the checkpoint model loaded: ['encoder.encoder.embed_tokens.weight', 'decoder.lm_head.weight'].


TrainOutput(global_step=1701, training_loss=2.4208521915841703, metrics={'train_runtime': 267.1459, 'train_samples_per_second': 12.723, 'train_steps_per_second': 6.367, 'total_flos': 1385594918535168.0, 'train_loss': 2.4208521915841703, 'epoch': 3.0})

In [93]:
evaluation = trainer.evaluate()

print("Evaluation results:", evaluation)

Evaluation results: {'eval_loss': 2.066429376602173, 'eval_runtime': 4.5743, 'eval_samples_per_second': 62.086, 'eval_steps_per_second': 7.87, 'epoch': 3.0}


In [112]:
def generate_python_code(java_code, model, tokenizer):
    device = next(model.parameters()).device  # Get the device of the model
    input_ids = tokenizer(java_code, padding='max_length', max_length=512, truncation=True, return_tensors="pt").input_ids.to(device)
    output = model.generate(input_ids=input_ids, max_length=100, num_return_sequences=1, early_stopping=True, eos_token_id = tokenizer.sep_token_id,
                        pad_token_id = tokenizer.pad_token_id,
                        decoder_start_token_id=tokenizer.pad_token_id )
    python_code = tokenizer.decode(output[0], skip_special_tokens=True)
    return python_code

# Example usage
java_code = """
class MyClass {
    public static void main(String[] args) {
        System.out.println("Hello, World!");
    }
}
"""
python_code = generate_python_code(java_code, model, tokenizer)
print("Generated Python code:\n", python_code)



Generated Python code:
 '''Python3 program to find the number of the array''''''Function to find the array''' def find(arr, n): ''''''''' for '''' for if arr[i in range(arr[i]): '''''''''''' if (arr[i])
