In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
%pip install transformers
%pip install datasets
%pip install accelerate -U
%pip install transformers[torch]
%pip install huggingface_hub

In [None]:
from transformers import RobertaTokenizer, T5ForConditionalGeneration, DataCollatorForSeq2Seq, Trainer, TrainingArguments, TrainerCallback
from datasets import load_dataset

class CustomLoggingCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if state.global_step % args.logging_steps == 0:
            print(f"Step: {state.global_step}, Loss: {logs['loss']}")



def load_dataset_from_csv(file_path, tokenizer, max_length=512):
    dataset = load_dataset('csv', data_files=file_path, split='train')
    dataset = dataset.filter(lambda example: all(value is not None for value in example.values()))
    dataset = dataset.train_test_split(test_size=0.2)

    def tokenize_function(examples):
        inputs = tokenizer(examples['lang1'], padding='max_length', max_length=max_length, truncation=True, return_tensors="pt")
        targets = tokenizer(examples['lang2'], padding='max_length', max_length=max_length, truncation=True, return_tensors="pt")
        return {'input_ids': inputs.input_ids, 'labels': targets.input_ids}


    tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=dataset["train"].column_names)
    return tokenized_datasets

tokenizer = RobertaTokenizer.from_pretrained("Salesforce/codet5-base")
tokenized_dataset = load_dataset_from_csv("./drive/MyDrive/preprocessed.csv", tokenizer)
callback = CustomLoggingCallback()

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/codet5-finetuned",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    save_steps=600,  
    save_total_limit=2,
    prediction_loss_only=True,
    logging_steps=100,  
    evaluation_strategy="steps",  
    eval_steps=200,  
    logging_dir="./logs",  
    logging_first_step=True,  
    load_best_model_at_end=True,  
    metric_for_best_model="eval_loss",  
    greater_is_better=False,  
    learning_rate=5e-5,  
    lr_scheduler_type="linear",  
    warmup_steps=0,  
    gradient_accumulation_steps=1,  
    logging_strategy="steps",  
)


model = T5ForConditionalGeneration.from_pretrained("Salesforce/codet5-base")
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, label_pad_token_id=-100)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    callbacks=[callback]
)

trainer.train()

In [None]:
def generate_python_code(java_code, model, tokenizer):
    device = next(model.parameters()).device  # Get the device of the model
    input_ids = tokenizer(java_code, padding='max_length', max_length=512, truncation=True, return_tensors="pt").input_ids.to(device)
    output = model.generate(input_ids=input_ids, max_length=100, num_return_sequences=1, early_stopping=True)
    python_code = tokenizer.decode(output[0], skip_special_tokens=True)
    return python_code

# Example usage
java_code = """
class GFG {
    public static void main(String[] args)
    {
        // Declaring and initializing integer variable
        int num = 10;
        // Checking if number is even or odd number
        if (num % 2 == 0) {
            System.out.println("Entered Number is Even");
        }

        else {
            System.out.println("Entered Number is Odd");

        }
    }
}
"""
python_code = generate_python_code(java_code, model, tokenizer)
print("Generated Python code:\n", python_code)

In [None]:
evaluation = trainer.evaluate()

print("Evaluation results:", evaluation)