In [1]:
pip install torch transformers datasets scikit-learn numpy accelerate torch transformers

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [2]:
# Install Dependencies
# pip install torch transformers datasets scikit-learn numpy accelerate

import warnings
warnings.filterwarnings("ignore", category=FutureWarning, module='transformers.generation.utils')

import os
import json
import torch
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)
from datasets import Dataset
import numpy as np
from sklearn.model_selection import train_test_split

In [3]:
# Create data directory if it doesn't exist
os.makedirs('data', exist_ok=True)

# Sample training data (bidirectional)
sample_data = {
    "examples": [
        {
            "python": "import os\n\ntry:\n    with open('file.txt', 'r') as file:\n        data = file.read()\n        print(data)\nexcept Exception as e:\n    print('Error:', e)",
            "nodejs": "const fs = require('fs');\n\nfs.readFile('file.txt', 'utf8', (err, data) => {\n    if (err) {\n        console.error('Error:', err);\n        return;\n    }\n    console.log(data);\n});"
        }
    ]
}

# Save sample training data
with open('data/sample.json', 'w') as f:
    json.dump(sample_data, f, indent=4)

In [4]:
def load_dataset(data_dir):
    """Load and prepare the dataset from JSON files for both translation directions."""
    inputs = []
    outputs = []

    for filename in os.listdir(data_dir):
        if filename.endswith('.json'):
            filepath = os.path.join(data_dir, filename)
            print(f"Loading data from {filepath}")
            with open(filepath, 'r') as f:
                try:
                    data = json.load(f)
                    if "examples" in data:
                        for example in data['examples']:
                            if "python" in example and "nodejs" in example:
                                # Python to Node.js
                                inputs.append(f"Translate Python to Node.js: {example['python']}")
                                outputs.append(example['nodejs'])
                                # Node.js to Python
                                inputs.append(f"Translate Node.js to Python: {example['nodejs']}")
                                outputs.append(example['python'])
                            else:
                                print(f"Warning: Example in {filepath} is missing 'python' or 'nodejs' key.")
                    else:
                        print(f"Warning: {filepath} does not contain an 'examples' key.")
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON from {filepath}: {e}")
                except KeyError as e:
                    print(f"Error processing data from {filepath}: {e}")

    print(f"Loaded {len(inputs)} examples (both directions).")
    return inputs, outputs

def prepare_dataset(inputs, outputs, tokenizer, max_length=512):
    """Prepare the dataset for training."""
    if not inputs or not outputs or len(inputs) != len(outputs):
        print("Error: No data loaded or input/output mismatch.")
        return None, None

    if len(inputs) < 2:
         print("Error: Not enough data to create train/validation split. Need at least 2 examples.")
         return None, None

    train_inputs, val_inputs, train_outputs, val_outputs = train_test_split(
        inputs, outputs, test_size=0.1, random_state=42
    )

    def tokenize_function(examples):
        model_inputs = tokenizer(
            examples['input'],
            max_length=max_length,
            padding='max_length',
            truncation=True
        )

        with tokenizer.as_target_tokenizer():
            labels = tokenizer(
                examples['output'],
                max_length=max_length,
                padding='max_length',
                truncation=True
            )

        model_inputs['labels'] = labels['input_ids']
        return model_inputs

    train_dataset = Dataset.from_dict({
        'input': train_inputs,
        'output': train_outputs
    })
    val_dataset = Dataset.from_dict({
        'input': val_inputs,
        'output': val_outputs
    })

    print("Tokenizing training dataset...")
    train_dataset = train_dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=train_dataset.column_names
    )
    print("Tokenizing validation dataset...")
    val_dataset = val_dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=val_dataset.column_names
    )

    return train_dataset, val_dataset

In [7]:
# Create model directory
os.makedirs('model', exist_ok=True)

# Initialize tokenizer and model
model_name = "Salesforce/codet5p-220m"
print(f"Loading model and tokenizer: {model_name}")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Load and prepare dataset
print("Loading and preparing dataset...")
inputs, outputs = load_dataset('data')

Loading model and tokenizer: Salesforce/codet5p-220m
Loading and preparing dataset...
Loading data from data/sample.json
Loaded 7322 examples (both directions).


In [None]:
if inputs and outputs:
    train_dataset, val_dataset = prepare_dataset(inputs, outputs, tokenizer)

    if train_dataset and val_dataset:
        # Training arguments
        training_args = Seq2SeqTrainingArguments(
            output_dir='model',
            num_train_epochs=5,
            per_device_train_batch_size=4,
            per_device_eval_batch_size=4,
            warmup_steps=500,
            weight_decay=0.01,
            logging_dir='./logs',
            logging_steps=100,
            eval_strategy="steps",
            eval_steps=500,
            save_strategy="steps",
            save_steps=500,
            load_best_model_at_end=True,
            predict_with_generate=True,
            generation_max_length=512,
            generation_num_beams=4,
        )

        # Data collator
        data_collator = DataCollatorForSeq2Seq(
            tokenizer,
            model=model,
            padding=True,
        )

        # Initialize trainer
        print("Initializing trainer...")
        trainer = Seq2SeqTrainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            tokenizer=tokenizer,
            data_collator=data_collator
        )

        # Start training
        print("Starting training...")
        trainer.train()

        # Save the model
        print("Saving the final model...")
        trainer.save_model('model/final')
        tokenizer.save_pretrained('model/final')
        print("Training complete. Model saved to 'model/final'.")

Tokenizing training dataset...


Map:   0%|          | 0/6589 [00:00<?, ? examples/s]



Tokenizing validation dataset...


Map:   0%|          | 0/733 [00:00<?, ? examples/s]

Initializing trainer...


  trainer = Seq2SeqTrainer(


Starting training...


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mhazily2024[0m ([33mhazily2024-ny-develops[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss
500,1.3768,0.619612
1000,0.0693,0.05861
1500,0.0532,0.045901
2000,0.0366,0.040626
2500,0.0321,0.038385
3000,0.0311,0.035819
3500,0.0209,0.035656
4000,0.0187,0.034297
4500,0.0205,0.033343
5000,0.0139,0.03412


In [None]:
def translate_code(python_code, model, tokenizer, max_length=512):
    """Translate Python code to Node.js."""
    input_text = f"Translate Python to Node.js: {python_code}"

    inputs = tokenizer(
        input_text,
        max_length=max_length,
        padding="longest",
        truncation=True,
        return_tensors="pt"
    )

    if torch.cuda.is_available():
        inputs = {key: tensor.to(model.device) for key, tensor in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=max_length,
            num_beams=4,
            early_stopping=True
        )

    nodejs_code = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return nodejs_code

In [None]:
def translate_code(code, model, tokenizer, direction="Python to Node.js", max_length=512):
    """Translate code between Python and Node.js based on direction."""
    if direction == "Python to Node.js":
        input_text = f"Translate Python to Node.js: {code}"
    elif direction == "Node.js to Python":
        input_text = f"Translate Node.js to Python: {code}"
    else:
        raise ValueError("Invalid direction. Use 'Python to Node.js' or 'Node.js to Python'.")

    inputs = tokenizer(
        input_text,
        max_length=max_length,
        padding="longest",
        truncation=True,
        return_tensors="pt"
    )

    if torch.cuda.is_available():
        inputs = {key: tensor.to(model.device) for key, tensor in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=max_length,
            num_beams=4,
            early_stopping=True
        )

    translated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translated_code

In [None]:
if 'model' in locals() and 'tokenizer' in locals():
    print("\n--- Testing the Model ---")
    test_python = '''
import json
data = {'name': 'Alice', 'age': 25}
with open('data.json', 'w') as f:
    json.dump(data, f)
with open('data.json') as f:
    loaded = json.load(f)
    print(loaded)
'''
    test_nodejs = '''
const fs = require('fs');
const data = { name: 'Alice', age: 25 };
fs.writeFileSync('data.json', JSON.stringify(data));
const loaded = JSON.parse(fs.readFileSync('data.json', 'utf8'));
console.log(loaded);
'''
    print("Python to Node.js Translation:")
    try:
        if 'trainer' in locals() and trainer.state.is_world_process_zero:
            translated = translate_code(test_python, model, tokenizer, direction="Python to Node.js")
            print(translated)
        elif not 'trainer' in locals():
            print("Model not loaded or trained. Cannot perform test translation.")
        else:
            print("Testing skipped on non-main process.")
    except Exception as e:
        print(f"An error occurred during translation: {e}")
    print("\nNode.js to Python Translation:")
    try:
        if 'trainer' in locals() and trainer.state.is_world_process_zero:
            translated = translate_code(test_nodejs, model, tokenizer, direction="Node.js to Python")
            print(translated)
        elif not 'trainer' in locals():
            print("Model not loaded or trained. Cannot perform test translation.")
        else:
            print("Testing skipped on non-main process.")
    except Exception as e:
        print(f"An error occurred during translation: {e}")


In [None]:
# prompt: it should download the folder where model will be  saved, final folder

from google.colab import files

# Download the 'model/final' directory
# This will zip the directory and download it to your local machine
!zip -r /content/final_model.zip /content/model/final
files.download('/content/final_model.zip')