In [1]:
!pip install transformers[torch] datasets evaluate sacrebleu

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sacrebleu
  Downloading sacrebleu-2.4.1-py3-none-any.whl (106 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.6/106.6 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.29.2-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.4/297.4 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32

In [41]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [42]:
from datasets import load_dataset


def load_dataset_from_csv(file_path, tokenizer, max_length=512):
    dataset = load_dataset("csv", data_files=file_path, split="train")
    dataset = dataset.filter(
        lambda example: all(value is not None for value in example.values())
    )
    dataset = dataset.train_test_split(test_size=0.2)

    def tokenize_function(examples):
        inputs = tokenizer(
            examples["lang1"],
            padding="max_length",
            max_length=max_length,
            truncation=True,
            return_tensors="pt",
        )
        targets = tokenizer(
            examples["lang2"],
            padding="max_length",
            max_length=max_length,
            truncation=True,
            return_tensors="pt",
        )
        attention_mask = inputs.attention_mask
        return {
            "input_ids": inputs.input_ids,
            "attention_mask": attention_mask,
            "labels": targets.input_ids,
        }

    tokenized_datasets = dataset.map(
        tokenize_function, batched=True, remove_columns=dataset["train"].column_names
    )
    return tokenized_datasets
dataset = load_dataset_from_csv("/content/drive/MyDrive/preprocessed.csv", tokenizer)

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1558 [00:00<?, ? examples/s]

Map:   0%|          | 0/1133 [00:00<?, ? examples/s]

Map:   0%|          | 0/284 [00:00<?, ? examples/s]

In [None]:
"""from datasets import load_dataset

books = load_dataset("opus_books", "de-en")
books = books["train"].train_test_split(test_size=0.2)

from transformers import AutoTokenizer

checkpoint = "google-bert/bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

source_lang = "en"
target_lang = "de"

def preprocess_function(examples):
    inputs = [example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

tokenized_books = books.map(preprocess_function, batched=True)
"""

In [43]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

import evaluate

metric = evaluate.load("sacrebleu")

import numpy as np

# Not sure of this
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)

    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [5]:
from transformers import (
    EncoderDecoderModel,
    PreTrainedModel,
    BertConfig,
    BertLMHeadModel,
    RobertaConfig,
    EncoderDecoderConfig,
    AutoConfig,
    AutoModel,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)
from transformers.modeling_outputs import BaseModelOutput, Seq2SeqLMOutput
from typing import Optional, Tuple, Union
import torch.nn as nn
import inspect
import torch

class TestModel(EncoderDecoderModel):
    def __init__(
        self,
        config,
        encoder: Optional[PreTrainedModel] = None,
        decoder: Optional[PreTrainedModel] = None,
        pad_token_id: Optional[int] = None,
        decoder_start_token_id: Optional[int] = None,
    ):
        super().__init__(config)
        if encoder is None:
            from transformers import AutoModel

            encoder = AutoModel.from_config(config.encoder)

        if decoder is None:
            from transformers import AutoModelForCausalLM

            decoder = AutoModelForCausalLM.from_config(config.decoder)

        self.encoder = encoder
        self.decoder = decoder

        self.encoder.config = self.config.encoder
        self.decoder.config = self.config.decoder

        if (
            self.encoder.config.hidden_size != self.decoder.config.hidden_size
            and self.decoder.config.cross_attention_hidden_size is None
        ):
            self.enc_to_dec_proj = nn.Linear(
                self.encoder.config.hidden_size, self.decoder.config.hidden_size
            )

        if self.encoder.get_output_embeddings() is not None:
            raise ValueError(
                f"The encoder {self.encoder} should not have a LM Head. Please use a model without LM Head"
            )

        decoder_signature = set(
            inspect.signature(self.decoder.forward).parameters.keys()
        )
        if "encoder_hidden_states" not in decoder_signature:
            raise ValueError(
                "The selected decoder is not prepared for the encoder hidden states to be passed. Please see the "
                "following discussion on GitHub: https://github.com/huggingface/transformers/issues/23350"
            )
        self.config.pad_token_id = pad_token_id
        self.config.decoder_start_token_id = decoder_start_token_id
        #self.pad_token_id = pad_token_id
        #self.decoder_start_token_id = decoder_start_token_id
        # tie encoder, decoder weights if config set accordingly
        self.tie_weights()

    def tie_weights(self):
        if self.config.tie_encoder_decoder:
            decoder_base_model_prefix = self.decoder.base_model_prefix
            self._tie_encoder_decoder_weights(
                self.encoder,
                self.decoder._modules[decoder_base_model_prefix],
                self.decoder.base_model_prefix,
            )

    def shift_tokens_right(self, input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
        """
        Shift input ids one token to the right.
        """
        shifted_input_ids = input_ids.new_zeros(input_ids.shape)
        shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
        if decoder_start_token_id is None:
            raise ValueError("Make sure to set the decoder_start_token_id attribute of the model's configuration.")
        shifted_input_ids[:, 0] = decoder_start_token_id

        if pad_token_id is None:
            raise ValueError("Make sure to set the pad_token_id attribute of the model's configuration.")
        # replace possible -100 values in labels by `pad_token_id`
        shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)

        return shifted_input_ids

    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        decoder_input_ids: Optional[torch.LongTensor] = None,
        decoder_attention_mask: Optional[torch.BoolTensor] = None,
        encoder_outputs: Optional[Tuple[torch.FloatTensor]] = None,
        past_key_values: Tuple[Tuple[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        **kwargs,
    ):
        if encoder_outputs is None:
            encoder_outputs = self.encoder(
                input_ids=input_ids, attention_mask=attention_mask
            )

        encoder_hidden_states = encoder_outputs[0]

        if (labels is not None) and (decoder_input_ids is None and decoder_inputs_embeds is None):
            decoder_input_ids = self.shift_tokens_right(
                labels, self.config.pad_token_id, self.config.decoder_start_token_id
            )
            if decoder_attention_mask is None:
                decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)

        decoder_outputs = self.decoder(
            input_ids=decoder_input_ids,
            attention_mask=decoder_attention_mask,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=attention_mask,
        )

        loss = None
        if labels is not None:
            logits = decoder_outputs[0]
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(
                logits.reshape(-1, self.decoder.config.vocab_size), labels.view(-1)
            )

        return Seq2SeqLMOutput(
            loss=loss,
            logits=decoder_outputs.logits,
            #past_key_values=decoder_outputs.past_key_values,
            #decoder_hidden_states=decoder_outputs.hidden_states,
            #decoder_attentions=decoder_outputs.attentions,
            #cross_attentions=decoder_outputs.cross_attentions,
            #encoder_last_hidden_state=encoder_outs.last_hidden_state,
            #encoder_hidden_states=encoder_outs.hidden_states,
            #encoder_attentions=encoder_outs.attentions,
        )


In [6]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, AutoModel
from transformers import BertConfig, BertLMHeadModel



# Encoder
encoder = AutoModel.from_pretrained("google-bert/bert-base-cased")
encoder.resize_token_embeddings(len(tokenizer))


# Decoder
# google-bert/bert-base-german-cased
configuration = BertConfig(vocab_size=len(tokenizer),
                           hidden_size=768, num_hidden_layers = 4, num_attention_heads = 4,
                           is_decoder=True, add_cross_attention=True)

decoder = BertLMHeadModel(configuration)

decoder.resize_token_embeddings(len(tokenizer))

config = EncoderDecoderConfig.from_encoder_decoder_configs(
    encoder.config, configuration
)


# Model


model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [44]:
model = TestModel(config,encoder=encoder, decoder=decoder)

model.config.eos_token_id = tokenizer.sep_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.vocab_size = model.config.encoder.vocab_size

model.config.decoder_start_token_id = tokenizer.cls_token_id
model.generation_config.decoder_start_token_id = tokenizer.cls_token_id

training_args = Seq2SeqTrainingArguments(
    output_dir="my_awesome_opus_books_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=20,
    predict_with_generate=True,
    fp16=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,No log,4.844588,0.0,20.0
2,No log,3.525865,0.0,10.0176
3,No log,2.787217,0.0,11.0
4,4.214000,2.444461,0.0,18.2077
5,4.214000,2.275674,0.0,19.9366
6,4.214000,2.175143,0.0,20.0
7,4.214000,2.105136,0.0,20.0
8,2.356200,2.055676,0.0,20.0
9,2.356200,2.014085,0.0,20.0
10,2.356200,1.979862,0.0,20.0


  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)
  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)
  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)
  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)
  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)


TrainOutput(global_step=2840, training_loss=2.387797804281745, metrics={'train_runtime': 2025.1294, 'train_samples_per_second': 11.189, 'train_steps_per_second': 1.402, 'total_flos': 8637143737221120.0, 'train_loss': 2.387797804281745, 'epoch': 20.0})

In [52]:
text = """
Turn this java code into python code
import java.util.*;

public class GFG {

    // Function to convert LinkedList to Array
    public static <T> Object[] convertLinkedListToArray(LinkedList<T> linkedList)
    {

        // Converting LinkedList to Array
        Object[] array = linkedList.toArray();

        return array;
    }

    public static void main(String args[])
    {
        // Creating linked list
        LinkedList<String>
            linkedList = new LinkedList<String>();

        // Adding elements to the linked list
        linkedList.add("G");
        linkedList.add("e");
        linkedList.add("e");
        linkedList.add("k");
        linkedList.add("s");

        // Print the LinkedList
        System.out.println("Linked list: "
                        + linkedList);

        // Converting LinkedList to Object Array
        Object[] objArray = convertLinkedListToArray(linkedList);

        // Convert Object[] to String[]
        String[] array = Arrays.copyOf(objArray,
                                    objArray.length,
                                    String[].class);
        // Print the String Array
        System.out.println("Array: "
                        + Arrays.toString(array));
    }
}
"""
inputs = tokenizer(text, return_tensors="pt").input_ids
inputs = inputs.to(model.device)
outputs = model.generate(inputs, max_new_tokens=1000, do_sample=True, top_k=30, top_p=0.95)
out = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(out)

''' Python3 program for the array'''''''class node to find the tree node'' def printArray ( ) : str1 [ ] > 0'''Recursive function to make the node'''' If the current node in range (') :'''Function to str [ ] = 0 )'str _ _ _'' Driver Code''': for i in range ( ) ;''') in the str [ ]'''if _ _ main _ _'' _ name _ _ main _ = None : self.'''' = [ ] for i'''Driver code'''''' : sys. append ( ) :'return 0 return'def print ( [ 0 ] = " ) print ( " ) if ( )'If temp )'''Driver program to print ( temp. pop ( ) )'''if _ _ _ _'_ ='' _ main _ _ = = 0 : print ( ", 1 ) st ( head _ _ _ " ) print ( ", " ) print " " ) print ( " )
