In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from datasets import load_dataset
import torch
import json
import re



In [2]:
mistral_checkpoint = "/kaggle/input/mistral/pytorch/7b-v0.1-hf/1"
mistral_tokenizer = AutoTokenizer.from_pretrained(mistral_checkpoint)
mistral_model = AutoModelForCausalLM.from_pretrained(
        mistral_checkpoint,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True,
    )
mistral_device = next(mistral_model.parameters()).device

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
t5_checkpoint = "t5-base"
t5_tokenizer = AutoTokenizer.from_pretrained(t5_checkpoint)

In [None]:
t5_translator = pipeline("translation_en_to_de"
                         , model = t5_checkpoint
                         , clean_up_tokenization_spaces = True)

In [4]:
def get_gen_text(prompt, model, tokenizer, device):
    model_inputs = tokenizer([prompt], return_tensors="pt").to(device)
    
    generated_ids = model.generate(**model_inputs
                                   , max_length=1000
                                   , pad_token_id = tokenizer.eos_token_id
                                   , do_sample=True)
    gen_text = tokenizer.batch_decode(generated_ids)[0]
    gen_list = re.split(r'[.!?]', gen_text.replace(prompt,"").replace("<s>","").replace("\n",""))[:-1]
    
    return [x.strip() for x in gen_list]

In [None]:
output_path = "output.jsonl"
batch_size = 50
num_iters = 100
last_index = 0
data_list = []

for i in range(num_iters):
    prompt = f'''
                Write {batch_size} different short sentences.
                '''
    gen_text = get_gen_text(prompt
                             , mistral_model
                             , mistral_tokenizer
                             , mistral_device)

    translation = [t5_translator(f"translate English to German: {x}")[0]['translation_text'] for x in gen_text]

    with open(output_path, "a") as f:
        for j in range(len(gen_text)):
            tmp = {'id': (j+last_index), 'translation': {'en': gen_text[j], 'de': translation[j]}}
            f.write(json.dumps(tmp) + "\n")
    
    last_index += len(gen_text)
    print(f"iteration: {i+1}/{num_iters} | completed: {last_index}")


In [None]:
# Define the path to your JSON Lines file
file_path = "/kaggle/working/output.jsonl"

# Define the dataset configuration
dataset_config = "json"

# Load the dataset
my_dataset = load_dataset(dataset_config, data_files=file_path)