# Bloomz - Text Generation

## Fine Tuning Dataset

In [1]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import os
import json
import random
import torch

In [2]:
DATA_DIR = os.path.abspath(os.path.join('..', 'data'))

In [3]:
os.listdir(DATA_DIR)

['ob-loose-jun28-sm.jsonl', 'offload_folder', 'huggingface_cache']

In [4]:
file_path = os.path.join(DATA_DIR, 'ob-loose-jun28-sm.jsonl')

In [5]:
results = []

with open(file_path, 'r') as json_file:
    json_list = list(json_file)

for json_str in json_list:
    result = json.loads(json_str)
    results.append(result)
    if not isinstance(result, dict):
        print('JSON line could not be parsed as a dict')

In [6]:
example_prompt = random.choice(results)['prompt']

In [7]:
print(example_prompt)

Rephrase and join the sentences to remove repetition and sound more human without changing the wording and semantics.
The only exception is that you are allowed to rephrase from the user query the <|NOT_SURE|> parts.

###

Question: I have an inquiry about my purchase transactions, where can I send my message?
Robotic Answer: <|NOT_SURE|> <|CryptoWallet does not offer support via telephone so for inquiries, users must send an email at support@cryptos.com.|>
Human Answer:


## mt0-small - 300M

In [8]:
checkpoint = "bigscience/mt0-small"

In [9]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [10]:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, torch_dtype="auto", device_map="auto")

In [11]:
inputs = tokenizer.encode(example_prompt, return_tensors="pt").to("cuda")
outputs = model.generate(inputs)
print(tokenizer.decode(outputs[0]))

<pad> cryptos.com</s>




## mt0-base 

In [12]:
checkpoint_base = "bigscience/mt0-base"

In [13]:
tokenizer_base = AutoTokenizer.from_pretrained(checkpoint_base)

In [14]:
model_base = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_base, torch_dtype="auto", device_map="auto")

In [20]:
example_prompt = "Explain in a sentence in Spanish what is backpropagation in neural networks."

In [22]:
inputs = tokenizer_base.encode(example_prompt, truncation=False, return_tensors="pt").to("cuda")
outputs = model_base.generate(inputs)
print(tokenizer_base.decode(outputs[0]))

<pad> Qué es la propagación de datos en las redes nocturnas?</s>


## mt0-large

In [None]:
checkpoint_large = "bigscience/mt0-large"

In [None]:
tokenizer_large = AutoTokenizer.from_pretrained(checkpoint_large)

In [None]:
model_large = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_large, torch_dtype="auto", device_map="auto")

In [None]:
inputs = tokenizer_base.encode(example_prompt, return_tensors="pt").to("cuda")
outputs = model_base.generate(inputs,  max_length=100)
print(tokenizer_base.decode(outputs[0]))

## mt0-xxl - 13B 

In [None]:
CACHE_DIR = os.path.join(DATA_DIR, 'huggingface_cache')
try:
    os.mkdir(CACHE_DIR)
except FileExistsError:
    pass

In [None]:
OFFLOAD_FOLDER = os.path.join(DATA_DIR, 'offload_folder')
try:
    os.mkdir(OFFLOAD_FOLDER)
except FileExistsError:
    pass

In [None]:
import psutil
MAX_MEMORY = 1000

In [None]:
checkpoint_xxl = "bigscience/mt0-xxl"

tokenizer_xxl = AutoTokenizer.from_pretrained(checkpoint_xxl)

In [None]:
torch.cuda.mem_get_info()

In [None]:
max_memory = {0: "7GIB"}

In [None]:
model_xxl = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_xxl, torch_dtype="auto", device_map="auto", cache_dir=CACHE_DIR, offload_folder=OFFLOAD_FOLDER, max_memory=max_memory)

In [None]:
inputs = tokenizer.encode("Translate to English: Je t’aime.", return_tensors="pt").to("cuda")
outputs = model.generate(inputs)
print(tokenizer.decode(outputs[0]))