In [1]:
from pyarrow import parquet
import pyarrow
import datasets

import transformers
import numpy as np

from tqdm.notebook import tqdm


DATASET = "../../dataset_processed_orca/openchat3.1_orca_train"
TOKENIZER_NAME = "/data/one/LLaMA_13B_with_EOT_token"

# Load tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(TOKENIZER_NAME)

In [2]:
# Load dataset
dataset = datasets.load_dataset(DATASET, split="train", keep_in_memory=True)


Downloading and preparing dataset parquet/openchat3.1_orca_train to /home/one/.cache/huggingface/datasets/parquet/openchat3.1_orca_train-8a8b488621d0ce71/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /home/one/.cache/huggingface/datasets/parquet/openchat3.1_orca_train-8a8b488621d0ce71/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.


In [2]:
ds_parquet = parquet.read_table(DATASET + "/data.parquet")

In [5]:
len(ds_parquet)

3225785

In [23]:
supervise_len = np.array([pyarrow.compute.sum(x.values).as_py() for x in tqdm(ds_parquet.column("masks"))])

  0%|          | 0/4058285 [00:00<?, ?it/s]

In [32]:
# Length statistics
lengths = np.array(ds_parquet.column("length").to_pylist())

In [72]:
np.sum((lengths == 2048) & (supervise_len < 10))

459

In [71]:
tokenizer.decode(ds_parquet.column("tokens")[1439305].as_py())

'<s> User: Please answer the following question: Information:  - Shawntae Harris (born April 14, 1974), better known by her stage name Da Brat, is an American rapper and actress from Joliet, Illinois. Beginning her career in 1991, Harris debut album "Funkdafied" (1994) sold one million copies, making her the first female solo rap act to receive a platinum certification, and the second overall female rap act (solo or group) after Salt N Pepa. She has received two Grammy Award nominations.  - Tamia Marilyn Hill (born May 9, 1975), known professionally as Tamia, is a Canadian singer-songwriter. She is best known for her first Top 40 hit on the R&B charts "You Put a Move on My Heart", her 2001 hit "Stranger in My House", Fabolous\' 2003 hit "Into You" (which samples her 1998 song "So into You"), her 2006 hit "Me", and her 2012 hit "Beautiful Surprise". With a career spanning over two decades, she has garnered six career Grammy nominations for her musical work.  - Christopher Brian "Chris" 

In [36]:
np.sum(supervise_len) / np.sum(lengths)

0.31643916311707815

In [21]:
# Length statistics
lengths = np.array(dataset["length"])

print (np.min(lengths), np.mean(lengths), np.max(lengths))

print (np.argsort(lengths)[-500:])

26 418.277155026512 2048
[4054500 2155684 1536589 3592824 1231999  328651 2835248 3489625 3046371
 1965088  886623 2924761  886617 2835270 3726799 3827710 1739044 2407680
 1016836 4003094 1536786  275018 3947613  667172   36209 3827622  886499
 2407810  886491 3592698 3046158 2835410 2155465 2835411 3046151  206122
 1232122 1232117   91533 1337040 2602593 1016842  558564 2835342 1536697
 3046272 1337078 1536705 2835360 1232110 1232088  392724 1337066 3254874
 1739016 3046253 2407714 3318635 4054410 4054428 2155704 4054507 3046411
 2602870 4054622 3947429 3254645  886730 2835036 2407403 2602849 1964919
 3254656 1739358 1536375 3046646 2155892 2407414 1739350 3784567 2155870
 4054604  206229  392622 3592948 2407296 2155980 3947398 3254598 1337300
 2407314 3489507 1964901  392619 1964905  450636 2155950  274912 3254624
  786675 3827792 3592950  450626  621214  558310 1536392   36099 3726720
 1536489 2602792 3489574 2835170 1965033 1337178 2602764 4054551 2407525
 2155721 3046452  786759 1

In [13]:
# Print the supervised and unsupervised text
sample_index = 1001

item = dataset[sample_index]

token_ids = np.array(item["1_tokens"])
masks     = np.array(item["1_masks"])

print(tokenizer.decode(token_ids))
print(tokenizer.decode(token_ids[~masks].tolist(), spaces_between_special_tokens=False).replace("<|end_of_turn|>", "<|end_of_turn|>\n\n"))
print("=================")
print(tokenizer.decode(token_ids[masks].tolist(), spaces_between_special_tokens=False).replace("<|end_of_turn|>", "<|end_of_turn|>\n\n"))

<s> Assistant is GPT4<|end_of_turn|> You are a helpful assistant, who always provide explanation. Think like you are answering to a five year old.<|end_of_turn|> User: Which is an appropriate title for this article?

Deutsche Bank AG and Thomas Weisel Partners LLC will pay a total of \$100 million to settle allegations they published misleading stock research to win investment-banking business, becoming <|end_of_turn|> Assistant: "Big Banks Pay Money to Make Problem Go Away"<|end_of_turn|>
<s> Assistant is GPT4<|end_of_turn|>

 You are a helpful assistant, who always provide explanation. Think like you are answering to a five year old.<|end_of_turn|>

 User: Which is an appropriate title for this article?

Deutsche Bank AG and Thomas Weisel Partners LLC will pay a total of \$100 million to settle allegations they published misleading stock research to win investment-banking business, becoming <|end_of_turn|>

 Assistant:
"Big Banks Pay Money to Make Problem Go Away"<|end_of_turn|>




In [15]:
tokenizer.decode(dataset[3]["tokens"])

'<s> You are an AI assistant. You will be given a task. You must generate a detailed and long answer.<|end_of_turn|> User: How is ""Thank you, Lady." said in Czech?<|end_of_turn|> Assistant GPT3: In Czech, the phrase "Thank you, Lady" can be translated as "Děkuji, paní." \n\nThe word "Děkuji" means "Thank you" and is a polite way of expressing gratitude. It is commonly used in everyday conversations and interactions.\n\nThe word "paní" means "Lady" and is used to address a woman in a polite and respectful manner. It is commonly used in formal settings or when addressing someone who is older or in a position of authority.\n\nWhen combined, "Děkuji, paní" is a polite and respectful way of expressing gratitude to a woman. It can be used in a variety of situations, such as thanking a waitress for her service or expressing appreciation to a female colleague for her help.<|end_of_turn|>'