In [7]:
from datasets import load_dataset, load_from_disk
dataset = load_from_disk("voice-part1")
dataset

Dataset({
    features: ['prompt', 'answer', 'length', 'index', 'audio', 'tokens'],
    num_rows: 2
})

In [11]:
token_ids = dataset[0]['tokens']

In [12]:
tokens = [f"<|sound_{num:04}|>" for num in token_ids]
tokens = ["<|sound_start|>"] + tokens + ["<|sound_end|>"]

In [13]:
tokens

['<|sound_start|>',
 '<|sound_0121|>',
 '<|sound_0913|>',
 '<|sound_0738|>',
 '<|sound_0913|>',
 '<|sound_0408|>',
 '<|sound_0544|>',
 '<|sound_0835|>',
 '<|sound_0913|>',
 '<|sound_0835|>',
 '<|sound_0518|>',
 '<|sound_0170|>',
 '<|sound_1001|>',
 '<|sound_0145|>',
 '<|sound_0025|>',
 '<|sound_0038|>',
 '<|sound_0408|>',
 '<|sound_0906|>',
 '<|sound_0296|>',
 '<|sound_1018|>',
 '<|sound_0081|>',
 '<|sound_0431|>',
 '<|sound_0252|>',
 '<|sound_0694|>',
 '<|sound_0347|>',
 '<|sound_0926|>',
 '<|sound_0347|>',
 '<|sound_0868|>',
 '<|sound_0347|>',
 '<|sound_0432|>',
 '<|sound_0651|>',
 '<|sound_0491|>',
 '<|sound_0564|>',
 '<|sound_0651|>',
 '<|sound_0920|>',
 '<|sound_0431|>',
 '<|sound_0791|>',
 '<|sound_0679|>',
 '<|sound_0245|>',
 '<|sound_0926|>',
 '<|sound_0164|>',
 '<|sound_0984|>',
 '<|sound_0406|>',
 '<|sound_0833|>',
 '<|sound_0655|>',
 '<|sound_0862|>',
 '<|sound_0743|>',
 '<|sound_0868|>',
 '<|sound_0782|>',
 '<|sound_0926|>',
 '<|sound_0634|>',
 '<|sound_0431|>',
 '<|sound_0

In [17]:
def transform_batch_tokens(batch):
    # Process the 'tokens' column for each batch
    batch_transformed_tokens = []
    for token_ids in batch['tokens']:
        # Convert each token ID to the desired string format
        tokens = [f"<|sound_{num:04}|>" for num in token_ids]
        # Optionally add empty strings at the beginning and end if needed
        tokens = ["<|sound_start|>"] + tokens + ["<|sound_end|>"]
        batch_transformed_tokens.append(tokens)
    return {"sound_tokens": batch_transformed_tokens}

transformed_dataset = dataset.map(
    transform_batch_tokens,
    batched=False,
    num_proc=1
)
transformed_dataset

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

TypeError: 'int' object is not iterable

In [10]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 4096 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = False # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Phi-3-mini-4k-instruct", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    use_gradient_checkpointing= "unsloth"
)
tokenizer.add_tokens("<|sound_start|>",special_tokens=True)
tokenizer.add_tokens("<|sound_end|>",special_tokens=True)
for sound_token in range(0, 1024):
    sound_added_token = f"<|sound_{sound_token:04}|>"
    tokenizer.add_tokens(sound_added_token)

==((====))==  Unsloth: Fast Mistral patching release 2024.6
   \\   /|    GPU: NVIDIA RTX A6000. Max memory: 47.318 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.26.post1. FA = True.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
# Tokenize the sound tokens
sound_tokens = transformed_dataset[0]['sound_tokens']

tokenized_output = tokenizer(sound_tokens, return_tensors='pt', truncation=True, max_length=4096)
print(tokenized_output)


{'input_ids': tensor([[    1, 32011],
        [    1, 32878],
        [    1, 32437],
        ...,
        [    1, 32075],
        [    1, 32531],
        [    1, 32012]]), 'attention_mask': tensor([[1, 1],
        [1, 1],
        [1, 1],
        ...,
        [1, 1],
        [1, 1],
        [1, 1]])}


In [14]:
transformed_dataset

Dataset({
    features: ['prompt', 'answer', 'length', 'index', 'audio', 'tokens', 'sound_tokens'],
    num_rows: 111169
})

In [15]:
def create_conversations(batch):
    # Initialize the list to hold the formatted conversation data
    conversations = []
    
    # Iterate through the batch
    for sound_token, answer in zip(batch['sound_tokens'], batch['answer']):
        # Create the user part using the sound_tokens
        user_part = {"role": "user", "content": " ".join(sound_token)}
        
        # Create the assistant part using the answer
        assistant_part = {"role": "assistant", "content": answer}
        
        # Combine both parts into one conversation object
        conversation = [user_part, assistant_part]
        
        # Append the conversation object to the list
        conversations.append(conversation)
        
    return {"conversations": conversations}

# Apply the transformation to create a new 'conversations' column
final_dataset = transformed_dataset.map(
    create_conversations,
    batched=True,
    num_proc=64
)
final_dataset

Map (num_proc=64):   0%|          | 0/111169 [00:00<?, ? examples/s]

Dataset({
    features: ['prompt', 'answer', 'length', 'index', 'audio', 'tokens', 'sound_tokens', 'conversations'],
    num_rows: 111169
})

In [16]:
final_dataset[0]["conversations"]

[{'content': '<|sound_start|> <|sound_0865|> <|sound_0424|> <|sound_0121|> <|sound_0424|> <|sound_0408|> <|sound_0913|> <|sound_0475|> <|sound_0519|> <|sound_0151|> <|sound_0463|> <|sound_0388|> <|sound_0670|> <|sound_0407|> <|sound_0399|> <|sound_0906|> <|sound_0171|> <|sound_0465|> <|sound_0231|> <|sound_0465|> <|sound_0246|> <|sound_0585|> <|sound_0446|> <|sound_0489|> <|sound_0182|> <|sound_0764|> <|sound_0948|> <|sound_0373|> <|sound_0355|> <|sound_0373|> <|sound_0815|> <|sound_0676|> <|sound_0973|> <|sound_0731|> <|sound_1010|> <|sound_0764|> <|sound_1022|> <|sound_0991|> <|sound_0980|> <|sound_0945|> <|sound_0716|> <|sound_0945|> <|sound_0628|> <|sound_0407|> <|sound_0712|> <|sound_0407|> <|sound_0777|> <|sound_0906|> <|sound_0820|> <|sound_0465|> <|sound_0095|> <|sound_0876|> <|sound_0477|> <|sound_0876|> <|sound_0841|> <|sound_1017|> <|sound_0363|> <|sound_1019|> <|sound_0363|> <|sound_0511|> <|sound_0806|> <|sound_0502|> <|sound_0541|> <|sound_0913|> <|sound_0477|> <|sound_09

In [1]:
import glob
from datasets import Dataset, concatenate_datasets

arrow_files = glob.glob('/home/alandao/voice_data_process/audio/instruction-speech-v1/data/data*.arrow')

dataset = concatenate_datasets([Dataset.from_file(arrow_file) for arrow_file in arrow_files])
dataset = dataset.select_columns(['prompt', 'answer', 'tokens'])
print(dataset)

Dataset({
    features: ['prompt', 'answer', 'tokens'],
    num_rows: 444678
})


In [2]:
def transform_batch_tokens(batch):
    # Process the 'tokens' column for each batch
    batch_transformed_tokens = []
    for token_ids in batch['tokens']:
        # Convert each token ID to the desired string format
        tokens = [f"<|sound_{num:04}|>" for num in token_ids]
        # Optionally add empty strings at the beginning and end if needed
        tokens = ["<|sound_start|>"] + tokens + ["<|sound_end|>"]
        batch_transformed_tokens.append(tokens)
    return {"sound_tokens": batch_transformed_tokens}

transformed_dataset = dataset.map(
    transform_batch_tokens,
    batched=True,
    num_proc=56,
    batch_size=10000,
)

Map (num_proc=56):   0%|          | 0/444678 [00:00<?, ? examples/s]

In [4]:
transformed_dataset

Dataset({
    features: ['prompt', 'answer', 'tokens', 'sound_tokens'],
    num_rows: 444678
})

In [5]:
transformed_dataset[0]

{'prompt': 'Write a sentence that about [Loch Fyne eatType restaurant; Loch Fyne food Indian; Loch Fyne familyFriendly yes].',
 'answer': 'Step 1: Identify the key elements of the information.\n\n- Loch Fyne is an eatType restaurant.\n- Loch Fyne serves Indian food.\n- Loch Fyne is family-friendly.\n\nStep 2: Choose a sentence structure to convey the information.\n\nTo create a clear and concise sentence, I need to include all three pieces of information. The most natural-sounding sentence structure would be to start with the name of the restaurant, follow with the type of food and finish with the family-friendly aspect.\n\nStep 3: Construct the sentence.\n\nLoch Fyne, an eatType restaurant, specializes in serving delicious Indian cuisine and is welcoming for families with its family-friendly atmosphere.',
 'tokens': [62,
  913,
  62,
  424,
  62,
  424,
  62,
  424,
  408,
  544,
  408,
  544,
  408,
  913,
  408,
  518,
  408,
  518,
  408,
  518,
  432,
  723,
  80,
  303,
  143,
  

In [3]:
transformed_dataset.push_to_hub("jan-hq/instruction-speech-no-audio")

Uploading the dataset shards:   0%|          | 0/33 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/jan-hq/instruction-speech-no-audio/commit/5c6dc4401f83cd743c0d648c766e8321dbe17d0f', commit_message='Upload dataset', commit_description='', oid='5c6dc4401f83cd743c0d648c766e8321dbe17d0f', pr_url=None, pr_revision=None, pr_num=None)

In [3]:
import torch
import os
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, get_cosine_schedule_with_warmup
import time
from datasets import Dataset, interleave_datasets
from trl import SFTTrainer
import multiprocessing
from datasets import load_dataset

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }
pass

# import glob
# from datasets import Dataset, concatenate_datasets
# arrow_files = glob.glob('/home/alandao/voice_data_process/audio/instruction-speech-v1/data/data*.arrow')

# dataset = concatenate_datasets([Dataset.from_file(arrow_file) for arrow_file in arrow_files])
dataset = load_dataset("jan-hq/instruction-speech-no-audio", num_proc=64, split="train")

dataset = dataset.select_columns(['prompt', 'answer', 'tokens'])
print(dataset)

def count_tokens(example):
    example['token_count'] = len(example['tokens'])
    return example

dataset = dataset.map(count_tokens, num_proc=64)
print(dataset)

[2024-07-03 05:46:01,161] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


Resolving data files:   0%|          | 0/33 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/33 [00:00<?, ?it/s]

Setting num_proc from 64 to 33 for the train split as it only contains 33 shards.


Generating train split:   0%|          | 0/444678 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/45 [00:00<?, ?it/s]

Dataset({
    features: ['prompt', 'answer', 'tokens'],
    num_rows: 444678
})


Map (num_proc=64):   0%|          | 0/444678 [00:00<?, ? examples/s]

Dataset({
    features: ['prompt', 'answer', 'tokens', 'token_count'],
    num_rows: 444678
})


In [9]:
def transform_batch_tokens(batch):
    # Process the 'tokens' column for each batch
    batch_transformed_tokens = []
    for token_ids in batch['tokens']:
        # Convert each token ID to the desired string format
        tokens = [f"<|sound_{num:04}|>" for num in token_ids]
        # Optionally add empty strings at the beginning and end if needed
        tokens = ["<|sound_start|>"] + tokens + ["<|sound_end|>"]
        batch_transformed_tokens.append(tokens)
    return {"sound_tokens": batch_transformed_tokens}

transformed_dataset = dataset.map(
    transform_batch_tokens,
    batched=True,
    num_proc=56,
    batch_size=10000,
)
# Dataset 1
def create_conversations_sound(batch):
    # Initialize the list to hold the formatted conversation data
    conversations = []
    
    # Iterate through the batch
    for sound_token, answer in zip(batch['sound_tokens'], batch['answer']):
        # Create the user part using the sound_tokens
        user_part = {"role": "user", "content": "".join(sound_token)}
        assistant_part = {"role": "assistant", "content": answer}
        conversation = [user_part, assistant_part]
        conversations.append(conversation)
        
    return {"sound_convo": conversations}


# Apply the transformation to create a new 'conversations' column
transformed_dataset_sound = transformed_dataset.map(
    create_conversations_sound,
    batched=True,
    num_proc=56,
    batch_size=10000,
)

def create_conversations(batch):
    # Initialize the list to hold the formatted conversation data
    conversations = []
    
    # Iterate through the batch
    for question, answer in zip(batch['prompt'], batch['answer']):
        # Create the user part using the sound_tokens
        user_part = {"role": "user", "content": question}
        assistant_part = {"role": "assistant", "content": answer}
        conversation = [user_part, assistant_part]
        conversations.append(conversation)
        
    return {"text_convo": conversations}

# Apply the transformation to create a new 'conversations' column
transformed_dataset_sound = transformed_dataset_sound.map(
    create_conversations,
    batched=True,
    num_proc=56,
    batch_size=10000,
)

# Dataset 3
def create_conversations_transcribe(batch):
    # Initialize the list to hold the formatted conversation data
    conversations = []
    
    # Iterate through the batch
    for sound_token, question in zip(batch['sound_tokens'], batch['prompt']):
        # Create the user part using the sound_tokens
        user_part = {"role": "user", "content": f"Transcribe this given sound: {''.join(sound_token)}"}
        # Create the assistant part using the answer
        assistant_part = {"role": "assistant", "content": f"This is a transcription: {question}"}
        conversation = [user_part, assistant_part]
        conversations.append(conversation)
        
    return {"sound_transcribe": conversations}


# Apply the transformation to create a new 'conversations' column
transformed_dataset_sound = transformed_dataset_sound.map(
    create_conversations_transcribe,
    batched=True,
    num_proc=56,
    batch_size=10000,
)


transformed_dataset_sound = transformed_dataset_sound.remove_columns([col for col in transformed_dataset_sound.column_names if col not in ['text_convo','sound_convo','prompt','answer','sound_transcribe']])
print(transformed_dataset_sound)

Dataset({
    features: ['prompt', 'answer', 'sound_convo', 'text_convo', 'sound_transcribe'],
    num_rows: 444678
})


In [10]:
transformed_dataset_sound[0]["sound_transcribe"]

[{'content': 'Transcribe this given sound: <|sound_start|><|sound_0062|><|sound_0913|><|sound_0062|><|sound_0424|><|sound_0062|><|sound_0424|><|sound_0062|><|sound_0424|><|sound_0408|><|sound_0544|><|sound_0408|><|sound_0544|><|sound_0408|><|sound_0913|><|sound_0408|><|sound_0518|><|sound_0408|><|sound_0518|><|sound_0408|><|sound_0518|><|sound_0432|><|sound_0723|><|sound_0080|><|sound_0303|><|sound_0143|><|sound_0350|><|sound_0868|><|sound_0741|><|sound_0862|><|sound_0213|><|sound_0385|><|sound_0722|><|sound_0573|><|sound_0722|><|sound_0788|><|sound_0458|><|sound_0679|><|sound_0502|><|sound_0533|><|sound_0357|><|sound_0465|><|sound_0812|><|sound_0502|><|sound_0943|><|sound_0136|><|sound_0920|><|sound_0432|><|sound_0708|><|sound_0604|><|sound_0409|><|sound_0433|><|sound_0201|><|sound_0907|><|sound_0071|><|sound_0255|><|sound_0504|><|sound_0373|><|sound_0229|><|sound_0160|><|sound_0973|><|sound_0160|><|sound_0973|><|sound_0160|><|sound_0973|><|sound_0709|><|sound_0857|><|sound_0160|><|so

In [11]:
transformed_dataset_sound.push_to_hub("jan-hq/instruction-speech-conversation")

Uploading the dataset shards:   0%|          | 0/38 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/539 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/jan-hq/instruction-speech-conversation/commit/237143d9b16ce8a65d482c49053a4053f6029243', commit_message='Upload dataset', commit_description='', oid='237143d9b16ce8a65d482c49053a4053f6029243', pr_url=None, pr_revision=None, pr_num=None)