### Install Dependancies

In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q trl xformers wandb datasets einops gradio sentencepiece
!pip install asyncio aiohttp

### Setup Main settings and Imports

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, pipeline, logging, TextStreamer
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
import os, torch, wandb, platform, gradio, warnings
from datasets import Dataset
from trl import SFTTrainer

# Base and New model information
base_model, new_model = "../base_models/HuggingFaceH4_zephyr-7b-beta/", "output/Zephyr_HUSH_Assistant_v0.03"

# Training System Message
system_message = "You are a musicians consultant that specializes in audience growth."

bos_token = "<s>"
eos_token = "</s>"
pad_token = "<pad>"

topic = "Social media growth and content"
# topic = "Tiktok Algorithm and Content Creation"

# notes = "This transcript is from a youtube video, it contains information about how to survive as a musician in the modern world.  Please use this information and your own internal data to help me create a training data for this topic.  The point of this training data is to help me grow my musician presence online, come up with creative ideas for different niches and create schedules and plans for growth, connect with my audience, come up with ideas and general AI assistant information that could help me grow my career with great financial success.  Give very thorough and expert insights in your question answer pairs and really dive into logical detail about how things will work.  Do not use his catchphrase 'attention is the number one asset' as context for this it will be at the beginning of some transcripts"
notes = """
This transcript is from a youtube video,
Its about how tipping points build up and how to take advantage of them.
Please use this information and your own internal data to help me create robust training data for this topic.
"""
# Youtube Videos to download
videos = [
  #############################################
  #### REESE BASS SOUND DESIGN
  #############################################
  'https://www.youtube.com/watch?v=NxnFnbG6Nb8',
  'https://www.youtube.com/watch?v=u3YEU_kQ5y0'
]


In [None]:
from huggingface_hub import notebook_login
notebook_login()

### Remove any old data from the audio folder

In [None]:
import os
import shutil

# Directories
audio_dir = './audio'
transcripts_dir = './transcripts'
backup_dir = os.path.join(audio_dir, 'sound-design-backup')

# Ensure backup directory exists
if not os.path.exists(backup_dir):
    os.makedirs(backup_dir)

# Get list of files in each directory
audio_files = {os.path.splitext(file)[0] for file in os.listdir(audio_dir) if os.path.isfile(os.path.join(audio_dir, file))}
transcript_files = {os.path.splitext(file)[0] for file in os.listdir(transcripts_dir) if os.path.isfile(os.path.join(transcripts_dir, file))}

# Move audio files without matching transcripts to the backup folder
for file in audio_files:
    if file not in transcript_files:
        shutil.move(os.path.join(audio_dir, file + '.mp3'), backup_dir)  # assuming audio files are .mp3, modify if needed

print("Unmatched audio files have been moved to the backup directory.")


### Create & Format Dataset

In [None]:
!pip install nest_asyncio asyncio

In [None]:
# Creating the Dataset
from transcriber import transcribe_youtube_videos, unload_model
from data_creator import process_transcripts, create_dataset_json, sanitize_filename, process_transcripts_async

# remove video duplicates
videos = list(dict.fromkeys(videos))
print(f"Videos being processed: {len(videos)}")
# transcribe_youtube_videos(videos, download=True, transcribe=True)
# datasetPath = process_transcripts_async("transcripts", topic=topic, skip_existing=True, notes=notes)  
datasetPath = process_transcripts("transcripts", topic=topic, skip_existing=True, notes=notes)  

# log done generating in green
print("\033[92mDone Generating Dataset\033[0m")

data = create_dataset_json("data")

# # Load the dataset
dataset = Dataset.from_dict({
    'question': [item['question'] for item in data],
    'answer': [item['answer'] for item in data]
})

# Define a function for formatting the data
def format_samples(row):
    q = row["question"]
    a = row["answer"]
    system = f"<|system|>{system_message}</s>\n"
    user = f"<|user|>{q}</s>\n"
    assistant = f"<|assistant|>{a}</s>"
    row["data"] = system+user+assistant
    return row

unload_model()

# Apply the function to the dataset
dataset = dataset.map(format_samples)
dataset["data"][0]

### Load and Configure Model

In [None]:
unload_model()

# Load base model (Mistral 7B)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False,
)
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map={"": 0}
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)

# Configure tokenizer
tokenizer.bos_token = bos_token
tokenizer.eos_token = eos_token
tokenizer.pad_token = pad_token

# Ensure the token is in the tokenizer's vocabulary
if tokenizer.eos_token not in tokenizer.get_vocab():
    tokenizer.add_tokens([tokenizer.eos_token, tokenizer.pad_token])

tokenizer.padding_side = 'right'  # Set padding side


In [None]:
#Adding the adapters in the layers
model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
        r=16,
        lora_alpha=16,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"]
    )
model = get_peft_model(model, peft_config)

### Configure Training

In [None]:
# W and B monitoring
wandb.login(key = "6c79359e9f0e19899dfe5de61e089ee3896274e1")
run = wandb.init(project='Zephyr7B HU$H Assistant v0.021', job_type="training", anonymous="allow")

In [None]:
#Hyperparamter
training_arguments = TrainingArguments(
    output_dir= "./output",
    num_train_epochs= 10,
    per_device_train_batch_size= 14,
    gradient_accumulation_steps= 2,
    optim = "paged_adamw_8bit",
    save_steps= 250,
    logging_steps= 20,
    learning_rate= 1e-4,
    weight_decay= 0.001,
    fp16= False,
    bf16= False,
    max_grad_norm= 0.3,
    max_steps= -1,
    warmup_ratio= 0.1,
    group_by_length= True,
    lr_scheduler_type= "constant",
    report_to="wandb"
)
# Setting sft parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length= None,
    dataset_text_field="data",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)

### TRAIN!

In [None]:
trainer.train()

In [None]:
new_model = "output/Zephyr7B_HUSH_Assistant_v0.02"
# Save the fine-tuned model
trainer.model.save_pretrained(new_model)
trainer.tokenizer.save_pretrained(new_model)
wandb.finish()
model.config.use_cache = True
model.eval()

In [None]:
def stream(user_prompt):
    runtimeFlag = "cuda:0"
    system_prompt = f"<|system|>You are a social media algorithm and growth hacking expert with a music industry focus.  You are the assistant of the musical artist HU$H. Your name is Dezi\n"
    PROMPT = f"{system_prompt}</s><|user|>{user_prompt}</s>"

    prompt = f"{PROMPT}\n<|assistant|>"
    print(prompt)
    inputs = tokenizer([prompt], return_tensors="pt").to(runtimeFlag)

    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    _ = model.generate(**inputs, streamer=streamer, max_new_tokens=2048, do_sample=True, temperature=0.4)

In [None]:
stream("whats wrong with me")

In [None]:
# Clear the memory footprint
del model
torch.cuda.empty_cache()

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, pipeline, logging, TextStreamer
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
import os, torch, wandb, platform, gradio, warnings
from datasets import Dataset
from trl import SFTTrainer

# Training System Message
system_message = "You are an unreal engine development assistant that is proficient in explaining unreal engine to a beginner"

# Choose your model
base_model, fc_lora = "../base_models/HuggingFaceH4_zephyr-7b-beta/", "output/Zephyr7B_HUSH_Assistant_v0.011/checkpoint-500/"

# Reload the base model
model = AutoModelForCausalLM.from_pretrained(
    base_model, low_cpu_mem_usage=True,
    return_dict=True,torch_dtype=torch.bfloat16,
    device_map= {"": 0})

tokenizer = AutoTokenizer.from_pretrained(fc_lora, trust_remote_code=True)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Load lora and ensure it's on the same device
model = PeftModel.from_pretrained(model, fc_lora).to(device)

# Now attempt to merge
model = model.merge_and_unload()

In [None]:
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
def stream(user_prompt):
    runtimeFlag = "cuda:0"
    system_prompt = f"<|system|>{system_message}.\n"
    PROMPT = f"""{system_prompt}
    <|definition|>
{{
    "name": "search_internet",
    "description": "Search the internet.  This can be to find information about anything.",
    "parameters": {{
        "type": "object",
        "properties": {{
            "search_term": {{
                "type": "string",
                "description": "A search phrase for doing research on the users topic"
            }}
        }},
        "required": [
            "search_term"
        ]
    }}
}}</s>
<|user|>
{user_prompt}</s>
"""
    prompt = f"{PROMPT}\n<|assistant|>"
    print(prompt)
    inputs = tokenizer([prompt], return_tensors="pt").to(runtimeFlag)

    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    _ = model.generate(**inputs, streamer=streamer, max_new_tokens=2048, do_sample=True, temperature=0.1)

In [None]:
stream("Can you search the internet for the phrase 'pizza places in iowa city', Iowa for me?")

In [None]:
save_path = "output/zephyr/Zephyr7b-beta-Sound_Design_v0_01/model"
# Save the merged model and tokenizer
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

### Create Synthetic Dataset For Validation

In [None]:
from data_creator import is_sample_data, create_dataset_json
model = base_model_reload
# Get the dataset json - {question:string, answer:string}[]
data = create_dataset_json("data")

system_prompt = f"<|system|>You are an unreal engine lyra sample game and plugin dataset expert specializing in rewording questions / answers for creating large language model datasets.  These datasets are used to train the zephyr7b model\n"
user_prompt = f"<|user|>Please reword the following json question and answer pair into a new json question and answer pair in the same typings.  Your answers will be solid and complex, using only information in the question answer pair that I provide you with, while use your base knowledge of these things as reference but the data I provide as the ground truth.  You will reword the data I give in a new way that still maintains all the data in the question and answer so that it helps fine tune our large language models even more effectively.  You will only return the javascript objects in the exact format that I provide you with. Here is the q/a pair:"

text_array = "["

count = 0

# loop over the array of questions and answers and use the model to create a new question answer in the context of rewriting the training data for validation
for item in data:

    # Create the prompt
    prompt = f"{system_prompt}{user_prompt}\n{item}\n<|assistant|>\n"
    # print logs in green about creating prompt
    print(f"\033[92mPrompt Created\033[0m")

    # Tokenize the prompt
    inputs = tokenizer(prompt, return_tensors="pt")

    # Generate the answer
    outputs = model.generate(**inputs, max_length=1024, do_sample=True, top_p=0.95, top_k=60, temperature=0.9)

    # Decode the answer
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # print logs in green about response received
    print(f"\033[92mResponse Received\033[0m")
    print(decoded   
          )
    # check if the answer starts and ends with [ and ]
    if decoded.startswith("[") and decoded.endswith("]"):
        # print logs in yellow about removing brackets
        print(f"\033[93mRemoving Brackets\033[0m")
        # remove the brackets
        decoded = decoded[1:-1]

    # check if the answer starts and ends with { and }
    if decoded.startswith("{") and decoded.endswith("}"):
        
        try:
            # print logs in green about decoding json
            print(f"\033[92mDecoding JSON\033[0m")
            response = json.loads(json_string)
        except json.JSONDecodeError:
            response = None

        if response is None:
            # print logs in red about invalid json
            print(f"\033[91mInvalid JSON\033[0m")
        else:
            print(f"\033[92mValid JSON\033[0m")

        # check if it is valid json
        if is_sample_data(decoded):
            # put the answer into a javascript array as text with a comma after it
            text_array += f"{decoded},"
        else:
            # print logs in red about 
            print(f"\033[91mInvalid Sample Data\033[0m")
            
        # put the answer into a javascript array as text with a comma after it
        text_array += f"{decoded},"
        #Q: why is this not stopping at 5? A: because the count is not being incremented
        count = count + 1

        # print in green that we successfully added the answer to the array
        print(f"\033[92mSuccessfully Added To Array\033[0m")
        if count > 5:
            break
    else:
        # print logs in red about invalid json
        print(f"\033[91mInvalid JSON\033[0m")
# remove the last comma
text_array = text_array[:-1]
# close the array
text_array += "]"

# check if it is a valid json array
try:
    fileData = json.loads(text_array)
    # save the file
    with open("synthetic_data.json", "w") as f:
        f.write(text_array)
except:
    print("Invalid json array")
    