In [11]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch 


bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    # bnb_4bit_compute_dtype=torch.float16
)

model_id = "nvidia/Llama-3.1-Nemotron-Nano-8B-v1"

# 1) Load the meta-provided tokenizer & model
tokenizer = AutoTokenizer.from_pretrained(model_id)
model_kwargs = {"torch_dtype": torch.bfloat16, "device_map": "auto"}

# 2) Tell HF to reserve two new specials
new_specials = ["<|OTHER|>", "<|ME|>"]
num_added = tokenizer.add_special_tokens({"additional_special_tokens": new_specials})
print(f"Added {num_added} tokens (at IDs {tokenizer.convert_tokens_to_ids(new_specials)})")



Added 2 tokens (at IDs [128256, 128257])


In [13]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    # quantization_config=bnb_config,
    device_map="auto"
)

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

ValueError: You are trying to offload the whole model to the disk. Please use the `disk_offload` function instead.

In [None]:
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

# 1) Make sure all layers that need gradients are unfrozen, and any necessary buffers
#    (e.g. for layernorm) are adjust-for-training.
model = prepare_model_for_kbit_training(model)

# 2) Define your LoRA config
lora_config = LoraConfig(
    r=16,                        # bottleneck rank
    lora_alpha=32,               # scaling factor
    target_modules=["q_proj","v_proj"],  # typically attention proj layers
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# 3) Wrap your model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
tokenizer.pad_token_id = tokenizer.eos_token_id

pipeline = transformers.pipeline(
   "text-generation",
   model=model_id,
   tokenizer=tokenizer,
   max_new_tokens=32768,
   temperature=0.6,
   top_p=0.95,
   **model_kwargs
)


In [None]:
persona_tokens = ["<|ME|>","<|OTHER|>"]
time_tokens    = ["<|DT_SHORT|>","<|DT_LONG|>"]  # optional
new_tokens = persona_tokens + time_tokens

tokenizer.add_special_tokens({"additional_special_tokens": new_tokens})
model.resize_token_embeddings(len(tokenizer))

In [None]:
emb = model.get_input_embeddings()
emb.weight.requires_grad_(False)  
for t in new_tokens:
    idx = tokenizer.convert_tokens_to_ids(t)
    emb.weight[idx].requires_grad = True

In [None]:

model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3-7b-hf",
    torch_dtype="auto",
    device_map="auto"
)

# 3) Resize model embeddings so it can learn them
model.resize_token_embeddings(len(tokenizer))

# Now you can format examples using <|OTHER|> and <|ME|> and HF will handle them.

In [None]:
import pandas as pd
import json
from whatstk import WhatsAppChat
from transformers import AutoTokenizer
from tqdm.auto import tqdm


SAME_CONVO_THRESHOLD_SECONDS = 3600
SAME_USER_THRESHOLD_SECONDS = 600
HISTORY_MAX_TOKENS = 3000
CONVO_MIN_TOKENS = 100


# create the tokenizer to measure the length of the text
base_model_id = "alpindale/Mistral-7B-v0.2-hf"
encoder = AutoTokenizer.from_pretrained(base_model_id, add_bos_token=False, add_special_tokens=True, trust_remote_code=True, use_fast=True, force_download=False)

# combine messages from the same sender within 5 mins into a single new-line separated message
def collapse_messages(df, delta_threshold=SAME_USER_THRESHOLD_SECONDS):
    if len(df) == 0:
        return df
    
    new_data = []
    
    df_temp = df.copy()
    current_row = df_temp.iloc[0]
    current_role = current_row["chat_message"][0]

    for _, row in tqdm(df_temp[1:].iterrows(), total=len(df_temp)-1):
        row_role = row["chat_message"][0]
        row_message = row["chat_message"][1]

        if row_role == current_role and row["time_delta"] < delta_threshold:
            current_row["chat_message"] = (current_row["chat_message"][0], current_row["chat_message"][1] + "\n" + row_message)
        else:
            new_data.append(current_row.to_dict())
            current_row = row
            current_role = row_role
    
    # add last row
    new_data.append(current_row.to_dict())

    return pd.DataFrame(new_data)



def preprocess_convo(input_path, output_path, role="user", chat_owner="Watson"):
    chat = WhatsAppChat.from_source(filepath=input_path)
    df = chat.df

    # Calculate time passed since previous message
    df["date_previous"] = df["date"].shift(periods=1)
    df["time_delta"] = (df["date"]-df["date_previous"]).dt.total_seconds()
    df["chat_message"] = df.apply(lambda x: ("system" if x["username"] == chat_owner else role, x["message"]), axis=1)
    df = collapse_messages(df)

    query = []
    conversation = []
    token_len = 0


    for _, row in tqdm(df.iterrows(), total=len(df)):
        row_role = row["chat_message"][0]
        row_message = row["chat_message"][1]

        # Ignore media
        if row_message == "<Media omitted>":
            continue

        chat_message_formatted = "<start_header_id>{role}<end_header_id>{message}".format(role=row_role, message=row_message)
        chat_message_formatted_len = len(encoder.encode(chat_message_formatted))
      
        # Add message to conversation if it's within one hour from the previous message, and the history is less than 5000 tokens
        if row["time_delta"]<SAME_CONVO_THRESHOLD_SECONDS and token_len + chat_message_formatted_len<HISTORY_MAX_TOKENS: 
            conversation.append(chat_message_formatted)
        
        # If message is more than one hour from the previous one or history length is too long, create a new conversation to hold the current message
        else:
            # Write the current conversation to the final query
            query.append(conversation)
            # reset
            conversation = [chat_message_formatted]
            token_len = chat_message_formatted_len

    # write out the last conversation
    query.append(conversation)


    df_model = pd.DataFrame({"query": query})
    df_model['query_str'] = df_model['query'].apply(lambda x: "<|eot_id|>".join(x))
    df_model['query_len'] = df_model['query_str'].apply(lambda x: len(encoder.encode(x)))
    
    # remove short conversations
    df_model_filtered = df_model[df_model['query_len'] > CONVO_MIN_TOKENS]


    # write output as json lines in the format {'input': formatted_message} so that we can use it for finetuning
    with open(output_path, 'w') as f:
        for _, row in df_model_filtered.iterrows():
            f.write(json.dumps({'input': row['query_str']}) + '\n')        