# Setup

Requirements:
* torchao-0.8.0
* torchtune-0.6.0
* torch-2.4.0

In [1]:
import pandas as pd
import json
from datasets import Dataset

# Transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm

#import torch.nn.attention.flex_attention
#from torchtune.modules.tokenizers import ModelTokenizer
#from torchtune.models.llama3 import llama3_tokenizer
#from torchtune.data import Message

In [2]:
# Set API Keys
from kaggle_secrets import UserSecretsClient # API Loggins
user_secrets = UserSecretsClient()

## Hugging Face
Hugging_Face_token = user_secrets.get_secret("Hugging_Face_token")

# Login to Hugging Face
from huggingface_hub import login

login(Hugging_Face_token)

# Load Data

In [3]:
#df_HoC_2000s_raw = pd.read_csv('H:/MA_Thesis/data/Rauh_Schwalbach_2020_ParlSpeech/df_HoC_2000s.csv')

df_HoC_2000s_raw = pd.read_csv('/kaggle/input/parlspeech/df_HoC_2000s.csv')
df_HoC_2000s_raw.columns

Index(['date', 'agenda', 'speechnumber', 'speaker', 'party', 'party.facts.id',
       'chair', 'terms', 'text'],
      dtype='object')

In [4]:
df_HoC_2000s = df_HoC_2000s_raw[['date', 'agenda', 'speechnumber', 'speaker', 'party','text']]
df_HoC_2000s.columns
df_HoC_2000s.head(3)

Unnamed: 0,date,agenda,speechnumber,speaker,party,text
0,2000-01-10,Severe Disablement Allowance [Oral Answers To ...,1,Andrew George,LibDem,What steps the Government are taking to ensure...
1,2000-01-10,Severe Disablement Allowance [Oral Answers To ...,2,Hugh Bayley,Lab,Severe disablement allowance does not provide ...
2,2000-01-10,Severe Disablement Allowance [Oral Answers To ...,3,Andrew George,LibDem,Those who warned the Government against abolit...


# Exploratory Data Analaysis

In [5]:
df_HoC_2000s['speaker'].value_counts().head(10)

speaker
CHAIR             59591
David Cameron     13642
Theresa May       12984
Tony Blair         8272
Jack Straw         8142
Chris Grayling     6599
Chris Bryant       6217
Andrew Lansley     5942
Philip Hammond     5606
Peter Hain         5422
Name: count, dtype: int64

In [None]:
print(f"David Cameron has {df_HoC_2000s_raw[df_HoC_2000s_raw['speaker'] == 'David Cameron']['terms'].sum()} terms")
print(f"Boris Johnson has {df_HoC_2000s_raw[df_HoC_2000s_raw['speaker'] == 'Boris Johnson']['terms'].sum()} terms")

In [6]:
df_HoC_2005 = df_HoC_2000s[df_HoC_2000s['date'].str.contains('2005')]
df_HoC_2015 = df_HoC_2000s[df_HoC_2000s['date'].str.contains('2015')]

# Load Tokenizer

In [7]:
BASE_MODEL_ID = "meta-llama/Llama-3.2-3B-Instruct"

In [8]:
# Create the tokenizer to measure the length of the text
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, 
                                          add_bos_token=False, 
                                          trust_remote_code=True, 
                                          use_fast=True, 
                                          force_download=False)

tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.model_max_length

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

131072

In [9]:
min(tokenizer.vocab.values()), max(tokenizer.vocab.values()) 

(0, 128256)

In [13]:
print(tokenizer.special_tokens_map)

test_text = "Hello, how are you today?"
tokens = tokenizer.encode(test_text, return_tensors="pt")
print(tokens)

{'bos_token': '<|begin_of_text|>', 'eos_token': '<|eot_id|>', 'pad_token': '[PAD]'}
tensor([[128000,   9906,     11,   1268,    527,    499,   3432,     30]])


# Prepare Data for Training

**References on Preprocessing Dataset for Fine-tuning**
- https://pytorch.org/torchtune/0.2/tutorials/chat.html
- https://medium.com/@alexandros_chariton/how-to-fine-tune-llama-3-2-instruct-on-your-own-data-a-detailed-guide-e5f522f397d7

In [23]:
# Constants
TOKEN_LENGTH_LIMIT = 3800
MIN_TOKEN_LENGTH = 512
CHAT_OWNER = "David Cameron"

In [33]:
df = df_HoC_2015.assign(role=df_HoC_2015["speaker"].apply(lambda x: "assistant" if x == CHAT_OWNER else "user"))
df.groupby(["date", "agenda"])
df.iloc[60:100]

Unnamed: 0,date,agenda,speechnumber,speaker,party,text,role
952408,2015-01-05,Crime Levels [Oral Answers to Questions > Home...,61,John Howell,Con,What assessment she has made of changes in the...,user
952409,2015-01-05,Crime Levels [Oral Answers to Questions > Home...,62,Lynne Featherstone,LibDem,Police reform is working and crime is down by ...,user
952410,2015-01-05,Crime Levels [Oral Answers to Questions > Home...,63,John Howell,Con,I thank the Minister for that answer. Will she...,user
952411,2015-01-05,Crime Levels [Oral Answers to Questions > Home...,64,Lynne Featherstone,LibDem,I am happy to do as my hon. Friend suggests an...,user
952412,2015-01-05,Crime Levels [Oral Answers to Questions > Home...,65,Steve Reed,Lab,The Minister does not have much to say about c...,user
952413,2015-01-05,Crime Levels [Oral Answers to Questions > Home...,66,Lynne Featherstone,LibDem,I welcome the hon. Gentleman to his place. Up ...,user
952414,2015-01-05,Crime Levels [Oral Answers to Questions > Home...,67,Simon Kirby,Con,The police in my constituency do an excellent ...,user
952415,2015-01-05,Crime Levels [Oral Answers to Questions > Home...,68,Lynne Featherstone,LibDem,I am more than happy to congratulate my hon. F...,user
952416,2015-01-05,Crime Levels [Oral Answers to Questions > Home...,69,Kevin Brennan,Lab,Does the Minister agree with Sir Tom Winsor th...,user
952417,2015-01-05,Crime Levels [Oral Answers to Questions > Home...,70,Lynne Featherstone,LibDem,"I might not have put it that way, but when one...",user


In [34]:
def preprocess_convo_1(df, output_path):
    df = df.assign( role =df["speaker"].apply(lambda x: "assistant" if x == CHAT_OWNER else "user")) # add new 'role' column
    grouped = df.groupby(["date", "agenda"])
    
    conversations = []

    for (date, agenda), group in tqdm(grouped):
        # Start the dialogue with a 'system' role
        conversation = [
            {"role": "system", 
             "content": """
             You are David Cameron, a politician in the UK's House of Commons.
             You are responding to Observations.
             Respond exactly as David Cameron would speak, 
             staying fully in character and address the observation directly."""}
        ]
        
        token_len = len(tokenizer.encode(conversation[0]["content"]))
        has_assistant_message = False

        for _, row in group.iterrows():
            role = row["role"]
            message = row["text"]

            chat_message = {"role": role, "content": message}  # Message(role=role, content=message)
            chat_message_len = len(tokenizer.encode(message))

            if token_len + chat_message_len > TOKEN_LENGTH_LIMIT:
                if token_len >= MIN_TOKEN_LENGTH and has_assistant_message:
                    conversations.append({"conversation": conversation})
            
                # Start new conversation with the system prompt
                conversation = [
                    {"role": "system", "content": """
                    You are David Cameron, a politician in the UK's House of Commons.
                    You are responding to Observations.
                    Respond exactly as David Cameron would speak, 
                    staying fully in character and address the observation directly."""}
                ]
            
                # Retain the last assistant message if available
                if has_assistant_message:
                    conversation.append(conversation[-1])
                
                token_len = len(tokenizer.encode(conversation[-1]["content"])) if conversation else 0
                has_assistant_message = any(msg["role"] == "assistant" for msg in conversation)

            conversation.append(chat_message)
            token_len += chat_message_len
            if role == "assistant":
                has_assistant_message = True

        if token_len >= MIN_TOKEN_LENGTH and has_assistant_message:
            conversations.append({"conversation": conversation})

    with open(output_path, 'w') as f:
        for convo in conversations:
            f.write(json.dumps(convo) + '\n')

In [15]:
output_path = '/kaggle/working/preprocessed_DavidCameron.jsonl'
preprocess_convo_1(df_HoC_2015, output_path)

100%|██████████| 2660/2660 [01:27<00:00, 30.57it/s]


In [None]:
# Check token lengths in dataset
total_tokens = []
with open(output_path, 'r') as f:
    for line in f:
        data = json.loads(line)
        conversation = " ".join([msg["content"] for msg in data["conversation"]])
        token_count = len(tokenizer.encode(conversation))
        total_tokens.append(token_count)

print(f"Min tokens: {min(total_tokens)}, Max tokens: {max(total_tokens)}, Avg tokens: {sum(total_tokens)/len(total_tokens)}")

In [None]:
# Inspect a Few Samples
dialog_blocks = []
with open(output_path, 'r') as f:
    for line in f:
        dialog_blocks.append(json.loads(line))  # Parse each line as a JSON object

dialog_blocks[2]

# Prepare & Tokenize

In [None]:
# Load the preprocessed JSONL dataset
with open(output_path, "r") as f:
    raw_data = [json.loads(line) for line in f]

# Convert into a dataset format that follows the guide
formatted_data = []

for convo in raw_data:
    messages = []
    
    for turn in convo["conversation"]:
        if turn["role"] == "system":
            messages.append({"role": "system", "content": turn["content"]})
        elif turn["role"] == "user":
            messages.append({"role": "user", "content": turn["content"]})
        elif turn["role"] == "assistant":
            messages.append({"role": "assistant", "content": turn["content"]})
    
    # Apply chat template
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    prompt = prompt.replace("Cutting Knowledge Date: December 2023\nToday Date: 10 Feb 2025\n\n", "")
    
    formatted_data.append({"prompt": prompt})

# Convert to Hugging Face Dataset
dataset = Dataset.from_list(formatted_data)

In [None]:
def tokenize_function(example):
    tokens = tokenizer(example['prompt'], padding="max_length", truncation=True, max_length=4096)
    
    tokens['labels'] = [-100 if token == tokenizer.pad_token_id else token for token in tokens['input_ids']    ]

    return tokens

# Apply tokenization
tokenized_dataset = dataset.map(tokenize_function, batched=True)


In [None]:
tokenized_dataset

In [None]:
decoded_text = tokenizer.decode(tokenized_dataset[2]["input_ids"])
decoded_text

# What's Next?

* Fix missing "system prompt" at the start of a coversation list
* Fix double `<|begin_of_text|>` tokens