# Setup

Requirements:
* torchao-0.8.0
* torchtune-0.6.0
* torch-2.4.0

In [1]:
import torch
print(torch.__version__)

2.5.1+cu121


In [40]:
import pandas as pd
import json
from datasets import Dataset

# Transformers & 
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from tqdm import tqdm

#import torch.nn.attention.flex_attention
#from torchtune.modules.tokenizers import ModelTokenizer
#from torchtune.models.llama3 import llama3_tokenizer
#from torchtune.data import Message

In [2]:
# Set API Keys
from kaggle_secrets import UserSecretsClient # API Loggins
user_secrets = UserSecretsClient()

## Hugging Face
Hugging_Face_token = user_secrets.get_secret("Hugging_Face_token")

# Login to Hugging Face
from huggingface_hub import login

login(Hugging_Face_token)

# Load Data

In [3]:
#df_HoC_2000s_raw = pd.read_csv('H:/MA_Thesis/data/Rauh_Schwalbach_2020_ParlSpeech/df_HoC_2000s.csv')

df_HoC_2000s_raw = pd.read_csv('/kaggle/input/parlspeech/df_HoC_2000s.csv')
df_HoC_2000s_raw.columns

Index(['date', 'agenda', 'speechnumber', 'speaker', 'party', 'party.facts.id',
       'chair', 'terms', 'text'],
      dtype='object')

In [4]:
df_HoC_2000s = df_HoC_2000s_raw[['date', 'agenda', 'speechnumber', 'speaker', 'party','text']]
df_HoC_2000s.columns
df_HoC_2000s.head(3)

Unnamed: 0,date,agenda,speechnumber,speaker,party,text
0,2000-01-10,Severe Disablement Allowance [Oral Answers To ...,1,Andrew George,LibDem,What steps the Government are taking to ensure...
1,2000-01-10,Severe Disablement Allowance [Oral Answers To ...,2,Hugh Bayley,Lab,Severe disablement allowance does not provide ...
2,2000-01-10,Severe Disablement Allowance [Oral Answers To ...,3,Andrew George,LibDem,Those who warned the Government against abolit...


# Exploratory Data Analaysis

In [10]:
df_HoC_2000s['speaker'].value_counts().head(10)

speaker
CHAIR             59591
David Cameron     13642
Theresa May       12984
Tony Blair         8272
Jack Straw         8142
Chris Grayling     6599
Chris Bryant       6217
Andrew Lansley     5942
Philip Hammond     5606
Peter Hain         5422
Name: count, dtype: int64

In [None]:
print(f"David Cameron has {df_HoC_2000s_raw[df_HoC_2000s_raw['speaker'] == 'David Cameron']['terms'].sum()} terms")
print(f"Boris Johnson has {df_HoC_2000s_raw[df_HoC_2000s_raw['speaker'] == 'Boris Johnson']['terms'].sum()} terms")

In [5]:
df_HoC_2005 = df_HoC_2000s[df_HoC_2000s['date'].str.contains('2005')]
df_HoC_2015 = df_HoC_2000s[df_HoC_2000s['date'].str.contains('2015')]

# Prepare Data for Training

In [8]:
# Constants
TOKEN_LENGTH_LIMIT = 4000
MIN_TOKEN_LENGTH = 200
CHAT_OWNER = "David Cameron"
BASE_MODEL_ID = "meta-llama/Llama-3.2-3B-Instruct"

In [99]:
# Create the tokenizer to measure the length of the text
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, 
                                          add_bos_token=False, 
                                          trust_remote_code=True, 
                                          use_fast=True, 
                                          force_download=False)

tokenizer.pad_token = tokenizer.eos_token

In [100]:
tokenizer.model_max_length

131072

In [101]:
print(tokenizer.special_tokens_map)

test_text = "Hello, how are you today?"
tokens = tokenizer.encode(test_text, return_tensors="pt")
print(tokens)


{'bos_token': '<|begin_of_text|>', 'eos_token': '<|eot_id|>', 'pad_token': '<|eot_id|>'}
tensor([[128000,   9906,     11,   1268,    527,    499,   3432,     30]])


**References on Preprocessing Dataset for Fine-tuning**
- https://pytorch.org/torchtune/0.2/tutorials/chat.html
- https://medium.com/@alexandros_chariton/how-to-fine-tune-llama-3-2-instruct-on-your-own-data-a-detailed-guide-e5f522f397d7

In [102]:
def preprocess_convo_1(df, output_path):
    df = df.assign(role=df["speaker"].apply(lambda x: "assistant" if x == CHAT_OWNER else "user"))
    
    grouped = df.groupby(["date", "agenda"])
    conversations = []

    for (date, agenda), group in tqdm(grouped):
        conversation = [
            {"role": "system", 
             "content": """
             You are David Cameron, a politician in the UK's House of Commons.
             You are responding to Observations.
             Respond exactly as David Cameron would speak, 
             staying fully in character and address the observation directly."""}
        ]
        token_len = len(tokenizer.encode(conversation[0]["content"]))
        has_assistant_message = False

        for _, row in group.iterrows():
            role = row["role"]
            message = row["text"]

            chat_message = {"role": role, "content": message}  # Message(role=role, content=message)
            chat_message_len = len(tokenizer.encode(message))

            if token_len + chat_message_len > TOKEN_LENGTH_LIMIT:            # Limit within Token-limit
                if token_len >= MIN_TOKEN_LENGTH and has_assistant_message:
                    conversations.append({"conversation": conversation})

                conversation = [conversation[-1]] if has_assistant_message else []
                token_len = len(tokenizer.encode(conversation[-1]["content"])) if conversation else 0
                has_assistant_message = any(msg["role"] == "assistant" for msg in conversation)

            conversation.append(chat_message)
            token_len += chat_message_len
            if role == "assistant":
                has_assistant_message = True

        if token_len >= MIN_TOKEN_LENGTH and has_assistant_message:
            conversations.append({"conversation": conversation})

    with open(output_path, 'w') as f:
        for convo in conversations:
            f.write(json.dumps(convo) + '\n')

In [103]:
output_path = '/kaggle/working/preprocessed_DavidCameron.jsonl'
preprocess_convo_1(df_HoC_2015, output_path)

100%|██████████| 2660/2660 [00:52<00:00, 50.50it/s]


In [104]:
# Check token lengths in dataset
total_tokens = []
with open(output_path, 'r') as f:
    for line in f:
        data = json.loads(line)
        conversation = " ".join([msg["content"] for msg in data["conversation"]])
        token_count = len(tokenizer.encode(conversation))
        total_tokens.append(token_count)

print(f"Min tokens: {min(total_tokens)}, Max tokens: {max(total_tokens)}, Avg tokens: {sum(total_tokens)/len(total_tokens)}")

Min tokens: 272, Max tokens: 4394, Avg tokens: 3211.944


In [62]:
# Inspect a Few Samples
dialog_blocks = []
with open(output_path, 'r') as f:
    for line in f:
        dialog_blocks.append(json.loads(line))  # Parse each line as a JSON object

import random

# Print random samples
for _ in range(1):
    sample = random.choice(dialog_blocks)
    print(json.dumps(sample, indent=4))

{
    "conversation": [
        {
            "role": "user",
            "content": "Will the Prime Minister confirm that the Government will maintain their commitment in grants to the Aerospace Technology Institute?"
        },
        {
            "role": "assistant",
            "content": "The hon. Gentleman will have to wait for the outcome of the spending review-he only has to wait another 48 hours. The partnerships that we have put in place for the defence industry, the aerospace industry and other industries have been successful in generating growth, jobs and intellectual property."
        },
        {
            "role": "user",
            "content": "I welcome the Prime Minister's statement, particularly the reaffirmation of his personal commitment and our commitment as a country to the 0.7% spending target for aid. Will he reassure my constituents that their hard-earned cash will be spent only where it is squarely in our national interests to do so?"
        },
        {

# Prepare & Tokenize

In [105]:
# Load the preprocessed JSONL dataset
with open(output_path, "r") as f:
    raw_data = [json.loads(line) for line in f]

# Convert into a dataset format that follows the guide
formatted_data = []

for convo in raw_data:
    messages = []
    
    for turn in convo["conversation"]:
        if turn["role"] == "system":
            messages.append({"role": "system", "content": turn["content"]})
        elif turn["role"] == "user":
            messages.append({"role": "user", "content": turn["content"]})
        elif turn["role"] == "assistant":
            messages.append({"role": "assistant", "content": turn["content"]})
    
    # Apply chat template
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    prompt = prompt.replace("Cutting Knowledge Date: December 2023\nToday Date: 10 Feb 2025\n\n", "")
    
    formatted_data.append({"prompt": prompt})

# Convert to Hugging Face Dataset
dataset = Dataset.from_list(formatted_data)

In [118]:
tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 10 Feb 2025\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nThe Prime Minister promised during the election campaign that he would not restrict child benefits to two children. Since then, he has not only reneged on that but, as a result, brought in the rape clause for women in order for women to receive child benefits. Since July, I have asked a number of his Ministers a number of times, and nobody has been able to tell me how this will work. Will he now drop the two-child policy and the rape clause?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nFirst of all, we have made it absolutely clear, and let me make it clear again, that there is no question of someone who is raped and has a child losing their child tax credits or their child benefit-no question at all. But is it right for future claimants on universal credit to get payments for their first two

In [119]:
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(example):
    tokens = tokenizer(example['prompt'], padding=True, truncation=True, max_length=4608)
    
    #tokens['input_ids'] = [tokenizer.pad_token_id if token == tokenizer.eos_token_id else token for token in tokens['input_ids']    ]
    #tokens['labels'] = [-100 if token == tokenizer.pad_token_id else token for token in tokens['input_ids']    ]

    return tokens

# Apply tokenization
tokenized_dataset = dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/125 [00:00<?, ? examples/s]

In [120]:
tokenized_dataset

Dataset({
    features: ['prompt', 'input_ids', 'attention_mask'],
    num_rows: 125
})

In [121]:
decoded_text = tokenizer.decode(tokenized_dataset[2]["input_ids"])
decoded_text

"<|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are David Cameron, a politician in the UK's House of Commons.\n             You are responding to Observations.\n             Respond exactly as David Cameron would speak, \n             staying fully in character and address the observation directly.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nIf he will list his official engagements for Wednesday 7 January.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI am sure the whole House will want to join me in condemning the barbaric attack this morning on an office of a magazine in Paris, in which it is reported that 10 or more people may have been killed. While details are still unclear, I know that this House and this country stand united with the French people in our opposition to all forms of terrorism, and we stand squarely for free speech and democracy. These people will never be able to take us off those values. This morning I had

# What's Next?

* Why is there so many excessive `<|eot_id|>` tokens? This is not normal right?