# Setup

In [1]:
import pandas as pd
import json

# Transformers & 
from transformers import AutoTokenizer
from tqdm import tqdm
#from torchtune.models.llama3._tokenizer import Llama3Tokenizer
#from torchtune.data import Message

In [7]:
# Set API Keys
from kaggle_secrets import UserSecretsClient # API Loggins
user_secrets = UserSecretsClient()

## Hugging Face
Hugging_Face_token = user_secrets.get_secret("Hugging_Face_token")

# Login to Hugging Face
from huggingface_hub import login

login(Hugging_Face_token)

# Load Data

In [8]:
#df_HoC_2000s_raw = pd.read_csv('H:/MA_Thesis/data/Rauh_Schwalbach_2020_ParlSpeech/df_HoC_2000s.csv')

df_HoC_2000s_raw = pd.read_csv('/kaggle/input/parlspeech/df_HoC_2000s.csv')
df_HoC_2000s_raw.columns

Index(['date', 'agenda', 'speechnumber', 'speaker', 'party', 'party.facts.id',
       'chair', 'terms', 'text'],
      dtype='object')

In [9]:
df_HoC_2000s = df_HoC_2000s_raw[['date', 'agenda', 'speechnumber', 'speaker', 'party','text']]
df_HoC_2000s.columns
df_HoC_2000s.head(3)

Unnamed: 0,date,agenda,speechnumber,speaker,party,text
0,2000-01-10,Severe Disablement Allowance [Oral Answers To ...,1,Andrew George,LibDem,What steps the Government are taking to ensure...
1,2000-01-10,Severe Disablement Allowance [Oral Answers To ...,2,Hugh Bayley,Lab,Severe disablement allowance does not provide ...
2,2000-01-10,Severe Disablement Allowance [Oral Answers To ...,3,Andrew George,LibDem,Those who warned the Government against abolit...


# Exploratory Data Analaysis

In [13]:
df_HoC_2000s['speaker'].value_counts().head(10)

speaker
CHAIR             59591
David Cameron     13642
Theresa May       12984
Tony Blair         8272
Jack Straw         8142
Chris Grayling     6599
Chris Bryant       6217
Andrew Lansley     5942
Philip Hammond     5606
Peter Hain         5422
Name: count, dtype: int64

In [10]:
print(f"David Cameron has {df_HoC_2000s_raw[df_HoC_2000s_raw['speaker'] == 'David Cameron']['terms'].sum()} terms")
print(f"Boris Johnson has {df_HoC_2000s_raw[df_HoC_2000s_raw['speaker'] == 'Boris Johnson']['terms'].sum()} terms")

David Cameron has 1677960 terms
Boris Johnson has 259544 terms


In [22]:
df_HoC_2005 = df_HoC_2000s[df_HoC_2000s['date'].str.contains('2005')]
df_HoC_2015 = df_HoC_2000s[df_HoC_2000s['date'].str.contains('2015')]

# Prepare Data for Training

In [82]:
# Constants
TOKEN_LENGTH_LIMIT = 4000
MIN_TOKEN_LENGTH = 200
CHAT_OWNER = "David Cameron"
BASE_MODEL_ID = "meta-llama/Llama-3.2-3B-Instruct"

In [133]:
# Create the tokenizer to measure the length of the text
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, 
                                          add_bos_token=False, 
                                          trust_remote_code=True, 
                                          use_fast=True, 
                                          force_download=False)

In [134]:
tokenizer.special_tokens

AttributeError: 'PreTrainedTokenizerFast' object has no attribute 'special_tokens'

**References on Preprocessing Dataset for Fine-tuning**
- https://pytorch.org/torchtune/0.2/tutorials/chat.html
- https://medium.com/@alexandros_chariton/how-to-fine-tune-llama-3-2-instruct-on-your-own-data-a-detailed-guide-e5f522f397d7

In [129]:
def preprocess_convo_1(df, output_path):
    df = df.assign(role=df["speaker"].apply(lambda x: "assistant" if x == CHAT_OWNER else "user"))
    
    grouped = df.groupby(["date", "agenda"])
    conversations = []

    for (date, agenda), group in tqdm(grouped):
        conversation = [
            {"role": "system", 
             "content": """
             You are David Cameron, a politician in the UK's House of Commons.
             You are responding to Observations.
             Respond exactly as David Cameron would speak, 
             staying fully in character and address the observation directly."""}
        ]
        token_len = len(tokenizer.encode(conversation[0]["content"]))
        has_assistant_message = False

        for _, row in group.iterrows():
            role = row["role"]
            message = row["text"]

            chat_message = {"role": role, "content": message}
            chat_message_len = len(tokenizer.encode(message))

            if token_len + chat_message_len > TOKEN_LENGTH_LIMIT:
                if token_len >= MIN_TOKEN_LENGTH and has_assistant_message:
                    conversations.append({"conversation": conversation})

                conversation = [conversation[-1]] if has_assistant_message else []
                token_len = len(tokenizer.encode(conversation[-1]["content"])) if conversation else 0
                has_assistant_message = any(msg["role"] == "assistant" for msg in conversation)

            conversation.append(chat_message)
            token_len += chat_message_len
            if role == "assistant":
                has_assistant_message = True

        if token_len >= MIN_TOKEN_LENGTH and has_assistant_message:
            conversations.append({"conversation": conversation})

    with open(output_path, 'w') as f:
        for convo in conversations:
            f.write(json.dumps(convo) + '\n')

In [131]:
output_path = '/kaggle/working/preprocessed_DavidCameron.jsonl'
preprocess_convo_1(df_HoC_2015, output_path)

100%|██████████| 2660/2660 [00:55<00:00, 48.14it/s]


In [21]:
dialog_blocks = []
with open(output_path, 'r') as f:
    for line in f:
        dialog_blocks.append(json.loads(line))  # Parse each line as a JSON object

# Display the number of dialog blocks
print(f"Total dialog blocks: {len(dialog_blocks)}")
print(json.dumps(dialog_blocks[0], indent=4))  # Pretty-print the first dialog block

Total dialog blocks: 125
{
    "conversation": [
        {
            "role": "system",
            "content": "You are Boris Johnson, a former UK Prime Minister. Answer in his style."
        },
        {
            "role": "user",
            "content": "What steps he is taking to devolve power to Bradford and other cities and large metropolitan areas."
        },
        {
            "role": "user",
            "content": "The Government have agreed a city deal and growth deal with the Leeds city region, of which Bradford is, of course, a part. The result is new transport, housing and regeneration schemes, such as the One City park, which will directly benefit Bradford. The city deal has already ensured more than 600 new apprenticeships, and 69% of 16 and 17-year-olds involved in the devolved youth contract pilot have been supported into education, employment or training. We are also in active negotiations on a devolution deal to give the area more control over key policy levers,

In [117]:
# Check token lengths in dataset
total_tokens = []
with open(output_path, 'r') as f:
    for line in f:
        data = json.loads(line)
        conversation = " ".join([msg["content"] for msg in data["conversation"]])
        token_count = len(tokenizer.encode(conversation))
        total_tokens.append(token_count)

print(f"Min tokens: {min(total_tokens)}, Max tokens: {max(total_tokens)}, Avg tokens: {sum(total_tokens)/len(total_tokens)}")

Min tokens: 242, Max tokens: 4394, Avg tokens: 3203.84


In [97]:
# Inspect a Few Samples
import random

# Print random samples
for _ in range(1):
    sample = random.choice(dialog_blocks)
    print(json.dumps(sample, indent=4))

{
    "conversation": [
        {
            "role": "system",
            "content": "You are Boris Johnson, a former UK Prime Minister. Answer in his style."
        },
        {
            "role": "user",
            "content": "The Prime Minister has been asked repeatedly about his plans to exclude Scottish MPs from decisions that will directly and indirectly impact on Scotland's budget and my constituents in Airdrie and Shotts. Will he finally tell the House and the people of Scotland whether it is right to create a second-class status for Scottish MPs through the back door, or is he content to press ahead with plans that will bring about the break-up of Britain?"
        },
        {
            "role": "assistant",
            "content": "I am quite baffled. I thought the whole point of the SNP is that SNP Members want to exclude themselves from the UK Parliament forever. I thought that was the whole point. What we are putting in place is a fair and balanced system that is fai

# Prepare & Tokenize

In [108]:
def load_preprocessed_data(file_path):
    with open(file_path, 'r') as f:
        return [json.loads(line) for line in f]

data = load_preprocessed_data(output_path)

In [109]:
from datasets import Dataset

# Flatten the conversations into a list of dictionaries
flattened_data = []
for convo in data:
    for message in convo["conversation"]:
        flattened_data.append(message)

# Create a Dataset object
dataset = Dataset.from_list(flattened_data)

In [92]:
tokenizer.pad_token = tokenizer.eos_token
#tokenizer.add_special_tokens({'pad_token': '[PAD]'})

def tokenize_function(examples):
    tokens = tokenizer(examples["content"], padding="max_length", truncation=True)
    tokens['labels'] = [-100 if token == tokenizer.pad_token_id else token for token in tokens['input_ids']    ]
    return tokens

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset.save_to_disk("/kaggle/working/tokenized_dataset")

Map:   0%|          | 0/3354 [00:00<?, ? examples/s]