# Setup

In [1]:
import pandas as pd
import json

# Transformers
from transformers import AutoTokenizer
from tqdm.auto import tqdm

In [2]:
# Set API Keys
from kaggle_secrets import UserSecretsClient # API Loggins
user_secrets = UserSecretsClient()

## Hugging Face
Hugging_Face_token = user_secrets.get_secret("Hugging_Face_token")

# Login to Hugging Face
from huggingface_hub import login

login(Hugging_Face_token)

# Load Data

In [3]:
#df_HoC_2000s_raw = pd.read_csv('H:/MA_Thesis/data/Rauh_Schwalbach_2020_ParlSpeech/df_HoC_2000s.csv')

df_HoC_2000s_raw = pd.read_csv('/kaggle/input/parlspeech/df_HoC_2000s.csv')

In [4]:
df_HoC_2000s = df_HoC_2000s_raw[['date', 'agenda', 'speechnumber', 'speaker', 'party','text']]
df_HoC_2000s.columns
df_HoC_2000s.head(3)

Unnamed: 0,date,agenda,speechnumber,speaker,party,text
0,2000-01-10,Severe Disablement Allowance [Oral Answers To ...,1,Andrew George,LibDem,What steps the Government are taking to ensure...
1,2000-01-10,Severe Disablement Allowance [Oral Answers To ...,2,Hugh Bayley,Lab,Severe disablement allowance does not provide ...
2,2000-01-10,Severe Disablement Allowance [Oral Answers To ...,3,Andrew George,LibDem,Those who warned the Government against abolit...


# Prepare Data for Training

In [5]:
# Constants
TOKEN_LENGTH_LIMIT = 4000
MIN_TOKEN_LENGTH = 200
CHAT_OWNER = "Boris Johnson"
BASE_MODEL_ID = "meta-llama/Llama-3.2-3B"

In [6]:
# Create the tokenizer to measure the length of the text
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, 
                                          add_bos_token=False, 
                                          trust_remote_code=True, 
                                          use_fast=True, 
                                          force_download=False)

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

Special template, format, and tags https://pytorch.org/torchtune/0.2/tutorials/chat.html

In [7]:
def preprocess_convo(df, output_path):
    # Assign roles
    df = df.assign(role=df["speaker"].apply(lambda x: "assistant" if x == CHAT_OWNER else "user"))
    
    # Group by date and agenda
    grouped = df.groupby(["date", "agenda"])
    
    conversations = []
    
    for (date, agenda), group in tqdm(grouped):
        conversation = []
        token_len = 0
        has_assistant_message = False
        
        for _, row in group.iterrows():
            role = row["role"]
            message = row["text"]
            
            # Format the message
            chat_message_formatted = f"<start_header_id>{role}<end_header_id>{message}"
            chat_message_formatted_len = len(tokenizer.encode(chat_message_formatted))
            
            # Check if adding the message exceeds the token length limit
            if token_len + chat_message_formatted_len > TOKEN_LENGTH_LIMIT:
                # Save the current conversation if it meets the minimum token length and contains at least one assistant message
                if token_len >= MIN_TOKEN_LENGTH and has_assistant_message:
                    conversations.append(conversation)
                # Start a new conversation
                conversation = []
                token_len = 0
                has_assistant_message = False
            
            # Add the message to the conversation
            conversation.append(chat_message_formatted)
            token_len += chat_message_formatted_len
            if role == "assistant":
                has_assistant_message = True
        
        # Save the last conversation if it meets the minimum token length and contains at least one assistant message
        if token_len >= MIN_TOKEN_LENGTH and has_assistant_message:
            conversations.append(conversation)
    
    # Save the conversations to a JSON lines file
    with open(output_path, 'w') as f:
        for conversation in conversations:
            query_str = "<|eot_id|>".join(conversation)
            f.write(json.dumps({'input': query_str}) + '\n')

In [8]:
output_path = '/kaggle/working/HoC_boris_johnson.jsonl'
preprocess_convo(df_HoC_2000s, output_path)

  0%|          | 0/51318 [00:00<?, ?it/s]

In [9]:
dialog_blocks = []
with open(output_path, 'r') as f:
    for line in f:
        dialog_blocks.append(json.loads(line))  # Parse each line as a JSON object

# Display the number of dialog blocks
print(f"Total dialog blocks: {len(dialog_blocks)}")
print(json.dumps(dialog_blocks[0], indent=4))  # Pretty-print the first dialog block

Total dialog blocks: 374
{
    "input": "<start_header_id>assistant<end_header_id>It is a great pleasure to follow the right hon. Member for Gateshead, East and Washington, West (Joyce Quin) and my hon. Friend the Member for South Suffolk (Mr. Yeo), and to speak in a debate briefly attended by my old comrade the hon. Member for Clwyd, South (Mr. Jones), who has now left the Chamber. He defeated me soundly in 1997, so living up to his nickname of \" Jones the Vote\", and it is a great honour to share the Chamber with him now. As is conventional in maiden speeches, I pay tribute to my predecessor. As many in south Oxfordshire and elsewhere have not hesitated to point out, Michael Heseltine is a hard act to follow, so I approach this moment with much the same sense of self-doubt as Simba in \" The Lion King\". For the benefit of those who have not seen Walt Disney's film, there is a poignant moment when Simba, following Mufasa across the veld, compares his own paws with the vast pawprints

In [10]:
for i, block in enumerate(dialog_blocks[:5]):  # Preview first 3 blocks
    print(f"Dialog Block {i + 1}:")
    print(json.dumps(block, indent=4))
    print("-" * 40)


Dialog Block 1:
{
    "input": "<start_header_id>assistant<end_header_id>It is a great pleasure to follow the right hon. Member for Gateshead, East and Washington, West (Joyce Quin) and my hon. Friend the Member for South Suffolk (Mr. Yeo), and to speak in a debate briefly attended by my old comrade the hon. Member for Clwyd, South (Mr. Jones), who has now left the Chamber. He defeated me soundly in 1997, so living up to his nickname of \" Jones the Vote\", and it is a great honour to share the Chamber with him now. As is conventional in maiden speeches, I pay tribute to my predecessor. As many in south Oxfordshire and elsewhere have not hesitated to point out, Michael Heseltine is a hard act to follow, so I approach this moment with much the same sense of self-doubt as Simba in \" The Lion King\". For the benefit of those who have not seen Walt Disney's film, there is a poignant moment when Simba, following Mufasa across the veld, compares his own paws with the vast pawprints left by 