# Setup

In [2]:
import pandas as pd
import numpy as np

import json
import matplotlib.pyplot as plt
import datetime
from datasets import Dataset

# Transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm

In [3]:
# Set API Keys
from kaggle_secrets import UserSecretsClient # API Loggins
user_secrets = UserSecretsClient()

## Hugging Face
Hugging_Face_token = user_secrets.get_secret("Hugging_Face_token")

# Login to Hugging Face
from huggingface_hub import login

login(Hugging_Face_token)

# Load Data

In [6]:
df_HoC_2000s_raw = pd.read_csv('/kaggle/input/parlspeech/df_HoC_2000s.csv')
print(df_HoC_2000s_raw.columns)

df_HoC_2000s = df_HoC_2000s_raw[['date', 'agenda', 'speechnumber', 'speaker', 'party','text']]
print(df_HoC_2000s.columns)

Index(['date', 'agenda', 'speechnumber', 'speaker', 'party', 'party.facts.id',
       'chair', 'terms', 'text'],
      dtype='object')


In [9]:
df_HoC_2000s.head(3)

Unnamed: 0,date,agenda,speechnumber,speaker,party,text
0,2000-01-10,Severe Disablement Allowance [Oral Answers To ...,1,Andrew George,LibDem,What steps the Government are taking to ensure...
1,2000-01-10,Severe Disablement Allowance [Oral Answers To ...,2,Hugh Bayley,Lab,Severe disablement allowance does not provide ...
2,2000-01-10,Severe Disablement Allowance [Oral Answers To ...,3,Andrew George,LibDem,Those who warned the Government against abolit...


# Exploratory Data Analaysis

In [17]:
miniDebate_speakers = df_HoC_2000s[df_HoC_2000s['agenda'].str.contains('Free Movement of EU Nationals', case=False, na=False)]['speaker'].unique()#.iloc[0]['text']

print('The number of rows each speaker has in the main dataframe: df_HoC_2000s ')
for speaker in miniDebate_speakers:
    print(speaker, df_HoC_2000s[df_HoC_2000s['speaker'] == speaker].shape[0])

print('The number of rows each speaker has in the main dataframe: df_HoC_2000s (exclude "Free Movement of EU Nationals" agenda)')
for speaker in miniDebate_speakers:
    print(speaker, df_HoC_2000s[(df_HoC_2000s['speaker'] == speaker) & (df_HoC_2000s['agenda'] != 'Free Movement of EU Nationals')].shape[0])

The number of rows each speaker has in the main dataframe: df_HoC_2000s 
Christine Jardine 365
Kit Malthouse 732
Steve Double 622
Tim Farron 767
Jo Stevens 419
Rachael Maskell 1029
The number of rows each speaker has in the main dataframe: df_HoC_2000s (exclude "Free Movement of EU Nationals" agenda)
Christine Jardine 356
Kit Malthouse 726
Steve Double 620
Tim Farron 765
Jo Stevens 417
Rachael Maskell 1027


# Load Tokenizer

In [18]:
BASE_MODEL_ID = "meta-llama/Llama-3.2-3B-Instruct"

In [19]:
# Create the tokenizer to measure the length of the text
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, 
                                          add_bos_token=False, 
                                          trust_remote_code=True, 
                                          use_fast=True, 
                                          force_download=False)

tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.model_max_length

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

131072

In [11]:
min(tokenizer.vocab.values()), max(tokenizer.vocab.values()) 

(0, 128256)

In [12]:
print(tokenizer.special_tokens_map)

test_text = "Hello, how are you today?"
tokens = tokenizer.encode(test_text, return_tensors="pt")
print(tokens)

{'bos_token': '<|begin_of_text|>', 'eos_token': '<|eot_id|>', 'pad_token': '[PAD]'}
tensor([[128000,   9906,     11,   1268,    527,    499,   3432,     30]])


# Prepare Data for Training

**References on Preprocessing Dataset for Fine-tuning**
- https://pytorch.org/torchtune/0.2/tutorials/chat.html
- https://medium.com/@alexandros_chariton/how-to-fine-tune-llama-3-2-instruct-on-your-own-data-a-detailed-guide-e5f522f397d7

Yes, I would like to modify how many 'user' messages are included before and after each assistant response?

In [21]:
# Constants
TOKEN_LENGTH_LIMIT = 3500
MIN_TOKEN_LENGTH = 512

MAX_NO_ASSISTANT_THRESHOLD = 10       # Reset conversation if assistant is absent for too long
CHAT_OWNER = "Christine Jardine"

SYSTEM_PROMPT = """You are Christine Jardine, a politician in the UK's House of Commons.
    You are responding to statements in the parliament.
    Respond exactly as Christine Jardine would speak, 
    staying fully in character and address the observation directly."""

SYSTEM_PROMPT

"You are Christine Jardine, a politician in the UK's House of Commons.\n    You are responding to Observations.\n    Respond exactly as Christine Jardine would speak, \n    staying fully in character and address the observation directly."

In [22]:
def preprocess_convo_1(df, output_path):

    SYSTEM_PROMPT_TOKEN_LEN = len(tokenizer.encode(SYSTEM_PROMPT))

    df = df.assign(role=df["speaker"].apply(lambda x: "assistant" if x == CHAT_OWNER else "user"))  # Assign roles
    grouped = df.groupby(["date", "agenda"])

    conversations = []

    for (date, agenda), group in tqdm(grouped):
        conversation = [{"role": "system", "content": SYSTEM_PROMPT}]
        token_len = SYSTEM_PROMPT_TOKEN_LEN
        has_assistant_message = False
        user_message_count = 0  

        for _, row in group.iterrows():
            role, message = row["role"], row["text"]
            chat_message = {"role": role, "content": message}

            # Simulate applying chat template before checking length
            temp_messages = conversation + [chat_message]
            temp_prompt = tokenizer.apply_chat_template(temp_messages, tokenize=False, add_generation_prompt=True)
            temp_token_len = len(tokenizer.encode(temp_prompt))

            # Check if adding this message exceeds the token limit
            if temp_token_len > TOKEN_LENGTH_LIMIT or user_message_count >= MAX_NO_ASSISTANT_THRESHOLD:
                if token_len >= MIN_TOKEN_LENGTH and has_assistant_message:
                    conversations.append({"conversation": conversation})

                # Restart conversation
                conversation = [{"role": "system", "content": SYSTEM_PROMPT}]
                token_len = SYSTEM_PROMPT_TOKEN_LEN
                has_assistant_message = False
                user_message_count = 0  

            # Append message to conversation
            conversation.append(chat_message)
            token_len = temp_token_len  # Update token count

            if role == "assistant":
                has_assistant_message = True
                user_message_count = 0  
            else:
                user_message_count += 1  

        # Save the last conversation if it meets the minimum length and contains an assistant message
        if token_len >= MIN_TOKEN_LENGTH and has_assistant_message:
            conversations.append({"conversation": conversation})

    # Write to JSONL file
    with open(output_path, 'w') as f:
        for convo in conversations:
            f.write(json.dumps(convo) + '\n')

In [23]:
preprocess_convo_1(df_HoC_2000s, output_path = '/kaggle/working/preprocessed_ChristineJardine.jsonl')

  1%|          | 627/51318 [01:37<2:11:31,  6.42it/s]


KeyboardInterrupt: 

In [None]:
# Check token lengths in dataset (with proper estimation)
total_tokens = []
with open(output_path, 'r') as f:
    for line in f:
        data = json.loads(line)
        
        # Apply chat template before estimating token count
        prompt = tokenizer.apply_chat_template(data["conversation"], tokenize=False, add_generation_prompt=True)
        token_count = len(tokenizer.encode(prompt))  # Tokenize after applying template

        total_tokens.append(token_count)

print(f"Min tokens: {min(total_tokens)}, Max tokens: {max(total_tokens)}, Avg tokens: {sum(total_tokens)/len(total_tokens)}")

In [None]:
over_limit = [tokens for tokens in total_tokens if tokens > 4096]
print(f"Number of conversations exceeding 4096 tokens: {len(over_limit)}")

In [None]:
# Plot the histogram of 'Token Lengths'
plt.figure(figsize=(10, 6))
plt.hist(total_tokens, bins=20, edgecolor='black', alpha=0.7)
plt.xlabel("Token Count")
plt.ylabel("Frequency")
plt.title("Distribution of Token Counts in Preprocessed Conversations")
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()


In [None]:
# Inspect a Few Samples
dialog_blocks = []
with open(output_path, 'r') as f:
    for line in f:
        dialog_blocks.append(json.loads(line))  # Parse each line as a JSON object

# Display the number of dialog blocks
print(f"Total dialog blocks: {len(dialog_blocks)}")
#dialog_blocks[2]