# Setup

In [9]:
import pandas as pd
import json

# Transformers & 
from transformers import AutoTokenizer
from tqdm import tqdm
from torchtune.models.llama3 import llama3_tokenizer
from torchtune.data import Message

In [10]:
# Set API Keys
from kaggle_secrets import UserSecretsClient # API Loggins
user_secrets = UserSecretsClient()

## Hugging Face
Hugging_Face_token = user_secrets.get_secret("Hugging_Face_token")

# Login to Hugging Face
from huggingface_hub import login

login(Hugging_Face_token)

# Load Data

In [11]:
#df_HoC_2000s_raw = pd.read_csv('H:/MA_Thesis/data/Rauh_Schwalbach_2020_ParlSpeech/df_HoC_2000s.csv')

df_HoC_2000s_raw = pd.read_csv('/kaggle/input/parlspeech/df_HoC_2000s.csv')
df_HoC_2000s_raw.columns

Index(['date', 'agenda', 'speechnumber', 'speaker', 'party', 'party.facts.id',
       'chair', 'terms', 'text'],
      dtype='object')

In [12]:
df_HoC_2000s = df_HoC_2000s_raw[['date', 'agenda', 'speechnumber', 'speaker', 'party','text']]
df_HoC_2000s.columns
df_HoC_2000s.head(3)

Unnamed: 0,date,agenda,speechnumber,speaker,party,text
0,2000-01-10,Severe Disablement Allowance [Oral Answers To ...,1,Andrew George,LibDem,What steps the Government are taking to ensure...
1,2000-01-10,Severe Disablement Allowance [Oral Answers To ...,2,Hugh Bayley,Lab,Severe disablement allowance does not provide ...
2,2000-01-10,Severe Disablement Allowance [Oral Answers To ...,3,Andrew George,LibDem,Those who warned the Government against abolit...


# Exploratory Data Analaysis

In [13]:
df_HoC_2000s['speaker'].value_counts().head(10)

speaker
CHAIR             59591
David Cameron     13642
Theresa May       12984
Tony Blair         8272
Jack Straw         8142
Chris Grayling     6599
Chris Bryant       6217
Andrew Lansley     5942
Philip Hammond     5606
Peter Hain         5422
Name: count, dtype: int64

In [29]:
df_HoC_2000s[df_HoC_2000s['speaker'] == 'Boris Johnson'].shape[0]

2213

In [30]:
print(f"David Cameron has {df_HoC_2000s_raw[df_HoC_2000s_raw['speaker'] == 'David Cameron']['terms'].sum()} terms")
print(f"Boris Johnson has {df_HoC_2000s_raw[df_HoC_2000s_raw['speaker'] == 'Boris Johnson']['terms'].sum()} terms")

David Cameron has 1677960 terms
Boris Johnson has 259544 terms


In [29]:
df_HoC_2000s['date'].str.extract(r'(\d{4})').value_counts()

0   
2018    88249
2011    80410
2013    77636
2016    77454
2019    76361
2012    71467
2017    71025
2014    70128
2010    69573
2015    66874
2000    63890
2008    63503
2003    61369
2004    60348
2007    58510
2006    58452
2009    58004
2002    55128
2001    52577
2005    51353
Name: count, dtype: int64

In [31]:
df_HoC_2005 = df_HoC_2000s[df_HoC_2000s['date'].str.contains('2005')]
df_HoC_2005

Unnamed: 0,date,agenda,speechnumber,speaker,party,text
293312,2005-01-10,,1,Peter Luff,Con,If he will make a statement on the future size...
293313,2005-01-10,,2,Adam Ingram,Lab,We are adapting and modernising the Royal Navy...
293314,2005-01-10,,3,Peter Luff,Con,I was glad to hear the Minister refer to the n...
293315,2005-01-10,,4,Adam Ingram,Lab,"The programme, as has been explained time and ..."
293316,2005-01-10,,5,Kevan Jones,Lab,My right hon. Friend will be aware of the stro...
...,...,...,...,...,...,...
344660,2005-12-20,,203,Bob Spink,Con,So that we may all rush off to our Christmas e...
344661,2005-12-20,,204,David Amess,Con,I have the honour to present a petition signed...
344662,2005-12-20,,205,Jeremy Browne,LibDem,I am grateful for the opportunity to raise thi...
344663,2005-12-20,,206,Karen Buck,Lab,I congratulate the hon. Member for Taunton (Mr...


# Prepare Data for Training

In [36]:
# Constants
TOKEN_LENGTH_LIMIT = 4000
MIN_TOKEN_LENGTH = 200
CHAT_OWNER = "David Cameron"
BASE_MODEL_ID = "meta-llama/Llama-3.2-3B"

In [37]:
# Create the tokenizer to measure the length of the text
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, 
                                          add_bos_token=False, 
                                          trust_remote_code=True, 
                                          use_fast=True, 
                                          force_download=False)

Special template, format, and tags https://pytorch.org/torchtune/0.2/tutorials/chat.html

In [38]:
def preprocess_convo(df, output_path):
    df = df.assign(role=df["speaker"].apply(lambda x: "assistant" if x == CHAT_OWNER else "user"))
    
    grouped = df.groupby(["date", "agenda"])
    conversations = []

    for (date, agenda), group in tqdm(grouped):
        conversation = [
            {"role": "system", "content": "You are Boris Johnson, a former UK Prime Minister. Answer in his style."}
        ]
        token_len = len(tokenizer.encode(conversation[0]["content"]))
        has_assistant_message = False

        for _, row in group.iterrows():
            role = row["role"]
            message = row["text"]

            chat_message = {"role": role, "content": message}
            chat_message_len = len(tokenizer.encode(message))

            if token_len + chat_message_len > TOKEN_LENGTH_LIMIT:
                if token_len >= MIN_TOKEN_LENGTH and has_assistant_message:
                    conversations.append({"conversation": conversation})

                conversation = [conversation[-1]] if has_assistant_message else []
                token_len = len(tokenizer.encode(conversation[-1]["content"])) if conversation else 0
                has_assistant_message = any(msg["role"] == "assistant" for msg in conversation)

            conversation.append(chat_message)
            token_len += chat_message_len
            if role == "assistant":
                has_assistant_message = True

        if token_len >= MIN_TOKEN_LENGTH and has_assistant_message:
            conversations.append({"conversation": conversation})

    with open(output_path, 'w') as f:
        for convo in conversations:
            f.write(json.dumps(convo) + '\n')

In [26]:
def preprocess_convo(df, output_path):
    """
    Converts parliamentary transcripts into the correct format for fine-tuning Llama 3 Instruct.
    Uses Llama3's tokenizer for correct tokenization.
    """
    df = df.assign(role=df["speaker"].apply(lambda x: "assistant" if x == CHAT_OWNER else "user"))

    grouped = df.groupby(["date", "agenda"])
    conversations = []

    for (date, agenda), group in tqdm(grouped):
        # Initialize conversation with a system message
        conversation = [
            Message(role="system", content="You are Boris Johnson, a former UK Prime Minister. Answer in his style.")
        ]
        
        token_len = len(tokenizer.encode(conversation[0].content))
        has_assistant_message = False

        for _, row in group.iterrows():
            role = row["role"]
            message = row["text"]

            chat_message = Message(role=role, content=message)
            chat_message_len = len(tokenizer.encode(message))

            # Ensure the conversation does not exceed token limit
            if token_len + chat_message_len > TOKEN_LENGTH_LIMIT:
                if token_len >= MIN_TOKEN_LENGTH and has_assistant_message:
                    conversations.append(conversation)

                # Start a new conversation, keeping the last assistant message for continuity
                conversation = [conversation[-1]] if has_assistant_message else []
                token_len = len(tokenizer.encode(conversation[-1].content)) if conversation else 0
                has_assistant_message = any(msg.role == "assistant" for msg in conversation)

            conversation.append(chat_message)
            token_len += chat_message_len
            if role == "assistant":
                has_assistant_message = True

        if token_len >= MIN_TOKEN_LENGTH and has_assistant_message:
            conversations.append(conversation)

    # Save in Llama3 fine-tuning format
    with open(output_path, 'w') as f:
        for convo in conversations:
            tokenized_messages, _ = tokenizer.tokenize_messages(convo)  # Correct tokenized format
            f.write(json.dumps({'messages': convo}) + '\n')

In [41]:
output_path = '/kaggle/working/preprocessed_DavidCameron.jsonl'
preprocess_convo(df_HoC_2005, output_path)

  4%|▎         | 5/137 [00:00<00:04, 30.37it/s]


In [40]:
dialog_blocks = []
with open(output_path, 'r') as f:
    for line in f:
        dialog_blocks.append(json.loads(line))  # Parse each line as a JSON object

# Display the number of dialog blocks
print(f"Total dialog blocks: {len(dialog_blocks)}")
print(json.dumps(dialog_blocks[0], indent=4))  # Pretty-print the first dialog block

Total dialog blocks: 0


IndexError: list index out of range

In [None]:
for i, block in enumerate(dialog_blocks[:5]):  # Preview first 3 blocks
    print(f"Dialog Block {i + 1}:")
    print(json.dumps(block, indent=4))
    print("-" * 40)
