# Setup

In [None]:
import pandas as pd
import json

# Transformers
from transformers import AutoTokenizer
from tqdm.auto import tqdm

In [None]:
# Set API Keys
from kaggle_secrets import UserSecretsClient # API Loggins
user_secrets = UserSecretsClient()

## Hugging Face
Hugging_Face_token = user_secrets.get_secret("Hugging_Face_token")

# Login to Hugging Face
from huggingface_hub import login

login(Hugging_Face_token)

# Load Data

In [None]:
#df_HoC_2000s_raw = pd.read_csv('H:/MA_Thesis/data/Rauh_Schwalbach_2020_ParlSpeech/df_HoC_2000s.csv')

df_HoC_2000s_raw = pd.read_csv('/kaggle/input/parlspeech-2000s/df_HoC_2000s.csv')

In [None]:
df_HoC_2000s = df_HoC_2000s_raw[['date', 'agenda', 'speechnumber', 'speaker', 'party','text']]
df_HoC_2000s.columns

# Clean Data

In [None]:
# check for any weird non words in the 'text' column and print
weird = []
for i in df_HoC_2000s['text']:
    if not i.isalnum():
        weird.append(i)
        
print(weird)


# Prepare Data for Training

In [None]:
# Constants
TOKEN_LENGTH_LIMIT = 4000
MIN_TOKEN_LENGTH = 200
CHAT_OWNER = "Boris Johnson"
BASE_MODEL_ID = "meta-llama/Llama-3.2-3B"

In [None]:
# Create the tokenizer to measure the length of the text
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, 
                                          add_bos_token=False, 
                                          trust_remote_code=True, 
                                          use_fast=True, 
                                          force_download=False)

In [None]:
def preprocess_convo(df, output_path):
    # Assign roles
    df = df.assign(role=df["speaker"].apply(lambda x: "system" if x == CHAT_OWNER else "user"))
    
    # Group by date and agenda
    grouped = df.groupby(["date", "agenda"])
    
    conversations = []
    
    for (date, agenda), group in tqdm(grouped):
        conversation = []
        token_len = 0
        has_system_message = False
        
        for _, row in group.iterrows():
            role = row["role"]
            message = row["text"]
            
            # Format the message
            chat_message_formatted = f"<start_header_id>{role}<end_header_id>{message}"
            chat_message_formatted_len = len(tokenizer.encode(chat_message_formatted))
            
            # Check if adding the message exceeds the token length limit
            if token_len + chat_message_formatted_len > TOKEN_LENGTH_LIMIT:
                # Save the current conversation if it meets the minimum token length and contains at least one system message
                if token_len >= MIN_TOKEN_LENGTH and has_system_message:
                    conversations.append(conversation)
                # Start a new conversation
                conversation = []
                token_len = 0
                has_system_message = False
            
            # Add the message to the conversation
            conversation.append(chat_message_formatted)
            token_len += chat_message_formatted_len
            if role == "system":
                has_system_message = True
        
        # Save the last conversation if it meets the minimum token length and contains at least one system message
        if token_len >= MIN_TOKEN_LENGTH and has_system_message:
            conversations.append(conversation)
    
    # Save the conversations to a JSON lines file
    with open(output_path, 'w') as f:
        for conversation in conversations:
            query_str = "<|eot_id|>".join(conversation)
            f.write(json.dumps({'input': query_str}) + '\n')

In [None]:
output_path = '/kaggle/working/HoC_boris_johnson.jsonl'
preprocess_convo(df_HoC_2000s, output_path)

In [None]:
dialog_blocks = []
with open(output_path, 'r') as f:
    for line in f:
        dialog_blocks.append(json.loads(line))  # Parse each line as a JSON object

# Display the number of dialog blocks
print(f"Total dialog blocks: {len(dialog_blocks)}")
print(json.dumps(dialog_blocks[0], indent=4))  # Pretty-print the first dialog block

In [None]:
for i, block in enumerate(dialog_blocks[:3]):  # Preview first 3 blocks
    print(f"Dialog Block {i + 1}:")
    print(json.dumps(block, indent=4))
    print("-" * 40)
