In [1]:
# Identity cue injection - CORRECT: Each document processed independently
import json
import csv
import os
import random
import re

random.seed(42)

# ----------------------------
# Config
# ----------------------------
jsonl_file = "elon_musk_4_transcript_clean.jsonl"
csv_file = "joe-rogan-experience-1169-elon-musk.csv"
qa_jsonl_file = "output.jsonl"
qr_pairs_file = "QR_Pairs (2).jsonl"
elon_vc_podcast_file = "ElonVCpodcast2010.jsonl"
elon_trump_file = "Elon_Trump_Transcript.json"
lara_trump_file = "Elon Musk With Lara Trump (FULL INTERVIEW).txt"
don_lemon_file = "Elon Musk on Racism, Bailing Out Trump, Hate Speech, and More - The Don Lemon Show (Full Interview) [hhsfjBpKiTw].txt"
ten_x_file = "Elon Musk_ “10X Every 6 Months” [FPpPTp7FIHY].txt"
cpac_file = "Full Elon Musk Interview at CPAC 2025 _ Unfiltered, Unscripted, Unmissable [L2hkqolW168].txt"
ben_shapiro_file = "Elon Musk & Ben Shapiro in Passionate Interview [JGdbFGANapk].txt"
x_takeover_file = "Elon Musk Interview _ The Future, Engineered _ X Takeover 2025 [YqDehngsBHw].txt"
doge_optimus_file = "Elon Musk on DOGE, Optimus, Starlink Smartphones, Evolving with AI, Why the West is Imploding [qeZqZBRA-6Q].txt"
tesla_politics_file = "Elon Musk Talks Tesla, Politics and Putin Relationship (Full Interview) [gPGZRJDVXcU].txt"
verdict_file = "ELON UNCUT - THE COMPLETE INTERVIEW _ Verdict Ep. 216 [aMcmuKTfr54].txt"
grok_4_file = "Elon Musk introducing Grok 4 (FULL VIDEO) [QbNODZwQQuw].txt"
ben_shapiro_2_file = "Elon Musk Interview With Ben Shapiro (FULL INTERVIEW) [S_vpv4I27hs].txt"
ted_2022_file = "Elon Musk talks Twitter, Tesla and how his brain works — live at TED2022 [cdZZpaB2kDM].txt"
rishi_sunak_file = "Rishi Sunak & Elon Musk_ Talk AI, Tech & the Future [R2meHtrO1n8].txt"
superintelligence_file = "Elon Musk_ Digital Superintelligence, Multiplanetary Life, How to Be Useful [cFIlta1GkiE].txt"
trump_2_file = "Trump, Musk pull curtain back behind relationship, media's divide and conquer mission [hMbcMO5JgEo].txt"
trump_doge_file = "Musk and Trump explain why it's essential to cut funding, 'weed out corruption' [BN6xgxRNZeI].txt"
tucker_carlson_file = "tucker_carlson.txt"
output_file = "instruction_response_context22.json"
max_context_turns = 10
response_speaker = "elon musk"
identity_injection_rate = 0.30

# ----------------------------
# System message
# ----------------------------
identity_system_message = "You are Elon Musk."

# Conversation-specific system messages
conversation_system_messages = {
    "trump": "You are Elon Musk in a conversation with Donald Trump.",
    "trump_2": "You are Elon Musk in an interview with Donald Trump.",
    "trump_doge": "You are Elon Musk in an interview with Donald Trump about the Department of Government Efficiency (DOGE).",
    "lara_trump": "You are Elon Musk in an interview with Lara Trump.",
    "joe_rogan": identity_system_message,
    "don_lemon": identity_system_message,
    "default": identity_system_message
}

# Control what percentage of each conversation type gets enhanced context
# This prevents large datasets from biasing the model toward one context
context_injection_rates = {
    "trump": 1.0,           # 100% - small dataset, needs all the help
    "trump_2": 1.0,         # 100% - small dataset
    "trump_doge": 1.0,      # 100% - small dataset
    "lara_trump": 1.0,      # 100% - small dataset
    "don_lemon": 1.0,       # 100% - small dataset
    "ten_x": 1.0,           # 100% - small dataset
    "cpac": 1.0,            # 100% - small dataset
    "ben_shapiro": 1.0,     # 100% - small dataset
    "x_takeover": 1.0,      # 100% - small dataset
    "doge_optimus": 1.0,    # 100% - small dataset
    "tesla_politics": 1.0,  # 100% - small dataset
    "verdict": 1.0,         # 100% - small dataset
    "grok_4": 1.0,          # 100% - small dataset
    "ben_shapiro_2": 1.0,   # 100% - small dataset
    "ted_2022": 1.0,        # 100% - small dataset
    "rishi_sunak": 1.0,     # 100% - small dataset
    "superintelligence": 1.0,  # 100% - small dataset
    "tucker_carlson": 1.0,  # 100% - small dataset
    "joe_rogan": 1.0,       # 30% - large dataset, don't over-emphasize
    "default": 0.0          # 0% - keep general
}

# ----------------------------
# Universal merging function
# ----------------------------
def merge_consecutive_speakers(blocks):
    """
    Merge consecutive blocks from the same speaker within a conversation.
    """
    if not blocks:
        return []
    
    merged_blocks = []
    current_speaker = None
    current_text = ""
    
    for block in blocks:
        speaker = block["speaker"]
        text = block["text"]
        
        if speaker == current_speaker:
            # Same speaker - merge text
            current_text += " " + text
        else:
            # Different speaker - save previous block
            if current_speaker is not None:
                merged_blocks.append({"speaker": current_speaker, "text": current_text})
            current_speaker = speaker
            current_text = text
    
    # Don't forget the last block
    if current_speaker is not None:
        merged_blocks.append({"speaker": current_speaker, "text": current_text})
    
    return merged_blocks

# ----------------------------
# Create pairs from merged blocks
# ----------------------------
def create_pairs_from_blocks(blocks, source_name, conversation_type="default"):
    """
    Create instruction-response pairs from dialogue blocks.
    Each pair includes context from previous turns in THIS conversation only.
    """
    pairs = []
    
    # Get context injection rate for this conversation type
    injection_rate = context_injection_rates.get(conversation_type, 0.0)
    enhanced_system_message = conversation_system_messages.get(conversation_type, identity_system_message)
    
    for i in range(len(blocks)):
        # Only create pairs where Elon is responding
        if blocks[i]["speaker"].lower() != response_speaker.lower():
            continue
        
        # Get context (previous conversation turns from THIS conversation)
        # Start from up to max_context_turns back, or from beginning
        start_idx = max(0, i - max_context_turns)
        context_blocks = blocks[start_idx:i]
        
        # Decide whether to use enhanced or default system message
        if random.random() < injection_rate:
            system_message = enhanced_system_message
        else:
            system_message = identity_system_message
        
        # Build instruction with system message
        instruction = [{
            "role": "system",
            "content": system_message
        }]
        
        # Add conversation history (only previous turns, not current response)
        for b in context_blocks:
            role = "assistant" if b["speaker"].lower() == response_speaker.lower() else "user"
            instruction.append({"role": role, "content": b["text"]})
        
        # Response is Elon's actual words
        response = blocks[i]["text"]
        
        pairs.append({"instruction": instruction, "response": response})
    
    print(f"  → Created {len(pairs)} pairs from {source_name}")
    return pairs

# ----------------------------
# Process single document
# ----------------------------
def process_document(raw_blocks, source_name, conversation_type="default"):
    """
    Process a single document/conversation:
    1. Merge consecutive same-speaker turns
    2. Create instruction-response pairs with context
    """
    if not raw_blocks:
        print(f"✓ {source_name}: No data")
        return []
    
    print(f"\n✓ {source_name}:")
    print(f"  → Loaded {len(raw_blocks)} raw blocks")
    
    # Merge consecutive same-speaker turns
    merged_blocks = merge_consecutive_speakers(raw_blocks)
    print(f"  → Merged to {len(merged_blocks)} blocks")
    
    # Create pairs with context from this conversation only
    pairs = create_pairs_from_blocks(merged_blocks, source_name, conversation_type)
    
    return pairs

# ----------------------------
# Data loading functions
# ----------------------------
def load_jsonl_standard(file_path):
    """Load standard JSONL with 'speaker' and 'text' fields"""
    blocks = []
    if not os.path.exists(file_path):
        return blocks
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            try:
                data = json.loads(line.strip())
                blocks.append({"speaker": data["speaker"], "text": data["text"]})
            except:
                continue
    return blocks

def load_csv_standard(file_path):
    """Load CSV with Speaker and Text columns"""
    blocks = []
    if not os.path.exists(file_path):
        return blocks
    with open(file_path, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            blocks.append({"speaker": row["Speaker"], "text": row["Text"]})
    return blocks

def load_qa_jsonl(file_path):
    """Load Q&A format JSONL (output.jsonl style)"""
    blocks = []
    if not os.path.exists(file_path):
        return blocks
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            try:
                qa = json.loads(line.strip())
                blocks.append({
                    "speaker": qa.get("question_speaker", "interviewer"),
                    "text": qa["question"]
                })
                blocks.append({
                    "speaker": qa.get("response_speaker", response_speaker),
                    "text": qa["response"]
                })
            except:
                continue
    return blocks

def load_qr_pairs(file_path):
    """Load QR_Pairs.jsonl format (Q and A fields)"""
    blocks = []
    if not os.path.exists(file_path):
        return blocks
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            try:
                qr = json.loads(line.strip())
                blocks.append({
                    "speaker": "interviewer",
                    "text": qr["Q"]
                })
                blocks.append({
                    "speaker": response_speaker,
                    "text": qr["A"]
                })
            except:
                continue
    return blocks

def load_elon_vc_podcast(file_path):
    """
    Load ElonVCpodcast2010.jsonl format
    speaker: 0 = Elon (assistant)
    speaker: any other number = interviewer (user)
    """
    blocks = []
    if not os.path.exists(file_path):
        return blocks
    
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            try:
                entry = json.loads(line.strip())
                
                # Determine speaker
                if entry["speaker"] == 0:
                    speaker_name = response_speaker
                else:
                    speaker_name = "interviewer"
                
                blocks.append({
                    "speaker": speaker_name,
                    "text": entry["transcript"]
                })
            except Exception as e:
                print(f"Warning: Skipped malformed line: {e}")
                continue
    
    return blocks

def load_elon_trump_transcript(file_path):
    """
    Load Elon_Trump_Transcript.json format
    JSON array with 'speaker', 'timestamp', and 'text' fields
    """
    blocks = []
    if not os.path.exists(file_path):
        return blocks
    
    with open(file_path, "r", encoding="utf-8") as f:
        try:
            data = json.load(f)
            for entry in data:
                blocks.append({
                    "speaker": entry["speaker"],
                    "text": entry["text"]
                })
        except Exception as e:
            print(f"Warning: Failed to load {file_path}: {e}")
    
    return blocks

def load_speaker_transcript(file_path):
    """
    Load plain text transcript with "Speaker #:" format
    Speaker 0 = Elon Musk (assistant)
    Speaker 1 = Interviewer (user)
    """
    blocks = []
    if not os.path.exists(file_path):
        return blocks
    
    with open(file_path, "r", encoding="utf-8") as f:
        content = f.read()
    
    # Split by speaker pattern: "Speaker #:"
    # Pattern matches "Speaker" followed by space, digit(s), colon
    pattern = r'Speaker\s+(\d+):\s*'
    
    # Split content and capture speaker numbers
    parts = re.split(pattern, content)
    
    # parts[0] is empty or preamble, then alternates: speaker_num, text, speaker_num, text...
    for i in range(1, len(parts), 2):
        if i + 1 < len(parts):
            speaker_num = int(parts[i])
            text = parts[i + 1].strip()
            
            if not text:  # Skip empty utterances
                continue
            
            # Map speaker number to role
            if speaker_num == 0:
                speaker_name = response_speaker
            else:
                speaker_name = "interviewer"
            
            blocks.append({
                "speaker": speaker_name,
                "text": text
            })
    
    return blocks

def load_speaker_transcript_reversed(file_path):
    """
    Load plain text transcript with "Speaker #:" format (REVERSED MAPPING)
    Speaker 1 = Elon Musk (assistant)
    Speaker 0 or any other number = Interviewer (user)
    """
    blocks = []
    if not os.path.exists(file_path):
        return blocks
    
    with open(file_path, "r", encoding="utf-8") as f:
        content = f.read()
    
    # Split by speaker pattern: "Speaker #:"
    pattern = r'Speaker\s+(\d+):\s*'
    
    # Split content and capture speaker numbers
    parts = re.split(pattern, content)
    
    # parts[0] is empty or preamble, then alternates: speaker_num, text, speaker_num, text...
    for i in range(1, len(parts), 2):
        if i + 1 < len(parts):
            speaker_num = int(parts[i])
            text = parts[i + 1].strip()
            
            if not text:  # Skip empty utterances
                continue
            
            # REVERSED: Speaker 1 is Elon, all others are interviewer
            if speaker_num == 1:
                speaker_name = response_speaker
            else:
                speaker_name = "interviewer"
            
            blocks.append({
                "speaker": speaker_name,
                "text": text
            })
    
    return blocks

# ----------------------------
# Identity injection
# ----------------------------
def inject_identity_naturally(messages):
    """Minimal, natural identity injection"""
    if not messages:
        return messages
    
    messages = [{"role": msg["role"], "content": msg["content"]} for msg in messages]
    
    # Find first user message
    user_indices = [i for i, msg in enumerate(messages) if msg["role"] == "user"]
    if not user_indices:
        return messages
    
    # Very subtle - mostly no prefix
    prefixes = ["Elon, ", "", "", "", ""]
    
    # Only inject 40% of the time in first message
    if random.random() < 0.4:
        first_idx = user_indices[0]
        prefix = random.choice(prefixes)
        if prefix:
            messages[first_idx]["content"] = f"{prefix}{messages[first_idx]['content']}".strip()
    
    return messages

# ----------------------------
# MAIN PROCESSING
# ----------------------------
print("=" * 60)
print("Processing each document independently...")
print("=" * 60)

all_pairs = []

# Document 1: Standard JSONL
raw_blocks_1 = load_jsonl_standard(jsonl_file)
pairs_1 = process_document(raw_blocks_1, jsonl_file)
all_pairs.extend(pairs_1)

# Document 2: CSV
raw_blocks_2 = load_csv_standard(csv_file)
pairs_2 = process_document(raw_blocks_2, csv_file, conversation_type="joe_rogan")
all_pairs.extend(pairs_2)

# Document 3: Q&A JSONL
raw_blocks_3 = load_qa_jsonl(qa_jsonl_file)
pairs_3 = process_document(raw_blocks_3, qa_jsonl_file)
all_pairs.extend(pairs_3)

# Document 4: QR Pairs
raw_blocks_4 = load_qr_pairs(qr_pairs_file)
pairs_4 = process_document(raw_blocks_4, qr_pairs_file)
all_pairs.extend(pairs_4)

# Document 5: Elon VC Podcast
raw_blocks_5 = load_elon_vc_podcast(elon_vc_podcast_file)
pairs_5 = process_document(raw_blocks_5, elon_vc_podcast_file)
all_pairs.extend(pairs_5)

# Document 6: Elon Trump Transcript
raw_blocks_6 = load_elon_trump_transcript(elon_trump_file)
pairs_6 = process_document(raw_blocks_6, elon_trump_file, conversation_type="trump")
all_pairs.extend(pairs_6)

# Document 7: Lara Trump Interview
raw_blocks_7 = load_speaker_transcript(lara_trump_file)
pairs_7 = process_document(raw_blocks_7, lara_trump_file, conversation_type="lara_trump")
all_pairs.extend(pairs_7)

# Document 8: Don Lemon Interview
raw_blocks_8 = load_speaker_transcript(don_lemon_file)
pairs_8 = process_document(raw_blocks_8, don_lemon_file, conversation_type="don_lemon")
all_pairs.extend(pairs_8)

# Document 9: 10X Every 6 Months Interview
raw_blocks_9 = load_speaker_transcript(ten_x_file)
pairs_9 = process_document(raw_blocks_9, ten_x_file, conversation_type="ten_x")
all_pairs.extend(pairs_9)

# Document 10: CPAC 2025 Interview (reversed speaker mapping)
raw_blocks_10 = load_speaker_transcript_reversed(cpac_file)
pairs_10 = process_document(raw_blocks_10, cpac_file, conversation_type="cpac")
all_pairs.extend(pairs_10)

# Document 11: Ben Shapiro Interview
raw_blocks_11 = load_speaker_transcript(ben_shapiro_file)
pairs_11 = process_document(raw_blocks_11, ben_shapiro_file, conversation_type="ben_shapiro")
all_pairs.extend(pairs_11)

# Document 12: X Takeover 2025 Interview
raw_blocks_12 = load_speaker_transcript(x_takeover_file)
pairs_12 = process_document(raw_blocks_12, x_takeover_file, conversation_type="x_takeover")
all_pairs.extend(pairs_12)

# Document 13: DOGE/Optimus Interview (multiple interviewers)
raw_blocks_13 = load_speaker_transcript(doge_optimus_file)
pairs_13 = process_document(raw_blocks_13, doge_optimus_file, conversation_type="doge_optimus")
all_pairs.extend(pairs_13)

# Document 14: Tesla/Politics Interview (reversed speaker mapping)
raw_blocks_14 = load_speaker_transcript_reversed(tesla_politics_file)
pairs_14 = process_document(raw_blocks_14, tesla_politics_file, conversation_type="tesla_politics")
all_pairs.extend(pairs_14)

# Document 15: Verdict Interview (reversed speaker mapping, multiple interviewers)
raw_blocks_15 = load_speaker_transcript_reversed(verdict_file)
pairs_15 = process_document(raw_blocks_15, verdict_file, conversation_type="verdict")
all_pairs.extend(pairs_15)

# Document 16: Grok 4 Introduction (multiple interviewers)
raw_blocks_16 = load_speaker_transcript(grok_4_file)
pairs_16 = process_document(raw_blocks_16, grok_4_file, conversation_type="grok_4")
all_pairs.extend(pairs_16)

# Document 17: Ben Shapiro Interview 2
raw_blocks_17 = load_speaker_transcript(ben_shapiro_2_file)
pairs_17 = process_document(raw_blocks_17, ben_shapiro_2_file, conversation_type="ben_shapiro_2")
all_pairs.extend(pairs_17)

# Document 18: TED2022 Interview
raw_blocks_18 = load_speaker_transcript(ted_2022_file)
pairs_18 = process_document(raw_blocks_18, ted_2022_file, conversation_type="ted_2022")
all_pairs.extend(pairs_18)

# Document 19: Rishi Sunak Interview
raw_blocks_19 = load_speaker_transcript(rishi_sunak_file)
pairs_19 = process_document(raw_blocks_19, rishi_sunak_file, conversation_type="rishi_sunak")
all_pairs.extend(pairs_19)

# Document 20: Digital Superintelligence Interview
raw_blocks_20 = load_speaker_transcript(superintelligence_file)
pairs_20 = process_document(raw_blocks_20, superintelligence_file, conversation_type="superintelligence")
all_pairs.extend(pairs_20)

# Document 21: Trump Interview 2
raw_blocks_21 = load_speaker_transcript(trump_2_file)
pairs_21 = process_document(raw_blocks_21, trump_2_file, conversation_type="trump_2")
all_pairs.extend(pairs_21)

# Document 22: Trump DOGE Interview (reversed speaker mapping)
raw_blocks_22 = load_speaker_transcript_reversed(trump_doge_file)
pairs_22 = process_document(raw_blocks_22, trump_doge_file, conversation_type="trump_doge")
all_pairs.extend(pairs_22)

# Document 23: Tucker Carlson Interview (reversed speaker mapping)
raw_blocks_23 = load_speaker_transcript_reversed(tucker_carlson_file)
pairs_23 = process_document(raw_blocks_23, tucker_carlson_file, conversation_type="tucker_carlson")
all_pairs.extend(pairs_23)

# ----------------------------
# Apply identity injection
# ----------------------------
print(f"\n" + "=" * 60)
print(f"Applying identity injection ({identity_injection_rate*100:.0f}% rate)...")
print("=" * 60)
injected_count = 0

for pair in all_pairs:
    if random.random() < identity_injection_rate:
        pair["instruction"] = inject_identity_naturally(pair["instruction"])
        injected_count += 1

print(f"✓ Applied to {injected_count}/{len(all_pairs)} pairs")

# ----------------------------
# Save
# ----------------------------
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(all_pairs, f, ensure_ascii=False, indent=2)

print("\n" + "=" * 60)
print("Dataset Summary")
print("=" * 60)
print(f"Total training pairs: {len(all_pairs)}")
print(f"System message: '{identity_system_message}'")
print(f"Identity injection rate: {identity_injection_rate*100:.0f}%")
print(f"Context window: up to {max_context_turns} previous turns")
print(f"✓ Saved to: {output_file}")
print("=" * 60)

# ----------------------------
# Sample verification
# ----------------------------
if all_pairs:
    print("\n" + "=" * 60)
    print("Sample Training Pair")
    print("=" * 60)
    sample = all_pairs[0]
    context_turns = len([m for m in sample['instruction'] if m['role'] != 'system'])
    print(f"Context turns: {context_turns}")
    print(f"Response length: {len(sample['response'])} chars")
    print("\nFull sample:")
    print(json.dumps(sample, indent=2)[:800] + "...")

Processing each document independently...

✓ elon_musk_4_transcript_clean.jsonl:
  → Loaded 566 raw blocks
  → Merged to 523 blocks
  → Created 261 pairs from elon_musk_4_transcript_clean.jsonl

✓ joe-rogan-experience-1169-elon-musk.csv:
  → Loaded 1831 raw blocks
  → Merged to 1792 blocks
  → Created 881 pairs from joe-rogan-experience-1169-elon-musk.csv

✓ output.jsonl:
  → Loaded 592 raw blocks
  → Merged to 551 blocks
  → Created 267 pairs from output.jsonl

✓ QR_Pairs (2).jsonl:
  → Loaded 164 raw blocks
  → Merged to 164 blocks
  → Created 82 pairs from QR_Pairs (2).jsonl

✓ ElonVCpodcast2010.jsonl:
  → Loaded 42 raw blocks
  → Merged to 42 blocks
  → Created 21 pairs from ElonVCpodcast2010.jsonl

✓ Elon_Trump_Transcript.json:
  → Loaded 373 raw blocks
  → Merged to 370 blocks
  → Created 185 pairs from Elon_Trump_Transcript.json

✓ Elon Musk With Lara Trump (FULL INTERVIEW).txt:
  → Loaded 60 raw blocks
  → Merged to 60 blocks
  → Created 30 pairs from Elon Musk With Lara Trump 