In [9]:
import sys
from pathlib import Path

project_root = Path.cwd()
sys.path.append(str(project_root))
sys.path.append(str(project_root / 'src'))
sys.path.append(str(project_root / 'src' / 'data_processing'))
sys.path.append(str(project_root / 'src' / 'fine_tuning'))


Simplified ID Strategy\n
Source|Current|NewID|Example
QA  no id {video_id}_{order} 5R0FIkU97HY_01
Interview   has chunk_id    already unique  int_{interview_id}_17
Article     has chunk_id    already unique  art_{article_id}_04

In [None]:
import json
from pathlib import Path
from collections import defaultdict

def process_qa_file(input_path: str, output_path: str):
    """Add order-based IDs to QA pairs"""
    
    items = []
    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():  # Skip empty lines
                items.append(json.loads(line))
    
    # Count per video
    video_counter = defaultdict(int)
    
    for item in items:
        video_id = item['video_id']
        video_counter[video_id] += 1
        item['id'] = f"{video_id}_{video_counter[video_id]:02d}"
    
    with open(output_path, 'w', encoding='utf-8') as f:
        for item in items:
            f.write(json.dumps(item, ensure_ascii=False) + '\n')
    
    print(f"‚úì QA: {len(items)} items, {len(video_counter)} videos")




Run

In [1]:
import json

lines = open('./data/fine_tuning/train_data.jsonl', encoding='utf-8').readlines()
print(f"Total lines: {len(lines)}")

for i, line in enumerate(lines):
    if line.strip():
        r = json.loads(line)
        chars = len(r['input'] + r['output'])
        print(f"{i+1}: {chars} chars (~{chars//3} tokens)")

Total lines: 10
1: 1320 chars (~440 tokens)
2: 3632 chars (~1210 tokens)
3: 1786 chars (~595 tokens)
4: 2405 chars (~801 tokens)
5: 4532 chars (~1510 tokens)
6: 3583 chars (~1194 tokens)
7: 1235 chars (~411 tokens)
8: 4309 chars (~1436 tokens)
9: 2235 chars (~745 tokens)
10: 2359 chars (~786 tokens)


In [3]:
import json

# Check article format
with open('./data/processed/articles_with_questions.jsonl', encoding='utf-8') as f:
    for i, line in enumerate(f):
        if line.strip() and i < 2:
            r = json.loads(line)
            print(f"=== Article {i+1} ===")
            print(f"Questions: {r.get('potential_questions', [])}")
            print(f"Text preview: {r['text'][:300]}...")
            print()

=== Article 1 ===
Questions: ['–ß—Ç–æ –∑–Ω–∞—á–∏—Ç –±—ã—Ç—å –∂–µ–Ω—Å—Ç–≤–µ–Ω–Ω–æ–π –∏ –ø—Ä–∏ —ç—Ç–æ–º –æ—Å—Ç–∞–≤–∞—Ç—å—Å—è —Å–∏–ª—å–Ω–æ–π –∏ –Ω–µ–∑–∞–≤–∏—Å–∏–º–æ–π –∂–µ–Ω—â–∏–Ω–æ–π?', '–ö–∞–∫ –Ω–µ –ø–æ—Ç–µ—Ä—è—Ç—å –∂–µ–Ω—Å—Ç–≤–µ–Ω–Ω–æ—Å—Ç—å, –∫–æ–≥–¥–∞ —Ö–æ—á–µ—à—å –±—ã—Ç—å —Å–∞–º–æ–¥–æ—Å—Ç–∞—Ç–æ—á–Ω–æ–π –∏ —É–≤–µ—Ä–µ–Ω–Ω–æ–π –≤ —Å–µ–±–µ?', '–ü–æ—á–µ–º—É –æ–±—Ä–∞–∑ —É—Å–ø–µ—à–Ω–æ–π –∏ —Å–∞–º–æ—Å—Ç–æ—è—Ç–µ–ª—å–Ω–æ–π –∂–µ–Ω—â–∏–Ω—ã –≤–æ—Å–ø—Ä–∏–Ω–∏–º–∞–µ—Ç—Å—è –∫–∞–∫ ¬´–º—É–∂–∏–∫ –≤ —é–±–∫–µ¬ª?']
Text preview: "–ë—ã—Ç—å –∂–µ–Ω—Å—Ç–≤–µ–Ω–Ω–æ–π –Ω–µ –∑–Ω–∞—á–∏—Ç –±—ã—Ç—å —Ç—Ä—è–ø–∫–æ–π" –ö–æ–≥–¥–∞ —è –≥–æ–≤–æ—Ä—é –æ —Å–∞–º–æ–¥–æ—Å—Ç–∞—Ç–æ—á–Ω–æ–π –∏ —É–≤–µ—Ä–µ–Ω–Ω–æ–π –≤ —Å–µ–±–µ –∂–µ–Ω—â–∏–Ω–µ, –º–Ω–æ–≥–∏–µ –≤–æ–∑—Ä–∞–∂–∞—é—Ç: –≤–µ–¥—å –ø–æ–ª—É—á–∏—Ç—Å—è –º—É–∂–∏–∫ –≤ —é–±–∫–µ! –î–∞–≤–∞–π—Ç–µ –æ–± —ç—Ç–æ–º –ø–æ–≥–æ–≤–æ—Ä–∏–º. –£ –º–Ω–æ–≥–∏—Ö –∂–µ–Ω—â–∏–Ω, —á–∏—Ç–∞—é—â–∏—Ö –º–æ–∏ –∫–æ–ª–æ–Ω–∫–∏ –∏ —Å–ª—É—à–∞—é—â–∏—Ö –ª–µ–∫—Ü–∏–∏, –≤–æ–∑–Ω–∏–∫–∞–µ—Ç –ø—Ä–æ—Ç–µ—Å—Ç. –ö–æ–≥–¥–∞ —è –≥–æ–≤–æ—Ä—

prepare qa_pairs to train and test data for ft

In [2]:
import json
import random

# Load existing train_data (your 10 examples)
train_data = []
with open('./data/fine_tuning/train_data.jsonl', encoding='utf-8') as f:
    for line in f:
        if line.strip():
            train_data.append(json.loads(line))

print(f"Existing train examples: {len(train_data)}")

# Load qa_pairs
qa_pairs = []
with open('./data/processed/qa_pairs.jsonl', encoding='utf-8') as f:
    for line in f:
        if line.strip():
            r = json.loads(line)
            qa_pairs.append({
                "input": r['question'],
                "output": r['answer']
            })

print(f"QA pairs: {len(qa_pairs)}")

# Split qa_pairs: 90% train, 10% test
random.seed(42)
random.shuffle(qa_pairs)
split = int(len(qa_pairs) * 0.9)

# Combine
train_data.extend(qa_pairs[:split])
test_data = qa_pairs[split:]

print(f"Total train: {len(train_data)}")
print(f"Test: {len(test_data)}")

# Save
with open('./data/fine_tuning/train_data.jsonl', 'w', encoding='utf-8') as f:
    for item in train_data:
        f.write(json.dumps(item, ensure_ascii=False) + '\n')

with open('./data/fine_tuning/test_data.jsonl', 'w', encoding='utf-8') as f:
    for item in test_data:
        f.write(json.dumps(item, ensure_ascii=False) + '\n')

print("‚úÖ Saved!")

Existing train examples: 10
QA pairs: 241
Total train: 226
Test: 25
‚úÖ Saved!


In [5]:
import json

# Add articles - only FIRST question per chunk
articles_added = 0
with open('./data/processed/articles_with_questions.jsonl', encoding='utf-8') as f:
    articles = [json.loads(line) for line in f if line.strip()]

with open('./data/fine_tuning/train_data.jsonl', 'a', encoding='utf-8') as f:
    for r in articles:
        questions = r.get('potential_questions', [])
        if questions:
            item = {
                "input": questions[0],  # Only first question
                "output": r['text']
            }
            f.write(json.dumps(item, ensure_ascii=False) + '\n')
            articles_added += 1

print(f"Articles added: {articles_added}")

# Count total
with open('./data/fine_tuning/train_data.jsonl', encoding='utf-8') as f:
    total = len([l for l in f if l.strip()])
print(f"Total train examples: {total}")

Articles added: 250
Total train examples: 476


working on interview_ft.jsonl

In [7]:
import json

# Read entire file as one JSON or multiple JSON objects
with open('./data/processed/intervie_ft.jsonl', encoding='utf-8') as f:
    content = f.read()

# Try to parse - might be array or single objects separated by whitespace
interviews = []
decoder = json.JSONDecoder()
content = content.strip()
pos = 0

while pos < len(content):
    try:
        obj, end = decoder.raw_decode(content, pos)
        interviews.append(obj)
        pos = end
        # Skip whitespace
        while pos < len(content) and content[pos] in ' \n\t\r':
            pos += 1
    except json.JSONDecodeError:
        break

print(f"Loaded interviews: {len(interviews)}")

# Add to train_data
with open('./data/fine_tuning/train_data.jsonl', 'a', encoding='utf-8') as f:
    for r in interviews:
        messages = r.get('messages', [])
        user_msg = None
        assistant_msg = None
        for m in messages:
            if m['role'] == 'user':
                user_msg = m['content']
            elif m['role'] == 'assistant':
                assistant_msg = m['content']
        
        if user_msg and assistant_msg:
            item = {
                "input": user_msg,
                "output": assistant_msg
            }
            f.write(json.dumps(item, ensure_ascii=False) + '\n')

# Count total
with open('./data/fine_tuning/train_data.jsonl', encoding='utf-8') as f:
    total = len([l for l in f if l.strip()])
print(f"Total train examples: {total}")
print("‚úÖ Done!")

Loaded interviews: 18
Total train examples: 494
‚úÖ Done!


In [1]:
# Check data balance
import json

qa_count = 0
article_count = 0
short_answers = 0
long_answers = 0

with open('./data/fine_tuning/train_data.jsonl', encoding='utf-8') as f:
    for line in f:
        if line.strip():
            r = json.loads(line)
            out_len = len(r['output'])
            
            if out_len > 2000:
                article_count += 1
                long_answers += 1
            else:
                qa_count += 1
            
            if out_len < 500:
                short_answers += 1

print(f"Short answers (<500 chars): {short_answers}")
print(f"Long answers (>2000 chars): {long_answers}")
print(f"Likely Q&A: {qa_count}")
print(f"Likely articles: {article_count}")

Short answers (<500 chars): 127
Long answers (>2000 chars): 21
Likely Q&A: 473
Likely articles: 21


In [3]:
import json

# Append qa_pairs to existing 10
with open('./data/processed/qa_pairs.jsonl', encoding='utf-8') as f:
    qa_pairs = [json.loads(line) for line in f if line.strip()]

with open('./data/fine_tuning/train_data.jsonl', 'a', encoding='utf-8') as f:
    for r in qa_pairs:
        item = {"input": r['question'], "output": r['answer']}
        f.write(json.dumps(item, ensure_ascii=False) + '\n')

# Count total
with open('./data/fine_tuning/train_data.jsonl', encoding='utf-8') as f:
    total = len([l for l in f if l.strip()])
    
print(f"Total: {total}")  # Should be 251

Total: 250


In [6]:
import json
from pathlib import Path
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct")
data_path = Path("data/fine_tuning/train_data.jsonl")
output_path = Path("data/fine_tuning/train_data_clean.jsonl")

MIN_ANSWER_TOKENS = 50  # Minimum for a "real" answer with substance

kept = []
removed = []

with open(data_path, 'r', encoding='utf-8') as f:
    for line in f:
        r = json.loads(line)
        answer_tokens = len(tokenizer.encode(r['output']))
        
        if answer_tokens >= MIN_ANSWER_TOKENS:
            kept.append(r)
        else:
            removed.append({
                'tokens': answer_tokens,
                'q': r['input'][:60],
                'a': r['output'][:100]
            })

# Save cleaned data
with open(output_path, 'w', encoding='utf-8') as f:
    for r in kept:
        f.write(json.dumps(r, ensure_ascii=False) + '\n')

# Report
print(f"‚úÖ Kept: {len(kept)} examples")
print(f"‚ùå Removed: {len(removed)} examples\n")

print("Removed examples:")
for ex in sorted(removed, key=lambda x: x['tokens']):
    print(f"  [{ex['tokens']} tok] Q: {ex['q']}...")
    print(f"           A: {ex['a']}")
    print()

‚úÖ Kept: 245 examples
‚ùå Removed: 6 examples

Removed examples:
  [3 tok] Q: –ö–∞–∫ –∂–∏—Ç—å —Å –º—É–∂–µ–º-–Ω–µ–≤—Ä–æ—Ç–∏–∫–æ–º –∏ –±—ã—Ç—å —Å—á–∞—Å—Ç–ª–∏–≤–æ–π?...
           A: –ù–∏–∫–∞–∫.

  [9 tok] Q: –ù–µ —Ö–æ—á–µ—Ç—Å—è –≤—ã—Ö–æ–¥–∏—Ç—å –∏–∑ –¥–æ–º–∞. –≠—Ç–æ —Å–æ—Ü–∏–æ—Ñ–æ–±–∏—è?...
           A: –ù–µ—Ç, —ç—Ç–æ –¥–µ–ø—Ä–µ—Å—Å–∏—è.

  [12 tok] Q: –£ –º–µ–Ω—è –µ—Å—Ç—å –º—É–∂ –∏ –µ—Å—Ç—å —Ä–æ–¥–∏—Ç–µ–ª–∏. –Ø –∑–∞—Ä–∞–±–∞—Ç—ã–≤–∞—é –¥–æ—Å—Ç–∞—Ç–æ—á–Ω–æ –º–Ω...
           A: –ú–æ–∂–µ—Ç, –Ω–æ –Ω–µ –¥–æ–ª–∂–µ–Ω, —è —Ç–∞–∫ —Å—á–∏—Ç–∞—é.

  [28 tok] Q: –ú–Ω–µ 20 –ª–µ—Ç. –Ø –∂–∏–ª–∞ —Å –±–∞–±—É—à–∫–æ–π –∏ –¥–µ–¥—É—à–∫–æ–π, —Å –º–∞–º–æ–π –∏ –ø–∞–ø–æ–π –æ–±...
           A: –ê –≤ —à–∫–æ–ª–µ –≤–ª—é–±–ª—è–ª–∏—Å—å –≤ –∫–æ–≥–æ-–Ω–∏–±—É–¥—å? –ï—Å–ª–∏ –¥–∞ ‚Äî –±—É–¥–µ—Ç–µ –∂–∏—Ç—å –Ω–æ—Ä–º–∞–ª—å–Ω–æ. –≠—Ç–æ –Ω–æ—Ä–º–∞–ª—å–Ω–æ.

  [37 tok] Q: –ï—Å—Ç—å –ª–∏ —Ç–∞–∫–æ–µ –ø–æ–Ω—è—Ç–∏–µ, –∫–∞–∫ –∑–¥–æ—Ä–æ–≤–∞—è –∏ –Ω–µ–∑–¥–æ—Ä–æ–≤–∞—è —Ä–∞–∑–Ω–∏—Ü–∞ –≤ –≤...
           A: –ù–µ—Ç –ø—Ä–æ–±–ª–µ–º. –≠—Ç–æ

In [13]:
from inference_lora import load_model, ask

model, tokenizer = load_model()

prompt = """Generate 2-3 questions (in Russian) for this text:

–°–æ–≤–µ—Ç ¬´–¥–µ–ª–∞—Ç—å —Ç–æ–ª—å–∫–æ —Ç–æ, —á—Ç–æ —Ö–æ—á–µ—Ç—Å—è¬ª –Ω–∞—à–∏ –≥—Ä–∞–∂–¥–∞–Ω–µ –≤–æ—Å–ø—Ä–∏–Ω–∏–º–∞—é—Ç –∫–∞–∫ –ø—Ä–∏–∑—ã–≤ –∫ –∞–Ω–∞—Ä—Ö–∏–∏. –°–≤–æ–∏ —Å–∞–º—ã–µ —Å–∏–ª—å–Ω—ã–µ –∂–µ–ª–∞–Ω–∏—è –æ–Ω–∏ —Å—á–∏—Ç–∞—é—Ç –Ω–µ–ø—Ä–µ–º–µ–Ω–Ω–æ –Ω–∏–∑–º–µ–Ω–Ω—ã–º–∏, –ø–æ—Ä–æ—á–Ω—ã–º–∏, –æ–ø–∞—Å–Ω—ã–º–∏ –¥–ª—è –æ–∫—Ä—É–∂–∞—é—â–∏—Ö. –õ—é–¥–∏ —É–≤–µ—Ä–µ–Ω—ã, —á—Ç–æ –æ–Ω–∏ —Ç–∞–π–Ω—ã–µ –±–µ—Å–ø—Ä–µ–¥–µ–ª—å—â–∏–∫–∏, –∏ –ø–æ–ø—Ä–æ—Å—Ç—É –±–æ—è—Ç—Å—è –¥–∞—Ç—å —Å–µ–±–µ –≤–æ–ª—é! –Ø –≤–∏–∂—É –≤ —ç—Ç–æ–º —Å–µ—Ä—å–µ–∑–Ω—ã–π —Å–∏–º–ø—Ç–æ–º –≤—Å–µ–æ–±—â–µ–≥–æ –Ω–µ–≤—Ä–æ–∑–∞.\n\n–ì–æ–≤–æ—Ä–∏—à—å —á–µ–ª–æ–≤–µ–∫—É: ¬´–î–µ–ª–∞–π —Ç–æ, —á—Ç–æ —Ö–æ—á–µ—à—å!¬ª –ê –æ–Ω: ¬´–ù—É —á—Ç–æ –≤—ã! –†–∞–∑–≤–µ —Ç–∞–∫ –º–æ–∂–Ω–æ?!¬ª\n\n–û—Ç–≤–µ—á–∞—é: ¬´–ï—Å–ª–∏ –≤—ã —Å—á–∏—Ç–∞–µ—Ç–µ —Å–µ–±—è —Ö–æ—Ä–æ—à–∏–º —á–µ–ª–æ–≤–µ–∫–æ–º, —Ç–æ –¥–∞. –ú–æ–∂–Ω–æ –∏ –Ω—É–∂–Ω–æ¬ª. –ñ–µ–ª–∞–Ω–∏—è —Ö–æ—Ä–æ—à–µ–≥–æ —á–µ–ª–æ–≤–µ–∫–∞ —Å–æ–≤–ø–∞–¥–∞—é—Ç —Å –∏–Ω—Ç–µ—Ä–µ—Å–∞–º–∏ –æ–∫—Ä—É–∂–∞—é—â–∏—Ö.\n\n–®–µ—Å—Ç—å –ø—Ä–∞–≤–∏–ª, –∫–æ—Ç–æ—Ä—ã–µ –ø–æ–º–æ–≥–ª–∏ –Ω–µ –æ–¥–Ω–æ–º—É –¥–µ—Å—è—Ç–∫—É –ª—é–¥–µ–π –≤—ã–π—Ç–∏ –∏–∑ –Ω–µ–≤—Ä–æ–∑–∞, ‚Äì —Ä–µ–∑—É–ª—å—Ç–∞—Ç 30 –ª–µ—Ç –ø—Ä–∞–∫—Ç–∏–∫–∏. –≠—Ç–æ –Ω–µ –∑–Ω–∞—á–∏—Ç, —á—Ç–æ —è –¥—É–º–∞–ª –Ω–∞–¥ –Ω–∏–º–∏ 30 –ª–µ—Ç. –°–∫–æ—Ä–µ–µ, –æ–¥–Ω–∞–∂–¥—ã –æ–Ω–∏ —Å–∞–º–∏ —Å—Ç–∏—Ö–∏–π–Ω–æ –≤—ã—Å—Ç—Ä–æ–∏–ª–∏—Å—å, –∫–∞–∫ —Ç–∞–±–ª–∏—Ü–∞ –ú–µ–Ω–¥–µ–ª–µ–µ–≤–∞ –≤ –≥–æ–ª–æ–≤–µ –ú–µ–Ω–¥–µ–ª–µ–µ–≤–∞, –∫–æ–≥–¥–∞ –æ–Ω –ø—Ä–æ—Å–Ω—É–ª—Å—è.\n\n–ü—Ä–∞–≤–∏–ª–∞ –Ω–∞ –ø–µ—Ä–≤—ã–π –≤–∑–≥–ª—è–¥ –ø—Ä–æ—Å—Ç—ã–µ:\n\n1. –î–µ–ª–∞—Ç—å —Ç–æ–ª—å–∫–æ —Ç–æ, —á—Ç–æ —Ö–æ—á–µ—Ç—Å—è.\n\n2. –ù–µ –¥–µ–ª–∞—Ç—å —Ç–æ–≥–æ, —á–µ–≥–æ –¥–µ–ª–∞—Ç—å –Ω–µ —Ö–æ—á–µ—Ç—Å—è.\n\n3. –°—Ä–∞–∑—É –≥–æ–≤–æ—Ä–∏—Ç—å –æ —Ç–æ–º, —á—Ç–æ –Ω–µ –Ω—Ä–∞–≤–∏—Ç—Å—è.\n\n4. –ù–µ –æ—Ç–≤–µ—á–∞—Ç—å, –∫–æ–≥–¥–∞ –Ω–µ —Å–ø—Ä–∞—à–∏–≤–∞—é—Ç.\n\n5. –û—Ç–≤–µ—á–∞—Ç—å —Ç–æ–ª—å–∫–æ –Ω–∞ –≤–æ–ø—Ä–æ—Å.\n\n6. –í—ã—è—Å–Ω—è—è –æ—Ç–Ω–æ—à–µ–Ω–∏—è, –≥–æ–≤–æ—Ä–∏—Ç—å —Ç–æ–ª—å–∫–æ –æ —Å–µ–±–µ.

Response - JSON array ONLY:
["–≤–æ–ø—Ä–æ—Å 1", "–≤–æ–ø—Ä–æ—Å 2", "–≤–æ–ø—Ä–æ—Å 3"]"""

response = ask(model, tokenizer, prompt)
print(response)

ü§ñ Loading model...


Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:08<00:00,  2.16s/it]


‚úÖ Ready
[
    "–ß—Ç–æ —Ç–∞–∫–æ–µ –Ω–µ–≤—Ä–æ–∑ –ø–æ –≤–∞—à–µ–º—É –º–Ω–µ–Ω–∏—é?",
    "–ö–∞–∫–∏–µ –∏–∑ —à–µ—Å—Ç–∏ –ø—Ä–∞–≤–∏–ª –≤–∞–º —Å–ª–æ–∂–Ω–æ —Ä–µ–∞–ª–∏–∑–æ–≤–∞—Ç—å –∏ –ø–æ—á–µ–º—É?",
    "–ú–æ–∂–µ—Ç–µ —Ä–∞—Å—Å–∫–∞–∑–∞—Ç—å –ø—Ä–æ —à–µ—Å—Ç–æ–µ –ø—Ä–∞–≤–∏–ª–æ –ø–æ–¥—Ä–æ–±–Ω–µ–µ?"


In [3]:
import json
from pathlib import Path

DATA_DIR = Path(r"C:\Projects\projects_py\labkovsky-model\data\fine_tuning")

# Input files
QA_FILE = DATA_DIR / "train_data_clean.jsonl"
DIALOGUES_FILE = Path(r"C:\Projects\projects_py\labkovsky-model\src\fine_tuning\data\Hochu_i_budu_dialogues.jsonl")

# Output
OUTPUT_FILE = DATA_DIR / "train_data_final.jsonl"

all_records = []

# 1. Q&A ‚Üí messages
with open(QA_FILE, 'r', encoding='utf-8') as f:
    for line in f:
        if line.strip():
            r = json.loads(line)
            all_records.append({
                "messages": [
                    {"role": "user", "content": r["input"]},
                    {"role": "assistant", "content": r["output"]}
                ]
            })

print(f"Q&A: {len(all_records)}")

# 2. Dialogues ‚Üí messages (rename turns to messages)
count = 0
with open(DIALOGUES_FILE, 'r', encoding='utf-8') as f:
    for line in f:
        if line.strip():
            r = json.loads(line)
            all_records.append({
                "messages": r["turns"]
            })
            count += 1

print(f"Dialogues: {count}")
print(f"Total: {len(all_records)}")

# Save
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
    for r in all_records:
        f.write(json.dumps(r, ensure_ascii=False) + '\n')

print(f"Saved to {OUTPUT_FILE}")

Q&A: 245
Dialogues: 44
Total: 289
Saved to C:\Projects\projects_py\labkovsky-model\data\fine_tuning\train_data_final.jsonl


In [22]:
import json
from collections import Counter, defaultdict

def collect_decision_types(jsonl_path: str):
    decision_types = []
    tags_by_dt = defaultdict(list)  # dt -> list of all tags
    
    with open(jsonl_path, "r", encoding="utf-8") as f:
        for line_num, line in enumerate(f, 1):
            line = line.strip()
            if not line:
                continue
            try:
                obj = json.loads(line)
            except json.JSONDecodeError as e:
                print(f"[WARN] Line {line_num}: invalid JSON ({e})")
                continue
            
            dt = obj.get("decision_type")
            aux_tags = obj.get("aux_tags", [])
            
            if dt is not None:
                decision_types.append(dt)
                tags_by_dt[dt].extend(aux_tags)
    
    # Count decision types
    dt_counter = Counter(decision_types)
    
    print("\n=== DECISION TYPES ===")
    for dt, count in dt_counter.most_common():
        print(f"{dt}: {count}")
    
    # Count tags within each decision type
    # print("\n=== TAGS BY DECISION TYPE ===")
    # for dt, count in dt_counter.most_common():
    #     tag_counter = Counter(tags_by_dt[dt])
    #     print(f"\n{dt} ({count} total):")
    #     if tag_counter:
    #         for tag, tag_count in tag_counter.most_common():
    #             print(f"  {tag}: {tag_count}")
    #     else:
    #         print("  (no tags)")
    
    return dt_counter, tags_by_dt

if __name__ == "__main__":
    collect_decision_types(r".\data\fine_tuning\qa_dt_corpus.jsonl")


=== DECISION TYPES ===
SELF_ESTEEM_CORRECTIVE: 81
EXPLANATION: 65
DEPENDENCY_BOUNDARIES: 42
ANXIETY_MANAGEMENT: 20
ADDICTION_PATTERN: 14
CLINICAL_ESCALATION: 13
AFFECTIVE_ADDICTION: 12
PARENTING_MODEL: 12
FEAR_SCENARIO_COPING: 11
PARENTING_LIMITS: 11
