### Training Data Creation

In [9]:
import json
import os
import re
from datetime import datetime

In [10]:
def extract_messages(chat_file):
    """Extract messages from chat file."""
    with open(chat_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    # Remove empty lines
    lines = [line.strip() for line in lines if line.strip()]
    
    # Process into message objects
    messages = []
    for i, line in enumerate(lines):
        # Extract timestamp, speaker and message content
        timestamp_match = re.match(r'\[(.*?)\]', line)
        if timestamp_match:
            timestamp = timestamp_match.group(1)
            rest = line[timestamp_match.end():].strip()
            speaker_match = re.match(r'(.*?):', rest)
            if speaker_match:
                speaker = speaker_match.group(1).strip()
                content = rest[speaker_match.end():].strip()
            else:
                speaker = "unknown"
                content = rest
        else:
            timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            speaker = "unknown"
            content = line.strip()
        
        # Get context (previous messages)
        context = lines[max(0, i-3):i] if i > 0 else []
        
        # Create message object
        message = {
            "id": f"{os.path.basename(chat_file)}_{i}",
            "speaker": speaker,
            "text": content,
            "context": context,
            "timestamp": timestamp,
            "summary": "",
            "importance": None
        }
        
        messages.append(message)
    
    return messages

In [11]:
def detect_importance_heuristic(message):
    """Simple heuristic for importance detection."""
    text = message["text"].lower()
    
    # Keywords suggesting importance
    important_keywords = [
        "deadline", "assignment", "project", "urgent", "important",
        "please", "devoir", "send", "tomorrow", "today", "test", "exam",
        "meeting", "class", "cours", "emta", "aya se3a", "bedna", "badna",
        "lesim", "due", "code", "send", "tebaato"
    ]
    
    # Question indicators
    question_indicators = ["?", "chou", "wen", "kif", "min", "shu", "chi", "ayya"]
    
    # Check for importance signals
    score = 0
    
    # Keyword matching
    for keyword in important_keywords:
        if keyword in text:
            score += 1
    
    # Question detection
    for indicator in question_indicators:
        if indicator in text:
            score += 0.5
    
    # URLs or links are often important
    if "http" in text or "www" in text:
        score += 1
    
    # ALL CAPS messages often indicate importance
    if text.isupper() and len(text) > 5:
        score += 1
    
    # Length can be a signal (very short messages like "hi" are often less important)
    if len(text) > 50:
        score += 0.5
    
    # Context consideration
    importance = 0
    if score >= 2:
        importance = 1  # Important
    else:
        importance = 0  # Not important
    
    return importance

In [12]:
def basic_summarize(message):
    """Create a very simple summary for demonstration."""
    text = message["text"].lower()
    
    # Define some summary templates
    summary_templates = {
        "hi|hello|hey|bonjour|sabaho": "Greeting",
        "deadline|due|emta": "Asking about deadline",
        "assignment|devoir|homework": "Discussing assignment",
        "send|share|show|tebaato|baato": "Requesting to share something",
        "code|github|programming": "Discussing code",
        "meeting|join|meet": "Discussion about meeting",
        "url|http|www": "Shared a link",
        r"\?$": "Asked a question"
    }
    
    # Try to match text with templates
    for pattern, summary in summary_templates.items():
        if re.search(pattern, text):
            return summary
    
    # Default summary
    return "General message"

In [13]:
def process_chats(input_dir, output_file):
    """Process all chat files in directory and save as structured JSON."""
    all_messages = []
    
    for file in os.listdir(input_dir):
        if file.endswith('.txt'):
            file_path = os.path.join(input_dir, file)
            print(f"Processing {file_path}...")
            
            messages = extract_messages(file_path)
            
            # Apply basic heuristics for importance and summary
            for message in messages:
                message["importance"] = detect_importance_heuristic(message)
                message["summary"] = basic_summarize(message)
            
            all_messages.extend(messages)
    
    # Save to JSON file
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump({"messages": all_messages}, f, ensure_ascii=False, indent=2)
    
    print(f"Processed {len(all_messages)} messages and saved to {output_file}")
    print("Note: This is a basic processing with heuristic importance detection.")
    print("You should review and refine the importance labels and summaries.")

In [14]:
input_dir = "./data/processed_chats"
output_file = "data/structured_corpus.json"

process_chats(input_dir, output_file)

Processing ./data/processed_chats/chat_1.txt...
Processing ./data/processed_chats/chat_2.txt...
Processing ./data/processed_chats/chat_3.txt...
Processing ./data/processed_chats/chat_6.txt...
Processing ./data/processed_chats/chat_4.txt...
Processed 86237 messages and saved to data/structured_corpus.json
Note: This is a basic processing with heuristic importance detection.
You should review and refine the importance labels and summaries.
