### Dataset Statistics

In [12]:
import os
from pathlib import Path
import datetime

In [15]:
raw_chats_dir = Path("data/processed_chats")

In [16]:
total_messages = 0

for file in raw_chats_dir.glob("*.txt"):
    with open(file, "r", encoding="utf-8") as f:
        total_messages += len(f.readlines())

print(f"Total messages across all files: {total_messages}")


Total messages across all files: 86237


In [9]:
unique_speakers = set()

for file in raw_chats_dir.glob("*.txt"):
    with open(file, "r", encoding="utf-8") as f:
        for line in f:
            if ":" in line:
                speaker = line.split("]")[1].split(":")[0].strip()
                unique_speakers.add(speaker)

print(f"Total unique participants: {len(unique_speakers)}")


Total unique participants: 253


In [8]:
total_words = 0
total_messages = 0

for file in raw_chats_dir.glob("*.txt"):
    with open(file, "r", encoding="utf-8") as f:
        for line in f:
            if ":" in line:
                message = line.split(":", 1)[1].strip()
                total_words += len(message.split())
                total_messages += 1

avg_length = total_words / total_messages if total_messages > 0 else 0
print(f"Average message length: {avg_length:.2f} words")


Average message length: 7.76 words


In [10]:
import re
from langdetect import detect, LangDetectException

code_switched_messages = 0
total_messages = 0

for file in raw_chats_dir.glob("*.txt"):
    with open(file, "r", encoding="utf-8") as f:
        for line in f:
            if ":" in line:
                message = line.split(":", 1)[1].strip()
                if message:  # Skip empty messages
                    total_messages += 1
                    try:
                        # Split message into words and detect language for each
                        words = re.findall(r'\w+', message)
                        languages = set()
                        for word in words:
                            try:
                                lang = detect(word)
                                languages.add(lang)
                            except LangDetectException:
                                continue
                        if len(languages) > 1:
                            code_switched_messages += 1
                    except LangDetectException:
                        continue

percentage = (code_switched_messages / total_messages * 100) if total_messages > 0 else 0
print(f"Code-switched messages: {code_switched_messages} ({percentage:.2f}%)")


Code-switched messages: 85994 (99.72%)


In [14]:
thread_lengths = []
current_thread = []
last_timestamp = None

for file in raw_chats_dir.glob("*.txt"):
    with open(file, "r", encoding="utf-8") as f:
        for line in f:
            if ":" in line:
                try:
                    # Extract timestamp from line
                    timestamp_str = line.split("]")[0].strip("[")
                    timestamp = datetime.datetime.strptime(timestamp_str, "%m/%d/%Y, %I:%M:%S %p")
                    
                    if last_timestamp is None:
                        last_timestamp = timestamp
                        current_thread.append(line)
                    else:
                        time_diff = (timestamp - last_timestamp).total_seconds() / 60
                        if time_diff <= 30:
                            current_thread.append(line)
                        else:
                            if current_thread:
                                thread_lengths.append(len(current_thread))
                            current_thread = [line]
                        last_timestamp = timestamp
                except (ValueError, IndexError):
                    continue
            else:
                if current_thread:
                    thread_lengths.append(len(current_thread))
                    current_thread = []
                    last_timestamp = None
        if current_thread:
            thread_lengths.append(len(current_thread))
            current_thread = []
            last_timestamp = None

avg_thread_length = sum(thread_lengths) / len(thread_lengths) if thread_lengths else 0
print(f"Average messages per conversation thread (1 conversation ~ 30 mins): {avg_thread_length:.2f}")

Average messages per conversation thread (1 conversation ~ 30 mins): 26.38
