### Remove everything irrelevant

In [38]:
import re
import os

In [43]:
# Regex to match emojis
emoji_pattern = re.compile(
    "["
    "\U0001F600-\U0001F64F"  # Emoticons
    "\U0001F300-\U0001F5FF"  # Symbols & pictographs
    "\U0001F680-\U0001F6FF"  # Transport & map symbols
    "\U0001F1E0-\U0001F1FF"  # Flags
    "\U00002700-\U000027BF"  # Dingbats
    "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
    "\U0001FA70-\U0001FAFF"  # Extended symbols
    "\U00002600-\U000026FF"  # Misc symbols
    "\U00002B50-\U00002B55"
    "]+",
    flags=re.UNICODE
)

def is_media_or_system_message(message):
    unwanted_keywords = [
        "audio omitted",
        "sticker omitted",
        "image omitted",
        "video omitted",
        "document omitted",
        "This message was deleted",
        "You deleted this message",
        "You created group",
        "Messages and calls are end-to-end encrypted",
        "<This message was edited>",
        "changed their phone number to a new number"
    ]
    message_clean = message.replace('\u200e', '')

    # Remove system messages and media omissions
    if any(kw.lower() in message_clean.lower() for kw in unwanted_keywords):
        return True

    # Remove messages containing tags like @961XXXXXXXX
    if re.search(r'@\d', message_clean):
        return True

    # Remove messages containing emojis
    if emoji_pattern.search(message_clean):
        return True

    return False

def clean_whatsapp_chat(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    cleaned_lines = []
    for line in lines:
        # Skip empty lines
        if not line.strip():
            continue
            
        # Extract timestamp and message
        match = re.match(r'\[(.*?)\] (.*?): (.*)', line)
        if not match:
            continue
            
        timestamp, sender, message = match.groups()
        message = message.strip()

        # Skip media and system messages
        if is_media_or_system_message(message):
            continue
        
        cleaned_lines.append(f"[{timestamp}] {sender}: {message}\n")

    with open(output_file, 'w', encoding='utf-8') as f:
        f.writelines(cleaned_lines)

In [44]:
def process_all_chats():
    raw_chats_dir = './data/raw_chats'
    processed_chats_dir = './data/processed_chats'
    
    # Create processed_chats directory if it doesn't exist
    if not os.path.exists(processed_chats_dir):
        os.makedirs(processed_chats_dir)
    
    # Process each chat file
    for filename in os.listdir(raw_chats_dir):
        if filename.endswith('.txt'):
            input_path = os.path.join(raw_chats_dir, filename)
            output_path = os.path.join(processed_chats_dir, filename)
            clean_whatsapp_chat(input_path, output_path)
            print(f"Processed {filename}")

In [45]:
process_all_chats()

Processed chat_1.txt
Processed chat_2.txt
Processed chat_3.txt
Processed chat_6.txt
Processed chat_4.txt
Processed chat_5.txt
