In [4]:
import os
import re

def extract_text_from_chats():
    # Create output directory if it doesn't exist
    os.makedirs('../data/only_text_chats', exist_ok=True)
    
    # Get all chat files
    chat_files = [f for f in os.listdir('../data/processed_chats') if f.endswith('.txt')]
    
    for chat_file in chat_files:
        input_path = os.path.join('../data/processed_chats', chat_file)
        output_path = os.path.join('../data/only_text_chats', chat_file)
        
        with open(input_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()
            
        # Extract only the text part after the timestamp and speaker
        text_lines = []
        for line in lines:
            # Match pattern: [timestamp] speaker: text
            match = re.match(r'^\[.*?\]\s+.*?:\s+(.+)$', line)
            if match:
                text = match.group(1).strip()
                if text:  # Only add non-empty messages
                    text_lines.append(text + '\n')
        
        # Write only the text to new file
        with open(output_path, 'w', encoding='utf-8') as f:
            f.writelines(text_lines)
            
        print(f"Processed {chat_file}")

In [5]:
extract_text_from_chats()

Processed chat_1.txt
Processed chat_2.txt
Processed chat_3.txt
Processed chat_6.txt
Processed chat_4.txt


In [7]:
pip install python-Levenshtein

Collecting python-Levenshtein
  Downloading python_levenshtein-0.27.1-py3-none-any.whl.metadata (3.7 kB)
Downloading python_levenshtein-0.27.1-py3-none-any.whl (9.4 kB)
Installing collected packages: python-Levenshtein
Successfully installed python-Levenshtein-0.27.1
Note: you may need to restart the kernel to use updated packages.


In [8]:
import os
import re
from collections import defaultdict
from Levenshtein import distance
from typing import Dict, Set

def cluster_similar_words(threshold: float = 0.8) -> Dict[str, Set[str]]:
    """
    Cluster words based on similarity using Levenshtein distance.
    
    Args:
        threshold: Minimum similarity ratio (0-1) for words to be considered similar
    
    Returns:
        Dictionary mapping each word to its cluster of similar words
    """
    # Get all words from files
    words = set()
    chat_dir = '../data/only_text_chats'
    
    for filename in os.listdir(chat_dir):
        if filename.endswith('.txt'):
            with open(os.path.join(chat_dir, filename), 'r', encoding='utf-8') as f:
                text = f.read().lower()
                # Extract words, keeping only alphanumeric characters
                file_words = re.findall(r'\b\w+\b', text)
                words.update(file_words)
    
    # Convert to list for easier indexing
    words = list(words)
    clusters = defaultdict(set)
    
    # Compare each word with every other word
    for i, word1 in enumerate(words):
        if word1 in clusters:  # Skip if already clustered
            continue
            
        clusters[word1].add(word1)  # Add word to its own cluster
        
        for word2 in words[i+1:]:
            if word2 in clusters:  # Skip if already clustered
                continue
                
            # Calculate similarity ratio
            max_len = max(len(word1), len(word2))
            if max_len == 0:
                continue
                
            similarity = 1 - (distance(word1, word2) / max_len)
            
            if similarity >= threshold:
                clusters[word1].add(word2)
                clusters[word2].add(word1)
    
    return clusters

In [9]:
def print_clusters(clusters: Dict[str, Set[str]], min_cluster_size: int = 2):
    """Print clusters with at least min_cluster_size words"""
    for word, cluster in clusters.items():
        if len(cluster) >= min_cluster_size:
            print(f"Cluster for '{word}': {sorted(cluster)}")

In [None]:
import json

clusters = cluster_similar_words(threshold=0.8)
with open('normal_clusters.json', 'w', encoding='utf-8') as f:
    json.dump({k: sorted(list(v)) for k, v in clusters.items() if len(v) >= 2}, f, ensure_ascii=False, indent=2)