### Transliterate Data

In [1]:
import json
from langdetect import detect
from googletrans import Translator
import re

In [5]:
def is_french_or_english(text):
    try:
        lang = detect(text)
        return lang in ['fr', 'en']
    except:
        return False

def transliterate_text(text):
    translator = Translator()
    words = text.split()
    result = []
    
    for word in words:
        # Clean word of punctuation
        clean_word = re.sub(r'[^\w\s]', '', word)
        
        if not clean_word:  # Skip if word is just punctuation
            result.append(word)
            continue
            
        if is_french_or_english(clean_word):
            result.append(word)
        else:
            try:
                translated = translator.translate(clean_word, dest='en').text
                # Preserve original punctuation
                if word[-1] in '.,!?;:':
                    translated += word[-1]
                result.append(translated)
            except:
                result.append(word)  # Keep original if translation fails
    
    return ' '.join(result)

def process_corpus(input_file='../data/structured_corpus.json', output_file='../data/transliterated_corpus.json'):
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    for message in data['messages']:
        message['text'] = transliterate_text(message['text'])
        # Also process context if needed
        message['context'] = [transliterate_text(ctx) for ctx in message['context']]
    
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    
    return data

In [None]:
processed_data = process_corpus()

  translated = translator.translate(clean_word, dest='en').text
