In [None]:
from src.requirements import *
from src.tokenizer import *

In [None]:
FORMATTING_REMOVE = {'\u200d', '\u200c', '\u200b', '\ufeff'}
JUNK = {'\u200e', '\u200f', '\xa0', '“', '”'}

In [None]:
def is_valid_char(ch):
    code = ord(ch)

    # 1. Devanagari block (U+0900 to U+097F)
    if 0x0900 <= code <= 0x097F:
        return True

    # 2. Standard Latin Digits (0-9) 
    if '0' <= ch <= '9':
        return False

    # 3. Basic Punctuation & Whitespace
    if ch in " \n\t.,?!-()\"'।॥":
        return True

    return False

In [None]:
def clean_transcript(text):
    if not isinstance(text, str):
        return ""
    
    text = unicodedata.normalize("NFD", text)
    
    cleaned = []
    for ch in text:
        if ch in FORMATTING_REMOVE:
            continue
        
        cleaned.append(ch)
    text = text.replace('\xa0', ' ')
    for junk in JUNK:
        text = text.replace(junk, '')
    
    text = text.replace('ऱ', 'र')
    
    return "".join([ch for ch in text if is_valid_char(ch)])

input_path = os.path.join("data", "metadata.tsv")

df = pd.read_csv(input_path, sep="\t")

print("Cleaning transcripts...")
df['transcript'] = df['transcript'].apply(clean_transcript)

output_path = os.path.join("data", "metadata_normal.tsv")
df.to_csv(output_path, sep="\t", index=False, encoding="utf-8")

print(f"Done! Cleaned file saved to: {output_path}")

In [None]:
token_path = os.path.join("data", "tokenizer.json")
tokenizer = Tokenizer.load(token_path)

In [None]:
all_chars = set("".join(df['transcript'].astype(str)))
missing = [c for c in all_chars if c not in tokenizer.token_to_id]

if missing:
    print(f"Still missing these characters: {repr(''.join(missing))}")
else:
    print("Tokenizer is fully compatible with the cleaned metadata!")