### Imports

In [72]:
import re
import pandas as pd
from tqdm import tqdm
import os

### Configuration

In [73]:
index = 0

languages = ['devanagari', 'tamil', 'telugu']
unicodes = [ '[\u0900-\u097F]+', '[\u0B80-\u0BFF]+', '[\u0C00-\u0C7F]+']

train_path = '/data/BADRI/IHTR/trainset/'
val_path = '/data/BADRI/IHTR/validationset_small/'

dest_path = './../dataset/' + languages[index]

### Input Directory

In [74]:
df1 = pd.read_csv(train_path + languages[index] + '/train.txt', names=['file','token'], sep=' ')
df2 = pd.read_csv(val_path + languages[index] + '/val.txt', names=['file','token'], sep=' ')
df = pd.concat([df1,df2])

### Fetching Vocabulary

In [75]:
unique_words = []
characters = set()
hin_re = re.compile(unicodes[index])

In [76]:
for path in sets:
    for word in df['token']:
        match = hin_re.match(word)
        if(match):
            tword = match.group()
            if(len(tword) > 32):
                print(tword)
                continue
            if(tword not in unique_words):
                unique_words.append(tword)
                for c in tword:
                    characters.add(c)

print("Unique Characters observed",len(characters))
print(characters, end='\n\n')
print("No of unique words:",len(unique_words))

Unique Characters observed 107
{'ध', 'ं', 'ऑ', 'म', 'य़', 'य', 'ज़', 'प', 'ॐ', '॓', '॰', 'स', 'ृ', 'ञ', 'ए', 'ू', 'व', 'अ', '७', '।', 'ऍ', 'ख़', 'ङ', '१', 'ड़', 'ष', 'ण', 'ऋ', 'ओ', 'ग', 'च', 'ऴ', 'ढ', 'थ', 'ऎ', 'ॄ', 'ज', 'ॻ', 'ी', 'न', 'त', 'ग़', '४', 'ँ', 'ठ', 'औ', 'ल', '॔', 'फ', 'उ', 'द', 'ब', 'ॢ', 'ट', '॒', 'े', 'ई', 'ऽ', 'ॾ', 'भ', 'आ', 'ऩ', 'ॠ', '॥', 'ॲ', 'ॅ', '़', 'ॱ', 'ॉ', 'ॽ', '्', 'ौ', 'ऊ', 'ॼ', '८', '५', 'ख', 'ड', 'ु', 'छ', 'घ', 'ळ', 'ऱ', '॑', 'ा', 'ः', 'झ', 'ो', 'ह', 'श', 'इ', 'ि', 'ै', '३', '९', '६', '०', 'र', 'ऐ', 'ॊ', 'ऌ', 'फ़', 'ऒ', 'क', 'ढ़', '२', 'क़'}

No of unique words: 9810


In [77]:
vocab_loc = os.path.join(dest_path, "vocab.txt")
unique_words_loc = os.path.join(dest_path, "unique_words.txt")

with open(vocab_loc,'w') as f:
    f.write(" ".join(characters))

with open(unique_words_loc,'w') as f:
    for word in unique_words:
        f.write("%s\n" % word) 