In [1]:
import os
import re

In [5]:
raw_texts = []
for fname in os.listdir("CORPUS_TEXT_UTF8SIG"):
    with open("CORPUS_TEXT_UTF8SIG/"+fname, 'r', encoding='utf-8-sig') as f:
        raw_texts.append(f.read())

In [6]:
tsheg = "་"
double_tsheg = "："
shad = "།"
sbrul_shad = "༈"
yig_mgo = "༄༅"
gter_ma = "༔"
visarga = "ཿ"

In [7]:
all_sents = []
joined_sents = []
for t in raw_texts:
    sent_arr = t.split(shad)
    joined_arr = []
    for i in range(len(sent_arr)):
        s = sent_arr[i]
        # Strip out urls
        s = re.sub(r'http\S+', '', s) 
        # Strip out English characters
        s = re.sub(r'[a-zA-Z]', '', s)
        # Strip out numbers
        s = re.sub(r'\d', '', s)
        # Strip out punctuation
        s = re.sub(r'[\."\(\)《》#%&\'+,-/:;<=>?@\[\]_\{\}«»°]', '', s)
        # Strip out weird spaces
        s = re.sub(r'[\ufeff\xa0\u200b]', '', s)
        # Strip out other characters
        s = re.sub(r'[\—‘’‚“”•…⁃℃①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬○善四基川师广慈播根父组织藏语金会﹙﹚﹪＂％（）、，－：～äéê]', '', s)
        # Strip out Tibetan punctuation 
        s = s.strip()
        sent_arr[i] = s
        t = s.split()
        t = tsheg.join(t)
        joined_arr.append(t)
    joined_sents.extend(joined_arr)
    all_sents.extend(sent_arr)

all_sents = [x for x in all_sents if x != '']
joined_sents = [x for x in joined_sents if x != '']

In [8]:
from collections import Counter
s = ''.join(all_sents)
c_clean = Counter(s)
c_raw = Counter(''.join(raw_texts))

In [9]:
def find(ch):
    i = s.find(ch)
    print(s[i-10:i+10])

In [10]:
c_clean

Counter({' ': 1118992,
         'ༀ': 3,
         '༄': 225,
         '༅': 225,
         '་': 632337,
         '༌': 1490,
         '༎': 7,
         '༑': 16,
         '༔': 62,
         '༛': 1,
         '༸': 60,
         '༼': 170,
         '༽': 226,
         'ཀ': 50129,
         'ཁ': 60718,
         'ག': 374444,
         'ང': 245372,
         'ཅ': 34336,
         'ཆ': 45427,
         'ཇ': 8769,
         'ཉ': 23862,
         'ཊ': 248,
         'ཋ': 1,
         'ཌ': 66,
         'ཎ': 401,
         'ཏ': 25678,
         'ཐ': 34116,
         'ད': 381844,
         'ན': 223304,
         'པ': 121858,
         'ཕ': 28850,
         'བ': 261763,
         'བྷ': 6,
         'མ': 181396,
         'ཙ': 13259,
         'ཚ': 50675,
         'ཛ': 9350,
         'ཛྷ': 1,
         'ཝ': 797,
         'ཞ': 55961,
         'ཟ': 24039,
         'འ': 179858,
         'ཡ': 83403,
         'ར': 255913,
         'ལ': 161316,
         'ཤ': 29574,
         'ཥ': 8,
         'ས': 480395,
         'ཧ': 4367,
         'ཨ': 1

In [11]:
all_targets = []
for sent in all_sents:
    targets = []
    for word in sent.split():
        for i, syllable in enumerate(word.split(tsheg)):
            if i == 0:
                targets.append('1')
            else:
                targets.append('0')
    all_targets.append(targets)

In [12]:
len(all_targets)

122111

In [13]:
all_targets[0] 

['1',
 '0',
 '0',
 '0',
 '1',
 '1',
 '0',
 '1',
 '1',
 '0',
 '1',
 '0',
 '1',
 '1',
 '0',
 '1',
 '0',
 '1',
 '0']

In [14]:
all_sents[0]

'སྟག་ལུང་བཀའ་རྒྱུད ཀྱི མ་དགོན ཁྲ འཁྱིལ་མ ལྷ་ཁང གི ཉམས་བཟོ ལེགས་འགྲུབ བྱུང་བ'

In [15]:
all_targets = [' '.join(t) for t in all_targets]

In [16]:
train_text = joined_sents[:100000]
train_target = all_targets[:100000]
valid_text = joined_sents[100000:112111]
valid_target = all_targets[100000:112111]
test_text = joined_sents[112111:]
test_target = all_targets[112111:]

In [137]:
with open("Corpora/Nanhai/data/train_text.txt", "w", encoding='utf-8-sig') as f:
    f.write('\n'.join(train_text))
with open("Corpora/Nanhai/data/train_target.txt", "w", encoding='utf-8-sig') as f:
    f.write('\n'.join(train_target))
with open("Corpora/Nanhai/data/valid_text.txt", "w", encoding='utf-8-sig') as f:
    f.write('\n'.join(valid_text))
with open("Corpora/Nanhai/data/valid_target.txt", "w", encoding='utf-8-sig') as f:
    f.write('\n'.join(valid_target))
with open("Corpora/Nanhai/data/test_text.txt", "w", encoding='utf-8-sig') as f:
    f.write('\n'.join(test_text))
with open("Corpora/Nanhai/data/test_target.txt", "w", encoding='utf-8-sig') as f:
    f.write('\n'.join(test_target))

In [18]:
dist = [len(s) for s in all_targets]

In [19]:
dist

[37,
 37,
 81,
 45,
 43,
 13,
 13,
 77,
 45,
 11,
 87,
 15,
 35,
 27,
 35,
 39,
 11,
 43,
 71,
 7,
 33,
 9,
 43,
 7,
 21,
 23,
 89,
 19,
 53,
 71,
 43,
 35,
 3,
 23,
 13,
 13,
 13,
 15,
 51,
 41,
 7,
 59,
 55,
 43,
 61,
 3,
 59,
 75,
 87,
 27,
 7,
 9,
 5,
 91,
 41,
 103,
 37,
 31,
 25,
 75,
 67,
 33,
 39,
 17,
 5,
 31,
 47,
 25,
 59,
 15,
 13,
 3,
 3,
 3,
 7,
 3,
 17,
 31,
 9,
 69,
 47,
 71,
 19,
 13,
 43,
 71,
 1,
 21,
 25,
 27,
 69,
 39,
 43,
 35,
 41,
 13,
 7,
 7,
 51,
 1,
 7,
 75,
 75,
 1,
 7,
 77,
 25,
 69,
 31,
 1,
 7,
 33,
 9,
 23,
 13,
 111,
 3,
 17,
 15,
 47,
 67,
 73,
 1,
 23,
 63,
 55,
 39,
 17,
 7,
 7,
 17,
 33,
 51,
 47,
 1,
 21,
 69,
 55,
 47,
 1,
 13,
 25,
 43,
 17,
 9,
 9,
 9,
 49,
 1,
 7,
 39,
 29,
 21,
 3,
 11,
 9,
 13,
 37,
 3,
 27,
 31,
 71,
 3,
 63,
 51,
 31,
 55,
 7,
 7,
 7,
 7,
 7,
 11,
 7,
 25,
 11,
 27,
 21,
 13,
 7,
 11,
 5,
 13,
 9,
 7,
 15,
 35,
 7,
 5,
 31,
 13,
 7,
 49,
 41,
 21,
 7,
 19,
 53,
 7,
 13,
 7,
 7,
 19,
 35,
 7,
 63,
 51,
 11,
 17,
 11,
 11,
 4

In [20]:
import matplotlib.pyplot as plt

In [23]:
dist.remove(3673)

In [26]:
max(dist)

1953

In [27]:
dist.remove(1953)

In [28]:
max(dist)

1143

In [29]:
dist.remove(1143)

In [30]:
max(dist)

875

In [31]:
import numpy as np

In [32]:
np.median

<function numpy.lib.function_base.median>

In [33]:
np.median(dist)

23.0

In [35]:
np.mean(dist)

29.569872571821666