In [23]:
from collections import Counter
from nltk import ngrams

import itertools
import os
import pybo

Store texts in the following directory structure:

    texts
      Bon
        file1.txt
        file2.txt
        ...
      Nyingma
        file1.txt
        file2.txt
        ...

In [2]:
tok = pybo.BoTokenizer('POS')

Loading Trie...
Time: 4.4831321239471436


In [16]:
tsek = '་'
filter_list = ['གི','ལ','གིས','དང','ནི','ནས','པ','གོ','ཀྱང','དུ','ཀྱིས','དེ','ཏེ','མ','སྟེ','སུ','ཀྱི','གྱིས','ན','ཅེས','འདི','གྱི','ཅི','བ','པར','ཡིས','ཅིང','མོ','ཅེས་པ','རྣམས','ཏུ']

def read_texts(text_dir):
    fnames = []
    texts = []
    texts_f = []
    bigrams = []
    bigrams_f = []
    
    for filename in sorted(os.listdir(text_dir)):
        if ".txt" in filename:
            with open(os.path.join(text_dir, filename), 'r', encoding='utf-8-sig') as f:
                fnames.append(filename)
                tokens = [t for t in tok.tokenize(f.read()) if t.type != "non-bo" and t.pos != "punct"]
                
                lemmas = [t.lemma.strip(tsek) if t.lemma else t.content.strip(tsek) for t in tokens]
                texts.append(lemmas)
                
                lemmas_f = [l for l in lemmas if l not in filter_list]
                texts_f.append(lemmas_f)
                
                grams_f = [tsek.join(x) for x in ngrams(lemmas, 2) if not set(x).intersection(filter_list)]
                bigrams_f.append(grams_f)
                
    return texts, texts_f, bigrams_f, fnames

In [21]:
bon, bon_f, bon_bigrams_f, bon_fnames = read_texts("texts/Bon")
bon_large, bon_large_f, bon_large_bigrams_f, bon_large_fnames = read_texts("texts/Bon-Large")
nyingma, nyingma_f, nyingma_bigrams_f, nyingma_fnames = read_texts("texts/Nyingma")
nyingma_large, nyingma_large_f, nyingma_large_bigrams_f, nyingma_large_fnames = read_texts("texts/Nyingma-Large")
nyingma_gn, nyingma_gn_f, nyingma_gn_bigrams_f, nyingma_gn_fnames = read_texts("texts/Nyingma-GN")
bon_dzogchen, bon_dzogchen_f, bon_dzogchen_bigrams_f, bon_dzogchen_fnames = read_texts("texts/Bon-Dzogchen")
nyingma_dzogchen, nyingma_dzogchen_f, nyingma_dzogchen_bigrams_f, nyingma_dzogchen_fnames = read_texts("texts/Nyingma-Dzogchen")
vairo, vairo_f, vairo_bigrams_f, vairo_fnames = read_texts("texts/Vairo")
dunhuang, dunhuang_f, dunhuang_bigrams_f, dunhuang_fnames = read_texts("texts/Dunhuang")
dunhuang_chan, dunhuang_chan_f, dunhuang_chan_bigrams_f, dunhuang_chan_fnames = read_texts("texts/Dunhuang-Chan")
mahamudra, mahamudra_f, mahamudra_bigrams_f, mahamudra_fnames = read_texts("texts/Mahamudra")

all_texts = [bon, bon_large, nyingma, nyingma_large, nyingma_gn, bon_dzogchen, nyingma_dzogchen, vairo, dunhuang, dunhuang_chan, mahamudra]
all_texts_f = [bon_f, bon_large_f, nyingma_f, nyingma_large_f, nyingma_gn_f, bon_dzogchen_f, nyingma_dzogchen_f, vairo_f, dunhuang_f, dunhuang_chan_f, mahamudra_f]
all_bigrams_f = [bon_bigrams_f, bon_large_bigrams_f, nyingma_bigrams_f, nyingma_large_bigrams_f, nyingma_gn_bigrams_f, bon_dzogchen_bigrams_f, nyingma_dzogchen_bigrams_f, vairo_bigrams_f, dunhuang_bigrams_f, dunhuang_chan_bigrams_f, mahamudra_bigrams_f]
all_fnames = [bon_fnames, bon_large_fnames, nyingma_fnames, nyingma_large_fnames, nyingma_gn_fnames, bon_dzogchen_fnames, nyingma_dzogchen_fnames, vairo_fnames, dunhuang_fnames, dunhuang_chan_fnames, mahamudra_fnames]


In [12]:
for ts, fs in zip(all_texts, all_fnames):
    print(','.join(fs))
    print('"' + '","'.join([str(len(t)) for t in ts]) + '"')

01-SMDG-gser-lung-non-che.txt,02-SMDG-gser-lung-non-chung.txt,03-SMDG-sems-phran-rig-pa-khu-byug-sa-gcod.txt,04-SMDG-rig-pa-khu-byug-gzhung.txt,05-SMDG-rig-pa-khu-byug-grel.txt,06-SMDG-cog-bzhag-sa-gcod.txt,07-SMDG-cog-bzhag-gzhung.txt,08-SMDG-cog-bzhag-grel.txt,09a-SMDG-rgyun-thag-sa-gcod.-144-145docx.txt,09b-SMDG-rgyun-thag-gzhung-145-147.txt,09c-SMDG-rgyun-thag-grel-147-160.txt,10-SMDG-sems-lung-rgyun-thag.txt,11-SMDG-sems-lung-rgyun-thag-'grel.txt,12-SMDG-lhug-par-bzhag-pa.txt,13-ngang-thag_sems-smad-sde-dgu-TTN-v3.txt,14-SMDG-thig-le-dbyings-kyi-ti-ka-v2.txt
"14446","2274","642","1797","16341","183","213","3720","630","534","3653","288","1337","1565","5838","7227"
Kanjur-172-gab-pa-dgu་_pp.1-565.txt,kanjur-174-1_gser-gyi-rus-sbal.txt
"82517","7041"
L1-rig-pa'i-khu-byug_p302-324.txt,L2-rtsal-chen_p436-447་.txt,L3-khyung-chen_p447-468.txt,L4_byang-sems-bsgom-rdo-la-gser-zhun-p416-425.txt,L5_nam-mkha'-che-rgyas-pa-yi-ge-med-pa_V8-p.468-473.txt,LL01_L14_rtse-mo-byung-rgyal_vol8-p480-4

In [25]:
for tfs, bfs, fs in zip(all_texts_f, all_bigrams_f, all_fnames):
    print(', ,'.join(fs))
    print('\n')
    word_cs = [Counter(t) for t in tfs]
    word_most_commons = [c.most_common(20) for c in word_cs]
    bigram_cs = [Counter(b) for b in bfs]
    bigram_most_commons = [c.most_common(20) for c in bigram_cs]
    
    for i in range(20):
        word_arr = ["{}: {}".format(p[i][0], p[i][1]) for p in word_most_commons]
        bigram_arr = ["{}: {}".format(p[i][0], p[i][1]) for p in bigram_most_commons]
        
        print(",".join([val for pair in zip(word_arr, bigram_arr) for val in pair]))
    print("\n\n")

01-SMDG-gser-lung-non-che.txt, ,02-SMDG-gser-lung-non-chung.txt, ,03-SMDG-sems-phran-rig-pa-khu-byug-sa-gcod.txt, ,04-SMDG-rig-pa-khu-byug-gzhung.txt, ,05-SMDG-rig-pa-khu-byug-grel.txt, ,06-SMDG-cog-bzhag-sa-gcod.txt, ,07-SMDG-cog-bzhag-gzhung.txt, ,08-SMDG-cog-bzhag-grel.txt, ,09a-SMDG-rgyun-thag-sa-gcod.-144-145docx.txt, ,09b-SMDG-rgyun-thag-gzhung-145-147.txt, ,09c-SMDG-rgyun-thag-grel-147-160.txt, ,10-SMDG-sems-lung-rgyun-thag.txt, ,11-SMDG-sems-lung-rgyun-thag-'grel.txt, ,12-SMDG-lhug-par-bzhag-pa.txt, ,13-ngang-thag_sems-smad-sde-dgu-TTN-v3.txt, ,14-SMDG-thig-le-dbyings-kyi-ti-ka-v2.txt


སེམས: 166,ལུང་བརྣན: 80,མཚན་མ: 54,ལུང་བརྣན: 25,སེམས: 17,བྱང་ཆུབ་སེམས: 6,ཁུ་བྱུག: 62,རེ་སྤྲོ: 21,སེམས: 207,བོན་ཉིད: 61,ཅོག: 5,ཅོག་བཞག: 4,རྣམ་པ: 9,རྣམ་པ་གསུམ་པོ: 9,མྱེད: 95,རྣམ་པ་གསུམ: 15,དོན: 41,བྱང་ཆུབ་སེམས: 11,སེམས: 17,བྱང་ཆུབ་སེམས: 7,མྱེད: 134,མྱེད་པས: 26,མི: 9,མི་དམྱིགས: 5,ཉིད: 30,བོན་ཉིད: 11,མི: 50,བོན་ཉིད: 12,བྱ: 74,གཤེན་ལྷ་ཀ: 22,ལས: 86,སྤྲུལ་ལོ: 17
བདག: 157,སྙམ་བྱེད: 34,མྱེད: 48,མཚན་མ་ཐམས་ཅ

མཛད: 4,ཨཱ་ཙཱརྻ: 2,སེམས: 45,བྱང་ཆུབ་སེམས: 16,ཡིན: 2107,ཕྱག་རྒྱ་ཆེན་པོ: 229,མེད: 158,སྐྱེ་མེད: 18,དགའ: 6,བྱེད་རང: 4
ཐམས་ཅད: 4,ཙཱརྻ་དཔའ་བོ: 2,ཡིན: 38,བྱང་ཆུབ་སེམས་དཔའ: 5,མི: 1942,ངོ་བོ་ཉིད: 196,མི: 76,གཉིས་མེད: 18,མི: 6,རང་དགའ: 4
འདོད: 4,དཔའ་བོ་རྣམ་པར: 2,བྱང་ཆུབ: 35,རང་བཞིན་ཅན: 4,མེད: 1834,མི་འགྱུར: 188,ཡི: 55,རྟོག་མེད: 16,མེད: 6,དགའ་ཡེ་ཤེས: 4
དཔའ་བོ: 3,རྣམ་པར་སྣང: 2,མེད: 27,མི་འགྱུར: 4,ལས: 1551,འགྱུར་རོ: 170,ཆོས་ཉིད: 53,ཕྱག་རྒྱ་ཆེན་པོ: 12,ད: 5,ཡེ་ཤེས་སེང་གེ: 4
ཆེན་པོ: 3,སྣང་མཛད: 2,མིན: 25,མཚན་ཉིད་ཅན: 4,ཉིད: 1280,ཕྱག་འཚལ: 156,བདེ་ཆེན: 44,ལས་འདས: 10,ད་ལྟ: 5,སེང་གེ་བཞིན: 4
ཆུ: 3,བྱང་ཆུབ་སེམས: 2,འགྱུར: 25,ཕུང་པོ་ཁམས: 3,མེད་པ: 1057,ལྷན་ཅིག་སྐྱེས་པ: 143,ལས: 44,དོན་མཐོང: 10,རབ: 5,ཞགས་པ་རབ: 4
སེམས: 3,བརྟན་གཡོ་ཐམས་ཅད: 2,ཉིད: 19,དོན་བྱེད་པ: 3,བྱ་བ: 1045,བདག་ཉིད་ཅན: 138,བྲལ: 40,ཞེན་མེད: 10,ཁྱོད: 4,རབ་བཅད་པ: 4
གནས: 3,གླུ་བཞུགས: 1,ཡི: 19,མི་འདོད: 3,འགྱུར: 1000,འཚལ་ལོ: 137,བླ་མ: 37,ནོར་བུ་རིན་ཆེན: 9,བྱེད: 4,ད་ལྟ་རེ་བ: 4
ཨཱ: 2,དཔའ་བོ་རྣམ: 1,སངས་རྒྱས: 18,འགའ་ཡང་མེད: 3,བྱ: 951,གཉིས་མེད: 134,སངས་རྒྱས: 35,