In [1]:
from utils import load_dataset, gather_text 

fpathes = ['../data/train.tsv', '../data/dev.tsv', '../data/test.tsv']
texts = []

for ds in load_dataset(fpathes):
    texts.extend(gather_text(ds))
    
len(texts)

9613

In [2]:
from utils import saveJson, ngramGenerator
from paddlenlp.transformers.bert.tokenizer import BasicTokenizer
from collections import Counter
from threading import Thread
from tqdm import tqdm
import json
from time import time


def add_ngram_data(text):
    tks = tokenize(text)
    if tks:
        tks = ['<START>'] + tks + ['<END>']
        bi_tks = ngramGenerator(tks, 2)
        tri_tks = ngramGenerator(tks, 3)
        four_tks = ngramGenerator(tks, 4)
        
        unigram.update(Counter(tks))
        bigram.update(Counter(bi_tks))
        trigram.update(Counter(tri_tks))
        fourgram.update(Counter(four_tks))
        
        
def build_ngram_dataset(step=100):
    end_idx = len(texts)
    for i in tqdm(range(0, end_idx, step)):
        threads = []
        for j in range(i, i + step if i + step <= end_idx else end_idx):
            t = Thread(target=add_ngram_data, args=(texts[j], ))
            t.start()
            threads.append(t)
        for t in threads:
            t.join()


def trimCounterDic(counterDic, min_freq=0, addUNK=False):
    total = sum(counterDic.values())
    counterDic = counterDic.most_common()
    counterDic = [(c[0], c[1] / total) for c in counterDic if c[1] >= min_freq]
    if addUNK:
        counterDic.append(('<UNK>', 1 / total))
    return dict(counterDic)


def main():
    global unigram, bigram, trigram, fourgram
    
    build_ngram_dataset()
    # trim and normalize the frequency dicts
    unigram = trimCounterDic(unigram, 2, True)
    bigram = trimCounterDic(bigram, 5)
    trigram = trimCounterDic(trigram, 5)
    fourgram = trimCounterDic(fourgram, 5)
    saveJson(unigram, '../data/Unigram.json')
    saveJson(bigram, '../data/Bigram.json')
    saveJson(trigram, '../data/Trigram.json')
    saveJson(fourgram, '../data/Fourgram.json')

In [3]:
if __name__ == '__main__':
    s = time()
    tokenize = BasicTokenizer().tokenize
    unigram = Counter()
    bigram = Counter()
    trigram = Counter()
    fourgram = Counter()
    main()
    e = time()
    print("Total time: " + str(e - s))

100%|███████████████████████████████████████████| 97/97 [00:04<00:00, 23.14it/s]


../data/Unigram.json has been saved!
../data/Bigram.json has been saved!
../data/Trigram.json has been saved!
../data/Fourgram.json has been saved!
Total time: 4.405871868133545
