# Things I have learned in this project
* At the typing declaration -> use Tuple[str] instead of Tuple(str) -> not an object call
* Tuple declaration: ("hello"), ("hello",) --> 'hello', ('hello',)
* Usage of CTRL symbols (tokens) in tokenization
* Abstraction of tasks in increasing level of complexity. Reuse simpler functions. Break down complex function into more manageable tasks.
* Add unittests on those manageable tasks, as far as possible
* Apply on VNExpress dataset -> easier to understand the result. A nice adaptation. 

In [1]:
from typing import List, Tuple, Dict
import unittest
from collections import defaultdict
from common_utils import *

TOKENS = ["hello", "world", "!"]

In [2]:
test_cases_1 = [(
    {
        "tokens": TOKENS,
        "n": 2
    }, 
    [("hello", "world"), ("world", "!")]),
              (
    {
        "tokens": TOKENS,
        "n": 1
    }, 
    [("hello", ), ("world", ), ("!",)])] # This is different from ("hello"), ("hello",) --> 'hello', ('hello',)

def test_assertion(func, test_cases):
    for args, expected_result in test_cases:
        assert func(**args) == expected_result, f"{func(**args)}, {expected_result}"

test_assertion(build_basic_ngram, test_cases_1)

# Adding control characters BOS, EOS

In [3]:
test_cases_2 = [(
    {
        "tokens":["hello", "world", "!"],
        "n": 2
    }, 
    [(BOS, "hello"), ("hello", "world"), ("world", "!"), ("!", EOS)])]
test_assertion(func=build_ngrams_ctrl, test_cases=test_cases_2)

# Count ngram frequency -> Build the model

In [2]:
count_frequencies(texts=[TOKENS] * 3, n=3)

{('<BOS>', '<BOS>'): defaultdict(int, {'hello': 3}),
 ('<BOS>', 'hello'): defaultdict(int, {'world': 3}),
 ('hello', 'world'): defaultdict(int, {'!': 3}),
 ('world', '!'): defaultdict(int, {'<EOS>': 3}),
 ('!', '<EOS>'): defaultdict(int, {'<EOS>': 3})}

In [6]:
[TOKENS] * 3

[['hello', 'world', '!'], ['hello', 'world', '!'], ['hello', 'world', '!']]

In [2]:
model = build_ngram_model(texts=[TOKENS*3], n=3, verbose=True)

context=('<BOS>', '<BOS>'), token=hello
context=('<BOS>', 'hello'), token=world
context=('hello', 'world'), token=!
context=('world', '!'), token=hello
context=('!', 'hello'), token=world
context=('hello', 'world'), token=!
context=('world', '!'), token=hello
context=('!', 'hello'), token=world
context=('hello', 'world'), token=!
context=('world', '!'), token=<EOS>
context=('!', '<EOS>'), token=<EOS>


In [6]:
from data_loader import get_vn_express_dataset
data = get_vn_express_dataset()

topic = suc-khoe
Loaded 50 articles
topic = khoa-hoc
Loaded 37 articles
topic = so-hoa
Loaded 43 articles
topic = the-gioi
Loaded 47 articles
topic = phap-luat
Loaded 47 articles
topic = du-lich
Loaded 38 articles
topic = giai-tri
Loaded 47 articles
topic = kinh-doanh
Loaded 50 articles
topic = thoi-su
Loaded 38 articles
topic = giao-duc
Loaded 38 articles
topic = the-thao
Loaded 43 articles


# Fit the model on VN Express dataset

In [7]:
model_vnexpress = build_ngram_model(texts=data, n=3, verbose=False)

In [8]:
ans = model_vnexpress[('tiêm', 'phòng')]
sorted(ans.items(), key=lambda x:x[1], reverse=True)

[('cho', 0.42857142857142855),
 ('Covid-19', 0.14285714285714285),
 ('hàng', 0.14285714285714285),
 ('vaccine', 0.14285714285714285),
 ('đầy', 0.14285714285714285)]