Unigrams

In [17]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/lanphgphm/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
with open('testing_data.txt', 'r') as f:
    test_data = f.read()

with open('training_data.txt', 'r') as f:
    train_data = f.read()

In [4]:
unigrams = nltk.tokenize.word_tokenize(train_data) 
n_unigrams = len(unigrams)

unique_unigrams = set(unigrams) 
n_unique_unigrams = len(unique_unigrams)

print(f"Number of unigrams: {n_unigrams}\nNumber of unique unigrams: {n_unique_unigrams}")

Number of unigrams: 742
Number of unique unigrams: 302


In [5]:
def ngram_count(n, ngrams): 
    '''ngram is tokenized data'''
    counts = {} 
    counts['<UNK>'] = 0
    last_start = len(ngrams) - n + 1 

    for i in range(last_start): 
        ngram = tuple(ngrams[i:i+n])
        if ngram in counts.keys():
            counts[ngram] += 1
        else:
            counts[ngram] = 1
    
    return counts 
    

In [6]:
def add_one_smoothing(ngrams_count, N, V): 
    '''N: Number of word token, V: vocab size (unique ngram)
    this function returns dictionary of ngram probabilities 
    after applying Laplace smoothing. 
    '''
    ngrams_prob = {} 
    ngrams_prob['<UNK>'] = 1 / (N + V)

    for ngram in ngrams_count.keys(): 
        ci = ngrams_count[ngram]
        ngrams_prob[ngram] = (ci + 1) / (N + V)
    
    return ngrams_prob

In [7]:
unigrams_count = ngram_count(1, unigrams)
unigrams_prob = add_one_smoothing(unigrams_count, n_unigrams, n_unique_unigrams)

unigrams_prob

{'<UNK>': 0.0009578544061302681,
 ('The',): 0.0028735632183908046,
 ('Computer',): 0.009578544061302681,
 ('Science',): 0.01053639846743295,
 ('major',): 0.008620689655172414,
 ('prepares',): 0.0019157088122605363,
 ('students',): 0.009578544061302681,
 ('with',): 0.006704980842911878,
 ('an',): 0.005747126436781609,
 ('adaptable',): 0.0019157088122605363,
 ('skill',): 0.0019157088122605363,
 ('set',): 0.0019157088122605363,
 ('to',): 0.020114942528735632,
 ('respond',): 0.0019157088122605363,
 ('the',): 0.029693486590038315,
 ('astonishing',): 0.0019157088122605363,
 ('speed',): 0.0019157088122605363,
 ('of',): 0.022030651340996167,
 ('technological',): 0.0019157088122605363,
 ('change',): 0.0019157088122605363,
 ('and',): 0.03639846743295019,
 ('develop',): 0.0028735632183908046,
 ('solutions',): 0.0019157088122605363,
 ('for',): 0.009578544061302681,
 ('problems',): 0.0038314176245210726,
 ('today',): 0.0019157088122605363,
 ('tomorrow',): 0.0019157088122605363,
 ('.',): 0.030651340

In [14]:
def unigram_ppl(unigrams, n_unigrams, unigram_probability):
    test_ppl = 1

    for unigram in unigrams:
        if unigram in unigram_probability.keys():
            test_ppl *= 1/unigram_probability[unigram]
        else: 
            test_ppl *= 1/unigram_probability['<UNK>']

    test_ppl = test_ppl ** (1/n_unigrams)
    return test_ppl

In [16]:
test_unigrams = nltk.tokenize.word_tokenize(test_data)
n_test_unigrams = len(test_unigrams)

unigram_ppl(test_unigrams, n_test_unigrams, unigrams_prob)


1043.9999999999998

Bigrams

In [32]:
def get_ngram(n, text): 
    unigrams = nltk.tokenize.word_tokenize(text)
    ngrams = [] 
    if n == 1: 
        return unigrams 
    else: 
        last_start = len(unigrams) - n + 1
        for i in range(last_start):
            ngram = tuple(unigrams[i: i+n])
            ngrams.append(ngram)
        return ngrams  

In [33]:
x = get_ngram(2, train_data)
x

[('The', 'Computer'),
 ('Computer', 'Science'),
 ('Science', 'major'),
 ('major', 'prepares'),
 ('prepares', 'students'),
 ('students', 'with'),
 ('with', 'an'),
 ('an', 'adaptable'),
 ('adaptable', 'skill'),
 ('skill', 'set'),
 ('set', 'to'),
 ('to', 'respond'),
 ('respond', 'to'),
 ('to', 'the'),
 ('the', 'astonishing'),
 ('astonishing', 'speed'),
 ('speed', 'of'),
 ('of', 'technological'),
 ('technological', 'change'),
 ('change', 'and'),
 ('and', 'develop'),
 ('develop', 'solutions'),
 ('solutions', 'for'),
 ('for', 'the'),
 ('the', 'problems'),
 ('problems', 'of'),
 ('of', 'today'),
 ('today', 'and'),
 ('and', 'tomorrow'),
 ('tomorrow', '.'),
 ('.', 'Using'),
 ('Using', 'a'),
 ('a', 'student-centered'),
 ('student-centered', ','),
 (',', 'interdisciplinary'),
 ('interdisciplinary', ','),
 (',', 'and'),
 ('and', 'future-focused'),
 ('future-focused', 'approach'),
 ('approach', ','),
 (',', 'the'),
 ('the', 'Computer'),
 ('Computer', 'Science'),
 ('Science', 'major'),
 ('major', 'ai

In [26]:
ls = [] 

for i in range(len(unigrams)-1): 
    ls.append(tuple(unigrams[i: i+2]))

ls

[('The', 'Computer'),
 ('Computer', 'Science'),
 ('Science', 'major'),
 ('major', 'prepares'),
 ('prepares', 'students'),
 ('students', 'with'),
 ('with', 'an'),
 ('an', 'adaptable'),
 ('adaptable', 'skill'),
 ('skill', 'set'),
 ('set', 'to'),
 ('to', 'respond'),
 ('respond', 'to'),
 ('to', 'the'),
 ('the', 'astonishing'),
 ('astonishing', 'speed'),
 ('speed', 'of'),
 ('of', 'technological'),
 ('technological', 'change'),
 ('change', 'and'),
 ('and', 'develop'),
 ('develop', 'solutions'),
 ('solutions', 'for'),
 ('for', 'the'),
 ('the', 'problems'),
 ('problems', 'of'),
 ('of', 'today'),
 ('today', 'and'),
 ('and', 'tomorrow'),
 ('tomorrow', '.'),
 ('.', 'Using'),
 ('Using', 'a'),
 ('a', 'student-centered'),
 ('student-centered', ','),
 (',', 'interdisciplinary'),
 ('interdisciplinary', ','),
 (',', 'and'),
 ('and', 'future-focused'),
 ('future-focused', 'approach'),
 ('approach', ','),
 (',', 'the'),
 ('the', 'Computer'),
 ('Computer', 'Science'),
 ('Science', 'major'),
 ('major', 'ai

In [27]:
x = get_ngram(2, unigrams)
x

[(),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),


In [19]:
ngram_count(2, unigrams)

{'<UNK>': 0,
 ('The', 'Computer'): 2,
 ('Computer', 'Science'): 9,
 ('Science', 'major'): 4,
 ('major', 'prepares'): 1,
 ('prepares', 'students'): 1,
 ('students', 'with'): 2,
 ('with', 'an'): 2,
 ('an', 'adaptable'): 1,
 ('adaptable', 'skill'): 1,
 ('skill', 'set'): 1,
 ('set', 'to'): 1,
 ('to', 'respond'): 1,
 ('respond', 'to'): 1,
 ('to', 'the'): 2,
 ('the', 'astonishing'): 1,
 ('astonishing', 'speed'): 1,
 ('speed', 'of'): 1,
 ('of', 'technological'): 1,
 ('technological', 'change'): 1,
 ('change', 'and'): 1,
 ('and', 'develop'): 1,
 ('develop', 'solutions'): 1,
 ('solutions', 'for'): 1,
 ('for', 'the'): 2,
 ('the', 'problems'): 1,
 ('problems', 'of'): 1,
 ('of', 'today'): 1,
 ('today', 'and'): 1,
 ('and', 'tomorrow'): 1,
 ('tomorrow', '.'): 1,
 ('.', 'Using'): 1,
 ('Using', 'a'): 1,
 ('a', 'student-centered'): 1,
 ('student-centered', ','): 1,
 (',', 'interdisciplinary'): 1,
 ('interdisciplinary', ','): 1,
 (',', 'and'): 12,
 ('and', 'future-focused'): 1,
 ('future-focused', 'appr

In [18]:
bigrams = ngram_count(2, )
n_bigrams = len(bigrams)

unique_bigrams = set(bigrams) 
n_unique_bigrams = len(unique_bigrams)

print(f"Number of bigrams: {n_bigrams}\nNumber of unique bigrams: {n_unique_bigrams}")

Number of bigrams: 742
Number of unique bigrams: 302


In [None]:
unigrams_count = ngram_count(1, unigrams)
unigrams_prob = add_one_smoothing(unigrams_count, n_unigrams, n_unique_unigrams)

unigrams_prob

{'<UNK>': 0.0009578544061302681,
 ('The',): 0.0028735632183908046,
 ('Computer',): 0.009578544061302681,
 ('Science',): 0.01053639846743295,
 ('major',): 0.008620689655172414,
 ('prepares',): 0.0019157088122605363,
 ('students',): 0.009578544061302681,
 ('with',): 0.006704980842911878,
 ('an',): 0.005747126436781609,
 ('adaptable',): 0.0019157088122605363,
 ('skill',): 0.0019157088122605363,
 ('set',): 0.0019157088122605363,
 ('to',): 0.020114942528735632,
 ('respond',): 0.0019157088122605363,
 ('the',): 0.029693486590038315,
 ('astonishing',): 0.0019157088122605363,
 ('speed',): 0.0019157088122605363,
 ('of',): 0.022030651340996167,
 ('technological',): 0.0019157088122605363,
 ('change',): 0.0019157088122605363,
 ('and',): 0.03639846743295019,
 ('develop',): 0.0028735632183908046,
 ('solutions',): 0.0019157088122605363,
 ('for',): 0.009578544061302681,
 ('problems',): 0.0038314176245210726,
 ('today',): 0.0019157088122605363,
 ('tomorrow',): 0.0019157088122605363,
 ('.',): 0.030651340