In [1]:
n_gram_counts = {
    ('i', 'am', 'happy'): 2,
    ('am', 'happy', 'because'): 1}

if ('i', 'am', 'learning') in n_gram_counts:
    print(f"n-gram {('i', 'am', 'learning')} found")
else:
    print(f"n-gram {('i', 'am', 'learning')} missing")

n_gram_counts[('i', 'am', 'learning')] = 1
if ('i', 'am', 'learning') in n_gram_counts:
    print(f"n-gram {('i', 'am', 'learning')} found")
else:
    print(f"n-gram {('i', 'am', 'learning')} missing")

n-gram ('i', 'am', 'learning') missing
n-gram ('i', 'am', 'learning') found


In [2]:
# Merging tuple
prefix = ('i', 'am', 'happy')
word = 'because'

merged = prefix + (word,)
print(merged)

('i', 'am', 'happy', 'because')


In [3]:
import numpy as np
import pandas as pd
from collections import defaultdict

In [6]:
def single_pass_trigram_count_matrix(corpus):
    # corpus is a pre-processed and tokenized corpus
    bigrams = []
    vocab = []
    count_matrix_d = defaultdict(dict)

    for i in range(len(corpus)-2):
        trigram = tuple(corpus[i:i+3])
        bigram = tuple(corpus[i:i+2])

        if not bigram in bigrams:
            bigrams.append(bigram)
        
        last_word = trigram[-1]
        if not last_word in vocab:
            vocab.append(last_word)
        
        if (bigram,last_word) not in count_matrix_d:
            count_matrix_d[(bigram,last_word)] = 0
        count_matrix_d[(bigram,last_word)]+=1

    count_matrix = np.zeros((len(bigrams),len(vocab)))
    for key,val in count_matrix_d.items():
        count_matrix[bigrams.index(key[0]),vocab.index(key[1])] = val
    
    count_matrix = pd.DataFrame(count_matrix,index=bigrams,columns=vocab)
    return bigrams, vocab, count_matrix

In [7]:
corpus = ['i','am','happy','because','i','am','learning','.']
bigrams,vocab,count_mat = single_pass_trigram_count_matrix(corpus)

In [8]:
count_mat

Unnamed: 0,happy,because,i,am,learning,.
"(i, am)",1.0,0.0,0.0,0.0,1.0,0.0
"(am, happy)",0.0,1.0,0.0,0.0,0.0,0.0
"(happy, because)",0.0,0.0,1.0,0.0,0.0,0.0
"(because, i)",0.0,0.0,0.0,1.0,0.0,0.0
"(am, learning)",0.0,0.0,0.0,0.0,0.0,1.0


In [11]:
rowsums = np.sum(count_mat,axis=1)
prob_mat = count_mat.div(rowsums,axis=0)

In [12]:
prob_mat

Unnamed: 0,happy,because,i,am,learning,.
"(i, am)",0.5,0.0,0.0,0.0,0.5,0.0
"(am, happy)",0.0,1.0,0.0,0.0,0.0,0.0
"(happy, because)",0.0,0.0,1.0,0.0,0.0,0.0
"(because, i)",0.0,0.0,0.0,1.0,0.0,0.0
"(am, learning)",0.0,0.0,0.0,0.0,0.0,1.0


In [16]:
trigram = ('i','am','happy')
bigram,word = trigram[:-1],trigram[-1]
prob = prob_mat[word][bigram]
prob

0.5

In [17]:
vocabulary = ['i', 'am', 'happy', 'because', 'learning', '.', 'have', 'you', 'seen','it', '?']
starts_with = 'ha'

for word in vocabulary:
    if word.startswith(starts_with):
        print(word)

happy
have


In [21]:
import random
def train_val_test_split(data,train_percent,val_percent):
    random.seed(21)
    random.shuffle(data)

    train_data = data[:int(len(data)*train_percent/100)]
    val_data = data[int(len(data)*train_percent/100):int(len(data)*(train_percent+val_percent)/100)]
    test_data = data[int(len(data)*(train_percent+val_percent)/100):]

    return train_data,val_data,test_data

In [22]:
data = [x for x in range(1,101)]
train_data, validation_data, test_data = train_val_test_split(data, 80, 10)
print("split 80/10/10:\n",f"train data: {train_data}\n", f"validation data: {validation_data}\n", 
      f"test data: {test_data}\n")

train_data, validation_data, test_data = train_val_test_split(data, 98, 1)
print("split 98/1/1:\n",f"train data: {train_data}\n", f"validation data: {validation_data}\n", 
      f"test data: {test_data}\n")

split 80/10/10:
 train data: [18, 96, 41, 93, 7, 90, 40, 92, 91, 45, 4, 29, 94, 42, 23, 21, 80, 34, 95, 77, 50, 86, 81, 97, 88, 59, 35, 17, 69, 63, 33, 27, 83, 76, 46, 25, 39, 49, 58, 8, 74, 38, 51, 44, 14, 13, 73, 71, 87, 67, 72, 98, 10, 32, 26, 100, 47, 36, 11, 78, 85, 60, 52, 16, 12, 20, 3, 84, 15, 64, 70, 43, 5, 57, 53, 56, 6, 79, 30, 19]
 validation data: [9, 55, 75, 48, 2, 1, 31, 68, 65, 24]
 test data: [66, 61, 28, 62, 37, 82, 99, 89, 54, 22]

split 98/1/1:
 train data: [34, 82, 74, 28, 40, 24, 8, 61, 66, 14, 93, 69, 62, 38, 81, 50, 19, 76, 37, 6, 67, 1, 9, 99, 68, 11, 46, 80, 15, 52, 83, 35, 75, 56, 13, 88, 58, 87, 36, 92, 57, 49, 72, 44, 42, 94, 5, 70, 31, 3, 43, 89, 45, 27, 59, 22, 73, 25, 4, 79, 2, 78, 98, 21, 29, 77, 41, 48, 23, 16, 64, 51, 7, 47, 10, 100, 90, 30, 63, 95, 91, 26, 53, 71, 96, 18, 33, 84, 12, 97, 20, 85, 17, 60, 39, 55, 54, 65]
 validation data: [32]
 test data: [86]



In [25]:
# Perplexity example
p = 10 ** (-250)
M = 100
perplexity = p ** (-1 / M)
perplexity

316.22776601683796