# Load libraries

In [1]:
import sys
sys.path.append('../')
from utilities import *

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# Load data

In [2]:
corpus_dir = '../../data/corpus/'
model_dir = '../../data/ngrams/'

In [5]:
perpl = pd.read_csv(f'{corpus_dir}perpl.csv')

In [6]:
perpl.shape

(2573, 2)

In [4]:
tokenized_text = [list(word_tokenize(sent)) for sent in perpl.cor]

# Helper function

In [5]:
trigram_sent = lambda sent: ngrams(sent, 3, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>')

# Load Ngram Language Models

In [6]:
with open(f'{model_dir}counter.pickle', 'rb') as file:
    counter = pickle.load(file)

with open(f'{model_dir}vocabulary.pickle', 'rb') as file:
    vocabulary = pickle.load(file)

# Lidstone (add-a) trigram

In [7]:
def best_alpha(alpha):
    trigramL = MLidstone(gamma=alpha, order=3, vocabulary=vocabulary, counter=counter)
    perpl = [trigramL.perplexity(trigram_sent(sent)) for sent in tokenized_text]
    return np.mean(perpl)

In [8]:
best = fmin(fn=best_alpha, space=hp.uniform('alpha', 0, 0.2), algo=tpe.suggest, max_evals=100)

100%|██████████| 100/100 [02:21<00:00,  1.60s/it, best loss: 4353.882109219711]


In [9]:
best['alpha']

0.006295889336373706

# Interpolated Kneser-Nay Smoothing trigram

In [10]:
def best_discount(discount):
    trigramKNI = MKneserNeyInterpolated(order=3, discount=discount, vocabulary=vocabulary, counter=counter)
    perpl = [trigramKNI.perplexity(trigram_sent(sent)) for sent in tokenized_text]
    return np.mean(perpl)

In [11]:
best = fmin(fn=best_discount, space=hp.uniform('discount', 0, 1), algo=tpe.suggest, max_evals=100)

100%|██████████| 100/100 [5:41:01<00:00, 193.73s/it, best loss: 591.7693824536899] 


In [12]:
best['discount']

0.9275733392408017