In [1]:
import sys
sys.path.append('../')
from utilities import *

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# Load data

In [2]:
corpus_dir = '../../data/corpus/'
model_dir = '../../data/ngrams/'

In [3]:
noisy = pd.read_csv(f'{corpus_dir}noisy.csv')

In [4]:
noisy.shape

(50, 2)

# Load Ngram Language Models

In [5]:
with open(f'{model_dir}counter.pickle', 'rb') as file:
    counter = pickle.load(file)

with open(f'{model_dir}vocabulary.pickle', 'rb') as file:
    vocabulary = pickle.load(file)

# Lidstone (add-a smoothing) trigram

In [6]:
trigramL = MLidstone(gamma=0.0063, order=3, vocabulary=vocabulary, counter=counter)
channel = NoisyChannelModel(trigramL)

## Poisson Channel model

In [7]:
def best_l(l):
    channel.l = l
    df = channel.beam_search_sentences(noisy.err)
    return np.array([wer(correct, changed) for correct, changed in zip(noisy.cor, df[0].str.lower())]).mean()

In [8]:
channel.channel_method_poisson = True
channel.channel_prob_param = 0.01

In [9]:
best = fmin(fn=best_l, space=hp.uniform('l', 0.2, 5), algo=tpe.suggest, max_evals=20)

100%|██████████| 20/20 [1:04:07<00:00, 185.34s/it, best loss: 0.31727504315878474]


In [10]:
best['l']

0.269518851825738

## Normalized and inversely proportional to edit distances channel model

In [11]:
channel.channel_method_poisson = False
channel.channel_prob_param = 0.99

In [12]:
best = fmin(fn=best_l, space=hp.uniform('l', 0.2, 5), algo=tpe.suggest, max_evals=20)

  0%|          | 0/20 [00:00<?, ?it/s, best loss: ?]

  probs = (1-self.channel_prob_param)/inv_eds.sum() * inv_eds

  probs = (1-self.channel_prob_param)/inv_eds.sum() * inv_eds



100%|██████████| 20/20 [1:03:35<00:00, 184.59s/it, best loss: 0.7003744354224736]


In [13]:
best['l']

0.4197552886522665

# Interpolated with Kneser-Ney smoothing trigram

In [14]:
trigramKNI = MKneserNeyInterpolated(order=3, discount=0.9276, vocabulary=vocabulary, counter=counter)
channel = NoisyChannelModel(trigramKNI)

## Poisson Channel model

In [15]:
channel.channel_method_poisson = True
channel.channel_prob_param = 0.01

In [16]:
best = fmin(fn=best_l, space=hp.uniform('l', 0.2, 5), algo=tpe.suggest, max_evals=20)

100%|██████████| 20/20 [3:10:14<00:00, 578.14s/it, best loss: 0.27098692568696015]  


In [17]:
best['l']

0.2104752435030852

## Normalized and inversely proportional to edit distances channel model

In [18]:
channel.channel_method_poisson = False
channel.channel_prob_param = 0.99

In [19]:
best = fmin(fn=best_l, space=hp.uniform('l', 0.2, 5), algo=tpe.suggest, max_evals=20)

100%|██████████| 20/20 [3:21:32<00:00, 606.81s/it, best loss: 0.6686602454700643]  


In [20]:
best['l']

0.31665685543409694