In [1]:
import random
import numpy as np
from utils import get_logs, parse_logs, clean_data
from model import NgramModel

In [2]:
logs = parse_logs(get_logs())

In [3]:
all_keys = np.array([k for k, v in zip(logs.keys(), logs.values()) if len(v[-1]) > 1])
white_keys = np.array([k for k, v in zip(logs.keys(), logs.values()) if len(v[-1]) > 1 and v[-1][1][0] == 'w'])
black_keys = np.array([k for k, v in zip(logs.keys(), logs.values()) if len(v[-1]) > 1 and v[-1][1][0] == 'b'])
remaining_keys = np.array([k for k in logs.keys() if k not in white_keys and k not in black_keys])
draw_keys = np.array([k for k in remaining_keys if logs[k][-1] == ['d']])
white_keys = np.append(white_keys, [k for k in remaining_keys if len(logs[k][-1]) > 1 and logs[k][-1][1][0] == 'r' and logs[k][-2][1][0] == 'w'])
black_keys = np.append(black_keys, [k for k in remaining_keys if len(logs[k][-1]) > 1 and logs[k][-1][1][0] == 'r' and logs[k][-2][1][0] == 'b'])
draw_keys = np.append(draw_keys, [k for k in remaining_keys if len(logs[k][-1]) > 1 and logs[k][-1][1][0] == 'a'])

In [4]:
def extract_words(logs, key):
    pieces = []
    rows = logs[key]
    rows_count = len(rows)
    for i, row in enumerate(rows):
        if len(row) == 3:
            pieces.extend(row[1:3])
    return pieces

In [5]:
words = [extract_words(logs, key) for key in all_keys]
word_test = [extract_words(logs, key) for key in all_keys][-1]

In [6]:
def concatenate_ngrams(logs, all_keys, n):
    ngrams = [extract_ngrams(extract_words(logs, key), n) for key in all_keys]
    concatenated_array = np.concatenate(ngrams)
    return concatenated_array

In [7]:
def extract_ngrams(tokens, n):
        if n == 1:
            tokens_extended = ['s'] + tokens    
        else:
            tokens_extended = ['s'] * (n - 1) + tokens 
        ngrams = []
        for i in range(len(tokens_extended) - n):
            if n == 1:
                ngram = tokens_extended[i:i + n][0]
            else:
                ngram = tuple(tokens_extended[i:i + n])
            ngrams.append(ngram)
        ngrams = np.array(ngrams)
        return ngrams

In [8]:
one = concatenate_ngrams(logs, all_keys, 1)
two = concatenate_ngrams(logs, all_keys, 2)
three = concatenate_ngrams(logs, all_keys, 3)
four = concatenate_ngrams(logs, all_keys, 4)

In [9]:
two[:10]

array([['s', 'wG1'],
       ['wG1', '.'],
       ['.', 'bG1'],
       ['bG1', '\\wG1'],
       ['\\wG1', 'wA1'],
       ['wA1', 'wG1\\'],
       ['wG1\\', 'bG2'],
       ['bG2', '\\bG1'],
       ['\\bG1', 'wQ'],
       ['wQ', 'wG1-']], dtype='<U4')

$$P(X_1 \ldots X_n) = P(X_1) P(X_2|X_1) P(X_3|X_{1:2}) \ldots P(X_n|X_{1:n-1}) = \prod_{k=1}^{n} P(X_k|X_{1:k-1})
 $$

When we use a bigram model to predict the conditional probability of the next word, we are thus making the following approximation:
$$ P(w_n|w_{1:n-1}) \approx P(w_n|w_{n-1})$$


Then we approximate the probability of a word given its entire context as follows:
$$ P(w_n|w_{1:n-1}) \approx P(w_n|w_{n-N+1:n-1})$$


We compute the count of the bigram C(wn−1wn) and normalize by the sum of all the bigrams that share the same first word wn−1. Since the sum of all bigram counts that start with a given word wn−1 must be equal to the unigram count for that word wn−1:
$$
P(w_n | w_{n-1}) = \frac{{C(w_{n-1}w_n)}}{{C(w_{n-1})}}
$$
example :
$$
P('wG1' | '-bG1') = \frac{{C('-bG1, wG1,')}}{{C('-bG1')}}
$$

In [10]:
def get_count_for_key(ngram_counts, desired_key):
    for key, value in ngram_counts.items():
        if len(key) == len(desired_key) and all(key[i] == desired_key[i] for i in range(len(key))):
            return value
    return None

def calculate_probs(counted_ngrams, oneless_ngrams):

    counted_ngrams_keys = [key for key in counted_ngrams.keys()]
    oneless_ngrams_keys = [key[:-1][0] if len(key) == 2 else key[:-1] for key in counted_ngrams.keys()]
    counted_ngrams_values = np.array([counted_ngrams[key] for key in counted_ngrams_keys])
    oneless_ngrams_values = np.array([oneless_ngrams[key] for key in oneless_ngrams_keys])

    probs = {}
    for i in range(len(counted_ngrams_keys)):
        key = counted_ngrams_keys[i]
        if len(key) == 2:
            probs[key] = counted_ngrams_values[i] / oneless_ngrams_values[i]
        else:
            probs[key] = counted_ngrams_values[i] / oneless_ngrams_values[i]

    return probs

In [11]:
word_test[:10]

['wG1', '.', 'bG1', 'wG1\\', 'wQ', '-wG1', 'bQ', '/bG1', 'wA1', '\\wQ']

In [12]:
model = NgramModel(2)
model.update(two)
model.generate_text(15)

['wG1',
 'wS2-',
 'wB1',
 'wA1/',
 'wA3',
 '-bQ',
 'bS2',
 'bA1-',
 'bG2',
 'bB2',
 'bQ',
 'wA1-',
 'bA1',
 '-wA1',
 'wB2']