In [1]:
import random
import numpy as np
from utils import get_logs, parse_logs, clean_data
from model import NgramModel

In [2]:
logs = parse_logs(get_logs())

In [3]:
all_keys = np.array([k for k, v in zip(logs.keys(), logs.values()) if len(v[-1]) > 1])
white_keys = np.array([k for k, v in zip(logs.keys(), logs.values()) if len(v[-1]) > 1 and v[-1][1][0] == 'w'])
black_keys = np.array([k for k, v in zip(logs.keys(), logs.values()) if len(v[-1]) > 1 and v[-1][1][0] == 'b'])
remaining_keys = np.array([k for k in logs.keys() if k not in white_keys and k not in black_keys])
draw_keys = np.array([k for k in remaining_keys if logs[k][-1] == ['d']])
white_keys = np.append(white_keys, [k for k in remaining_keys if len(logs[k][-1]) > 1 and logs[k][-1][1][0] == 'r' and logs[k][-2][1][0] == 'w'])
black_keys = np.append(black_keys, [k for k in remaining_keys if len(logs[k][-1]) > 1 and logs[k][-1][1][0] == 'r' and logs[k][-2][1][0] == 'b'])
draw_keys = np.append(draw_keys, [k for k in remaining_keys if len(logs[k][-1]) > 1 and logs[k][-1][1][0] == 'a'])

In [4]:
def extract_words(logs, key):
    pieces = []
    rows = logs[key]
    rows_count = len(rows)
    for i, row in enumerate(rows):
        if len(row) == 3:
            pieces.extend(row[1:3])
    return pieces

In [5]:
words = [extract_words(logs, key) for key in all_keys]
word_test = [extract_words(logs, key) for key in all_keys][-1]

In [6]:
def extract_ngrams(tokens, n):
        if n == 1:
            tokens_extended = ['s'] + tokens    
        else:
            tokens_extended = ['s'] * (n - 1) + tokens 
        ngrams = []
        for i in range(len(tokens_extended) - n):
            if n == 1:
                ngram = tokens_extended[i:i + n][0]
            else:
                ngram = tuple(tokens_extended[i:i + n])
            ngrams.append(ngram)
        ngrams = np.array(ngrams)
        return ngrams

In [7]:
def concatenate_ngrams(logs, all_keys, n):
    ngrams = [extract_ngrams(extract_words(logs, key), n) for key in all_keys]
    concatenated_array = np.concatenate(ngrams)
    return concatenated_array

In [8]:
one = concatenate_ngrams(logs, all_keys, 1)
two = concatenate_ngrams(logs, all_keys, 2)
three = concatenate_ngrams(logs, all_keys, 3)
four = concatenate_ngrams(logs, all_keys, 4)

In [9]:
two[:10]

array([['s', 'wG1'],
       ['wG1', '.'],
       ['.', 'bG1'],
       ['bG1', '\\wG1'],
       ['\\wG1', 'wA1'],
       ['wA1', 'wG1\\'],
       ['wG1\\', 'bG2'],
       ['bG2', '\\bG1'],
       ['\\bG1', 'wQ'],
       ['wQ', 'wG1-']], dtype='<U4')

In [10]:
word_test[:10]

['wG1', '.', 'bG1', 'wG1\\', 'wQ', '-wG1', 'bQ', '/bG1', 'wA1', '\\wQ']

In [11]:
model = NgramModel(2)
model.update(two)
model.generate_text(4)

['wG1', 'bG2-', 'wG3', 'wA2\\']

In [12]:
model.calculate_probs()

{('s', 'wG1'): 0.8134328358208955,
 ('wG1', '.'): 0.3318112633181126,
 ('.', 'bG1'): 0.7835820895522388,
 ('bG1', '\\wG1'): 0.007692307692307693,
 ('\\wG1', 'wA1'): 0.05154639175257732,
 ('wA1', 'wG1\\'): 0.021566401816118047,
 ('wG1\\', 'bG2'): 0.05442176870748299,
 ('bG2', '\\bG1'): 0.004975124378109453,
 ('\\bG1', 'wQ'): 0.22033898305084745,
 ('wQ', 'wG1-'): 0.03940886699507389,
 ('wG1-', 'bQ'): 0.10357142857142858,
 ('bQ', '-bG1'): 0.022764227642276424,
 ('-bG1', 'wA1'): 0.171875,
 ('wA1', 'bG2/'): 0.007945516458569807,
 ('bG2/', 'bA1'): 0.04,
 ('bA1', '-bG2'): 0.008403361344537815,
 ('-bG2', 'wG2'): 0.047619047619047616,
 ('wG2', 'wG1\\'): 0.006688963210702341,
 ('wG1\\', 'bA1'): 0.1360544217687075,
 ('bA1', '/wG2'): 0.007202881152460984,
 ('/wG2', 'wS1'): 0.16129032258064516,
 ('wS1', '\\wA1'): 0.004166666666666667,
 ('\\wA1', 'bA2'): 0.025,
 ('bA2', '-bA1'): 0.012328767123287671,
 ('-bA1', 'wS1'): 0.04,
 ('wS1', '-bQ'): 0.018055555555555554,
 ('-bQ', 'bS1'): 0.059322033898305086

In [13]:
test_moves = word_test
model.perplexity(test_moves)

perplexity: [1m0[0m
ngram probability: 0.3318112633181126, perplexity: 0.3318112633181126
perplexity: [1m3.0[0m
ngram probability: 0.7835820895522388, perplexity: 3.404604161083835
perplexity: [1m3.4[0m
ngram probability: 0.026153846153846153, perplexity: 11.469771623815792
perplexity: [1m11.5[0m
ngram probability: 0.05442176870748299, perplexity: 23.747156131467946
perplexity: [1m23.7[0m
ngram probability: 0.07060755336617405, perplexity: 40.349798943125016
perplexity: [1m40.3[0m
ngram probability: 0.19844357976653695, perplexity: 52.83258570789248
perplexity: [1m52.8[0m
ngram probability: 0.05203252032520325, perplexity: 80.59185861127665
perplexity: [1m80.6[0m
ngram probability: 0.22058823529411764, perplexity: 97.35166380516806
perplexity: [1m97.4[0m
ngram probability: 0.012485811577752554, perplexity: 158.4355461744815
perplexity: [1m158.4[0m
ngram probability: 0.18947368421052632, perplexity: 187.1103173455967
perplexity: [1m187.1[0m
ngram probability: 0.025