In [21]:
# allowing better python 2 & python 3 compatibility 
from __future__ import print_function 

def hangman(secret_word, guesser, max_mistakes=8, verbose=True, **guesser_args):
    """
        secret_word: a string of lower-case alphabetic characters, i.e., the answer to the game
        guesser: a function which guesses the next character at each stage in the game
            The function takes a:
                mask: what is known of the word, as a string with _ denoting an unknown character
                guessed: the set of characters which already been guessed in the game
                guesser_args: additional (optional) keyword arguments, i.e., name=value
        max_mistakes: limit on length of game, in terms of allowed mistakes
        verbose: be chatty vs silent
        guesser_args: keyword arguments to pass directly to the guesser function
    """
    secret_word = secret_word.lower()
    mask = ['_'] * len(secret_word)
    guessed = set()
    if verbose:
        print("Starting hangman game. Target is", ' '.join(mask), 'length', len(secret_word))
    
    mistakes = 0
    while mistakes < max_mistakes:
        if verbose:
            print("You have", (max_mistakes-mistakes), "attempts remaining.")
        guess = guesser(mask, guessed, **guesser_args)

        if verbose:
            print('Guess is', guess)
        if guess in guessed:
            if verbose:
                print('Already guessed this before.')
            mistakes += 1
        else:
            guessed.add(guess)
            if guess in secret_word:
                for i, c in enumerate(secret_word):
                    if c == guess:
                        mask[i] = c
                if verbose:
                    print('Good guess:', ' '.join(mask))
            else:
                if verbose:
                    print('Sorry, try again.')
                mistakes += 1
                
        if '_' not in mask:
            if verbose:
                print('Congratulations, you won.')
            return mistakes
        
    if verbose:
        print('Out of guesses. The word was', secret_word)    
    return mistakes

def human(mask, guessed, **kwargs):
    """
    simple function for manual play
    """
    print('Enter your guess:')
    try:
        return raw_input().lower().strip() # python 3
    except NameError:
        return input().lower().strip() # python 2

In [8]:
hangman('whatever', human, 8, True)

Starting hangman game. Target is _ _ _ _ _ _ _ _ length 8
You have 8 attempts remaining.
Enter your guess:
w
Guess is w
Good guess: w _ _ _ _ _ _ _
You have 8 attempts remaining.
Enter your guess:
asd
Guess is asd
Sorry, try again.
You have 7 attempts remaining.
Enter your guess:
asd
Guess is asd
Already guessed this before.
You have 6 attempts remaining.
Enter your guess:

Guess is 
Good guess: w _ _ _ _ _ _ _
You have 6 attempts remaining.
Enter your guess:
zxc
Guess is zxc
Sorry, try again.
You have 5 attempts remaining.
Enter your guess:
zx
Guess is zx
Sorry, try again.
You have 4 attempts remaining.
Enter your guess:
c
Guess is c
Sorry, try again.
You have 3 attempts remaining.
Enter your guess:
zxc
Guess is zxc
Already guessed this before.
You have 2 attempts remaining.
Enter your guess:
z
Guess is z
Sorry, try again.
You have 1 attempts remaining.
Enter your guess:
c
Guess is c
Already guessed this before.
Out of guesses. The word was whatever


8

In [22]:
from nltk.corpus import brown
import re
import numpy as np
word_set = []
words = brown.words()
for word in words:
    word = word.lower()
    if word.isalpha():
        word_set.append(word)
word_set = list(set(word_set))
np.random.shuffle(word_set)
test_set = word_set[0:1000]
train_set = word_set[1000:]


print(len(test_set))
print(len(train_set))

1000
39234


In [23]:

def ai(mask, guessed, **kwargs):
    alpha = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
    guess = np.random.choice(alpha,1)[0]
    while guess in guessed:
        guess =  np.random.choice(alpha,1)[0]
    return guess

def get_output(test_set,ai):
    total_guess = 0
    for i in test_set:
        total_guess += hangman(i, ai, 26,False)
    return float(total_guess)/1000

print (get_output(test_set,ai))

16.748


In [11]:
from collections import Counter
def get_counts(train_set):
    unigram_counts = Counter()
    for word in train_set:
        for characters in word:
            unigram_counts[characters] += 1
    count = dict(unigram_counts)   
    count = sorted(count.items(), lambda x, y: cmp(x[1], y[1]), reverse=True)
    list_alpha = []
    for i in range(len(count)):
        list_alpha.append(count[i][0])
    return list_alpha


alpha = get_counts(train_set)

def uniai(mask, guessed, **kwargs):
    guess = alpha[len(guessed)]
    return guess

def get_output(test_set,ai):
    total_guess = 0
    for i in test_set:
        total_guess += hangman(i, ai, 26,False)
    return float(total_guess)/1000

print (get_output(test_set,uniai))



10.637


In [14]:
def get_new_counts(train_set,length):
    unigram_counts = Counter()
    for word in train_set:
        if len(word) == length:
            for characters in word:
                unigram_counts[characters] += 1
    count = dict(unigram_counts)   
    count = sorted(count.items(), lambda x, y: cmp(x[1], y[1]), reverse=True)
    list_alpha = []
    for i in range(len(count)):
        list_alpha.append(count[i][0])
    if len(list_alpha) == 0:
        return get_counts(train_set)
    else:
        return list_alpha


def new_uniai(mask, guessed, **kwargs):
    alpha = get_new_counts(train_set,len(mask))
    if len(alpha) != 26:
        alpha = get_counts(train_set)
    guess = alpha[len(guessed)]
    return guess
                           
def get_output(test_set,ai):
    total_guess = 0
    for i in test_set:
        total_guess += hangman(i, ai, 26,False)
    return float(total_guess)/1000

print (get_output(test_set,new_uniai))

10.269


In [24]:
from nltk import ngrams
import math
from collections import defaultdict
from collections import Counter
def get_count(dataset,n):
    ngram = defaultdict(Counter)
    unigram = Counter()
    for word in dataset:
        w = ["<s4>","<s3>","<s2>","<s1>"] + [alpha.lower() for alpha in word] + ["</s1>","</s2>","</s3>","</s4>"]      
        template = ngrams(w,n)
        for gram in template:
            prev = list(gram)
            target = prev.pop()
            if n > 2:
                ngram[tuple(prev)][target] += 1
            elif n == 2:
                ngram[prev.pop()][target] += 1
            elif n == 1:
                unigram[target] += 1
    if n > 1:
        return ngram
    else:
        return unigram

    
unigram_counts = get_count(train_set,1)   
bigram_counts = get_count(train_set,2)
trigram_counts = get_count(train_set,3)
fourgram_counts = get_count(train_set,4)
fivegram_counts = get_count(train_set,5)
token_counts = float(sum(unigram_counts.values()))

In [25]:
alpha = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']

#token_count = float(sum(unigrams.values()))
def get_log_prob_interp(prev_word, word, unigram_counts, bigram_counts, trigram_counts, fourgram_counts, fivegram_counts, token_count, lambdas):
    fivegram_lambda = lambdas[0]
    fourgram_lambda = lambdas[1]
    trigram_lambda = lambdas[2]
    bigram_lambda = lambdas[3]
    unigram_lambda = lambdas[4]

    
    
    # fivegram probability
    sm_fivegram_counts = fivegram_counts[prev_word][word] * fivegram_lambda
    if sm_fivegram_counts == 0.0:
        interp_fivegram_counts = 0
    else:
        interp_fivegram_counts = sm_fivegram_counts / (float(fourgram_counts[(prev_word[-4],prev_word[-3],prev_word[-2])][prev_word[-1]]))
    
    # fourgram_probability
    sm_fourgram_counts = fourgram_counts[prev_word][word] * fourgram_lambda
    if sm_fourgram_counts == 0.0:
        interp_fourgram_counts = 0
    else:
        interp_fourgram_counts = sm_fourgram_counts / (float(trigram_count[(prev_word[-3],prev_word[-2])][prev_word[-1]]))
    
    
    # trigram probability
    sm_trigram_counts = trigram_counts[prev_word][word] * trigram_lambda
    if sm_trigram_counts == 0.0:
        interp_trigram_counts = 0
    else:
        interp_trigram_counts = sm_trigram_counts / (float(bigram_counts[prev_word[-2]][prev_word[-1]]))
    
    # bigram probability
    sm_bigram_counts = bigram_counts[prev_word[-1]][word] * bigram_lambda
    if sm_bigram_counts == 0.0:
        interp_bigram_counts = 0
    else:
        interp_bigram_counts = sm_bigram_counts / (float(unigram_counts[prev_word[-1]]))
     
    # unigram probability
    interp_unigram_counts = (unigram_counts[word]/(token_count)) * unigram_lambda
    
    
    # sum of 1-5gram log probability
    prob = math.log(interp_fivegram_counts + interp_fourgram_counts + interp_trigram_counts + interp_bigram_counts + interp_unigram_counts)
    return prob
        

def get_word_log_prob_interp(prev, unigram_counts, bigram_counts, trigram_counts, fourgram_counts, fivegram_counts, token_count, lambdas):
    prev_word = prev
    pred = [(get_log_prob_interp(prev_word, 
                                    word, 
                                    unigram_counts, 
                                    bigram_counts,
                                    trigram_counts,
                                    fourgram_counts,
                                    fivegram_counts,
                                    token_count, 
                                    lambdas),word) for word in alpha]
    
    pred = sorted(pred, key=lambda x: x[0])
    return pred
    

In [26]:
def trigram_ai(mask, guessed, **kwargs):
    """change lambdas to fit a trigram model
    """
    mask = list(mask)
    mask = ['<s4>','<s3>','<s2>','<s1>']+mask+['</s1>','</s2>','</s3>','</s4>']
    for i in range(len(mask)):
        if mask[i] == '_':
            prev = (mask[i-4],mask[i-3],mask[i-2],mask[i-1])
            break
    possible = get_word_log_prob_interp(prev, 
                                        unigram_counts, 
                                        bigram_counts, 
                                        trigram_counts, 
                                        fourgram_counts, 
                                        fivegram_counts, 
                                        token_counts, 
                                        (0, 0,0.9,0.09,0.001))   # trigram
    pred = [x for y,x in possible]  
    guess = pred.pop()
    while guess in guessed or guess is "<s4>" or guess is "<s3>" or guess is "<s2>" or guess is "<s1>":
        guess = pred.pop()
    #print(guess)
    return guess
print("guess for 3-grams")    
print (get_output(test_set,trigram_ai)) 

def fourgram_ai(mask, guessed, **kwargs):
    """change lambdas to fit a trigram model
    """
    mask = list(mask)
    mask = ['<s4>','<s3>','<s2>','<s1>']+mask+['</s1>','</s2>','</s3>','</s4>']
    for i in range(len(mask)):
        if mask[i] == '_':
            prev = (mask[i-4],mask[i-3],mask[i-2],mask[i-1])
            break
    possible = get_word_log_prob_interp(prev, 
                                        unigram_counts, 
                                        bigram_counts, 
                                        trigram_counts, 
                                        fourgram_counts, 
                                        fivegram_counts, 
                                        token_counts, 
                                        (0,0.9,0.09,0.009,0.0001))   # trigram
    pred = [x for y,x in possible]  
    guess = pred.pop()
    while guess in guessed or guess is "<s4>" or guess is "<s3>" or guess is "<s2>" or guess is "<s1>":
        guess = pred.pop()
    #print(guess)
    return guess
print("guess for 4-grams")        
print (get_output(test_set,fourgram_ai)) 

def fivegram_ai(mask, guessed, **kwargs):
    """change lambdas to fit a trigram model
    """
    mask = list(mask)
    mask = ['<s4>','<s3>','<s2>','<s1>']+mask+['</s1>','</s2>','</s3>','</s4>']
    for i in range(len(mask)):
        if mask[i] == '_':
            prev = (mask[i-4],mask[i-3],mask[i-2],mask[i-1])
            break
    possible = get_word_log_prob_interp(prev, 
                                        unigram_counts, 
                                        bigram_counts, 
                                        trigram_counts, 
                                        fourgram_counts, 
                                        fivegram_counts, 
                                        token_counts, 
                                        (0.9,0.09,0.009,0.0009,0.00001))   # trigram
    pred = [x for y,x in possible]  
    guess = pred.pop()
    while guess in guessed or guess is "<s4>" or guess is "<s3>" or guess is "<s2>" or guess is "<s1>":
        guess = pred.pop()
    #print(guess)
    return guess
print("guess for 5-grams")     
print (get_output(test_set,fivegram_ai)) 

guess for 3-grams
9.15
guess for 4-grams
9.15
guess for 5-grams
8.21
