# Codenames clue giving

These cells will be in the same order as in the blog post [here](https://jamesmullenbach.github.io/2018/01/02/codenames-fun.html) if you want to follow along.

Let's start by building the board.

In [4]:
import gensim
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True, limit=200000)
ours = ['watch', 'star', 'mole', 'Berlin', 'limousine', 'day', 'wind', 'cap', 'thumb']                                  
theirs = ['smuggler', 'crown', 'cotton', 'palm', 'pumpkin', 'giant', 'link', 'dog']                                     
assassin = 'tie'

Simple one word clue method:

In [5]:
clues = {}
for our_word in ours:
    #get similar words from google vocab, and lowercase them
    candidates = [(c.lower(), s) for c,s in model.most_similar(positive=[our_word], topn=50)]
    #per game rules, we should exclude multi-word results and words that use part of the clue word.
    #we could use stemmers for this maybe, but let's keep it simple
    clues[our_word] = [(c,s) for c,s in candidates if '_' not in c and c not in our_word and our_word not in c][:5]

In [9]:
{word: clue[0] for word,clue in clues.items()}

{'Berlin': (u'munich', 0.6743212938308716),
 'cap': (u'hat', 0.4268711507320404),
 'day': (u'week', 0.65529865026474),
 'limousine': (u'limos', 0.6517890095710754),
 'mole': (u'birthmark', 0.46605658531188965),
 'star': (u'heartthrob', 0.543801486492157),
 'thumb': (u'pinkie', 0.6484930515289307),
 'watch': (u'see', 0.5326846837997437),
 'wind': (u'gusts', 0.5962637662887573)}

Rule keeping:

In [29]:
def verify(candidate, word_list):
    if '_' in candidate:
        return False
    for word in word_list:
        if word in candidate or candidate in word:
            return False
    return True

Two word clues!

In [30]:
import itertools
clues_2 = {}
for our1, our2 in itertools.combinations(ours, r=2):
    our_list = [our1, our2]
    candidates = [(c.lower(), s) for c,s in model.most_similar(positive=our_list, topn=50)]
    clues_2[(our1, our2)] = [(c,s) for c,s in candidates if verify(c, our_list)][:5]

In [32]:
clues_2[('watch', 'star')]

[(u'standout', 0.4403938949108124),
 (u'legend', 0.4198486804962158),
 (u'fans', 0.41028615832328796),
 (u'phenom', 0.4011392295360565),
 (u'idol', 0.4011325240135193)]

In [33]:
clues_2[('mole', 'Berlin')]

[(u'stasi', 0.5160060524940491),
 (u'hamburg', 0.47616061568260193),
 (u'german', 0.47304385900497437),
 (u'budapest', 0.4673951268196106),
 (u'munich', 0.4670976400375366)]

Using the whole board:

In [41]:
candidates = model.most_similar(positive=ours, negative=theirs+[assassin], topn=100)

In [42]:
import operator
cand_words = []
for cand,_ in candidates:
    if verify(cand, ours):
        scores = {}
        for our_word in ours:
            scores[our_word] = model.similarity(cand, our_word)
        cand_words.append((cand, sorted(scores.items(), key=operator.itemgetter(1), reverse=True)[:5]))

In [43]:
[{word: [w for w,s in scores]} for word, scores in cand_words[:10]]

[{u'flashbulbs': ['star', 'limousine', 'wind', 'watch', 'day']},
 {u'eve': ['day', 'star', 'Berlin', 'watch', 'limousine']},
 {u'rookies': ['star', 'cap', 'day', 'watch', 'wind']},
 {u'VIPs': ['limousine', 'watch', 'star', 'mole', 'Berlin']},
 {u'Tomczyk': ['Berlin', 'cap', 'wind', 'watch', 'day']},
 {u'Stasi': ['Berlin', 'mole', 'wind', 'watch', 'thumb']},
 {u'countdown': ['day', 'watch', 'mole', 'star', 'wind']},
 {u'##/#-hour': ['day', 'watch', 'limousine', 'wind', 'thumb']},
 {u'invitees': ['limousine', 'star', 'watch', 'day', 'Berlin']},
 {u'hour': ['day', 'watch', 'limousine', 'wind', 'star']}]

Classification!

In [90]:
import numpy as np
from sklearn import linear_model
X = np.array([model.word_vec(word) for word in ours + theirs + [assassin]])
Y = np.array([1 for i in range(len(ours))] + [-1 for i in range(len(theirs)+1)])
#this is an SVM by default
clf = linear_model.SGDClassifier(max_iter=1000, fit_intercept=False, penalty='none')
clf.fit(X,Y)
vec = clf.coef_[0]

In [91]:
clf_candidates = model.similar_by_vector(vec, topn=100)

Getting valid clues similarly:

In [92]:
clf_cand_words = []
for cand,_ in clf_candidates:
    if verify(cand, ours):
        scores = {}
        for our_word in ours:
            scores[our_word] = model.similarity(cand, our_word)
        clf_cand_words.append({cand: [w for w,s in sorted(scores.items(), key=operator.itemgetter(1), reverse=True)[:5]]})

In [93]:
#These may not be the same as the blog post due to randomness of SGD, but there should be a good bit of overlap
clf_cand_words[:10]

[{u'sprain': ['thumb', 'star', 'mole', 'watch', 'cap']},
 {u'wattage': ['star', 'wind', 'Berlin', 'limousine', 'watch']},
 {u'gametime': ['watch', 'thumb', 'day', 'wind', 'star']},
 {u'limos': ['limousine', 'star', 'watch', 'Berlin', 'mole']},
 {u'flashbulbs': ['star', 'limousine', 'wind', 'watch', 'day']},
 {u'invitees': ['limousine', 'star', 'watch', 'day', 'Berlin']},
 {u'MRI': ['thumb', 'mole', 'watch', 'wind', 'cap']},
 {u'tweeting': ['watch', 'day', 'thumb', 'star', 'mole']},
 {u'inactives': ['thumb', 'watch', 'day', 'mole', 'limousine']},
 {u'schedule': ['day', 'watch', 'wind', 'thumb', 'cap']}]

Using only a couple of our team's words:

In [94]:
ours_sub = ['watch', 'star']
X = np.array([model.word_vec(word) for word in ours_sub + theirs + [assassin]])
Y = np.array([1 for i in range(len(ours_sub))] + [-1 for i in range(len(theirs)+1)])
clf = linear_model.SGDClassifier(max_iter=1000, fit_intercept=False, penalty='none')
clf.fit(X,Y)
vec = clf.coef_[0]

In [95]:
sub_candidates = model.similar_by_vector(vec, topn=100)

In [99]:
#Again, may be slightly different
[cand for cand,_ in sub_candidates if verify(cand.lower(), ours)][:5]

[u'viewing', u'marquee', u'seeing', u'RATINGS', u'tuned']