In [1]:
import json
import os
import random

import numpy as np

In [2]:
os.chdir('..')

In [3]:
random.seed(42)

In [5]:
# with open('games_with_tags.json', 'r') as in_json:
with open('games_with_tags_min_20_freq.json', 'r') as in_json:
    games_with_tags = json.load(in_json)

Generate mapping for games to indices and the same for tags:

In [6]:
game_index = {game: idx for idx, game in enumerate(games_with_tags)}
index_game = {idx: game for game, idx in game_index.items()}

In [8]:
tag_count = 0
tag_index = {}
for game, tags in games_with_tags.items():
    for tag in tags:
        if tag not in tag_index:
            tag_index[tag] = tag_count
            tag_count += 1
index_tag = {idx: tag for tag, idx in tag_index.items()}

In [9]:
print(f'n game: {len(game_index)}')
print(f'n tags: {len(tag_index)}')

n game: 32421
n tags: 358


Creat true pairs

In [10]:
pairs = []
for game, tags in games_with_tags.items():
    for tag in tags:
        pairs.append(tuple((game_index[game], tag_index[tag])))

In [11]:
pairs[0]

(0, 0)

In [12]:
index_game[pairs[0][0]], index_tag[pairs[0][1]]

('Counter-Strike', '1980s')

In [13]:
index_game[pairs[5000][0]], index_tag[pairs[5000][1]]

('Spectrum', 'Adventure')

In [14]:
index_game[pairs[20000][0]], index_tag[pairs[20000][1]]

('Aberoth', 'Pixel_Graphics')

In [15]:
pairs_set = set(pairs)

In [16]:
(0, 0) in pairs_set

True

In [18]:
random.seed(100)

def generate_batch(pairs, n_positive = 50, negative_ratio = 1.0, classification = False):
    """Generate batches of samples for training"""
    batch_size = n_positive * (1 + negative_ratio)
    batch = np.zeros((batch_size, 3))
    
    # Adjust label based on task
    if classification:
        neg_label = 0
    else:
        neg_label = -1
    
    # This creates a generator
    while True:
        # randomly choose positive examples
        for idx, (game_id, tag_id) in enumerate(random.sample(pairs, n_positive)):
            batch[idx, :] = (game_id, tag_id, 1)

        # Increment idx by 1
        idx += 1
        
        # Add negative examples until reach batch size
        while idx < batch_size:
            
            # random selection
            random_game = random.randrange(len(game_index))
            random_tag = random.randrange(len(tag_index))
            
            # Check to make sure this is not a positive example
            if (random_game, random_tag) not in pairs_set:
                
                # Add to batch and increment index
                batch[idx, :] = (random_game, random_tag, neg_label)
                idx += 1
                
        # Make sure to shuffle order
        np.random.shuffle(batch)
        yield {'game': batch[:, 0], 'tag': batch[:, 1]}, batch[:, 2]

In [19]:
next(generate_batch(pairs, n_positive = 2, negative_ratio = 2))

({'game': array([23984., 23118., 14907.,  5196., 14205., 24564.]),
  'tag': array([179., 201.,  89.,  48., 259.,  40.])},
 array([-1., -1., -1.,  1., -1.,  1.]))

In [20]:
index_game[24504]

'GT Legends'

In [21]:
index_tag[30]

'Indie'

In [22]:
index_game[5726]

'Far Beyond: A space odyssey VR'

In [24]:
index_tag[350]

'Photo_Editing'

In [25]:
x, y = next(generate_batch(pairs, n_positive = 5, negative_ratio = 1))

# Show a few example training pairs
for label, b_idx, l_idx in zip(y, x['game'], x['tag']):
    print(f'Game: {index_game[b_idx]:30} Tag: {index_tag[l_idx]:40} Label: {label}')

Game: Gal*Gun VR                     Tag: Anime                                    Label: 1.0
Game: Ruin City Gasolina             Tag: Cats                                     Label: -1.0
Game: INVESTMENT HERO                Tag: Atmospheric                              Label: 1.0
Game: Lichtspeer                     Tag: 2D                                       Label: 1.0
Game: Poly Mole                      Tag: Satire                                   Label: -1.0
Game: End of Realms                  Tag: Hand-drawn                               Label: 1.0
Game: It Stares Back                 Tag: Souls-like                               Label: 1.0
Game: Ultimate General: Gettysburg   Tag: Chess                                    Label: -1.0
Game: Cardinal Quest 2               Tag: Family_Friendly                          Label: -1.0
Game: Zup! Zero 2                    Tag: Horses                                   Label: -1.0


In [26]:
from keras.layers import (
    Input,
    Embedding,
    Dot,
    Reshape,
    Dense
)
from keras.models import Model

Using TensorFlow backend.


In [86]:
def game_embedding_model(embedding_size = 200, classification = False):
    """Model to embed game and tags using the functional API.
       Trained to discern if a tag is present for a game"""
    
    # Both inputs are 1-dimensional
    game = Input(name = 'game', shape = [1])
    tag = Input(name = 'tag', shape = [1])
    
    # Embedding the game (shape will be (None, 1, 50))
    game_embedding = Embedding(name = 'game_embedding',
                               input_dim = len(game_index),
                               output_dim = embedding_size)(game)
    
    # Embedding the tag (shape will be (None, 1, 50))
    tag_embedding = Embedding(name = 'tag_embedding',
                               input_dim = len(tag_index),
                               output_dim = embedding_size)(tag)
    
    # Merge the layers with a dot product along the second axis (shape will be (None, 1, 1))
    merged = Dot(name = 'dot_product', normalize = True, axes = 2)([game_embedding, tag_embedding])
    
    # Reshape to be a single number (shape will be (None, 1))
    merged = Reshape(target_shape = [1])(merged)
    
    # If classifcation, add extra layer and loss function is binary cross entropy
    if classification:
        merged = Dense(1, activation = 'sigmoid')(merged)
        model = Model(inputs = [book, link], outputs = merged)
        model.compile(optimizer = 'Adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    
    # Otherwise loss function is mean squared error
    else:
        model = Model(inputs = [game, tag], outputs = merged)
        model.compile(optimizer = 'Adam', loss = 'mse')
    
    return model

In [87]:
model = game_embedding_model()
model.summary()

Model: "model_7"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
game (InputLayer)               (None, 1)            0                                            
__________________________________________________________________________________________________
tag (InputLayer)                (None, 1)            0                                            
__________________________________________________________________________________________________
game_embedding (Embedding)      (None, 1, 200)       6484200     game[0][0]                       
__________________________________________________________________________________________________
tag_embedding (Embedding)       (None, 1, 200)       71600       tag[0][0]                        
____________________________________________________________________________________________

In [107]:
n_positive = 1024

gen = generate_batch(pairs, n_positive, negative_ratio = 1)

# Train
h = model.fit_generator(
    gen,
    epochs = 10, 
    steps_per_epoch = len(pairs) // n_positive,
    verbose = 2
)

Epoch 1/10
 - 19s - loss: 0.3906
Epoch 2/10
 - 19s - loss: 0.3972
Epoch 3/10
 - 19s - loss: 0.3793
Epoch 4/10
 - 19s - loss: 0.3824
Epoch 5/10
 - 19s - loss: 0.3824
Epoch 6/10
 - 19s - loss: 0.4062
Epoch 7/10
 - 19s - loss: 0.3930
Epoch 8/10
 - 19s - loss: 0.3829
Epoch 9/10
 - 19s - loss: 0.3754
Epoch 10/10
 - 19s - loss: 0.3910


In [108]:
model.save('./models/second_attempt_min_20_freq.h5')

In [109]:
# Extract embeddings
game_layer = model.get_layer('game_embedding')
game_weights = game_layer.get_weights()[0]
game_weights.shape

(32421, 200)

Each game is now represented as a 50-dimensional vector.

We need to normalize the embeddings so that the dot product between two embeddings becomes the cosine similarity.

In [110]:
game_weights = game_weights / np.linalg.norm(game_weights, axis = 1).reshape((-1, 1))
game_weights[0][:10]
np.sum(np.square(game_weights[0]))

1.0

In [111]:
game_weights[game_index['Portal']]

array([-0.02028892, -0.04083905, -0.03770131, -0.06191832,  0.0363252 ,
        0.10930686,  0.04621289,  0.16038361,  0.01269828, -0.04753675,
        0.04659893, -0.02544025,  0.02250916,  0.1077145 , -0.01074699,
       -0.04704662, -0.01812878,  0.14190973, -0.08162078,  0.00452935,
       -0.07139829,  0.03305717,  0.00649044, -0.08766993,  0.07420082,
       -0.00081735,  0.12735492,  0.02561213,  0.00573422, -0.00379396,
        0.03417899, -0.04833051, -0.03268033,  0.10434254,  0.00289319,
        0.05448822,  0.05231246, -0.03145514, -0.03707538,  0.12277426,
        0.00377354,  0.11227879,  0.04453736,  0.10264253,  0.09225872,
       -0.07955199,  0.09692035,  0.07006463,  0.12184901, -0.16252285,
       -0.10421647, -0.04378475,  0.04611482, -0.02412887, -0.00918506,
       -0.00749663, -0.00036915, -0.11830124,  0.03889192,  0.04704228,
        0.024077  , -0.11215495,  0.00719643, -0.00635523,  0.05302636,
        0.12598021, -0.10939269, -0.04585391,  0.02096593, -0.07

In [126]:
dists = np.dot(game_weights, game_weights[game_index['Age of Empires II: Definitive Edition']])
sorted_dists = np.argsort(dists)
closest = sorted_dists[-6:]
for c in reversed(closest):
    print(f'GAME: {index_game[c]:{40}} Similarity: {dists[c]:.{2}}')

GAME: Age of Empires II: Definitive Edition    Similarity: 1.0
GAME: Rise of Nations: Extended Edition        Similarity: 0.97
GAME: Age of Empires II (2013)                 Similarity: 0.97
GAME: Stronghold Crusader HD                   Similarity: 0.96
GAME: Age of Mythology: Extended Edition       Similarity: 0.95
GAME: Medieval II: Total War Kingdoms          Similarity: 0.95


In [127]:
for key in game_index:
    if 'Total War' in key:
        print(key)

Total War: MEDIEVAL II – Definitive Edition
Total War: WARHAMMER
Rome: Total War - Collection
Total War: EMPIRE – Definitive Edition
Total War: ROME II - Emperor Edition
Total War: NAPOLEON – Definitive Edition
Total War: THREE KINGDOMS
Total War: WARHAMMER II
Total War: ATTILA
Total War Battles: KINGDOM
Total War Saga: THRONES OF BRITANNIA
SHOGUN: Total War - Collection
Medieval: Total War - Collection
Rome: Total War - Alexander
Medieval II: Total War Kingdoms


In [128]:
dists = np.dot(game_weights, game_weights[game_index['Total War: WARHAMMER']])
sorted_dists = np.argsort(dists)
closest = sorted_dists[-6:]
for c in reversed(closest):
    print(f'GAME: {index_game[c]:{40}} Similarity: {dists[c]:.{2}}')

GAME: Total War: WARHAMMER                     Similarity: 1.0
GAME: Phantom Doctrine                         Similarity: 0.97
GAME: Total War: THREE KINGDOMS                Similarity: 0.96
GAME: Warhammer 40,000: Dawn of War II         Similarity: 0.96
GAME: Total War: WARHAMMER II                  Similarity: 0.95
GAME: Warhammer 40,000: Dawn of War II Chaos Rising Similarity: 0.94


In [129]:
def extract_weights(name, model):
    """Extract weights from a neural network model"""
    
    # Extract weights
    weight_layer = model.get_layer(name)
    weights = weight_layer.get_weights()[0]
    
    # Normalize
    weights = weights / np.linalg.norm(weights, axis = 1).reshape((-1, 1))
    return weights

In [130]:
tag_weights = extract_weights('tag_embedding', model)

In [138]:
dists = np.dot(tag_weights, tag_weights[tag_index['RTS']])
sorted_dists = np.argsort(dists)
closest = sorted_dists[-6:]
for c in reversed(closest):
    print(f'GAME: {index_tag[c]:{40}} Similarity: {dists[c]:.{2}}')

GAME: RTS                                      Similarity: 1.0
GAME: War                                      Similarity: 0.86
GAME: Historical                               Similarity: 0.86
GAME: Tactical                                 Similarity: 0.86
GAME: Grand_Strategy                           Similarity: 0.82
GAME: 4X                                       Similarity: 0.82


In [139]:
dists = np.dot(tag_weights, tag_weights[tag_index['Fantasy']])
sorted_dists = np.argsort(dists)
closest = sorted_dists[-6:]
for c in reversed(closest):
    print(f'GAME: {index_tag[c]:{40}} Similarity: {dists[c]:.{2}}')

GAME: Fantasy                                  Similarity: 1.0
GAME: Story_Rich                               Similarity: 0.78
GAME: Great_Soundtrack                         Similarity: 0.73
GAME: Atmospheric                              Similarity: 0.71
GAME: Funny                                    Similarity: 0.71
GAME: Comedy                                   Similarity: 0.71
