# Model

Here, the model will be trained and inspected.

In [1]:
import json
import os
import random

import numpy as np

In [2]:
os.chdir('..')

In [3]:
random.seed(42)
np.random.seed(42)

Only going to used tags where the frequency is at least 20, this should help reduce noise in the data

In [4]:
# with open('games_with_tags.json', 'r') as in_json:
# with open('games_with_tags_min_20_freq.json', 'r') as in_json:
# with open('games_with_tags_min_50_freq.json', 'r') as in_json:
with open('games_with_tags_double_filter.json', 'r') as in_json:
    games_with_tags = json.load(in_json)

In [105]:
games_with_tags['Age of Mythology: Extended Edition']

['Strategy',
 'Masterpiece',
 'RTS',
 'Classic',
 'Multiplayer',
 'Great Soundtrack',
 'Singleplayer',
 'Fantasy',
 'Simulation',
 'Historical',
 'Atmospheric',
 'Adventure',
 'Tactical',
 'Real-Time',
 'Story Rich',
 'Co-op',
 'Remake']

In [107]:
games_with_tags['Surviving Mars']

['Colony Sim',
 'City Builder',
 'Strategy',
 'Survival',
 'Space',
 'Base-Building',
 'Simulation',
 'Sci-fi',
 'Resource Management',
 'Singleplayer',
 'Building',
 'Management',
 'Space Sim',
 'Futuristic',
 'Sandbox']

Generate mapping for games to indices and the same for tags:

In [5]:
game_index = {game: idx for idx, game in enumerate(games_with_tags)}
index_game = {idx: game for game, idx in game_index.items()}

In [6]:
tag_count = 0
tag_index = {}
for game, tags in games_with_tags.items():
    for tag in tags:
        if tag not in tag_index:
            tag_index[tag] = tag_count
            tag_count += 1
index_tag = {idx: tag for tag, idx in tag_index.items()}

In [7]:
print(f'n game: {len(game_index)}')
print(f'n tags: {len(tag_index)}')

n game: 3070
n tags: 230


Creating a set of true pairs where a game actually has a given tag. Note the game index and the tag index are what is stored in the `pairs` list

In [8]:
pairs = []
for game, tags in games_with_tags.items():
    for tag in tags:
        pairs.append(tuple((game_index[game], tag_index[tag])))

Taking a look at the first pair:

In [9]:
pairs[0]

(0, 0)

Now, relate those indices back to a game name and tag name

In [10]:
index_game[pairs[0][0]], index_tag[pairs[0][1]]

('Counter-Strike', 'Action')

In [11]:
index_game[pairs[5000][0]], index_tag[pairs[5000][1]]

('Spelunky', 'Singleplayer')

In [12]:
index_game[pairs[20000][0]], index_tag[pairs[20000][1]]

('Kingdom Rush Origins', 'Tower Defense')

Creating a set object of the pairs. This set will be used to identify if a game - tag relationship ship exists and will be used to create a negative training data set. Using a set should speed things up a bit at the items in the set will be hashed and should be faster than `pair in pairs` as this is a linear search 

In [13]:
pairs_set = set(pairs)

In [14]:
(0, 0) in pairs_set

True

Create a generator method to yield data as required:

In [15]:
random.seed(100)

def generate_batch(pairs, n_positive = 50, negative_ratio = 1.0, classification = False):
    """Generate batches of samples for training"""
    batch_size = n_positive * (1 + negative_ratio)
    batch = np.zeros((batch_size, 3))
    
    # Adjust label based on task
    if classification:
        neg_label = 0
    else:
        neg_label = -1
    
    # This creates a generator
    while True:
        # randomly choose positive examples
        for idx, (game_id, tag_id) in enumerate(random.sample(pairs, n_positive)):
            batch[idx, :] = (game_id, tag_id, 1)

        # Increment idx by 1
        idx += 1
        
        # Add negative examples until reach batch size
        while idx < batch_size:
            
            # random selection
            random_game = random.randrange(len(game_index))
            random_tag = random.randrange(len(tag_index))
            
            # Check to make sure this is not a positive example
            if (random_game, random_tag) not in pairs_set:
                
                # Add to batch and increment index
                batch[idx, :] = (random_game, random_tag, neg_label)
                idx += 1
                
        # Make sure to shuffle order
        np.random.shuffle(batch)
        yield {'game': batch[:, 0], 'tag': batch[:, 1]}, batch[:, 2]

In [16]:
next(generate_batch(pairs, n_positive = 2, negative_ratio = 2))

({'game': array([ 619., 2049., 1432., 1863., 1609.,  715.]),
  'tag': array([ 31.,  19., 110., 197., 187., 180.])},
 array([ 1.,  1., -1., -1., -1., -1.]))

In [18]:
index_game[619]

'Sniper Elite 3'

In [19]:
index_tag[31]

'Online Co-Op'

In [20]:
index_game[2049]

'Bayonetta'

In [21]:
index_tag[19]

'Mature'

Have a look at a training batch:

In [22]:
x, y = next(generate_batch(pairs, n_positive = 5, negative_ratio = 1))

# Show a few example training pairs
for label, b_idx, l_idx in zip(y, x['game'], x['tag']):
    print(f'Game: {index_game[b_idx]:30} Tag: {index_tag[l_idx]:40} Label: {label}')

Game: XCOM 2                         Tag: Real Time Tactics                        Label: -1.0
Game: Kingdom Two Crowns             Tag: Dystopian                                Label: -1.0
Game: Sundered: Eldritch Edition     Tag: Indie                                    Label: 1.0
Game: La-Mulana                      Tag: Metroidvania                             Label: 1.0
Game: Jamestown                      Tag: Lore-Rich                                Label: -1.0
Game: Loren The Amazon Princess      Tag: Sandbox                                  Label: -1.0
Game: OKAMI HD / 大神 絶景版              Tag: Adventure                                Label: 1.0
Game: Shadow Warrior Classic Redux   Tag: Classic                                  Label: 1.0
Game: Resident Evil Revelations 2 / Biohazard Revelations 2 Tag: Military                                 Label: -1.0
Game: Guacamelee! Gold Edition       Tag: Beat 'em up                              Label: 1.0


In [23]:
from keras.layers import (
    Input,
    Embedding,
    Dot,
    Reshape,
    Dense
)
from keras.models import Model
import tensorflow as tf

Using TensorFlow backend.


In [24]:
tf.random.set_seed(42)

A function to build the embedding model

In [112]:
def game_embedding_model(embedding_size = 100, classification = False):
    """Model to embed game and tags using the functional API.
       Trained to discern if a tag is present for a game"""
    
    # Both inputs are 1-dimensional
    game = Input(name = 'game', shape = [1])
    tag = Input(name = 'tag', shape = [1])
    
    # Embedding the game (shape will be (None, 1, 50))
    game_embedding = Embedding(name = 'game_embedding',
                               input_dim = len(game_index),
                               output_dim = embedding_size)(game)
    
    # Embedding the tag (shape will be (None, 1, 50))
    tag_embedding = Embedding(name = 'tag_embedding',
                               input_dim = len(tag_index),
                               output_dim = embedding_size)(tag)
    
    # Merge the layers with a dot product along the second axis (shape will be (None, 1, 1))
    merged = Dot(name = 'dot_product', normalize = True, axes = 2)([game_embedding, tag_embedding])
    
    # Reshape to be a single number (shape will be (None, 1))
    merged = Reshape(target_shape = [1])(merged)
    
    # If classifcation, add extra layer and loss function is binary cross entropy
    if classification:
        merged = Dense(1, activation = 'sigmoid')(merged)
        model = Model(inputs = [book, link], outputs = merged)
        model.compile(optimizer = 'Adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    
    # Otherwise loss function is mean squared error
    else:
        model = Model(inputs = [game, tag], outputs = merged)
        model.compile(optimizer = 'Adam', loss = 'mse')
    
    return model

In [167]:
model = game_embedding_model(embedding_size=200)
model.summary()

Model: "model_11"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
game (InputLayer)               (None, 1)            0                                            
__________________________________________________________________________________________________
tag (InputLayer)                (None, 1)            0                                            
__________________________________________________________________________________________________
game_embedding (Embedding)      (None, 1, 200)       614000      game[0][0]                       
__________________________________________________________________________________________________
tag_embedding (Embedding)       (None, 1, 200)       46000       tag[0][0]                        
___________________________________________________________________________________________

In [168]:
n_positive = 1024

gen = generate_batch(pairs, n_positive, negative_ratio = 2)

# Train
h = model.fit_generator(
    gen,
    epochs = 20, 
    steps_per_epoch = len(pairs) // n_positive,
    verbose = 2
)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/20
 - 1s - loss: 0.9946
Epoch 2/20
 - 0s - loss: 0.9679
Epoch 3/20
 - 0s - loss: 0.9287
Epoch 4/20
 - 0s - loss: 0.8730
Epoch 5/20
 - 0s - loss: 0.7779
Epoch 6/20
 - 0s - loss: 0.6723
Epoch 7/20
 - 0s - loss: 0.5791
Epoch 8/20
 - 0s - loss: 0.5196
Epoch 9/20
 - 0s - loss: 0.4832
Epoch 10/20
 - 0s - loss: 0.4663
Epoch 11/20
 - 0s - loss: 0.4479
Epoch 12/20
 - 0s - loss: 0.4390
Epoch 13/20
 - 0s - loss: 0.4307
Epoch 14/20
 - 0s - loss: 0.4205
Epoch 15/20
 - 0s - loss: 0.4182
Epoch 16/20
 - 0s - loss: 0.4166
Epoch 17/20
 - 0s - loss: 0.4154
Epoch 18/20
 - 0s - loss: 0.4066
Epoch 19/20
 - 0s - loss: 0.4040
Epoch 20/20
 - 0s - loss: 0.4025


The loss was reduced over successive epochs which means the network was learning something!

In [169]:
model.save('./models/double_filter_emb_200.h5')

In [170]:
# Extract embeddings
game_layer = model.get_layer('game_embedding')
game_weights = game_layer.get_weights()[0]
game_weights.shape

(3070, 200)

Each game is now represented as a 50-dimensional vector.

We need to normalize the embeddings so that the dot product between two embeddings becomes the cosine similarity.

In [171]:
game_weights = game_weights / np.linalg.norm(game_weights, axis = 1).reshape((-1, 1))
game_weights[0][:10]
np.sum(np.square(game_weights[0]))

1.0

In [172]:
game_weights[game_index['Portal']]

array([-0.00109073, -0.01724519,  0.06001356, -0.02607019,  0.02891507,
       -0.01699745,  0.09929603, -0.02611912,  0.00795362, -0.1755628 ,
       -0.01310005,  0.11739302,  0.05751131,  0.01500415,  0.06135628,
        0.01482367, -0.06131312, -0.03841453,  0.1068726 ,  0.10221927,
        0.13364178,  0.03754778,  0.07873269,  0.02576359,  0.03266801,
        0.10079321, -0.00823136,  0.01456807,  0.0077336 ,  0.10560231,
        0.00041216, -0.12101679,  0.05460078, -0.06164695, -0.03260313,
        0.04724703,  0.00618768,  0.14591552,  0.00061356,  0.07496168,
        0.02256386, -0.08729154, -0.02515487, -0.02831933,  0.09597023,
        0.02227032, -0.03774486, -0.0313034 , -0.01692515, -0.08252463,
        0.12335243,  0.0608047 , -0.0518028 , -0.07511599,  0.03980004,
       -0.02810827, -0.1307546 , -0.07446088,  0.05529015, -0.10727797,
       -0.10108518,  0.06855935, -0.11287209,  0.01309502, -0.07384143,
        0.03019584,  0.08735685, -0.00470799, -0.08862843,  0.09

In [173]:
dists = np.dot(game_weights, game_weights[game_index['Age of Empires II: Definitive Edition']])
sorted_dists = np.argsort(dists)
closest = sorted_dists[-6:]
for c in reversed(closest):
    print(f'GAME: {index_game[c]:{40}} Similarity: {dists[c]:.{2}}')

GAME: Age of Empires II: Definitive Edition    Similarity: 1.0
GAME: Tooth and Tail                           Similarity: 0.96
GAME: Kingdom Wars                             Similarity: 0.96
GAME: Total Annihilation                       Similarity: 0.96
GAME: Warhammer 40,000: Dawn of War II Chaos Rising Similarity: 0.95
GAME: SpellForce 3                             Similarity: 0.95


In [174]:
for key in game_index:
    if 'Myth' in key:
        print(key)

Age of Mythology: Extended Edition
The Lost Mythologies


In [175]:
dists = np.dot(game_weights, game_weights[game_index['Total War: WARHAMMER']])
sorted_dists = np.argsort(dists)
closest = sorted_dists[-6:]
for c in reversed(closest):
    print(f'GAME: {index_game[c]:{40}} Similarity: {dists[c]:.{2}}')

GAME: Total War: WARHAMMER                     Similarity: 1.0
GAME: Total War: WARHAMMER II                  Similarity: 0.9
GAME: Endless Space 2                          Similarity: 0.89
GAME: Age of Mythology: Extended Edition       Similarity: 0.88
GAME: Total War: THREE KINGDOMS                Similarity: 0.87
GAME: Age of Wonders III                       Similarity: 0.86


In [176]:
def extract_weights(name, model):
    """Extract weights from a neural network model"""
    
    # Extract weights
    weight_layer = model.get_layer(name)
    weights = weight_layer.get_weights()[0]
    
    # Normalize
    weights = weights / np.linalg.norm(weights, axis = 1).reshape((-1, 1))
    return weights

In [177]:
tag_weights = extract_weights('tag_embedding', model)

In [178]:
dists = np.dot(tag_weights, tag_weights[tag_index['RTS']])
sorted_dists = np.argsort(dists)
closest = sorted_dists[-6:]
for c in reversed(closest):
    print(f'GAME: {index_tag[c]:{40}} Similarity: {dists[c]:.{2}}')

GAME: RTS                                      Similarity: 1.0
GAME: Grand Strategy                           Similarity: 0.9
GAME: Historical                               Similarity: 0.85
GAME: Real-Time                                Similarity: 0.82
GAME: Base-Building                            Similarity: 0.79
GAME: Military                                 Similarity: 0.78


In [179]:
dists = np.dot(tag_weights, tag_weights[tag_index['Fantasy']])
sorted_dists = np.argsort(dists)
closest = sorted_dists[-6:]
for c in reversed(closest):
    print(f'GAME: {index_tag[c]:{40}} Similarity: {dists[c]:.{2}}')

GAME: Fantasy                                  Similarity: 1.0
GAME: Action RPG                               Similarity: 0.8
GAME: Character Customization                  Similarity: 0.78
GAME: Third Person                             Similarity: 0.73
GAME: Hack and Slash                           Similarity: 0.72
GAME: Masterpiece                              Similarity: 0.7


In [180]:
def subtract_tag(tag: str, game: str) -> np.array:
    new_game_weight = game_weights[game_index[game]] - tag_weights[tag_index[tag]]
    return new_game_weight / np.linalg.norm(new_game_weight).reshape((-1, 1))[0]

In [181]:
def add_tag(tag: str, game: str) -> np.array:
    new_game_weight = game_weights[game_index[game]] + tag_weights[tag_index[tag]]
    return new_game_weight / np.linalg.norm(new_game_weight).reshape((-1, 1))[0]

In [182]:
def find_closest(game_embedding: np.array) -> None:
    dists = np.dot(game_weights, game_embedding)
    sorted_dists = np.argsort(dists)
    closest = sorted_dists[-6:]
    for c in reversed(closest):
        print(f'GAME: {index_game[c]:{40}} Similarity: {dists[c]:.{2}}')

In [183]:
find_closest(subtract_tag('Classic', 'Age of Mythology: Extended Edition'))

GAME: CryoFall                                 Similarity: 0.82
GAME: My Lands: Black Gem Hunting              Similarity: 0.8
GAME: Free Company VR                          Similarity: 0.8
GAME: All Guns On Deck                         Similarity: 0.8
GAME: INTERSTELLAR PRIME                       Similarity: 0.79
GAME: 太吾绘卷 The Scroll Of Taiwu                 Similarity: 0.79


In [184]:
find_closest(subtract_tag('Fantasy', 'Age of Mythology: Extended Edition'))

GAME: Steel Division: Normandy 44              Similarity: 0.82
GAME: Ashes of the Singularity: Escalation     Similarity: 0.82
GAME: ARMA: Cold War Assault                   Similarity: 0.82
GAME: Takedown: Red Sabre                      Similarity: 0.81
GAME: War of Rights                            Similarity: 0.79
GAME: Ultimate General: Civil War              Similarity: 0.79


In [185]:
find_closest(game_weights[game_index['Rocket League']])

GAME: Rocket League                            Similarity: 1.0
GAME: Stick Fight: The Game                    Similarity: 0.82
GAME: Duck Game                                Similarity: 0.77
GAME: SpeedRunners                             Similarity: 0.77
GAME: Hot Shot Burn                            Similarity: 0.77
GAME: Xenon Racer                              Similarity: 0.75


In [186]:
find_closest(subtract_tag('Sports', 'Rocket League'))

GAME: Mirador                                  Similarity: 0.87
GAME: Magicka                                  Similarity: 0.87
GAME: Overcooked! 2                            Similarity: 0.86
GAME: Monaco: What's Yours Is Mine             Similarity: 0.84
GAME: FORCED: Slightly Better Edition          Similarity: 0.84
GAME: BattleBlock Theater                      Similarity: 0.84


In [187]:
find_closest(subtract_tag('Sports', 'GRID 2'))

GAME: Far Cry 2: Fortune's Edition             Similarity: 0.92
GAME: E.Y.E: Divine Cybermancy                 Similarity: 0.9
GAME: Outward                                  Similarity: 0.9
GAME: Far Cry New Dawn                         Similarity: 0.89
GAME: American Fugitive                        Similarity: 0.89
GAME: Dead Rising 2                            Similarity: 0.89


In [188]:
find_closest(game_weights[game_index['GRID 2']])

GAME: GRID 2                                   Similarity: 1.0
GAME: RaceRoom Racing Experience               Similarity: 0.92
GAME: Need For Speed: Hot Pursuit              Similarity: 0.9
GAME: The Crew 2                               Similarity: 0.88
GAME: Wreckfest                                Similarity: 0.88
GAME: Project CARS                             Similarity: 0.85


In [189]:
find_closest(add_tag('Fantasy', 'Surviving Mars'))

GAME: Galactic Civilizations III               Similarity: 0.75
GAME: Stellaris                                Similarity: 0.74
GAME: Banished                                 Similarity: 0.7
GAME: Craft The World                          Similarity: 0.7
GAME: Oxygen Not Included                      Similarity: 0.69
GAME: Sid Meier's Civilization: Beyond Earth   Similarity: 0.69


In [190]:
find_closest(subtract_tag('Colony Sim', 'Surviving Mars'))

GAME: Endless Sky                              Similarity: 0.95
GAME: X4: Foundations                          Similarity: 0.93
GAME: X Rebirth                                Similarity: 0.91
GAME: Starpoint Gemini 2                       Similarity: 0.91
GAME: Ashes of the Singularity: Escalation     Similarity: 0.9
GAME: Wenjia                                   Similarity: 0.89
