# Game Classifier

## Software prerequisite

Import all the modules needed for the whole process.

In [1]:
# Python built-ins
from collections import Counter
import hashlib
import os
import pickle
import subprocess
import sys
import xml.etree.ElementTree as ET
# third party modules
from keras import optimizers
from keras.applications.vgg16 import preprocess_input
from keras.applications.vgg16 import VGG16
from keras.layers import Dense, Activation
from keras.models import load_model
from keras.models import Sequential
from keras.preprocessing import image
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer

Using TensorFlow backend.


Define some user arguments.

In [2]:
DOWNLOADS = './downloads' # Where is the downloads
GAME_START_ID = 1 # Game ID to start with
GAME_END_ID = 3000 # Game ID to end with

Copy some functions from download.py so it can understand the downloads.

In [3]:
def parseGameInfo(id, location, filename='info', allow=False):
    result = {'id':id}
    filepath = os.path.join(location, filename)
    if not os.path.isfile(filepath):
        print('File {} is missing'.format(filepath))
        if not allow:
            sys.exit(1)
        return result
    tree = ET.parse(filepath)
    root = tree.getroot()
    for child in root:
        if child.tag == 'baseImgUrl':
            result['baseImgUrl'] = child.text
        if child.tag == 'Game':
            for childChild in child:
                if childChild.tag == 'GameTitle':
                    result['GameTitle'] = childChild.text
                elif childChild.tag == 'Platform':
                    result['Platform'] = childChild.text
                elif childChild.tag == 'Overview':
                    result['Overview'] = childChild.text
                elif childChild.tag == 'Platform':
                    result['Platform'] = childChild.text
                elif childChild.tag == 'Genres':
                    result['Genres'] = []
                    for childChildChild in childChild:
                        if childChildChild.tag == 'genre':
                            result['Genres'].append(childChildChild.text)
                elif childChild.tag == 'Images':
                    result['Images'] = []
                    for childChildChild in childChild.iter(tag='screenshot'):
                        for childChildChildChild in childChildChild.iter(tag='original'):
                            result['Images'].append(childChildChildChild.text)
    return result

def hashURL(url):
    m = hashlib.md5()
    m.update(url.encode('utf-8'))
    return m.hexdigest()

Define more functions that will be used when building and trying the model

In [4]:
model = VGG16(weights='imagenet', include_top=False)
def extractFeatureFromImage(filepath):
    global model
    img = image.load_img(filepath, target_size=(224, 224))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    features = model.predict(x)
    return features.reshape(1,-1)

Downloading data from https://github.com/fchollet/deep-learning-models/releases/download/v0.1/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5

## Dataset information

Let us see how much information we have collected.

In [5]:
metaData = []
totalGames = 0
totalImages = 0
for i in range(GAME_START_ID, GAME_END_ID+1):
    info = parseGameInfo(i, os.path.join(DOWNLOADS, str(i)))
    if 'Genres' in info and len(info.get('Images', [])) != 0:
        metaData.append(info)
        totalGames += 1
        totalImages += len(info['Images'])
print('Total games collected: {}'.format(totalGames))
print('Total images collected: {}'.format(totalImages))

Total games collected: 1064
Total images collected: 2509


Let us find out how many unique genres they belong to.

In [15]:
genres = []
for info in metaData:
    if 'Genres' not in info:
        print(info)
        break
    genres.extend(info['Genres'])
genres = Counter(genres)
genreList = list(genres.keys())
genreList.sort()
numGenres = len(genreList)
print('{} genres with occurance:'.format(numGenres))
print(dict(genres))

19 genres with occurance:
{'Shooter': 218, 'Action': 474, 'Flight Simulator': 9, 'Adventure': 224, 'Sandbox': 6, 'Racing': 60, 'Role-Playing': 122, 'Horror': 17, 'Fighting': 62, 'Platform': 141, 'Sports': 94, 'Strategy': 58, 'Stealth': 8, 'MMO': 4, 'Construction and Management Simulation': 9, 'Puzzle': 69, 'Vehicle Simulation': 3, 'Music': 3, 'Life Simulation': 7}


Helper functions to convert from genre to genre id and vice verse

In [7]:
def genreToID(genre):
    return genreList.index(genre)

def idToGenre(id):
    return genreList[id]

## Data pre-processing

### Go straight to the section Try the model if you want to skip the time consuming part of training the model

Extract the features from the images based on the pre-trained network.

In [None]:
feature_list=[]
genre_list=[]
i = 0
for info in metaData:
    for imageUrl in info['Images']:
        i += 1
        url = info['baseImgUrl'] + imageUrl
        filepath = os.path.join(DOWNLOADS, str(info['id']), 'images', hashURL(url))
        features = extractFeatureFromImage(filepath)
        feature_list.append(features)
        genre_list.append([genreToID(x) for x in info['Genres']])
        if i % 100 == 0:
            print('Finished {} images'.format(i))
        # break # Uncomment it if you only want one image for each game.

Transform to X and Y

In [None]:
(a,b)=feature_list[0].shape
feature_size=a*b
np_features=np.zeros((len(feature_list),feature_size))
for i in range(len(feature_list)):
    feat=feature_list[i]
    np_features[i]=feat
X=np_features

mlb=MultiLabelBinarizer()
Y=mlb.fit_transform(genre_list)

Select the training set and testing set.

In [None]:
mask = np.random.rand(len(X)) < 0.8
X_train=X[mask]
X_test=X[~mask]
Y_train=Y[mask]
Y_test=Y[~mask]

## Deep Learning

Create the model.

In [None]:
model_visual = Sequential([
    Dense(1024, input_shape=(25088,)),
    Activation('relu'),
    Dense(256),
    Activation('relu'),
    Dense(numGenres),
    Activation('sigmoid'),
])
opt = optimizers.rmsprop(lr=0.0001, decay=1e-6)

#sgd = optimizers.SGD(lr=0.05, decay=1e-6, momentum=0.4, nesterov=False)
model_visual.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])

Train it!

In [None]:
model_visual.fit(X_train, Y_train, epochs=50, batch_size=64,verbose=1)

Make the prediction.

In [None]:
Y_preds=model_visual.predict(X_test)

## Evaluate the model

Evaluation criteria.

In [None]:
def precision_recall(gt,preds):
    TP=0
    FP=0
    FN=0
    for t in gt:
        if t in preds:
            TP+=1
        else:
            FN+=1
    for p in preds:
        if p not in gt:
            FP+=1
    if TP+FP==0:
        precision=0
    else:
        precision=TP/float(TP+FP)
    if TP+FN==0:
        recall=0
    else:
        recall=TP/float(TP+FN)
    return precision,recall

How is the model?

In [None]:
precs=[]
recs=[]
for i in range(len(Y_preds)):
    row=Y_preds[i]
    gt_genres=Y_test[i]
    gt_genre_names=[]
    for j in range(len(gt_genres)):
        if gt_genres[j]==1:
            gt_genre_names.append(idToGenre(j))
    #tops=np.argsort(row)[-len(gt_genre_names):]
    tops=np.argsort(row)[-1:]
    predicted_genres=[]
    for genre in tops:
        predicted_genres.append(idToGenre(genre))
    (precision,recall)=precision_recall(gt_genre_names,predicted_genres)
    precs.append(precision)
    recs.append(recall)
print('Precision is {}'.format(np.mean(np.asarray(precs))))
print('Recall is {}'.format(np.mean(np.asarray(recs))))

Save the model

In [None]:
model_visual.save('GameClassifier.hdf5')

## Try the model

Load the model

In [8]:
my_model = load_model('GameClassifier.hdf5')

Make a guess!

In [None]:
image_path = '/tmp/test'
url = input('Image URL: ')
subprocess.check_call(['rm', '-rf', image_path])
subprocess.check_call(['wget', '-O', image_path, url])

X = extractFeatureFromImage(image_path)
Y=my_model.predict(X)
row = Y[0]
orders = np.argsort(row)[::-1]
i = 1
for guess in orders:
    answer = input('Guess {}: Is {} correct? '.format(i, idToGenre(guess)))
    i += 1
    if answer.lower() == 'y' or answer.lower() == 'yes':
        print('Great!')
        break

Image URL: http://images9.gry-online.pl/galeria/galeria_duze3/417648453.jpg
Guess 1: Is Adventure correct? no
Guess 2: Is Platform correct? no
Guess 3: Is Role-Playing correct? no
Guess 4: Is Puzzle correct? no
Guess 5: Is Action correct? no
Guess 6: Is Strategy correct? no
Guess 7: Is MMO correct? no
Guess 8: Is Flight Simulator correct? no
Guess 9: Is Vehicle Simulation correct? 
Guess 10: Is Horror correct? 
Guess 11: Is Shooter correct? 
Guess 12: Is Life Simulation correct? 
Guess 13: Is Sports correct? 
Guess 14: Is Stealth correct? 
Guess 15: Is Racing correct? 
