## Content-Based Filtering of Game Metadata
## Based on TheGamesDB

In [1]:
import pandas as pd
import numpy as np

In [2]:
# mdb file was split into 3 files due to MS Acces export limitations
Game = pd.read_excel('Game.xlsx')
Game1 = pd.read_excel('Game1.xlsx')
Game3 = pd.read_excel('Game3.xlsx')

In [3]:
# combine all the dataframes into one
Game = pd.concat([Game, Game1, Game3], ignore_index=True)

### EDA and Data Cleaning

In [4]:
Game.head()

Unnamed: 0,Name,ReleaseYear,Overview,MaxPlayers,ReleaseType,Cooperative,VideoURL,CommunityRating,Platform,ESRB,...,AlternateName_China,AlternateName_Europe,AlternateName_France,AlternateName_Germany,AlternateName_Japan,AlternateName_Korea,AlternateName_NorthAmerica,AlternateName_Spain,AlternateName_UnitedStates,AlternateName_World
0,Shiren Monsters: Netsal,,"Netsal is a Sports game, published by ChunSoft...",1.0,Released,False,,2.166667,Nintendo Game Boy Advance,Not Rated,...,,,,,,,,,,
1,Shonen Jump's One Piece,,You can try to grab the greatest treasure of t...,,Released,False,,4.153846,Nintendo Game Boy Advance,E - Everyone,...,,,,,,,,,,
2,Shonen Jump's Shaman King: Legacy of the Spiri...,,Assume the role of Yoh Asakura in the ultimate...,1.0,Released,False,https://www.youtube.com/watch?v=Th323uP886Y,3.428571,Nintendo Game Boy Advance,E - Everyone,...,,,,,,,,,,
3,Shonen Jump's Shaman King: Legacy of the Spiri...,,Assume the role of Yoh Asakura in the ultimate...,1.0,Released,False,https://www.youtube.com/watch?v=tjY9-A-eGzE,4.029412,Nintendo Game Boy Advance,E - Everyone,...,,,,,,,,,,
4,Shonen Jump's Shaman King: Master of Spirits,,In this game you play Yoh Asakura from the Sha...,1.0,Released,False,https://www.youtube.com/watch?v=GbrAXjtTiLo,3.625,Nintendo Game Boy Advance,E - Everyone,...,,,,,,,,,,


In [5]:
# check columns
Game.columns

Index(['Name', 'ReleaseYear', 'Overview', 'MaxPlayers', 'ReleaseType',
       'Cooperative', 'VideoURL', 'CommunityRating', 'Platform', 'ESRB',
       'CommunityRatingCount', 'Genres', 'Developer', 'Publisher', 'N°',
       'DatabaseID', 'WikipediaURL', 'AlternateName_Australia',
       'AlternateName_Brazil', 'AlternateName_China', 'AlternateName_Europe',
       'AlternateName_France', 'AlternateName_Germany', 'AlternateName_Japan',
       'AlternateName_Korea', 'AlternateName_NorthAmerica',
       'AlternateName_Spain', 'AlternateName_UnitedStates',
       'AlternateName_World'],
      dtype='object')

In [6]:
games_df = Game[['Name', 'Overview', 'Cooperative', 'Platform', 'Genres', 'Developer', 'Publisher']]

In [7]:
# drop where no overview is available
games_df = games_df.dropna(subset=['Overview'])

In [8]:
len(games_df)

214097

In [9]:
# show zero values for each column
games_df.isna().sum()

Name               0
Overview           0
Cooperative        0
Platform           0
Genres         10513
Developer      16604
Publisher      16196
dtype: int64

In [10]:
# duplicates drop
games_df = games_df.drop_duplicates(subset=['Name'], keep='first')

In [14]:
# Write developer to publisher if publisher is empty
games_df['Publisher'] = games_df['Publisher'].fillna(games_df['Developer'])

In [15]:
# Write publisher to developer if developer is empty
games_df['Developer'] = games_df['Developer'].fillna(games_df['Publisher'])

In [16]:
# fill empty values with empty string
games_df['Genres'].fillna('', inplace=True)
games_df['Developer'].fillna('', inplace=True)
games_df['Publisher'].fillna('', inplace=True)

In [17]:
games_df.isna().sum()

Name           0
Overview       0
Cooperative    0
Platform       0
Genres         0
Developer      0
Publisher      0
dtype: int64

### Data Preprocessing

In [18]:
# prepend developer to overview
games_df['Overview'] = games_df['Developer'] + ' ' + games_df['Overview']

In [19]:
# one-hot encode cooperative
CoopDummies = pd.get_dummies(games_df['Cooperative'], prefix='Coop')

In [20]:
# add CoopDummies back and drop the original column
games_df = pd.concat([games_df, CoopDummies], axis=1)
games_df.drop(['Cooperative'], axis=1, inplace=True)

In [21]:
# multilabel binarizer
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()

In [22]:
# One-hot encode genres that are separated by '; '
games_df['Genres'] = games_df['Genres'].str.split('; ')
genres_df = pd.DataFrame(mlb.fit_transform(games_df.pop('Genres')), columns=mlb.classes_, index=games_df.index)

In [23]:
# join genres_df back to games_df
games_df = pd.concat([games_df, genres_df], axis=1)

In [24]:
games_df.reset_index(drop=True, inplace=True)

In [25]:
# drop '' column
games_df.drop([''], axis=1, inplace=True)

In [26]:
games_df.columns

Index(['Name', 'Overview', 'Platform', 'Developer', 'Publisher', 'Coop_False',
       'Coop_True', 'Action', 'Adventure', 'Beat 'em Up', 'Board Game',
       'Casino', 'Construction and Management Simulation', 'Education',
       'Fighting', 'Flight Simulator', 'Horror', 'Life Simulation', 'MMO',
       'Music', 'Party', 'Platform', 'Puzzle', 'Quiz', 'Racing',
       'Role-Playing', 'Sandbox', 'Shooter', 'Sports', 'Stealth', 'Strategy',
       'Vehicle Simulation', 'Visual Novel'],
      dtype='object')

In [27]:
import nltk
nltk.download('wordnet')
stopwords = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package wordnet to /home/berry/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [28]:
# lemmatizer and stemmer. Only use one of them
# porter = PorterStemmer()
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
import string

In [29]:
# function to clean text
def clean_text(text):
    # hyphens to space
    text = text.replace('-', ' ')
    # remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # remove numbers
    # text = text.translate(str.maketrans('', '', string.digits))
    # make lowercase
    text = text.lower()
    # tokenize text
    tokens = word_tokenize(text)
    # remove stopwords
    tokens = [w for w in tokens if w not in stopwords]
    # lemmatize words
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    # stem words
    # tokens = [porter.stem(w) for w in tokens]
    return tokens

In [30]:
# apply clean_text function to description column
games_df['Overview_clean'] = games_df['Overview'].apply(clean_text)

In [31]:
game_tags = games_df.drop(columns=['Name', 'Overview', 'Platform', 'Developer', 'Publisher', 'Overview_clean'])

### Word2Vec

In [32]:
games_df['Overview_clean']

0        [chunsoft, netsal, sport, game, published, chu...
1        [dimps, corporation, try, grab, greatest, trea...
2        [kcej, assume, role, yoh, asakura, ultimate, q...
3        [kcej, assume, role, yoh, asakura, ultimate, q...
4        [kcej, game, play, yoh, asakura, shaman, king,...
                               ...                        
85959    [robert, dillinger, space, invadersgorf, style...
85960    [stuart, smith, player, travel, land, greece, ...
85961    [jon, mattson, text, adventure, sequel, dial, ...
85962    [gareth, pitchford, designed, small, four, loc...
85963    [bimbarlade, upon, time, king, obsessed, hunti...
Name: Overview_clean, Length: 85964, dtype: object

In [33]:
from gensim.models import Word2Vec


# instantiate model
model = Word2Vec(min_count=20, window=5)

In [34]:
# overview column to list
game_desc_tokens = games_df['Overview_clean'].tolist()

In [35]:
model.build_vocab(game_desc_tokens, progress_per=1000)

In [36]:
model.train(game_desc_tokens, total_examples=model.corpus_count, epochs=30, report_delay=1)

(162902635, 181279230)

In [37]:
model.wv.most_similar('mario')

[('luigi', 0.5947633385658264),
 ('yoshi', 0.5783502459526062),
 ('metroid', 0.5585972666740417),
 ('marios', 0.5278715491294861),
 ('waluigi', 0.5268223285675049),
 ('bros', 0.5211812257766724),
 ('sonic', 0.5068606734275818),
 ('kart', 0.5012959837913513),
 ('kirby', 0.4876628518104553),
 ('yoshis', 0.48446398973464966)]

In [38]:
model.wv.most_similar('zombie')

[('undead', 0.6855775713920593),
 ('mutant', 0.5581833124160767),
 ('monstrosity', 0.55033940076828),
 ('insect', 0.5427649617195129),
 ('horde', 0.5257471203804016),
 ('infected', 0.5071948766708374),
 ('ghoul', 0.5051590204238892),
 ('skeleton', 0.4895174205303192),
 ('corps', 0.48681408166885376),
 ('rampaging', 0.4863361716270447)]

In [39]:
# process game_descriptions into vectors based on word2vec model
game_desc_vectors = []
for desc in game_desc_tokens:
    vector = np.zeros(100)
    for word in desc:
        try:
            vector += model.wv[word]
        except KeyError:
            pass
    game_desc_vectors.append(vector)

In [40]:
game_desc_vectors = np.array(game_desc_vectors)

In [41]:
# append game_tags to game_desc_vectors
matrix = np.concatenate((game_desc_vectors, game_tags.values), axis=1)

In [42]:
# store matrix in pickle
import pickle
pickle.dump(matrix, open('matrix.pkl', 'wb'))

### Cosine Similarity

In [43]:
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

In [44]:
# Construct a reverse map of indices and game titles
indices = pd.Series(games_df.index, index=games_df['Name'])

In [45]:
def recommend_content(title, matrix=matrix):
    '''Get similar games based on content using the similarity matrix'''
    # get index for our game
    idx = indices[title]

    sim_scores = cosine_similarity(matrix[idx].reshape(1, -1), matrix)
    
    # sort scores based on similarity
    sorted_sim_scores = sorted(list(enumerate(sim_scores[0])), key=lambda x: x[1], reverse=True)
    
    # make a dictionary with title as key and score as value
    content_similar_scores = {indices.index[i[0]]: i[1] for i in sorted_sim_scores[1:21]}

    return content_similar_scores

In [48]:
recommend_content('Samurai Shodown II', matrix=game_desc_vectors)

{"Bamber's Fighting Clone: Duel Sword Doods": 0.8007226017384659,
 'Demon Warrior': 0.7860933195040416,
 'Kasumi Ninja': 0.7682937866963682,
 'Samurai Shodown: Warriors Rage': 0.7646426057421187,
 'Duel Sword Doods': 0.7596236156339276,
 'Spectral vs. Generation': 0.7592766689494024,
 'Shakii the Wolf': 0.7584396632489964,
 'Double Dragon V: The Shadow Falls': 0.7559188660108631,
 'Spectral Vs. Generation': 0.7549709899563433,
 'The Revenge of Shinobi': 0.7524692436533136,
 'Hiryuu no Ken III: 5 Nin no Ryuu Senshi': 0.751994609005994,
 'Samurai Bringer': 0.7500590858737819,
 'Ninja Gaiden Sigma': 0.748431206341419,
 'Warriors Orochi': 0.7451409857944225,
 'Warriors of Fate': 0.7429973583274536,
 'SoulCalibur V': 0.7383394234516375,
 "Shogun's Blade": 0.7332532332100025,
 'Kengo: Legend of the 9': 0.7324164354038082,
 'The Age of Heroes: Silkroad 2': 0.7257021384278578,
 'Ninja Gaiden II': 0.7254868521384746}

In [200]:
# do cosine_similarity one by one so i wont run out of memory, takes a long time
similarity_matrix = []
for vector in tqdm(matrix):
    similarity_matrix.append(cosine_similarity([vector], matrix)[0])

  0%|          | 111/85964 [00:35<7:33:42,  3.15it/s]


KeyboardInterrupt: 