## Content-Based Filtering of Game Metadata
## Based on TheGamesDB

In [1]:
import pandas as pd

### Theres is also a DOS games database but not very big

In [3]:
dos_database = pd.read_excel('dos_database.xlsx')

In [5]:
dos_database.tail()

Unnamed: 0,gametitle_mame,developer_mame,year_mame,genre_mame,description_mame,publisher_mame
7477,Zyconix,Miracle Games,19920101T000000,"Puzzle,Strategy",Zyconix is an arcade puzzle game.\n\nColored b...,"Accolade, Inc."
7478,Zyll,IBM,19840101T000000,"Adventure,Interactive Fiction,Role-Playing,Tex...","A completely unique concept--a real-time, 1 or...",IBM
7479,ZZT,"Epic MegaGames, Inc.",19910101T000000,"Action,Role-Playing,Text-Based",A text-mode adventure game in the style of Rog...,"Epic MegaGames, Inc."
7480,ZZT's City,Softdisk Publishing,19910101T000000,"Action,Role-Playing,Text-Based","Ah, life in the big city. Everyone said it wo...",Softdisk Publishing
7481,ZZT's Revenge!,"Epic MegaGames, Inc.",19920101T000000,"Action,Role-Playing,Text-Based","Shortly after the release of ZZT, Sweeney star...","Epic MegaGames, Inc."


In [7]:
dos_database['description_mame'].isna().sum()

12

### Also some other game dataset that is not as cool as the one that follows

In [2]:
gamelist = pd.read_excel('gamelist.xlsx')

### EDA and Data Cleaning

In [3]:
# mdb file was split into 3 files due to MS Acces export limitations
Game = pd.read_excel('Game.xlsx')
Game1 = pd.read_excel('Game1.xlsx')
Game3 = pd.read_excel('Game3.xlsx')

In [4]:
# combine all the dataframes into one
Game = pd.concat([Game, Game1, Game3], ignore_index=True)

In [5]:
Game.head()

Unnamed: 0,Name,ReleaseYear,Overview,MaxPlayers,ReleaseType,Cooperative,VideoURL,CommunityRating,Platform,ESRB,...,AlternateName_China,AlternateName_Europe,AlternateName_France,AlternateName_Germany,AlternateName_Japan,AlternateName_Korea,AlternateName_NorthAmerica,AlternateName_Spain,AlternateName_UnitedStates,AlternateName_World
0,Shiren Monsters: Netsal,,"Netsal is a Sports game, published by ChunSoft...",1.0,Released,False,,2.166667,Nintendo Game Boy Advance,Not Rated,...,,,,,,,,,,
1,Shonen Jump's One Piece,,You can try to grab the greatest treasure of t...,,Released,False,,4.153846,Nintendo Game Boy Advance,E - Everyone,...,,,,,,,,,,
2,Shonen Jump's Shaman King: Legacy of the Spiri...,,Assume the role of Yoh Asakura in the ultimate...,1.0,Released,False,https://www.youtube.com/watch?v=Th323uP886Y,3.428571,Nintendo Game Boy Advance,E - Everyone,...,,,,,,,,,,
3,Shonen Jump's Shaman King: Legacy of the Spiri...,,Assume the role of Yoh Asakura in the ultimate...,1.0,Released,False,https://www.youtube.com/watch?v=tjY9-A-eGzE,4.029412,Nintendo Game Boy Advance,E - Everyone,...,,,,,,,,,,
4,Shonen Jump's Shaman King: Master of Spirits,,In this game you play Yoh Asakura from the Sha...,1.0,Released,False,https://www.youtube.com/watch?v=GbrAXjtTiLo,3.625,Nintendo Game Boy Advance,E - Everyone,...,,,,,,,,,,


In [6]:
Game.columns

Index(['Name', 'ReleaseYear', 'Overview', 'MaxPlayers', 'ReleaseType',
       'Cooperative', 'VideoURL', 'CommunityRating', 'Platform', 'ESRB',
       'CommunityRatingCount', 'Genres', 'Developer', 'Publisher', 'N°',
       'DatabaseID', 'WikipediaURL', 'AlternateName_Australia',
       'AlternateName_Brazil', 'AlternateName_China', 'AlternateName_Europe',
       'AlternateName_France', 'AlternateName_Germany', 'AlternateName_Japan',
       'AlternateName_Korea', 'AlternateName_NorthAmerica',
       'AlternateName_Spain', 'AlternateName_UnitedStates',
       'AlternateName_World'],
      dtype='object')

In [37]:
games_df = Game[['Name', 'Overview', 'Cooperative', 'Platform', 'Genres', 'Developer', 'Publisher']]

In [38]:
# drop where no overview is available
games_df = games_df.dropna(subset=['Overview'])

In [39]:
len(games_df)

214097

In [40]:
# show zero values for each column
games_df.isna().sum()

Name               0
Overview           0
Cooperative        0
Platform           0
Genres         10513
Developer      16604
Publisher      16196
dtype: int64

In [41]:
# for duplicates drop the one where Genres is empty
games_df = games_df.drop_duplicates(subset=['Name'], keep='first')

In [42]:
# Write developer to publisher if publisher is empty
games_df['Publisher'] = games_df['Publisher'].fillna(games_df['Developer'])

In [43]:
games_df['Developer'] = games_df['Developer'].fillna(games_df['Publisher'])

In [44]:
games_df['Genres'].fillna('', inplace=True)

In [45]:
games_df['Developer'].fillna('', inplace=True)
games_df['Publisher'].fillna('', inplace=True)

### Data Preprocessing

In [46]:
# prepend developer to overview
games_df['Overview'] = games_df['Developer'] + ' ' + games_df['Overview']

In [47]:
# one-hot encode cooperative
CoopDummies = pd.get_dummies(games_df['Cooperative'], prefix='Coop')

In [48]:
# add CoopDummies back and drop the original column
games_df = pd.concat([games_df, CoopDummies], axis=1)
games_df.drop(['Cooperative'], axis=1, inplace=True)

In [49]:
# multilabel binarizer
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()

In [50]:
# One-hot encode genres that are separated by '; '
games_df['Genres'] = games_df['Genres'].str.split('; ')
genres_df = pd.DataFrame(mlb.fit_transform(games_df.pop('Genres')), columns=mlb.classes_, index=games_df.index)

In [51]:
# join genres_df back to games_df
games_df = pd.concat([games_df, genres_df], axis=1)

In [52]:
games_df.reset_index(drop=True, inplace=True)

In [54]:
# drop '' column
games_df.drop([''], axis=1, inplace=True)

In [53]:
games_df.columns

Index(['Name', 'Overview', 'Platform', 'Developer', 'Publisher', 'Coop_False',
       'Coop_True', '', 'Action', 'Adventure', 'Beat 'em Up', 'Board Game',
       'Casino', 'Construction and Management Simulation', 'Education',
       'Fighting', 'Flight Simulator', 'Horror', 'Life Simulation', 'MMO',
       'Music', 'Party', 'Platform', 'Puzzle', 'Quiz', 'Racing',
       'Role-Playing', 'Sandbox', 'Shooter', 'Sports', 'Stealth', 'Strategy',
       'Vehicle Simulation', 'Visual Novel'],
      dtype='object')

In [55]:
import nltk
nltk.download('wordnet')
stopwords = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package wordnet to /home/berry/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [56]:
# lemmatizer and stemmer. Only use one of them
# porter = PorterStemmer()
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
import string

In [69]:
# function to clean text
def clean_text(text):
    # remove hex
    # text = re.sub(r'[^\x00-\x7f]',r'', text)
    # hyphens to space
    text = text.replace('-', ' ')
    # remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # remove numbers
    # text = text.translate(str.maketrans('', '', string.digits))
    # make lowercase
    text = text.lower()
    # tokenize text
    tokens = word_tokenize(text)
    # remove stopwords
    tokens = [w for w in tokens if w not in stopwords]
    # lemmatize words
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    # stem words
    # tokens = [porter.stem(w) for w in tokens]
    # make tokens into string
    # text = ' '.join(tokens)
    # return clean text
    return tokens

In [70]:
# apply clean_text function to description column
games_df['Overview_clean'] = games_df['Overview'].apply(clean_text)

In [71]:
game_tags = games_df.drop(columns=['Name', 'Overview', 'Platform', 'Developer', 'Publisher', 'Overview_clean'])

### CountVectorizer

In [183]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [184]:
countvec = CountVectorizer(max_features=1000, min_df=5, ngram_range=(1, 2))
cv_matrix = countvec.fit_transform(games_df['Overview_clean'])

In [185]:
matrix = np.concatenate((cv_matrix.toarray(), game_tags.values), axis=1)

In [186]:
matrix.shape

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

### Dimensionality Reduction Test

In [201]:
# reduce dimensionality using locality sensitive hashing
from sklearn.random_projection import SparseRandomProjection
srp = SparseRandomProjection(n_components=100)
matrix = srp.fit_transform(matrix)

In [203]:
matrix.shape

(85964, 100)

### Word2Vec

In [72]:
games_df['Overview_clean']

0        [chunsoft, netsal, sport, game, published, chu...
1        [dimps, corporation, try, grab, greatest, trea...
2        [kcej, assume, role, yoh, asakura, ultimate, q...
3        [kcej, assume, role, yoh, asakura, ultimate, q...
4        [kcej, game, play, yoh, asakura, shaman, king,...
                               ...                        
85959    [robert, dillinger, space, invadersgorf, style...
85960    [stuart, smith, player, travel, land, greece, ...
85961    [jon, mattson, text, adventure, sequel, dial, ...
85962    [gareth, pitchford, designed, small, four, loc...
85963    [bimbarlade, upon, time, king, obsessed, hunti...
Name: Overview_clean, Length: 85964, dtype: object

In [73]:
from gensim.models import Word2Vec

# overview column to list
game_desc_tokens = games_df['Overview_clean'].tolist()


# train the model
model = Word2Vec(min_count=20, window=5, sample=6e-5, alpha=0.03, min_alpha=0.0007, negative=20)

In [75]:
model.build_vocab(game_desc_tokens, progress_per=10000)

In [76]:
model.train(game_desc_tokens, total_examples=model.corpus_count, epochs=30, report_delay=1)

(102542798, 181279230)

In [77]:
model.wv.most_similar('mario')

[('bros', 0.7459353804588318),
 ('luigi', 0.741351306438446),
 ('marios', 0.6861761808395386),
 ('bowsers', 0.6475427150726318),
 ('waluigi', 0.6158074140548706),
 ('yoshi', 0.6095926761627197),
 ('kart', 0.6034002900123596),
 ('bowser', 0.5985233187675476),
 ('super', 0.5884424448013306),
 ('smw', 0.5859856009483337)]

In [95]:
model.wv.most_similar('zombie')

[('undead', 0.6956534385681152),
 ('horde', 0.6028316020965576),
 ('apocalypse', 0.5984140634536743),
 ('corps', 0.5807664394378662),
 ('infected', 0.55157870054245),
 ('monstrosity', 0.5394513010978699),
 ('mutant', 0.5382084250450134),
 ('mindless', 0.5296018719673157),
 ('outbreak', 0.5289222002029419),
 ('infection', 0.5239735841751099)]

In [96]:
# process game_descriptions into vectors based on word2vec model
game_desc_vectors = []
for desc in game_desc_tokens:
    vector = np.zeros(100)
    for word in desc:
        try:
            vector += model.wv[word]
        except KeyError:
            pass
    game_desc_vectors.append(vector)

In [97]:
game_desc_vectors = np.array(game_desc_vectors)

In [100]:
# append game_tags to game_desc_vectors
matrix = np.concatenate((game_desc_vectors, game_tags.values), axis=1)

### Cosine Similarity

In [103]:
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

In [200]:
# do cosine_similarity one by one so i wont run out of memory
similarity_matrix = []
for vector in tqdm(matrix):
    similarity_matrix.append(cosine_similarity([vector], matrix)[0])

  0%|          | 111/85964 [00:35<7:33:42,  3.15it/s]


KeyboardInterrupt: 

In [91]:
# Construct a reverse map of indices and game titles
indices = pd.Series(games_df.index, index=games_df['Name'])

In [101]:
def recommend_content(title, matrix=matrix):
    '''Get similar games based on content using the similarity matrix'''
    # get index for our game
    idx = indices[title]

    sim_scores = cosine_similarity(matrix[idx].reshape(1, -1), matrix)
    
    # sort scores based on similarity
    sorted_sim_scores = sorted(list(enumerate(sim_scores[0])), key=lambda x: x[1], reverse=True)
    
    # make a dictionary with title as key and score as value
    content_similar_scores = {indices.index[i[0]]: i[1] for i in sorted_sim_scores[1:21]}

    return content_similar_scores

In [110]:
recommend_content('Samurai Shodown', matrix=game_desc_vectors)

{'The King of Fighters XIV': 0.8725421392158721,
 'Marvel vs. Capcom: Infinite': 0.8509478814305282,
 'Virtua Fighter 5 Ultimate Showdown': 0.8501745253431301,
 'Ultra Street Fighter II: The Final Challengers': 0.8463247625445645,
 "Spawn: In the Demon's Hand": 0.8430006567916725,
 'TEPPEN': 0.8419402231818001,
 'Groove on Fight: Gouketsuji Ichizoku 3': 0.8378511722228943,
 'SoulCalibur VI': 0.836656280106243,
 'Samurai Warriors 2': 0.8356113080831784,
 'Tekken Tag Tournament 2': 0.8344604723437308,
 'Warriors Orochi 3: Ultimate': 0.8306557517034472,
 'SoulCalibur IV & Tekken 6': 0.8285276711083543,
 'Street Fighter EX3': 0.8238885570053209,
 'Onimusha: Blade Warriors': 0.8211195769850643,
 'Halo 5: Guardians': 0.8193991154690048,
 'Halo 5: Guardians: Limited Edition': 0.8193991154690048,
 'Street Fighter 6': 0.8193635342156067,
 'Marvel vs. Capcom: Clash of Super Heroes': 0.8160364857929046,
 'Street Fighter III: Third Strike Online Edition': 0.8146280391061557,
 'Naruto to Boruto: Sh