## Content-Based Filtering of Game Metadata
## Based on TheGamesDB

In [37]:
import pandas as pd
import numpy as np

### Theres is also a DOS games database but not very big

In [3]:
dos_database = pd.read_excel('dos_database.xlsx')

In [5]:
dos_database.tail()

Unnamed: 0,gametitle_mame,developer_mame,year_mame,genre_mame,description_mame,publisher_mame
7477,Zyconix,Miracle Games,19920101T000000,"Puzzle,Strategy",Zyconix is an arcade puzzle game.\n\nColored b...,"Accolade, Inc."
7478,Zyll,IBM,19840101T000000,"Adventure,Interactive Fiction,Role-Playing,Tex...","A completely unique concept--a real-time, 1 or...",IBM
7479,ZZT,"Epic MegaGames, Inc.",19910101T000000,"Action,Role-Playing,Text-Based",A text-mode adventure game in the style of Rog...,"Epic MegaGames, Inc."
7480,ZZT's City,Softdisk Publishing,19910101T000000,"Action,Role-Playing,Text-Based","Ah, life in the big city. Everyone said it wo...",Softdisk Publishing
7481,ZZT's Revenge!,"Epic MegaGames, Inc.",19920101T000000,"Action,Role-Playing,Text-Based","Shortly after the release of ZZT, Sweeney star...","Epic MegaGames, Inc."


In [7]:
dos_database['description_mame'].isna().sum()

12

### Also some other game dataset that is not as cool as the one that follows

In [2]:
gamelist = pd.read_excel('gamelist.xlsx')

### EDA and Data Cleaning

In [2]:
# mdb file was split into 3 files due to MS Acces export limitations
Game = pd.read_excel('Game.xlsx')
Game1 = pd.read_excel('Game1.xlsx')
Game3 = pd.read_excel('Game3.xlsx')

In [3]:
# combine all the dataframes into one
Game = pd.concat([Game, Game1, Game3], ignore_index=True)

In [4]:
Game.head()

Unnamed: 0,Name,ReleaseYear,Overview,MaxPlayers,ReleaseType,Cooperative,VideoURL,CommunityRating,Platform,ESRB,...,AlternateName_China,AlternateName_Europe,AlternateName_France,AlternateName_Germany,AlternateName_Japan,AlternateName_Korea,AlternateName_NorthAmerica,AlternateName_Spain,AlternateName_UnitedStates,AlternateName_World
0,Shiren Monsters: Netsal,,"Netsal is a Sports game, published by ChunSoft...",1.0,Released,False,,2.166667,Nintendo Game Boy Advance,Not Rated,...,,,,,,,,,,
1,Shonen Jump's One Piece,,You can try to grab the greatest treasure of t...,,Released,False,,4.153846,Nintendo Game Boy Advance,E - Everyone,...,,,,,,,,,,
2,Shonen Jump's Shaman King: Legacy of the Spiri...,,Assume the role of Yoh Asakura in the ultimate...,1.0,Released,False,https://www.youtube.com/watch?v=Th323uP886Y,3.428571,Nintendo Game Boy Advance,E - Everyone,...,,,,,,,,,,
3,Shonen Jump's Shaman King: Legacy of the Spiri...,,Assume the role of Yoh Asakura in the ultimate...,1.0,Released,False,https://www.youtube.com/watch?v=tjY9-A-eGzE,4.029412,Nintendo Game Boy Advance,E - Everyone,...,,,,,,,,,,
4,Shonen Jump's Shaman King: Master of Spirits,,In this game you play Yoh Asakura from the Sha...,1.0,Released,False,https://www.youtube.com/watch?v=GbrAXjtTiLo,3.625,Nintendo Game Boy Advance,E - Everyone,...,,,,,,,,,,


In [5]:
# check columns
Game.columns

Index(['Name', 'ReleaseYear', 'Overview', 'MaxPlayers', 'ReleaseType',
       'Cooperative', 'VideoURL', 'CommunityRating', 'Platform', 'ESRB',
       'CommunityRatingCount', 'Genres', 'Developer', 'Publisher', 'N°',
       'DatabaseID', 'WikipediaURL', 'AlternateName_Australia',
       'AlternateName_Brazil', 'AlternateName_China', 'AlternateName_Europe',
       'AlternateName_France', 'AlternateName_Germany', 'AlternateName_Japan',
       'AlternateName_Korea', 'AlternateName_NorthAmerica',
       'AlternateName_Spain', 'AlternateName_UnitedStates',
       'AlternateName_World'],
      dtype='object')

In [6]:
games_df = Game[['Name', 'Overview', 'Cooperative', 'Platform', 'Genres', 'Developer', 'Publisher']]

In [7]:
# drop where no overview is available
games_df = games_df.dropna(subset=['Overview'])

In [8]:
len(games_df)

214097

In [9]:
# show zero values for each column
games_df.isna().sum()

Name               0
Overview           0
Cooperative        0
Platform           0
Genres         10513
Developer      16604
Publisher      16196
dtype: int64

In [10]:
# duplicates drop
games_df = games_df.drop_duplicates(subset=['Name'], keep='first')

In [11]:
# Write developer to publisher if publisher is empty
games_df['Publisher'] = games_df['Publisher'].fillna(games_df['Developer'])

In [12]:
games_df['Developer'] = games_df['Developer'].fillna(games_df['Publisher'])

In [13]:
games_df['Genres'].fillna('', inplace=True)

In [14]:
games_df['Developer'].fillna('', inplace=True)
games_df['Publisher'].fillna('', inplace=True)

In [15]:
games_df.isna().sum()

Name           0
Overview       0
Cooperative    0
Platform       0
Genres         0
Developer      0
Publisher      0
dtype: int64

### Data Preprocessing

In [16]:
# prepend developer to overview
games_df['Overview'] = games_df['Developer'] + ' ' + games_df['Overview']

In [17]:
# one-hot encode cooperative
CoopDummies = pd.get_dummies(games_df['Cooperative'], prefix='Coop')

In [18]:
# add CoopDummies back and drop the original column
games_df = pd.concat([games_df, CoopDummies], axis=1)
games_df.drop(['Cooperative'], axis=1, inplace=True)

In [19]:
# multilabel binarizer
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()

In [20]:
# One-hot encode genres that are separated by '; '
games_df['Genres'] = games_df['Genres'].str.split('; ')
genres_df = pd.DataFrame(mlb.fit_transform(games_df.pop('Genres')), columns=mlb.classes_, index=games_df.index)

In [21]:
# join genres_df back to games_df
games_df = pd.concat([games_df, genres_df], axis=1)

In [22]:
games_df.reset_index(drop=True, inplace=True)

In [23]:
# drop '' column
games_df.drop([''], axis=1, inplace=True)

In [24]:
games_df.columns

Index(['Name', 'Overview', 'Platform', 'Developer', 'Publisher', 'Coop_False',
       'Coop_True', 'Action', 'Adventure', 'Beat 'em Up', 'Board Game',
       'Casino', 'Construction and Management Simulation', 'Education',
       'Fighting', 'Flight Simulator', 'Horror', 'Life Simulation', 'MMO',
       'Music', 'Party', 'Platform', 'Puzzle', 'Quiz', 'Racing',
       'Role-Playing', 'Sandbox', 'Shooter', 'Sports', 'Stealth', 'Strategy',
       'Vehicle Simulation', 'Visual Novel'],
      dtype='object')

In [25]:
import nltk
nltk.download('wordnet')
stopwords = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package wordnet to /home/berry/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [26]:
# lemmatizer and stemmer. Only use one of them
# porter = PorterStemmer()
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
import string

In [27]:
# function to clean text
def clean_text(text):
    # remove hex
    # text = re.sub(r'[^\x00-\x7f]',r'', text)
    # hyphens to space
    text = text.replace('-', ' ')
    # remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # remove numbers
    # text = text.translate(str.maketrans('', '', string.digits))
    # make lowercase
    text = text.lower()
    # tokenize text
    tokens = word_tokenize(text)
    # remove stopwords
    tokens = [w for w in tokens if w not in stopwords]
    # lemmatize words
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    # stem words
    # tokens = [porter.stem(w) for w in tokens]
    # make tokens into string
    # text = ' '.join(tokens)
    # return clean text
    return tokens

In [28]:
# apply clean_text function to description column
games_df['Overview_clean'] = games_df['Overview'].apply(clean_text)

In [29]:
game_tags = games_df.drop(columns=['Name', 'Overview', 'Platform', 'Developer', 'Publisher', 'Overview_clean'])

### Word2Vec

In [30]:
games_df['Overview_clean']

0        [chunsoft, netsal, sport, game, published, chu...
1        [dimps, corporation, try, grab, greatest, trea...
2        [kcej, assume, role, yoh, asakura, ultimate, q...
3        [kcej, assume, role, yoh, asakura, ultimate, q...
4        [kcej, game, play, yoh, asakura, shaman, king,...
                               ...                        
85959    [robert, dillinger, space, invadersgorf, style...
85960    [stuart, smith, player, travel, land, greece, ...
85961    [jon, mattson, text, adventure, sequel, dial, ...
85962    [gareth, pitchford, designed, small, four, loc...
85963    [bimbarlade, upon, time, king, obsessed, hunti...
Name: Overview_clean, Length: 85964, dtype: object

In [31]:
from gensim.models import Word2Vec

# overview column to list
game_desc_tokens = games_df['Overview_clean'].tolist()


# train the model
model = Word2Vec(min_count=20, window=5, sample=6e-5, alpha=0.03, min_alpha=0.0007, negative=20)

In [32]:
model.build_vocab(game_desc_tokens, progress_per=10000)

In [33]:
model.train(game_desc_tokens, total_examples=model.corpus_count, epochs=30, report_delay=1)

(102541503, 181279230)

In [34]:
model.wv.most_similar('mario')

[('bros', 0.7492160201072693),
 ('luigi', 0.7383193969726562),
 ('marios', 0.6962532997131348),
 ('bowsers', 0.6583023071289062),
 ('waluigi', 0.6090022325515747),
 ('kart', 0.6082103848457336),
 ('yoshi', 0.6006398797035217),
 ('bowser', 0.5784978270530701),
 ('yoshis', 0.5756335258483887),
 ('super', 0.5712643265724182)]

In [35]:
model.wv.most_similar('zombie')

[('undead', 0.689279317855835),
 ('apocalypse', 0.6172518134117126),
 ('horde', 0.6037349700927734),
 ('corps', 0.5745407938957214),
 ('infected', 0.5548469424247742),
 ('outbreak', 0.5343813300132751),
 ('monstrosity', 0.5336241126060486),
 ('mutant', 0.5324799418449402),
 ('rampaging', 0.5165082216262817),
 ('bloodthirsty', 0.5156662464141846)]

In [38]:
# process game_descriptions into vectors based on word2vec model
game_desc_vectors = []
for desc in game_desc_tokens:
    vector = np.zeros(100)
    for word in desc:
        try:
            vector += model.wv[word]
        except KeyError:
            pass
    game_desc_vectors.append(vector)

In [39]:
game_desc_vectors = np.array(game_desc_vectors)

In [40]:
# append game_tags to game_desc_vectors
matrix = np.concatenate((game_desc_vectors, game_tags.values), axis=1)

In [41]:
# store matrix in pickle
import pickle
pickle.dump(matrix, open('matrix.pkl', 'wb'))

### Cosine Similarity

In [42]:
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

In [43]:
# Construct a reverse map of indices and game titles
indices = pd.Series(games_df.index, index=games_df['Name'])

In [44]:
def recommend_content(title, matrix=matrix):
    '''Get similar games based on content using the similarity matrix'''
    # get index for our game
    idx = indices[title]

    sim_scores = cosine_similarity(matrix[idx].reshape(1, -1), matrix)
    
    # sort scores based on similarity
    sorted_sim_scores = sorted(list(enumerate(sim_scores[0])), key=lambda x: x[1], reverse=True)
    
    # make a dictionary with title as key and score as value
    content_similar_scores = {indices.index[i[0]]: i[1] for i in sorted_sim_scores[1:21]}

    return content_similar_scores

In [48]:
recommend_content('Samurai Shodown', matrix=game_desc_vectors)

{'The King of Fighters XIV': 0.8754204040699904,
 'Ultra Street Fighter II: The Final Challengers': 0.8536641856553766,
 'Virtua Fighter 5 Ultimate Showdown': 0.8534674332667624,
 'Marvel vs. Capcom: Infinite': 0.8519850909369832,
 "Spawn: In the Demon's Hand": 0.8508398099711982,
 'Warriors Orochi 3: Ultimate': 0.8472299532201764,
 'TEPPEN': 0.8469666014836583,
 'Groove on Fight: Gouketsuji Ichizoku 3': 0.8412556435661185,
 'SoulCalibur VI': 0.8397968998695178,
 'Street Fighter EX3': 0.8379844136007386,
 'Samurai Warriors 2': 0.8343853779266217,
 'Tekken Tag Tournament 2': 0.8291746868791681,
 'SoulCalibur IV & Tekken 6': 0.8289612616225681,
 'Onimusha: Blade Warriors': 0.8274058668574362,
 'Street Fighter 6': 0.8253897342466634,
 'Marvel vs. Capcom: Clash of Super Heroes': 0.8241961492541835,
 'Halo 5: Guardians': 0.821395311034639,
 'Halo 5: Guardians: Limited Edition': 0.821395311034639,
 'Street Fighter III: Third Strike Online Edition': 0.820814743410681,
 'Street Fighter III: 3r

In [200]:
# do cosine_similarity one by one so i wont run out of memory, takes a long time
similarity_matrix = []
for vector in tqdm(matrix):
    similarity_matrix.append(cosine_similarity([vector], matrix)[0])

  0%|          | 111/85964 [00:35<7:33:42,  3.15it/s]


KeyboardInterrupt: 