# Board Game Recommender
## Collaborative filtering based on games theme features only

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

#ML Packages

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV

# Step 1: Clean and Prepare Data

## EDA


### Explore the games data, then clean and transform into a useable format

In [2]:
# Import games data
df = pd.read_csv('games.csv')

df_games = df.copy(deep=True)

df_games.head()

Unnamed: 0,BGGId,Name,Description,YearPublished,GameWeight,AvgRating,BayesAvgRating,StdDev,MinPlayers,MaxPlayers,...,Rank:partygames,Rank:childrensgames,Cat:Thematic,Cat:Strategy,Cat:War,Cat:Family,Cat:CGS,Cat:Abstract,Cat:Party,Cat:Childrens
0,1,Die Macher,die macher game seven sequential political rac...,1986,4.3206,7.61428,7.10363,1.57979,3,5,...,21926,21926,0,1,0,0,0,0,0,0
1,2,Dragonmaster,dragonmaster tricktaking card game base old ga...,1981,1.963,6.64537,5.78447,1.4544,3,4,...,21926,21926,0,1,0,0,0,0,0,0
2,3,Samurai,samurai set medieval japan player compete gain...,1998,2.4859,7.45601,7.23994,1.18227,2,4,...,21926,21926,0,1,0,0,0,0,0,0
3,4,Tal der Könige,triangular box luxurious large block tal der k...,1992,2.6667,6.60006,5.67954,1.23129,2,4,...,21926,21926,0,0,0,0,0,0,0,0
4,5,Acquire,acquire player strategically invest business t...,1964,2.5031,7.33861,7.14189,1.33583,2,6,...,21926,21926,0,1,0,0,0,0,0,0


In [3]:
#get description of features in df

df_games.info()
df_describe = df_games.describe()

with pd.option_context('display.max_columns', None):
  display(df_describe)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21925 entries, 0 to 21924
Data columns (total 48 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   BGGId                21925 non-null  int64  
 1   Name                 21925 non-null  object 
 2   Description          21924 non-null  object 
 3   YearPublished        21925 non-null  int64  
 4   GameWeight           21925 non-null  float64
 5   AvgRating            21925 non-null  float64
 6   BayesAvgRating       21925 non-null  float64
 7   StdDev               21925 non-null  float64
 8   MinPlayers           21925 non-null  int64  
 9   MaxPlayers           21925 non-null  int64  
 10  ComAgeRec            16395 non-null  float64
 11  LanguageEase         16034 non-null  float64
 12  BestPlayers          21925 non-null  int64  
 13  GoodPlayers          21925 non-null  object 
 14  NumOwned             21925 non-null  int64  
 15  NumWant              21925 non-null 

Unnamed: 0,BGGId,YearPublished,GameWeight,AvgRating,BayesAvgRating,StdDev,MinPlayers,MaxPlayers,ComAgeRec,LanguageEase,BestPlayers,NumOwned,NumWant,NumWish,NumWeightVotes,MfgPlaytime,ComMinPlaytime,ComMaxPlaytime,MfgAgeRec,NumUserRatings,NumComments,NumAlternates,NumExpansions,NumImplementations,IsReimplementation,Kickstarted,Rank:boardgame,Rank:strategygames,Rank:abstracts,Rank:familygames,Rank:thematic,Rank:cgs,Rank:wargames,Rank:partygames,Rank:childrensgames,Cat:Thematic,Cat:Strategy,Cat:War,Cat:Family,Cat:CGS,Cat:Abstract,Cat:Party,Cat:Childrens
count,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,16395.0,16034.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0
mean,117652.663216,1985.494914,1.982131,6.424922,5.685673,1.516374,2.007343,5.707868,10.004391,216.461819,0.311517,1467.848164,41.690946,228.457013,49.480137,90.513523,63.678586,90.513523,9.613409,861.668324,0.0,1.603786,1.380068,0.308734,0.116762,0.153341,11019.514071,19729.826956,20839.347184,19732.270011,20736.185815,21625.086796,18680.185678,21295.352201,21062.680274,0.055827,0.10577,0.161003,0.105633,0.01382,0.050855,0.02919,0.040182
std,104628.721777,212.486214,0.848983,0.932477,0.365311,0.285578,0.693093,15.014643,3.269157,236.595136,1.067002,5294.120574,117.255229,788.477151,205.762375,529.657389,443.916212,529.657389,3.64156,3638.680857,0.0,9.619364,7.701036,0.848095,0.321143,0.360324,6372.926817,6389.614077,4695.170448,6387.089598,4893.928466,2542.033794,7420.906104,3637.139987,4219.776597,0.229592,0.30755,0.367542,0.307374,0.116745,0.219707,0.168344,0.196391
min,1.0,-3500.0,0.0,1.04133,3.57481,0.196023,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,12346.0,2001.0,1.3333,5.83696,5.5103,1.32072,2.0,4.0,8.0,24.027778,0.0,150.0,3.0,14.0,4.0,25.0,20.0,25.0,8.0,56.0,0.0,0.0,0.0,0.0,0.0,0.0,5488.0,21926.0,21926.0,21926.0,21926.0,21926.0,21926.0,21926.0,21926.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,105305.0,2011.0,1.9688,6.45395,5.54654,1.47688,2.0,4.0,10.0,138.0,0.0,320.0,9.0,39.0,9.0,45.0,30.0,45.0,10.0,123.0,0.0,0.0,0.0,0.0,0.0,0.0,11022.0,21926.0,21926.0,21926.0,21926.0,21926.0,21926.0,21926.0,21926.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,206169.0,2017.0,2.5252,7.05245,5.67989,1.66547,2.0,6.0,12.0,351.0,0.0,899.0,28.0,127.0,26.0,90.0,60.0,90.0,12.0,395.0,0.0,1.0,1.0,0.0,0.0,0.0,16544.0,21926.0,21926.0,21926.0,21926.0,21926.0,21926.0,21926.0,21926.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,349161.0,2021.0,5.0,9.91429,8.51488,4.27728,10.0,999.0,21.0,1757.0,15.0,166497.0,2031.0,19182.0,7673.0,60000.0,60000.0,60000.0,25.0,108101.0,0.0,850.0,525.0,38.0,1.0,1.0,21926.0,21926.0,21926.0,21926.0,21926.0,21926.0,21926.0,21926.0,21926.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [4]:
df_na = df_games[df_games['Description'].isnull()]
print(df_na)
# Set description of game to its name
df_games.loc[14243, 'Description'] = 'Timeline: Sports et Loisirs'

        BGGId                         Name Description  YearPublished  \
14243  170984  Timeline: Sports et Loisirs         NaN           2014   

       GameWeight  AvgRating  BayesAvgRating   StdDev  MinPlayers  MaxPlayers  \
14243         1.0    6.64387         5.53658  1.62527           2           8   

       ...  Rank:partygames  Rank:childrensgames  Cat:Thematic Cat:Strategy  \
14243  ...            21926                21926             0            0   

       Cat:War  Cat:Family  Cat:CGS  Cat:Abstract  Cat:Party  Cat:Childrens  
14243        0           0        0             0          0              0  

[1 rows x 48 columns]


In [5]:
df_games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21925 entries, 0 to 21924
Data columns (total 48 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   BGGId                21925 non-null  int64  
 1   Name                 21925 non-null  object 
 2   Description          21925 non-null  object 
 3   YearPublished        21925 non-null  int64  
 4   GameWeight           21925 non-null  float64
 5   AvgRating            21925 non-null  float64
 6   BayesAvgRating       21925 non-null  float64
 7   StdDev               21925 non-null  float64
 8   MinPlayers           21925 non-null  int64  
 9   MaxPlayers           21925 non-null  int64  
 10  ComAgeRec            16395 non-null  float64
 11  LanguageEase         16034 non-null  float64
 12  BestPlayers          21925 non-null  int64  
 13  GoodPlayers          21925 non-null  object 
 14  NumOwned             21925 non-null  int64  
 15  NumWant              21925 non-null 

In [6]:
# check if all BGGIds are unique
df_games['BGGId'] = df_games['BGGId'].astype('string')

df_games['BGGId'].describe()

count     21925
unique    21925
top           1
freq          1
Name: BGGId, dtype: object

In [7]:
#for features with categorical data

df_games.describe(include=('object', 'bool'))

Unnamed: 0,Name,Description,GoodPlayers,Family,ImagePath
count,21925,21925,21925,6663,21908
unique,21521,21904,78,1456,21904
top,Robin Hood,dice throne game intriguing dice tactical card...,[],18xx,https://cf.geekdo-images.com/6vlkpViMExKmOvg0C...
freq,6,4,19946,103,2


In [8]:
df_games['ImagePath'][0]

'https://cf.geekdo-images.com/rpwCZAjYLD940NWwP3SRoA__original/img/yR0aoBVKNrAmmCuBeSzQnMflLYg=/0x0/filters:format(jpeg)/pic4718279.jpg'

In [9]:
#df_games.set_index('Name', inplace=True)

In [10]:
#df_games.head()

### While there may be duplicate names, the BGG identities are unique. As we may have version differences that or different games with the same name, we will use the BGGId as the identifier.

In [16]:
# Import themes data
df3 = pd.read_csv('themes.csv')

df_theme = df3.copy(deep=True)

df_theme.describe()

Unnamed: 0,BGGId,Adventure,Fantasy,Fighting,Environmental,Medical,Economic,Industry / Manufacturing,Transportation,Science Fiction,...,Theme_Fashion,Theme_Geocaching,Theme_Ecology,Theme_Chernobyl,Theme_Photography,Theme_French Foreign Legion,Theme_Cruise ships,Theme_Apache Tribes,Theme_Rivers,Theme_Flags identification
count,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,...,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0
mean,117652.663216,0.053683,0.123238,0.076078,0.008848,0.003968,0.069282,0.013546,0.018016,0.076397,...,9.1e-05,9.1e-05,9.1e-05,9.1e-05,9.1e-05,9.1e-05,9.1e-05,9.1e-05,9.1e-05,9.1e-05
std,104628.721777,0.225396,0.328718,0.265128,0.093651,0.062869,0.253938,0.1156,0.133012,0.265638,...,0.009551,0.009551,0.009551,0.009551,0.009551,0.009551,0.009551,0.009551,0.009551,0.009551
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,12346.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,105305.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,206169.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,349161.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Prepare Themes df for Similarity Matrix Creation

In [25]:
#merge game mechanics and game themes dataframes

df_features = df_theme

df_features.head()

Unnamed: 0,BGGId,Adventure,Fantasy,Fighting,Environmental,Medical,Economic,Industry / Manufacturing,Transportation,Science Fiction,...,Theme_Fashion,Theme_Geocaching,Theme_Ecology,Theme_Chernobyl,Theme_Photography,Theme_French Foreign Legion,Theme_Cruise ships,Theme_Apache Tribes,Theme_Rivers,Theme_Flags identification
0,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
# Add Game names from games df

df_features['Name'] = df_games['Name']

df_features.head()

Unnamed: 0_level_0,BGGId,Adventure,Fantasy,Fighting,Environmental,Medical,Economic,Industry / Manufacturing,Transportation,Science Fiction,...,Theme_Geocaching,Theme_Ecology,Theme_Chernobyl,Theme_Photography,Theme_French Foreign Legion,Theme_Cruise ships,Theme_Apache Tribes,Theme_Rivers,Theme_Flags identification,Name
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Die Macher,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,
Dragonmaster,2,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
Samurai,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
Tal der Könige,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
Acquire,5,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,


In [27]:
# Format matrix for cosine similarity function

df_features.set_index('Name', inplace=True)

df_matrix = df_features.drop(['BGGId'], axis=1)

df_matrix.head()

Unnamed: 0_level_0,Adventure,Fantasy,Fighting,Environmental,Medical,Economic,Industry / Manufacturing,Transportation,Science Fiction,Space Exploration,...,Theme_Fashion,Theme_Geocaching,Theme_Ecology,Theme_Chernobyl,Theme_Photography,Theme_French Foreign Legion,Theme_Cruise ships,Theme_Apache Tribes,Theme_Rivers,Theme_Flags identification
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Die Macher,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Dragonmaster,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Samurai,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Tal der Könige,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Acquire,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Create Pivot Table

In [28]:
similarity_matrix = cosine_similarity(df_matrix)

In [29]:
similarity_matrix.shape

(21925, 21925)

In [40]:
def game_recommender(game_title):
    # get index for game title
    index = np.where(df_matrix.index==game_title)[0][0]
    print(index)
    # get scores for the game title, sort, and get first 5 highest scoring game titles
    scores = list(enumerate(similarity_matrix[index]))
    print(scores[0:5])
    recommendations = sorted(scores, key=lambda x:x[1], reverse=True)[1:15]
    
    data_list = []
    for i in recommendations:
        inner_list=[]
        df_temp = df_games[df_games['Name']==df_matrix.index[i[0]]]
        inner_list.extend(df_temp.drop_duplicates('Name')['Name'])
        inner_list.extend(df_temp.drop_duplicates('Name')['ImagePath'])
        inner_list.extend(df_temp.drop_duplicates('Name')['BayesAvgRating'])
        inner_list.extend(df_temp.drop_duplicates('Name')['Rank:boardgame'])
        inner_list.extend(df_temp.drop_duplicates('Name')['MinPlayers'])
        inner_list.extend(df_temp.drop_duplicates('Name')['MaxPlayers'])
        data_list.append(inner_list)
        
    return data_list

In [41]:
game_recommender('Terraforming Mars')


14059
[(0, 0.3162277660168379), (1, 0.0), (2, 0.0), (3, 0.0), (4, 0.4472135954999579)]


[['High Frontier',
  'https://cf.geekdo-images.com/UuswhCjZ1kd2MsdFGv8zxw__original/img/gugpi_m3sKXiuI3qwbTmmoPSob0=/0x0/filters:format(jpeg)/pic727149.jpg',
  6.3792,
  1287,
  2,
  5],
 ['The Final Frontier',
  'https://cf.geekdo-images.com/eTFLWgWJPD5FI4I174MrjA__original/img/XTESGXlUboq8SrhMs3Ok9LMuHCw=/0x0/filters:format(jpeg)/pic6365972.jpg',
  5.51374,
  15603,
  2,
  4],
 ['Stellar Horizons',
  'https://cf.geekdo-images.com/KVk_KKHXZPPalLlCfvRhXQ__original/img/Il4W2VyFlgKm1rL2XvIVEn0iOfs=/0x0/filters:format(jpeg)/pic4233224.jpg',
  5.76598,
  4167,
  1,
  7],
 ['High Frontier 4 All',
  'https://cf.geekdo-images.com/SjfNTsR84Ad--u5lo6HY-A__original/img/xMVJcFIcFMAECB2IWTh_mRG-o4w=/0x0/filters:format(png)/pic5316685.png',
  6.37094,
  1308,
  1,
  5],
 ['Dog Eat Dog',
  'https://cf.geekdo-images.com/XOJE4CuUjj0dIQ_W-7GqPQ__original/img/TpgR8LNkW158I3r7uZcnLxpgjo4=/0x0/filters:format(jpeg)/pic451258.jpg',
  5.4714,
  20436,
  2,
  6],
 ['Space Empires',
  'https://cf.geekdo-images

In [32]:
import pickle
pickle.dump(similarity_matrix, open('similarity_scores_theme.pkl','wb'))