Importing the libraries needed for the project, and reading in the first dataset (boardgames_ranks.csv)

In [1]:
import pandas as pd
import requests
import matplotlib.pyplot as plt
import numpy as np
import xml.etree.ElementTree as ET
import time
from bs4 import BeautifulSoup
import re

df_board_game_rankings = pd.read_csv('.\\.venv\\data\\boardgames_ranks.csv')

Data Read-in & Cleaning
1) Reading in the same dataset under a different variable name to preserve the original.
2) Filtering out board games with an overall rank of 0, as we only want games with an actual ranking.
3) Creating dataframe df_bgr_top which constrains boardgames to those with an overall rank of 1 - 250.
4) Replacing NaN values in the DataFrame with '0'
5) Renaming fields.


In [None]:
##1
df_bgr2 = pd.read_csv('.\\.venv\\data\\boardgames_ranks.csv')

##2
zerorank = df_bgr2[(df_bgr2['rank'] == 0)].index
df_bgr_nozero = pd.DataFrame(df_bgr2.drop(zerorank, inplace = True))

##3
df_bgr_top = pd.DataFrame(df_bgr2.loc[(df_bgr2['rank'] >=1) & (df_bgr2['rank'] <=250)])

##4
df_bgr_top.fillna(0, inplace=True)

##5
df_bgr_top.rename(columns={'id': 'BGG_ID', 'name': 'GAME_NAME', 'yearpublished': 'PUBLISH_YR', 'rank': 'OVERALL_RANK', 'bayesaverage': 'BAYES_AVG',
                           'average': 'AVG_RATING','usersrated': 'USER_RATING', 'is_expansion': 'EXPANSION_IND', 'abstracts_rank': 'ABSTRACTS_RANK',
                           'cgs_rank': 'CGS_RANK', 'childrensgames_rank': 'CHILDREN_GAME_RANK', 'familygames_rank': 'FAMILY_RANK', 'partygames_rank': 'PARTY_RANK',
                           'strategygames_rank': 'STRATEGY_RANK', 'thematic_rank': 'THEMATIC_RANK', 'wargames_rank': 'WARGAME_RANK'}, inplace=True)


<class 'pandas.core.frame.DataFrame'>
Index: 250 entries, 0 to 249
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   BGG_ID              250 non-null    int64  
 1   GAME_NAME           250 non-null    object 
 2   PUBLISH_YR          250 non-null    int64  
 3   OVERALL_RANK        250 non-null    int64  
 4   BAYES_AVG           250 non-null    float64
 5   AVG_RATING          250 non-null    float64
 6   USER_RATING         250 non-null    int64  
 7   EXPANSION_IND       250 non-null    int64  
 8   ABSTRACTS_RANK      250 non-null    float64
 9   CGS_RANK            250 non-null    float64
 10  CHILDREN_GAME_RANK  250 non-null    float64
 11  FAMILY_RANK         250 non-null    float64
 12  PARTY_RANK          250 non-null    float64
 13  STRATEGY_RANK       250 non-null    float64
 14  THEMATIC_RANK       250 non-null    float64
 15  WARGAME_RANK        250 non-null    float64
dtypes: float64(10

Extracting Information from BGG API
1) Extracting list of 250 BGG_IDs to feed into BGG_XML_API_2
2) Setting up variables and list storage for the loop.
3) Iterate through all 250 BGG_IDs, and request XML for each. Sleeping when BGG_ID %7 is equal to 0, to adhere to API usage terms.
4) Parse XML for Mechanic Name and ID.
5) Extract all mechanics associated with a game, and append to a list.
6) Create pd.Series from lists.
7) Create pd.Dataframe from multiple series.
8) Convert fields to needed data types for Board Game Ranking dataframe (df_bgr_top)
9) Convert field to needed data type for Mechanics dataframe (df_mechv3)
10) Merge Game Rankings Dataframe and Mechanics Dataframe.
11) Cleanup - round averages to 2 places.

***NOTE - This block will take some time to run.

In [None]:
##1
bg_ids = df_bgr_top[str('BGG_ID')].values.tolist()

##2
API_base_string = 'https://boardgamegeek.com/xmlapi2/thing?id='
mech_id_ls = []
mech_name_ls = []
mech_bggid_ls = []

##3
for id in bg_ids:
    api_rec = requests.get(API_base_string + str(id))
    api_data = api_rec.content
    root = ET.fromstring(api_data)
    if id%7 == 0: 
      time.sleep(5)

##4,5
      for item in root:
        for link in item.findall('link'):
          if(link.get('type') == 'boardgamemechanic'):
            mech_id_ls.append(link.get('id'))
            mech_name_ls.append(link.get('value'))
            mech_bggid_ls.append(item.get('id'))
    else:
      for item in root:
        for link in item.findall('link'):
           if(link.get('type') == 'boardgamemechanic'):
            mech_id_ls.append(link.get('id'))
            mech_name_ls.append(link.get('value'))
            mech_bggid_ls.append(item.get('id'))

##6       
mech_bggid_ser = pd.Series(mech_bggid_ls)
mech_id_ser = pd.Series(mech_id_ls)
mech_name_ser = pd.Series(mech_name_ls)

##7
mechv3_frame = {'BGG_ID': mech_bggid_ser, 'MECH_ID': mech_id_ser, 'MECH_NAME': mech_name_ser}
df_mechv3 = pd.DataFrame(mechv3_frame)

##8
convert_dict = {'BGG_ID': int,
                'GAME_NAME': object,
                'PUBLISH_YR': int,
                'OVERALL_RANK': int,
                'BAYES_AVG': float,
                'AVG_RATING': float,
                'USER_RATING': int,
                'EXPANSION_IND': bool,
                'ABSTRACTS_RANK': int,
                'CGS_RANK': int,
                'CHILDREN_GAME_RANK': int,
                'FAMILY_RANK': int,
                'PARTY_RANK': int,
                'STRATEGY_RANK': int,
                'THEMATIC_RANK': int,
                'WARGAME_RANK': int}
df_bgr_top = df_bgr_top.astype(convert_dict)

##9
mech_convert_dict = {'MECH_ID': object}
df_mechv3 = df_mechv3.astype(mech_convert_dict)

##10
df_gamemech_merge = pd.merge(df_bgr_top, df_mechv3,on='BGG_ID',how='left')
df_game_mech = pd.DataFrame(df_gamemech_merge)

##11
df_game_mech.BAYES_AVG = df_game_mech.BAYES_AVG.round(2)
df_game_mech.AVG_RATING = df_game_mech.AVG_RATING.round(2)


1) Read in Designers with Location CSV file.
2) Creating new dataframe constrained by only the BGG_IDs existing in the top 250.

In [3]:
##1
df_bgdesigner_loc = pd.read_csv('.\\.venv\\data\\BGG_Designer_Location.csv')

##2
df_topbgdes_loc = pd.DataFrame(df_bgdesigner_loc.loc[df_bgdesigner_loc['BGG_ID'].isin(bg_ids)])
df_topbgdes_loc.reset_index(drop=True, inplace=True)
df_topbgdes_loc.head(30)

NameError: name 'bg_ids' is not defined