<a href="https://colab.research.google.com/github/kdambrowski/Board_Game_Data_Scraper_and_Processor/blob/main/web_scraping_program.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Project description

This project focuses on scraping and processing board game data from BoardGameGeek (BGG) using various Python functions and libraries. The goal is to collect detailed information about board games and prepare it for further analysis or presentation. The project utilizes web scraping techniques, data manipulation with Pandas, and data extraction from JSON-like structures.

# Library

In [55]:
import pandas as pd
from bs4 import BeautifulSoup as bs
from urllib.request import urlopen
from urllib.parse import urljoin, urlparse
import re
import json
import shutil
import os

# Settings

In [56]:
BASE_PAGE_ADDRESS = 'https://boardgamegeek.com'
BGG_USER_COLLECTION_URL = ''
HTML_PARSER = 'html.parser'
GAME_LIST_CSV_FILENAME = 'game_data.csv'
CSV_DESTINATION_PATH = ''

# Functions

In [57]:
def pretty_print_nested_dict(dictionary):
	"""Printing pretty dictionary in a nasted form.
	Args:
		dictionary (dict): The nested dictionary to be printed.
	"""
	for main_key, sub_dict in dictionary.items():
		if isinstance(sub_dict, dict):
			print(f"{main_key}:")
			for key, value in sub_dict.items():
				print(f"\t{key}: {value}")
		else:
			print(f"{main_key}: {sub_dict}")


def get_page_soup(url, parser = HTML_PARSER):
	"""Retrieves and parses the HTML content of a URL.
	Args:
		url (str): The URL to be fetched and parsed.
		parser (str, optional): The parser to be used for page analysis.
		Default: HTML_PARSER.
	Returns:
		BeautifulSoup: Parsed HTML content using the specified parser.
	"""
	response = urlopen(url)
	url_content = response.read()
	return bs(url_content, parser)


def create_game_link_dict(url, base_page_adress, parser = HTML_PARSER):
	"""Create a dictionary of game links from the provided URL and base URL.
	Args:
		url (str): The URL of the page containing game links.
		base_url (str): The base URL used to construct full game links.
	Returns:
		dict: A dictionary where keys are game names and
		values are their corresponding links.
	"""
	soup = get_page_soup(url, parser)
	game_link_dict = dict()

	for tag in soup.find_all('td', class_="collection_objectname"):
		game_link_tag = tag.a
		href = game_link_tag['href']
		game_name = game_link_tag.text
		game_link_http = base_page_adress + href
		game_link_dict[game_name] = game_link_http

	return game_link_dict


def extract_avg_players_rating(game_stats_dict):
	"""Extracts game statistics data about players rating.
	Args:
		stats (dict): A dictionary containing game statistics.
	Returns:
		dict: A dictionary containing extracted statistics data about
		average players rating (if available).
	"""
	collected_stats = {}
	for item, value in game_stats_dict.items():
		if item == 'average':
			collected_stats[item +'_players_rating'] = value
	return collected_stats


def extract_min_max_players_data(item_data_dict):
	"""Extracts player count data.
	Args:
		item_data (dict): A dictionary containing game data.
	Returns:
		dict: A dictionary containing extracted player count data (if available).
	"""
	player_data = {}
	for item in ['minplayers', 'maxplayers']:
		if item in item_data_dict:
			player_data[item] = item_data_dict[item]
	return player_data


def extract_game_weight_data(polls_dict):
	"""Extracts game weight data.
	Args:
		polls (dict): A dictionary containing poll results.
	Returns:
		dict: A dictionary containing extracted game weight data (if available).
	"""
	weight_data = {}
	if 'boardgameweight' in polls_dict:
		weight_subdata = polls_dict['boardgameweight']
		if 'averageweight' in weight_subdata:
			weight_data['averageweight'] = weight_subdata['averageweight']
	return weight_data


def scrape_game_data(game_link, parser = HTML_PARSER):
	"""Scrapes game data based on the provided link.
	Args:
		game_link (str): Link to the game page.
		parser (str, optional): Parser to be used for page analysis.
		Default: HTML_PARSER.
	Returns:
		dict: A dictionary containing the collected game data.
	"""
	collected_data = {'link': game_link}
	game_bs = get_page_soup(game_link, parser)
	reg_compiler = re.compile(r'GEEK\.geekitemPreload')
	reg_searcher = r'GEEK\.geekitemPreload\s*=\s*({.*?});'
	# Find <script> tags containing data
	script_tags = game_bs.find_all('script', text = reg_compiler)

	for script_tag in script_tags:
		match = re.search(reg_searcher, script_tag.string)
		if match:
			geekitem_preload_text = match.group(1)
			geekitem_preload = json.loads(geekitem_preload_text)
			# Extract data from individual sections
			stats_data = extract_avg_players_rating(geekitem_preload['item']['stats'])
			players_data = extract_min_max_players_data(geekitem_preload['item'])
			weight_data = extract_game_weight_data(geekitem_preload['item']['polls'])
			collected_data.update(stats_data)
			collected_data.update(players_data)
			collected_data.update(weight_data)

	return collected_data


def scrape_all_game_data(game_link_dict, parser = HTML_PARSER):
	"""Scrape data for all games in the provided dictionary of game links.
	Args:
		game_link_dict (dict): A dictionary where keys are game names and values
		are their corresponding links.
		parser (str, optional): Parser to be used for page analysis.
		Default: HTML_PARSER.
	Returns:
		dict: A dictionary where keys are game names and values are dictionaries
		containing the collected game data.
	"""
	game_data_dict = dict()
	for game_name, game_link in game_link_dict.items():
		collected_data = scrape_game_data(game_link, parser)
		game_data_dict[game_name] = collected_data
	return game_data_dict


def convert_objectcolumn_to_numeric(df):
	"""Convert object-type columns in a DataFrame to numeric data types
	(float or int).
	Args:
			df (pandas.DataFrame): The DataFrame containing columns to be converted.
	Returns:
			None: The function modifies the input DataFrame in place by converting
			eligible columns to numeric types (float or int) and replacing values that
			cannot be converted with NaN.
	"""
	for column in df.columns.tolist():
		if pd.to_numeric(df[column], errors='coerce').notna().all():
			df[column] = pd.to_numeric(df[column], errors='coerce')
	return df

def prepare_dataframe_from_dict(game_data_dict):
	"""Prepares a DataFrame from a dictionary of game data.
	Args:
			game_data_dict (dict): A dictionary containing game data with
			game names as keys and data as values.
	Returns:
			pandas.DataFrame: A prepared DataFrame with game data, where
			game names are in the 'nazwa_gry' column.
	"""
	df = pd.DataFrame.from_dict(game_data_dict, orient='index')
	df.reset_index(inplace=True)
	df.rename(columns={'index': 'nazwa_gry'}, inplace=True)
	df = convert_objectcolumn_to_numeric(df)
	return df


def convert_and_replace_decimal_separator(df):
	"""Converts all columns of a DataFrame to 'string' type
	and replaces dots with commas in numeric columns.
	This operation is necessary to usage DF in
	googlesheet as a numeric value.
	Args:
			param df: DataFrame whose columns need to be converted
			and where dots should be replaced with commas.
	Retuns:
			DataFrame after the conversion and replacement of decimal separators.
	"""
	for column in df.columns.tolist():
			if pd.to_numeric(df[column], errors='coerce').notna().all():
				df[column] = df[column].astype('string')
				df[column] = df[column].apply(lambda x: x.replace('.', ','))
	return df


# Code

## Create dict from BGG user's collection

In [58]:
game_link_dict = create_game_link_dict(BGG_USER_COLLECTION_URL, BASE_PAGE_ADDRESS)
game_data_dict = scrape_all_game_data(game_link_dict)

  script_tags = game_bs.find_all('script', text = reg_compiler)


## Create DataFrame based on scrapped information from BGG user's collection

In [59]:
game_list_df = prepare_dataframe_from_dict(game_data_dict)

## Prepare DataFrame to convert data for transition into GoogleSheet

In [60]:
game_list_df_for_sheet = convert_and_replace_decimal_separator(game_list_df)

## Save data and move them to indicated path

In [61]:
game_list_df_for_sheet.to_csv(GAME_LIST_CSV_FILENAME, index = False)

In [62]:
dest_url_for_csv = os.path.join(CSV_DESTINATION_PATH, GAME_LIST_CSV_FILENAME)

In [63]:
# shutil.move('/content/game_data.csv', dest_url_for_csv)

'game_data.csv'