# Scraping the Nintendo Switch game list from Wikipedia
---

In [1]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

# link urls and corresponding table classes
urls = [('https://en.wikipedia.org/wiki/List_of_Nintendo_Switch_games_(A–L)',
         'wikitable sortable plainrowheaders'),
        ('https://en.wikipedia.org/wiki/List_of_Nintendo_Switch_games_(M–Z)',
         'wikitable plainrowheaders sortable')]

elements = []
for url, url_class in urls:
    
    #a access the web
    website_url = requests.get(url).text

    soup = BeautifulSoup(website_url,'lxml')
    #print(soup.prettify())

    # find table
    table = soup.find('table', class_=url_class)

    for row in tqdm(table.findAll('tr')):
        tqdm._instances.clear()

        # only find game names with links to their own pages
        try:
            game = row.findAll('th')
            game_name = game[0].find(text=True)
            game_link = game[0].find(href=True)['href']
        
        except: pass

        # find rest of cells
        else:
            cells = row.findAll('td')
            atts = []
            for i in range(len(cells)-1):
                att = cells[i].find(text=True)
                atts.append(att)
            
            # append full row to list    
            elements.append([game_name, game_link] + atts)

100%|██████████| 1294/1294 [00:00<00:00, 8570.27it/s]
100%|██████████| 1275/1275 [00:00<00:00, 8441.32it/s]


In [2]:
import pandas as pd

cols = ['Title', 'Link', 'Genre', 'Developer', 'Publisher', 'Release_JP', 
        'Release_NA', 'Release_Pal']

df_games = pd.DataFrame(elements, columns=cols)
df_games = df_games.astype(str)

# clean links 
df_games = df_games.applymap(lambda x: x.replace('\n', ''))
df_games = df_games.applymap(lambda x: x.replace(':', ''))


print('Shape of dataframe: ', df_games.shape, '\n')
df_games.head()

Shape of dataframe:  (1262, 8) 



Unnamed: 0,Title,Link,Genre,Developer,Publisher,Release_JP,Release_NA,Release_Pal
0,1-2-Switch,/wiki/1-2-Switch,Party,Nintendo EPD,Nintendo,"March 3, 2017","March 3, 2017","March 3, 2017"
1,140,/wiki/140_(video_game),Action,Carlsen Games,Carlsen Games,Unreleased,"January 9, 2020","January 9, 2020"
2,1979 Revolution Black Friday,/wiki/1979_Revolution_Black_Friday,Adventure,Ink Stories,Ink Stories,Unreleased,"August 2, 2018","August 2, 2018"
3,198X,/wiki/198X,Arcade,Hi-Bit Studios,JP,"January 23, 2020","January 23, 2020","January 23, 2020"
4,2064 Read Only Memories Integral,/wiki/2064_Read_Only_Memories_Integral,Adventure,MidBoss,MidBoss,"December 27, 2018","August 14, 2018","August 14, 2018"


In [3]:
# Visit game own pages and extract plots

wiki_url = 'https://en.wikipedia.org'

plots = []
for idx, row in tqdm(df_games.iterrows(), total=df_games.shape[0]):
    tqdm._instances.clear()
    
    url = wiki_url + row['Link']
    
    website_url = requests.get(url).text

    soup = BeautifulSoup(website_url,'lxml')
    #print(soup.prettify())

    text = ''
    
    for section in soup.find_all('h2'):
        
        if section.text.startswith('Game') or section.text.startswith('Plot'):

            text += section.text + '\n\n'

            for element in section.next_siblings:
                if element.name and element.name.startswith('h'):
                    break

                elif element.name == 'p':
                    text += element.text + '\n'

        else: pass
    
    if not text:
        plots.append(None)
    else:
        plots.append(text)

100%|██████████| 1262/1262 [09:48<00:00,  2.14it/s]


In [4]:
len(plots)

1262

In [5]:
# Clean texts

import re

plots_clean = []
for text in plots:
    if text is not None:
        text = re.sub(r'\[.*?\]+', '', text)
        text = text.replace('\n', ' ')
        text = text.replace('Gameplay ', '')
        text = text.replace('Game-play ', '')
        text = text.replace('Plot ', '')
        plots_clean.append(text)
    else:
        plots_clean.append(None)

df_games['Plots'] = plots_clean

df_games.head()

Unnamed: 0,Title,Link,Genre,Developer,Publisher,Release_JP,Release_NA,Release_Pal,Plots
0,1-2-Switch,/wiki/1-2-Switch,Party,Nintendo EPD,Nintendo,"March 3, 2017","March 3, 2017","March 3, 2017",1-2-Switch is a party game in which players d...
1,140,/wiki/140_(video_game),Action,Carlsen Games,Carlsen Games,Unreleased,"January 9, 2020","January 9, 2020","As described by Carlsen, 140 is “an old schoo..."
2,1979 Revolution Black Friday,/wiki/1979_Revolution_Black_Friday,Adventure,Ink Stories,Ink Stories,Unreleased,"August 2, 2018","August 2, 2018",
3,198X,/wiki/198X,Arcade,Hi-Bit Studios,JP,"January 23, 2020","January 23, 2020","January 23, 2020",In an introductory sequence entitled Beating ...
4,2064 Read Only Memories Integral,/wiki/2064_Read_Only_Memories_Integral,Adventure,MidBoss,MidBoss,"December 27, 2018","August 14, 2018","August 14, 2018",


In [13]:
# Drop 'Untitled' games
idx_todrop = df_games[df_games.Title=='Untitled '].index.tolist()
df_games.drop(index=idx_todrop, inplace=True)

# Rename cols
rename = {'Release_JP': 'Released in: Japan', 'Release_NA': 'North America', 
 'Release_Pal': 'Rest of countries'}
df_games.rename(columns=rename, inplace=True)

In [14]:
df_games.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1260 entries, 0 to 1261
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Title               1260 non-null   object
 1   Link                1260 non-null   object
 2   Genre               1260 non-null   object
 3   Developer           1260 non-null   object
 4   Publisher           1260 non-null   object
 5   Released in: Japan  1260 non-null   object
 6   North America       1260 non-null   object
 7   Rest of countries   1260 non-null   object
 8   Plots               787 non-null    object
dtypes: object(9)
memory usage: 98.4+ KB


In [15]:
df_games.dropna().to_csv('datasets/Games_dataset.csv')