In [1]:
# import necessary libraries

In [2]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

In [3]:
# obtain url as page variable and turn into soup (html text)

In [4]:
url = "https://en.wikipedia.org/wiki/List_of_best-selling_video_games"
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html')

In [5]:
# acquire table from soup as its own subtext

In [6]:
table = soup.find('table', class_ = 'wikitable sortable plainrowheaders')

In [7]:
# now acquire all the titles from table html text

In [8]:
table_titles = table.find_all('th', scope = 'col')
titles_list = [title.text.strip() for title in table_titles]
print(titles_list)

['Title', 'Sales', 'Series', 'Platform(s)', 'Initial release date', 'Developer(s)[b]', 'Publisher(s)[b]', 'Ref.']


In [9]:
# set up df with just titles for now

In [10]:
game_df = pd.DataFrame()

In [11]:
# now get data and then do for loop to get data for each row and append to dataframe

In [12]:
column_data = table.find_all('tr')

In [13]:
for row in column_data[1:]:
    row_data = row.find_all('th', scope = 'row') + row.find_all('td') 
    individual_row_data = [rd.text.strip() for rd in row_data]
    game_df = pd.concat([game_df, pd.DataFrame([individual_row_data])], ignore_index=True)

In [14]:
game_df

Unnamed: 0,0,1,2,3,4,5,6,7
0,Minecraft,300000000,Minecraft,Multi-platform,"November 18, 2011[c]",Mojang Studios,Mojang Studios,[4]
1,Grand Theft Auto V,200000000,Grand Theft Auto,Multi-platform,"September 17, 2013",Rockstar North,Rockstar Games,[5]
2,Tetris (EA),100000000,Tetris,Multi-platform,"September 12, 2006",EA Mobile,Electronic Arts,[6]
3,Wii Sports,82900000,Wii,Wii,"November 19, 2006",Nintendo EAD,Nintendo,[7]
4,PUBG: Battlegrounds,75000000,PUBG Universe,Multi-platform,"December 20, 2017",PUBG Studios,Krafton,[8]
5,Mario Kart 8 / Deluxe,71360000,Mario Kart,Wii U / Switch,"May 29, 2014",Nintendo EAD / Nintendo EPD (Deluxe),Nintendo,[d]
6,Red Dead Redemption 2,65000000,Red Dead,Multi-platform,"October 26, 2018",Rockstar Games,Rockstar Games,[5]
7,Terraria,58700000,,Multi-platform,"May 16, 2011",Re-Logic,Re-Logic / 505 Games,[11]
8,Super Mario Bros.,58000000,Super Mario,Multi-platform,"September 13, 1985",Nintendo R&D4,Nintendo,[e]
9,Overwatch,50000000,Overwatch,Multi-platform,"May 24, 2016",Blizzard Entertainment,Blizzard Entertainment,[15]


In [15]:
# clean up the dataframe

In [16]:
# rename the columns
game_df.rename(columns = {0:'Title', 1:'Sales', 2:'Series', 3:'Platform(s)', 4:'Initial release date', 5:'Developer(s)[b]', 6:'Publisher(s)[b]', 7:'Ref.'}, inplace = True)

Unnamed: 0,Title,Sales,Series,Platform(s),Initial release date,Developer(s)[b],Publisher(s)[b],Ref.
0,Minecraft,300000000,Minecraft,Multi-platform,"November 18, 2011[c]",Mojang Studios,Mojang Studios,[4]
1,Grand Theft Auto V,200000000,Grand Theft Auto,Multi-platform,"September 17, 2013",Rockstar North,Rockstar Games,[5]
2,Tetris (EA),100000000,Tetris,Multi-platform,"September 12, 2006",EA Mobile,Electronic Arts,[6]
3,Wii Sports,82900000,Wii,Wii,"November 19, 2006",Nintendo EAD,Nintendo,[7]
4,PUBG: Battlegrounds,75000000,PUBG Universe,Multi-platform,"December 20, 2017",PUBG Studios,Krafton,[8]
5,Mario Kart 8 / Deluxe,71360000,Mario Kart,Wii U / Switch,"May 29, 2014",Nintendo EAD / Nintendo EPD (Deluxe),Nintendo,[d]
6,Red Dead Redemption 2,65000000,Red Dead,Multi-platform,"October 26, 2018",Rockstar Games,Rockstar Games,[5]
7,Terraria,58700000,,Multi-platform,"May 16, 2011",Re-Logic,Re-Logic / 505 Games,[11]
8,Super Mario Bros.,58000000,Super Mario,Multi-platform,"September 13, 1985",Nintendo R&D4,Nintendo,[e]
9,Overwatch,50000000,Overwatch,Multi-platform,"May 24, 2016",Blizzard Entertainment,Blizzard Entertainment,[15]


In [17]:
# drop ref column
game_df = game_df.drop('Ref.', axis = 1)

In [19]:
# change index
game_df = game_df.set_index('Title')

In [22]:
# print game_df and confirm it is prepared to export
game_df

Unnamed: 0_level_0,Sales,Series,Platform(s),Initial release date,Developer(s)[b],Publisher(s)[b]
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Minecraft,300000000,Minecraft,Multi-platform,"November 18, 2011[c]",Mojang Studios,Mojang Studios
Grand Theft Auto V,200000000,Grand Theft Auto,Multi-platform,"September 17, 2013",Rockstar North,Rockstar Games
Tetris (EA),100000000,Tetris,Multi-platform,"September 12, 2006",EA Mobile,Electronic Arts
Wii Sports,82900000,Wii,Wii,"November 19, 2006",Nintendo EAD,Nintendo
PUBG: Battlegrounds,75000000,PUBG Universe,Multi-platform,"December 20, 2017",PUBG Studios,Krafton
Mario Kart 8 / Deluxe,71360000,Mario Kart,Wii U / Switch,"May 29, 2014",Nintendo EAD / Nintendo EPD (Deluxe),Nintendo
Red Dead Redemption 2,65000000,Red Dead,Multi-platform,"October 26, 2018",Rockstar Games,Rockstar Games
Terraria,58700000,,Multi-platform,"May 16, 2011",Re-Logic,Re-Logic / 505 Games
Super Mario Bros.,58000000,Super Mario,Multi-platform,"September 13, 1985",Nintendo R&D4,Nintendo
Overwatch,50000000,Overwatch,Multi-platform,"May 24, 2016",Blizzard Entertainment,Blizzard Entertainment


In [24]:
# export dataset as csv file
game_df.to_csv(r"C:\Users\miria\OneDrive\Documents\DS Learn\output datasets\bestsellingvideogames(webscraping).csv", index = False) 