# Overview of Scraped StarCraft 2 Data Set Aquisition and Cleaning

## Copyright Jason Martineau 2023, all Rights Reserved

## 0.0 Introduction:

### This notebook shares some details about the webscraper that I built to collect the eSports data contained in the following Tableau Public dashboard <a href="https://public.tableau.com/views/EsportsStarCraft2PlayerPerformanceDashboard/EsportsPlayerBrandSponsoringDashboard?:language=en-US&:display_count=n&:origin=viz_share_link">linked here</a>. 

### The first section shows an outline of the webscrape that I built. It does not show functional code because I do not want the website that I scraped from to be overwealmed with a ton of extra traffic. 

### The scond section shows the cleaning and formatting I did to the scraped data, to get it ready for the tableau dashboard.

## 1.0 Web Scraper Module:

### I wrote a python module to do my scraping. Below is a copy of the module code, with all of the functional website data removed. In section 1.1 I will show how I invoke the methods in this class to perform the scrape.

In [None]:
from bs4 import BeautifulSoup
import requests
import re
import pickle
import datetime
import pandas as pd
import numpy as np
from uuid import uuid4
from ipywidgets import IntProgress;
from IPython.display import display;
from selenium import webdriver
from selenium.webdriver.common.keys import Keys


def scrape_website(replay_id_pickle = '', replay_metadata_scrape_pickle = ''):
    
    # exctract total number of pages on the page with list of all casts.
    html = <REDACTED>  
    
    #get datetime at which the pickle file was extracted
    data_creation_datetime = str(datetime.datetime.now(datetime.timezone.utc).date())
    
    if not replay_id_pickle:

        # this addendum is for only pro replays. Should be something like ~20,000 matches.
        primary_addendum = 'replays/?pro_only=on'
        page = requests.get(html + primary_addendum)

        soup = BS(page.text, 'html.parser')

        # find the number of pages to cycle through.
        last_page = int(str(soup.body.find(
            'div' , {'class' : 'container', 'style' : 'padding-top: 60px;'}).find(
            'div' , {'class' : 'row'}).find('div' , {'class' : 'col-md-9'}).find(
            'h3')).strip('<h3>').strip('</>').strip('Results').strip(' (').strip(')').strip('Page').split('of')[1])

        # create all page addenda
        primitive = 'replays/?p=&pro_only=on&p='
        addenda_list = []
        for current_page in range(1, last_page+1):

            str_page = str(current_page)
            addenda_list.append(primitive + str_page)

        replay_ids = {}
        replay_id_pattern = re.compile('<a href="/([0-9]+)/">')

        for current_addendum in addenda_list:

            numbered_page = requests.get(html + current_addendum)
            current_replay_id_list = replay_id_pattern.findall(numbered_page.text)

            print(current_replay_id_list)

            replay_ids.update({current_addendum : current_replay_id_list})

        with open(<REDACTED> + '__&__' + data_creation_datetime + '__&__' + '.pickle', 'wb') as handle:
            pickle.dump(replay_ids, handle)

    else:
        print('\n loading replay id pickle \n');
        with open(replay_id_pickle, 'rb') as handle:
            replay_ids = pickle.load(handle);

            
    if not replay_metadata_scrape_pickle: 
        replay_page_dict = {}
    else:
        with open(replay_metadata_scrape_pickle, 'rb') as handle:
            replay_page_dict = pickle.load(handle);

    for key in replay_ids:
        for replay_id in replay_ids[key]:
            
            replay_page_html = html + replay_id + '/'
            print(replay_page_html)

            if replay_page_html in replay_page_dict.keys():
                continue;
            else:   
                try:    
                    replay_page = requests.get(replay_page_html)
                    replay_page_dict.update({replay_page_html : replay_page})

                except Exception as e:
                    print(e)
                    replay_page_scrape_dict_filename = save_scrape(replay_page_dict);  
                    return replay_page_scrape_dict_filename        

    replay_page_scrape_dict_filename = save_scrape(replay_page_dict);  
    return replay_page_scrape_dict_filename

def save_scrape(replay_page_dict): 
    
    #get datetime at which the pickle file was extracted
    data_creation_datetime = str(datetime.datetime.now(datetime.timezone.utc).date())
    
    print('\n saving page content \n');
    
    replay_page_scrape_dict_filename = <REDACTED>      
    
    with open(replay_page_scrape_dict_filename, 'wb') as handle:
        pickle.dump(replay_page_dict, handle)
        
    return replay_page_scrape_dict_filename

def extract_from_website_scrape(website_pickle_filename,  max_records = 0):
   
    data_creation_datetime = <REDACTED>

    with open(website_pickle_filename, 'rb') as handle:
        dict_replays = pickle.load(handle)
    
    if (max_records > 0):
        data_length = max_records;
    else:
        data_length = len(dict_replays);
    
    replay_metadata_from_bs4_scrape_dict = <REDACTED>
    
    count = 0;
    
    status_bar = IntProgress(min=count, max=data_length); 
    
    display(status_bar);
    
    for replay in dict_replays:
        
        replay_id = str(uuid4())

        scraped_from_site = <REDACTED>

        soup = BeautifulSoup(dict_replays[replay].text, 'lxml')

        try:
            player_1_name = <REDACTED>
        except:
            player_1_name =  <REDACTED>

        try:
            player_1_species =  <REDACTED>
        except:
            player_1_species =  <REDACTED>
            
        try:
            player_1_build_order_names =  <REDACTED>
        except:
            player_1_build_order_names =  <REDACTED>
            
        try:
            player_1_build_order =  <REDACTED>
        except:
            player_1_build_order =  <REDACTED>
            
        try:
            player_1_winner_status =  <REDACTED>
        except:
            player_1_winner_status =  <REDACTED>
            
        try:
            player_1_spawn_location_and_league = extract_player_spawn_location_and_league(soup, 'player-1');
            player_1_spawn_location = player_1_spawn_location_and_league[0];
            player_1_league = player_1_spawn_location_and_league[1];
        except:
            player_1_spawn_location =  <REDACTED>
            player_1_league =  <REDACTED>
        
        try:
            player_1_salt_encoding =  <REDACTED>
        except:
            player_1_salt_encoding =  <REDACTED>
        
        try:
            player_2_name =  <REDACTED>
        except:
            player_2_name =  <REDACTED>

        try:
            player_2_species =  <REDACTED>
        except:
            player_2_species =  <REDACTED>

        try:
            player_2_build_order_names =  <REDACTED>
        except:
            player_2_build_order_names =  <REDACTED>            
            
        try:
            player_2_build_order = listify_buildorder(soup, "player-2");
        except:
            player_2_build_order = np.nan; 
            
        try:
            player_2_winner_status =  <REDACTED>
        except:
            player_2_winner_status = np.nan;
            
        try:
            player_2_spawn_location_and_league = extract_player_spawn_location_and_league(soup, 'player-2');
            player_2_spawn_location = player_2_spawn_location_and_league[0];
            player_2_league = player_2_spawn_location_and_league[1];
        except:
            player_2_spawn_location = np.nan;
            player_2_league = np.nan;
        
        try:
            player_2_salt_encoding =  <REDACTED>
        except:
            player_2_salt_encoding = np.nan;
        
        try:
            sc2_map =  <REDACTED>
        except:
            sc2_map =  <REDACTED>
        
        try:
            tags = dictify_tags(soup);
        except:
            tags = np.nan;
        
        try:
            played_on_date_and_time =  <REDACTED>
        except:
            played_on_date_and_time =  <REDACTED>
            
        try:
            game_length =  <REDACTED>
        except:
            game_length =  <REDACTED>
        
        try:
            tournament_name =  <REDACTED>
        except:
            tournament_name =  <REDACTED>

        try:       
            other_games_and_addresses = extract_other_games_and_addresses(soup);
            other_games_in_series = other_games_and_addresses[0];
            other_games_in_series_addresses = other_games_and_addresses[1];
        except:
            other_games_in_series =  <REDACTED>
            other_games_in_series_addresses =  <REDACTED>

        try:
            replay_download_address =  <REDACTED>
        except:
            replay_download_address =  <REDACTED>

        try:
            vod_address =  <REDACTED>
        except:
            vod_address =  <REDACTED>

        try:
            replay_description =  <REDACTED>
        except:
            replay_description = np.nan

            
        replay_metadata_from_bs4_scrape_dict['replay_id'][count] = replay_id
        replay_metadata_from_bs4_scrape_dict['scraped_from_site'][count] = scraped_from_site
        replay_metadata_from_bs4_scrape_dict['player_1_name'][count] = player_1_name                                  
        replay_metadata_from_bs4_scrape_dict['player_1_species'][count] = player_1_species   
        replay_metadata_from_bs4_scrape_dict['player_1_build_order_names'][count] = player_1_build_order_names
        replay_metadata_from_bs4_scrape_dict['player_1_build_order'][count] = player_1_build_order   
        replay_metadata_from_bs4_scrape_dict['player_1_winner_status'][count] = player_1_winner_status   
        replay_metadata_from_bs4_scrape_dict['player_1_spawn_location'][count] = player_1_spawn_location   
        replay_metadata_from_bs4_scrape_dict['player_1_league'][count] = player_1_league    
        replay_metadata_from_bs4_scrape_dict['player_1_salt_encoding'][count] = player_1_salt_encoding   
        replay_metadata_from_bs4_scrape_dict['player_2_name'][count] = player_2_name  
        replay_metadata_from_bs4_scrape_dict['player_2_species'][count] = player_2_species  
        replay_metadata_from_bs4_scrape_dict['player_2_build_order_names'][count] = player_2_build_order_names     
        replay_metadata_from_bs4_scrape_dict['player_2_build_order'][count] = player_2_build_order   
        replay_metadata_from_bs4_scrape_dict['player_2_winner_status'][count] = player_2_winner_status    
        replay_metadata_from_bs4_scrape_dict['player_2_spawn_location'][count] = player_2_spawn_location    
        replay_metadata_from_bs4_scrape_dict['player_2_league'][count] = player_2_league    
        replay_metadata_from_bs4_scrape_dict['player_2_salt_encoding'][count] = player_2_salt_encoding
        replay_metadata_from_bs4_scrape_dict['map'][count] = sc2_map    
        replay_metadata_from_bs4_scrape_dict['tags'][count] = tags   
        replay_metadata_from_bs4_scrape_dict['played_on_date_and_time'][count] = played_on_date_and_time    
        replay_metadata_from_bs4_scrape_dict['game_length'][count] = game_length   
        replay_metadata_from_bs4_scrape_dict['tournament_name'][count] = tournament_name  
        replay_metadata_from_bs4_scrape_dict['other_games_in_series'][count] = other_games_in_series
        replay_metadata_from_bs4_scrape_dict['other_games_in_series_addresses'][count] = other_games_in_series_addresses  
        replay_metadata_from_bs4_scrape_dict['replay_download_address'][count] = replay_download_address   
        replay_metadata_from_bs4_scrape_dict['vod_address'][count] = vod_address  
        replay_metadata_from_bs4_scrape_dict['replay_description'][count] = replay_description    

        count += 1;
        
        status_bar.value = count;
        
        if (max_records > 0) & (count == max_records):
            break;                                                                                                                                                                                  
    
    df_replay_metadata_from_bs4_scrape = pd.DataFrame(replay_metadata_from_bs4_scrape_dict);
    df_replay_metadata_from_bs4_scrape["data_creation_datetime"] = data_creation_datetime
    
    replay_metadata_table_filename =  <REDACTED>
    
    df_replay_metadata_from_bs4_scrape.to_csv(replay_metadata_table_filename, index = False)
    
    return replay_metadata_table_filename

def listify_buildorder(soup, player):
    output_list = [];
    build_order_tr =  <REDACTED>
    for tr_index, tr_value in enumerate(build_order_tr):
        
        # get inclusive_supply_at_time_of_creation with the index of the tr being given as the index of p1_build_order_tr: i'm not sure what this is
        inclusive_supply_at_time_of_creation =  <REDACTED>
        
        # get unit spawn time with the index of the tr being given as the index of p1_build_order_tr
        unit_spawn_time =  <REDACTED>
        
        # get unit category with the index of the tr being given as the index of p1_build_order_tr
        unit_category =  <REDACTED>
        
        # get unity type with the index of the tr being given as the index of p1_build_order_tr
        unit_type =  <REDACTED>
        
        output_list.append({'inclusive_supply_at_time_of_creation' : inclusive_supply_at_time_of_creation,
                            'unit_spawn_time' : unit_spawn_time,
                            'unit_category' : unit_category,
                            'unit_type' : unit_type});
        
    return output_list;

def extract_player_spawn_location_and_league(soup, player):
    
    player_spawn_location_and_league = [];
    
    spawn_location = np.nan
    league = np.nan
    player_index = np.nan;
    
    if (player == 'player-1'):
        player_index = 0;
        
    elif (player == 'player-2'):
        player_index = 1;
        
    else:
        raise ValueError('the only acceptable inputs for the \'player\' variable are \'player-1\' and \'player-2\'');
        
    if  <REDACTED>
        
        spawn_location =  <REDACTED>
        
    elif  <REDACTED>
        
        spawn_location =  <REDACTED>
        
        league =  <REDACTED>
    else:
        ValueError( <REDACTED>);
    
    player_spawn_location_and_league = [spawn_location, league];
    
    return player_spawn_location_and_league 

def dictify_tags(soup):
    
    tag_list =  <REDACTED>
    tag_dict = {};
    
    for tag in tag_list:
        player_specificity = lambda x : 'none' if not x else x;
        tag_dict.update( <REDACTED>) 
        
    return tag_dict;

## 1.1 Using the Webscraper Module:

### The following are the commands, using some of the methods from the module above, that I used to perform the data scrape.

### The first cell imports the scraper.

In [None]:
import custom_scraper

### The next cell is responsible for pulling the html text down from the website and storing the text in a dictionary keyed with the web address.

In [None]:
scrape_dict_filename = custom_scraper.scrape_website(replay_id_pickle = <REDACTED>,
                                                 replay_metadata_scrape_pickle = <REDACTED>)

### finally this call is responsible for parsing the html text and extracting the relevant data. In otherwords, this step converts the unstructured data to structured data.

In [None]:
replay_metadata_table_filename = custom_scraper.extract_from_website_scrape(<REDACTED>, max_records = 5000)

## 2.0 Cleaning the Scraped Data:

### In this section I go through the short cleaning process I did on the scraped data to get it ready for Tableau.

### First I import the data as well as the necessary python libaries.

In [None]:
import pandas as pd
import numpy as np
from dateutil.parser import parse

In [2]:
df_raw = pd.read_csv('./esports-starcraft2-data.csv')

In [3]:
df_raw.columns

Index(['replay_id', 'scraped_from_site', 'player_1_name', 'player_1_species',
       'player_1_build_order_names', 'player_1_build_order',
       'player_1_salt_encoding', 'player_1_winner_status',
       'player_1_spawn_location', 'player_1_league', 'player_2_name',
       'player_2_species', 'player_2_build_order_names',
       'player_2_build_order', 'player_2_salt_encoding',
       'player_2_winner_status', 'player_2_spawn_location', 'player_2_league',
       'tags', 'map', 'played_on_date_and_time', 'game_length',
       'tournament_name', 'other_games_in_series',
       'other_games_in_series_addresses', 'replay_download_address',
       'vod_address', 'replay_description', 'data_creation_datetime'],
      dtype='object')

In [4]:
df_cleaned = df_raw.drop(['replay_id', 'scraped_from_site', 
             'player_1_build_order', 'player_1_salt_encoding', 'player_1_spawn_location',
             'player_2_build_order', 'player_2_salt_encoding', 'player_2_spawn_location',
             'tags',
             'other_games_in_series', 'other_games_in_series_addresses',
             'replay_download_address', 'vod_address', 'replay_description',
             'data_creation_datetime'], axis = 1)

In [5]:
df_cleaned.head()

Unnamed: 0,player_1_name,player_1_species,player_1_build_order_names,player_1_winner_status,player_1_league,player_2_name,player_2_species,player_2_build_order_names,player_2_winner_status,player_2_league,map,played_on_date_and_time,game_length,tournament_name
0,Replicant,Terran,Cyclone Opening,1,,Couguar,Protoss,"1 Gate Expand, Colossus Rush",0.0,,Map: Hecate LE,"Nov. 12, 2023, 11:16 a.m.",14:13,
1,Spirit,Terran,"Cyclone Opening, Reaper Expand",0,Grandmaster,MaxPax,Protoss,"1 Gate Expand, Blink Stalkers",1.0,Grandmaster,Map: Solaris LE,"Oct. 30, 2023, 3:43 p.m.",9:39,
2,Classic,Protoss,"1 Gate Expand, Oracle Opening",0,,DKZDark,Zerg,Hatch First,1.0,,Map: [ESL] NeoHumanity,"May 20, 2023, 12:10 a.m.",7:31,
3,Lambo,Zerg,Hatch First,0,,Serral,Zerg,Hatch First,1.0,,Map: [ESL] NeoHumanity,"Sept. 6, 2023, 6:34 a.m.",7:08,
4,Cure,Terran,,0,,MaxPax,Protoss,"1 Gate Expand, Phoenix Opening",1.0,Master,Map: Solaris LE,"Oct. 18, 2023, 9:32 a.m.",11:35,


### I want to see which rows are mostly NaN.

In [6]:
len(df_cleaned[False == df_cleaned['tournament_name'].isna()])/len(df_cleaned)

0.8982

In [7]:
len(df_cleaned[False == df_cleaned['player_1_league'].isna()])/len(df_cleaned)

0.1442

In [8]:
len(df_cleaned[False == df_cleaned['player_2_league'].isna()])/len(df_cleaned)

0.154

### I want the dates to be formatted in a way that will work seamlessley with Tableau. I define a custom date formatting function and use apply to modify the column "played_on_date_and_time."

In [9]:
def custom_date_parse(x):
    
    custom_strings = {'noon' : '12:00 pm'};
    
    if type(x) == float:
        return(np.nan);
    
    else:
        try:
            for key in custom_strings:
                if key in x:
                    x = x.replace(key, custom_strings[key]);

            return parse(x);
        
        except:
            return(np.nan);

In [10]:
df_cleaned.loc[:, 'played_on_date_and_time'] = df_cleaned['played_on_date_and_time'].apply(lambda x : custom_date_parse(x))

In [11]:
df_cleaned

Unnamed: 0,player_1_name,player_1_species,player_1_build_order_names,player_1_winner_status,player_1_league,player_2_name,player_2_species,player_2_build_order_names,player_2_winner_status,player_2_league,map,played_on_date_and_time,game_length,tournament_name
0,Replicant,Terran,Cyclone Opening,1,,Couguar,Protoss,"1 Gate Expand, Colossus Rush",0.0,,Map: Hecate LE,2023-11-12 11:16:00,14:13,
1,Spirit,Terran,"Cyclone Opening, Reaper Expand",0,Grandmaster,MaxPax,Protoss,"1 Gate Expand, Blink Stalkers",1.0,Grandmaster,Map: Solaris LE,2023-10-30 15:43:00,9:39,
2,Classic,Protoss,"1 Gate Expand, Oracle Opening",0,,DKZDark,Zerg,Hatch First,1.0,,Map: [ESL] NeoHumanity,2023-05-20 00:10:00,7:31,
3,Lambo,Zerg,Hatch First,0,,Serral,Zerg,Hatch First,1.0,,Map: [ESL] NeoHumanity,2023-09-06 06:34:00,7:08,
4,Cure,Terran,,0,,MaxPax,Protoss,"1 Gate Expand, Phoenix Opening",1.0,Master,Map: Solaris LE,2023-10-18 09:32:00,11:35,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,wanted,Zerg,Hatch First,0,,iGCoffeeMs,Terran,Reaper Expand,1.0,,Map: 锈化山巅-天梯版,2021-10-07 07:54:00,14:17,Event: DreamHack SC2 Masters 2021 Winter
4996,wanted,Zerg,Hatch First,0,,iGCoffeeMs,Terran,"Liberator Opening, Reaper Expand",1.0,,Map: 紫晶浪漫-天梯版,2021-10-07 07:38:00,10:44,Event: DreamHack SC2 Masters 2021 Winter
4997,iGCoffeeMs,Terran,Reaper Expand,0,,wanted,Zerg,Pool First,1.0,,Map: 大气2000-天梯版,2021-10-07 07:25:00,20:40,Event: DreamHack SC2 Masters 2021 Winter
4998,wanted,Zerg,"Hatch First, Lurkers",0,,iGMacSed,Protoss,"1 Gate Expand, Oracle Opening, Carriers",1.0,,Map: 世界主宰-天梯版,2021-10-06 09:26:00,17:35,Event: DreamHack SC2 Masters 2021 Winter


In [12]:
len(df_cleaned[False == df_cleaned['played_on_date_and_time'].isna()])/len(df_cleaned)

0.9996

In [13]:
df_cleaned.drop(df_cleaned[df_cleaned['played_on_date_and_time'].isna() == True].index, inplace = True)

In [14]:
df_cleaned

Unnamed: 0,player_1_name,player_1_species,player_1_build_order_names,player_1_winner_status,player_1_league,player_2_name,player_2_species,player_2_build_order_names,player_2_winner_status,player_2_league,map,played_on_date_and_time,game_length,tournament_name
0,Replicant,Terran,Cyclone Opening,1,,Couguar,Protoss,"1 Gate Expand, Colossus Rush",0.0,,Map: Hecate LE,2023-11-12 11:16:00,14:13,
1,Spirit,Terran,"Cyclone Opening, Reaper Expand",0,Grandmaster,MaxPax,Protoss,"1 Gate Expand, Blink Stalkers",1.0,Grandmaster,Map: Solaris LE,2023-10-30 15:43:00,9:39,
2,Classic,Protoss,"1 Gate Expand, Oracle Opening",0,,DKZDark,Zerg,Hatch First,1.0,,Map: [ESL] NeoHumanity,2023-05-20 00:10:00,7:31,
3,Lambo,Zerg,Hatch First,0,,Serral,Zerg,Hatch First,1.0,,Map: [ESL] NeoHumanity,2023-09-06 06:34:00,7:08,
4,Cure,Terran,,0,,MaxPax,Protoss,"1 Gate Expand, Phoenix Opening",1.0,Master,Map: Solaris LE,2023-10-18 09:32:00,11:35,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,wanted,Zerg,Hatch First,0,,iGCoffeeMs,Terran,Reaper Expand,1.0,,Map: 锈化山巅-天梯版,2021-10-07 07:54:00,14:17,Event: DreamHack SC2 Masters 2021 Winter
4996,wanted,Zerg,Hatch First,0,,iGCoffeeMs,Terran,"Liberator Opening, Reaper Expand",1.0,,Map: 紫晶浪漫-天梯版,2021-10-07 07:38:00,10:44,Event: DreamHack SC2 Masters 2021 Winter
4997,iGCoffeeMs,Terran,Reaper Expand,0,,wanted,Zerg,Pool First,1.0,,Map: 大气2000-天梯版,2021-10-07 07:25:00,20:40,Event: DreamHack SC2 Masters 2021 Winter
4998,wanted,Zerg,"Hatch First, Lurkers",0,,iGMacSed,Protoss,"1 Gate Expand, Oracle Opening, Carriers",1.0,,Map: 世界主宰-天梯版,2021-10-06 09:26:00,17:35,Event: DreamHack SC2 Masters 2021 Winter


### I want the game duration to be given in decimal form with units minutes. I write a custome function to do this and then use apply to apply it.

In [15]:
def custom_game_duration_format(x):
    y = x.strip();
    if len(y) == 1:
        z = x.replace(' ', '00:00:0');
    elif len(y) == 2:
        z = x.replace(' ', '00:00:');
    elif len(y) == 3:
        z = x.replace(' ', '00:00');
    elif len(y) == 4:
        z = x.replace(' ', '00:0');
    elif len(y) == 5:
        z = x.replace(' ', '00:');
    elif len(y) == 6:
        z = x.replace(' ', '00');
    elif len(y) == 7:
        z = x.replace(' ', '0');
    else:
        pass;
    
    time = pd.to_timedelta(z);
    time_minutes = time.total_seconds()/60
    
    return round(10*time_minutes)/10;

In [16]:
df_cleaned.loc[:, 'game_length'] = df_cleaned['game_length'].apply(lambda x : custom_game_duration_format(x))

In [17]:
df_cleaned = df_cleaned.rename(columns = {'game_length' : 'game_length_in_minutes'})

In [18]:
df_cleaned

Unnamed: 0,player_1_name,player_1_species,player_1_build_order_names,player_1_winner_status,player_1_league,player_2_name,player_2_species,player_2_build_order_names,player_2_winner_status,player_2_league,map,played_on_date_and_time,game_length_in_minutes,tournament_name
0,Replicant,Terran,Cyclone Opening,1,,Couguar,Protoss,"1 Gate Expand, Colossus Rush",0.0,,Map: Hecate LE,2023-11-12 11:16:00,14.2,
1,Spirit,Terran,"Cyclone Opening, Reaper Expand",0,Grandmaster,MaxPax,Protoss,"1 Gate Expand, Blink Stalkers",1.0,Grandmaster,Map: Solaris LE,2023-10-30 15:43:00,9.6,
2,Classic,Protoss,"1 Gate Expand, Oracle Opening",0,,DKZDark,Zerg,Hatch First,1.0,,Map: [ESL] NeoHumanity,2023-05-20 00:10:00,7.5,
3,Lambo,Zerg,Hatch First,0,,Serral,Zerg,Hatch First,1.0,,Map: [ESL] NeoHumanity,2023-09-06 06:34:00,7.1,
4,Cure,Terran,,0,,MaxPax,Protoss,"1 Gate Expand, Phoenix Opening",1.0,Master,Map: Solaris LE,2023-10-18 09:32:00,11.6,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,wanted,Zerg,Hatch First,0,,iGCoffeeMs,Terran,Reaper Expand,1.0,,Map: 锈化山巅-天梯版,2021-10-07 07:54:00,14.3,Event: DreamHack SC2 Masters 2021 Winter
4996,wanted,Zerg,Hatch First,0,,iGCoffeeMs,Terran,"Liberator Opening, Reaper Expand",1.0,,Map: 紫晶浪漫-天梯版,2021-10-07 07:38:00,10.7,Event: DreamHack SC2 Masters 2021 Winter
4997,iGCoffeeMs,Terran,Reaper Expand,0,,wanted,Zerg,Pool First,1.0,,Map: 大气2000-天梯版,2021-10-07 07:25:00,20.7,Event: DreamHack SC2 Masters 2021 Winter
4998,wanted,Zerg,"Hatch First, Lurkers",0,,iGMacSed,Protoss,"1 Gate Expand, Oracle Opening, Carriers",1.0,,Map: 世界主宰-天梯版,2021-10-06 09:26:00,17.6,Event: DreamHack SC2 Masters 2021 Winter


### Finally I want the player winner status for the second player to be INT, it was imported as a float for some reason. Then I export the data to a csv to be used in Tableau.

In [19]:
df_cleaned['player_2_winner_status'] = df_cleaned['player_2_winner_status'].apply(lambda x : int(x)) 

In [None]:
df_cleaned

In [None]:
df_cleanedi.iloc[0:1000, : ].to_csv('./esports-starcraft2-dashboard-data.csv')