### Scrape GEMP
* Implement proof-of-concept scraper

In [1]:
## 0. Prerequisites
from selenium.webdriver import ChromeOptions
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import re
import time
import bs4 as bs
import hidden
# used to write completed games to csv file
from csv import DictWriter


In [2]:
class GempTable:
    def __init__(self, table):
        self.recorded = table['Recorded']
        self.table_id = table['TableID']
        self.format = table['Format']
        self.tournament = table['Tournament']
        self.status = table['Status']
        self.ls_player = table['LS_Player']
        self.ls_deck = table['LS_Deck']
        self.ds_player = table['DS_Player']
        self.ds_deck = table['DS_Deck']
        self.gamelink = table['GameLink']
        self.winner = table['Winner']
        self.window_id = None
        self.messages = []
    

    def record_gemp_table(self):
        headers = ['Recorded',
                         'TableID',
                         'Format',
                         'Tournament',
                         'Status',
                         'LS_Player',
                         'LS_Deck',
                         'DS_Player',
                         'DS_Deck',
                         'GameLink',
                         'Winner']
        if ((self.status == 'Finished') & (self.format == 'Open') & (self.recorded == 0)):
#             if DEBUG: print(f'recording {self.tableID}')
            with open('gemp_completed_games.csv', 'a', newline='') as f:
                dictwriter_object = DictWriter(f, fieldnames=headers)
                # Pass the data in the dictionary as an argument into the writerow() function
                self.recorded = time.strftime('%Y-%m-%d %H:%M:%S%z (%Z)')
                table = {'Recorded': self.recorded,
                         'TableID': self.table_id,
                         'Format':self.format,
                         'Tournament':self.tournament,
                         'Status':self.status,
                         'LS_Player': self.ls_player,
                         'LS_Deck':self.ls_deck,
                         'DS_Player':self.ds_player,
                         'DS_Deck':self.ds_deck,
                         'GameLink':self.gamelink,
                         'Winner':self.winner,
                        }
                dictwriter_object.writerow(table)
                # Close the file object
                f.close()
            if DEBUG: print(f"recorded {table['TableID']}")
    #     if DEBUG: print(f"length of tables after recording games: {len(tables)}")        
        recorded = True
        return recorded
        
    def record_gemp_table_messages(self):
        pass
    
    
    def append_gemp_table_messages(self, driver):
        driver.switch_to_window(driver.window_handles[self.window_id])
        # append messages from current gemp-table game window
        # what is the object that contains these messages
        # are these messages new? do we need to append all of them?
        # how to best determine which messages are to be appended?
        
        
    
    def open_gemp_table_tab(self, driver):
        driver.execute_script(f'''window.open({self.gamelink}, '{self.table_id}');''')
        self.window_id = len(driver.window_handles)-1

    

In [3]:
def parse_players_column(players_column):
    player_deck_dict = {}
    re_expr = re.compile('\s*(.*)\s\((?:DARK:\s)(.*)\)\,\s(.*)\s\((?:LIGHT:\s)(.*)\)')
    dark_light = re_expr.findall(players_column.text)
    if dark_light != []:
        player_deck_dict['DS_Player'] = dark_light[0][0]
        player_deck_dict['DS_Deck'] = dark_light[0][1]
        player_deck_dict['LS_Player'] = dark_light[0][2]
        player_deck_dict['LS_Deck'] = dark_light[0][3]
    else:
        re_expr = re.compile('\s*(.*)\s\((?:LIGHT:\s)(.*)\)\,\s(.*)\s\((?:DARK:\s)(.*)\)')
        light_dark = re_expr.findall(players_column.text)
        player_deck_dict['LS_Player'] = light_dark[0][0]
        player_deck_dict['LS_Deck'] = light_dark[0][1]
        player_deck_dict['DS_Player'] = light_dark[0][2]
        player_deck_dict['DS_Deck'] = light_dark[0][3]
    return player_deck_dict
        

def compose_gemp_table(gemp_table_entry, is_new_gemp_table):
    table_entry_values = gemp_table_entry.find_elements_by_tag_name('td')
    table_id = gemp_table_entry.get_attribute("class")
    
    table = {}
    table['Recorded'] = 0
    table['TableID'] = table_id
    table['Format'] = table_entry_values[0].text
    table['Tournament']= table_entry_values[1].text
    table['Status']= table_entry_values[2].text
    
    is_open_format = (table['Format'] == 'Open')
    is_playing = ((table['Status'] != 'Preparation') & (table['Status'] != 'Finished') & (table['Status'] != 'Cancelled') & (table['Status'] != ''))
    is_preparing = (table['Status'] == 'Preparation')
    is_cancelled = (table['Status'] == 'Cancelled')
    is_finished = (table['Status'] == 'Finished')
    is_relevant = ((not is_cancelled) & (not is_preparing))
    
    if ((is_open_format) & (is_relevant)):
        if (is_new_gemp_table & is_playing):
            if DEBUG: print("adding Playing Table: "+table_id)
            player_deck_dict = parse_players_column(table_entry_values[3])
            table['DS_Player'] = player_deck_dict['DS_Player'] 
            table['DS_Deck'] =  player_deck_dict['DS_Deck'] 
            table['LS_Player'] = player_deck_dict['LS_Player'] 
            table['LS_Deck'] = player_deck_dict['LS_Deck'] 
            if len(gemp_table_entry.find_elements_by_tag_name('a')) > 0:
                table['GameLink']= gemp_table_entry.find_elements_by_tag_name('a')[0].get_attribute("href")
            else:
                table['GameLink']= 'NA'
            table['Winner']= 'NA'
        elif (is_finished):
            if DEBUG: print("adding Finished Table: "+table_id)
            player_deck_dict = parse_players_column(table_entry_values[3])
            table['DS_Player'] = player_deck_dict['DS_Player'] 
            table['DS_Deck'] =  player_deck_dict['DS_Deck'] 
            table['LS_Player'] = player_deck_dict['LS_Player'] 
            table['LS_Deck'] = player_deck_dict['LS_Deck']
            if (is_new_gemp_table):
                table['GameLink']= 'NA'
                table['Winner']= 'NA'
            # WILL NEED TO UPDATE GEMPTABLE of finished, but old games.
            table['Winner']= table_entry_values[4].text
        else:
            if DEBUG: print(f"adding Unknown Table: {table_id} with {table['Status']} status")
            table['DS_Player'] = 'unknown'
            table['DS_Deck'] =  'unknown'
            table['LS_Player'] = 'unknown'
            table['LS_Deck'] = 'unknown'
            table['GameLink']= 'unknown'
            table['Winner']= 'unknown'
            
        return GempTable(table)
    else:
        return False
        


def get_gemp_table_entries(driver, xpath):
    table_body = driver.find_element_by_xpath(xpath)
    entries = table_body.find_elements_by_tag_name('tr')    
    return entries


def get_gemp_tables(driver, gemp_tables):
    # <The Dudley Boyz: GET THE TABLE(S)! />
    keys = ['Recorded','TableID','Format','Tournament','Status','LS_Player', 'LS_Deck','DS_Player','DS_Deck','GameLink','Winner']
    playing_tables = {}
    finished_tables = {}

    playing_tables['xpath'] = '/html/body/div[1]/div[1]/div[3]/div[1]/div[4]/table/tbody'
    finished_tables['xpath'] = '//*[@id="hall"]/div[1]/div[6]/table/tbody'
    
    playing_tables['entries'] = get_gemp_table_entries(driver, playing_tables['xpath'])
    finished_tables['entries'] = get_gemp_table_entries(driver, finished_tables['xpath'])
        
    ## Update/Add rows from Playing Tables
    for i in range(1, len(playing_tables['entries'])):
        table_id = playing_tables['entries'][i].get_attribute("class")
        is_not_recorded = (table_id not in gemp_tables['recorded'])
        is_new_gemp_table = (table_id not in gemp_tables.keys())
        if ((is_new_gemp_table) & (is_not_recorded)):    
            relevant_table = compose_gemp_table(playing_tables['entries'][i], is_new_gemp_table)
            if ((is_new_gemp_table) & (relevant_table is not False)):
                gemp_tables[table_id] = relevant_table
                gemp_tables['active'].append(table_id)
    
    # check length of tables dictionary
    if DEBUG: print(f'length of gemp_tables after Playing Tables: {len(gemp_tables)}')
    
    ## Updates tables with rows from Finished Tables
    for i in range(1, len(finished_tables['entries'])):
        table_id = finished_tables['entries'][i].get_attribute("class")
        is_not_recorded = (table_id not in gemp_tables['recorded'])
        is_new_gemp_table = (table_id not in gemp_tables.keys())
        
        if (is_not_recorded):
            relevant_table = compose_gemp_table(finished_tables['entries'][i], is_new_gemp_table)
            if ((is_new_gemp_table) & (relevant_table is not False)): 
                gemp_tables[table_id] = relevant_table
                gemp_tables['finished'].append(table_id)
            elif (relevant_table is not False):
                gemp_tables['finished'].append(table_id)
                gemp_tables['active'].pop(table_id)
               
    # check length of tables dictionary
    if DEBUG: print(f'length of tables after Finished Tables: {len(gemp_tables)}')
    
    return gemp_tables
    

In [4]:
def login_gemp(driver):
    driver.get (secrets["login"])
    driver.find_element_by_id("login").send_keys(secrets["user"])
    driver.find_element_by_id("password").send_keys(secrets["pass"])
    driver.find_element_by_id("password").send_keys(Keys.RETURN)
    return driver



def listen_for_gemp_tables(driver):
    listening = True
    gemp_tables = {'active':[], 'finished':[], 'recorded':[]}
    sleep_time = 120 # length of time between cycles
    gamehall_window = driver.window_handles[0]
    
    while(listening):
        gemp_tables['finished'] = []
        gemp_tables = get_gemp_tables(driver, gemp_tables)
        
        if (len(gemp_tables['finished']) > 0):
            for table_id in gemp_tables['finished']:
                if table_id not in gemp_tables['recorded']:
                    gemp_tables[table_id].record_gemp_table()
                    gemp_tables['recorded'].append(table_id)
                    gemp_tables.pop(table_id)
        if (len(gemp_tables['active'] > 0)):
            for table_id in gemp_tables['active']:
                if (gemp_tables[table_id].window_id is not None):
                    # switch to the window and check the messages
                    gemp_table[table_id].append_gemp_table_messages(driver)
                else:
                    # else open a new window and append messages
                    gemp_table[table_id].open_gemp_table(driver)
                    gemp_table[table_id].append_gemp_table_messages(driver)
                
                
        
        
        if listening:
            driver.switch_to_window(gamehall_window)
            if DEBUG: print(f"sleeping for {sleep_time} seconds...")
            time.sleep(sleep_time)
    return 0


def access_gemp(site_URL):
    chrome_options = webdriver.ChromeOptions()
#     chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(options=chrome_options)
    driver = login_gemp(driver)
    driver.get(site_URL)
    try: 
        driver.find_element_by_xpath('/html/body/div[1]/div[1]/div[3]/div[1]/div[5]/div').click()
    except:
        pass
    time.sleep(2)
    listen_for_gemp_tables(driver)
    
    source = driver.page_source
#     driver.quit()
    return source
  

In [5]:
# get gemp data
DEBUG = True
DEV = False
if DEV:
    secrets = hidden.dev_secrets()
else:
    secrets = hidden.secrets()
if DEBUG: print(f'Debug is ON')
site_URL = secrets["host"]
source = access_gemp(site_URL)

Debug is ON
adding Unknown Table: table12575 with  status
adding Playing Table: table12587
adding Playing Table: table12599
adding Playing Table: table12598
adding Playing Table: table12601
adding Playing Table: table12602
length of gemp_tables after Playing Tables: 9
adding Finished Table: table12597
adding Finished Table: table12573
length of tables after Finished Tables: 11
recorded table12597
recorded table12573
Parent window title: Game of Gemp-Swccg
sleeping for 120 seconds...
adding Playing Table: table12603
length of gemp_tables after Playing Tables: 10
length of tables after Finished Tables: 10
Parent window title: Game of Gemp-Swccg
sleeping for 120 seconds...


KeyboardInterrupt: 