In [80]:
# coding: utf8
import pandas as pd
import re
from selenium import webdriver
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.support.ui import WebDriverWait
import unicodedata
from multiprocessing import Process
import datetime

    
class ScrapeATPStats(object):
    
    from AcquireBetBrainATPUpcomingGames import AcquireBetBrainATPUpcomingGames
    
    def __init__(self, split_list, for_current_season):
        self.split_list = split_list
        self.players_game_data = []
        self.players_stat_data = []
        self.for_current_season = for_current_season
        self.current_datetime = datetime.datetime.now()
        self.beginning_month_to_scrape_current_season_stats = 3 # We will start scraping current season stats for predictions from March
        
    def __call__(self):
        print "Scraping data from " + str(self.split_list[0]) + " to " + str(self.split_list[-1])
        for self.i in self.split_list:
            self.go_on_atp_ranking_page()
            self.go_on_player_profile_page(self.browser)
            self.go_on_player_statistics_page(self.browser)
            self.scrape_player_stat_data(self.browser, self.players_stat_data,\
                                         self.for_current_season, self.current_datetime, self.beginning_month_to_scrape_current_season_stats)
            self.go_on_player_game_data_page(self.browser, self.for_current_season)
            self.scrape_player_game_data(self.browser, self.players_game_data, self.for_current_season)
            print "---Next player---"

            self.data_to_csv(self.players_stat_data, self.players_game_data, self.split_list)
            
        if self.for_current_season == "Yes":
            print "---Scraping upco games from betbrain for merging---"
            j = self.AcquireBetBrainATPUpcomingGames()
            j()
            
            print "---Done---\n"
            
        else:
            print "---Done---\n"
            
            
    # --- Going on the core tennis website ---
    def go_on_atp_ranking_page(self):
        url = "http://www.coretennis.net/"
        delay = 15 # seconds

        while True:
            try:
                browser = webdriver.Chrome("C:\Users\jbadiabo\PycharmProjects\Sibyl\chromedriver.exe")
                browser.set_page_load_timeout(delay)
                browser.get(url)
                # ---Going on the ATP Ranking page---
                browser.maximize_window()
                
                try: 
                    atp_ranking_button = browser.find_element_by_xpath(".//div[@class='menu1']/a[4]")
                    atp_ranking_url = atp_ranking_button.get_attribute("href")
                except NoSuchElementException:
                    atp_ranking_button = browser.find_element_by_partial_link_text("Tennis Rankings")
                    atp_ranking_url = atp_ranking_button.get_attribute("href")
                    
            except TimeoutException:
                print "Timeout going on atp ranking page, retrying"
                browser.quit()
                continue
            else:
                break

        try:
            atp_ranking_button.click()
            browser.get(browser.current_url)
        except TimeoutException:
            while True:
                try:
                    print "Timeout opening atp ranking page, keeping the page anyway.."
                    print "opening: " + atp_ranking_url
                    browser.get(atp_ranking_url)
                except TimeoutException:
                    print "Timeout getting atp ranking page retrying.."
                    continue
                else:
                    break

        self.browser = browser
        
    def go_on_player_profile_page(self, browser):
        # ---Going on player profile---
        current_url = browser.current_url
        delay = 15
        
        try: 
            try:
                table = browser.find_element_by_id('rtable2')
            except NoSuchElementException:
                table = browser.find_elements_by_tag_name("table")[0]

            body = table.find_element_by_tag_name('tbody')
            table_rows = body.find_elements_by_tag_name('tr')
            table_rows = table_rows[1:]
            player_name_cell = table_rows[self.i].find_elements_by_tag_name('td')[1]
            player_profile_link = player_name_cell.find_element_by_tag_name('a')    
            player_profile_url = player_profile_link.get_attribute('href')
            player_profile_link.click()
            browser.get(browser.current_url)
        except TimeoutException:
            while True:
                try:        
                    print "Timeout opening player page, retrying, should open player profile page.."
                    browser.quit()
                    browser = webdriver.Chrome("C:\Users\jbadiabo\PycharmProjects\Sibyl\chromedriver.exe")
                    delay += 10
                    browser.set_page_load_timeout(delay)
                    print "opening: " + player_profile_url
                    browser.get(player_profile_url)
                    browser.maximize_window()
                except TimeoutException:
                    print "Timeout on recreeating the driver, retrying.."
                    continue
                else:
                    break  

        self.browser = browser

    # ---Going on player statistics profile---
    def go_on_player_statistics_page(self, browser):
        current_url = browser.current_url
        delay = 15
        
        try:
            try:
                player_statistics_link = browser.find_element_by_link_text("Statistics")
                player_statistics_url = player_statistics_link.get_attribute('href')
            except NoSuchElementException:
                player_statistics_link = browser.find_element_by_xpath(".//div[@class='ppNav']/a[4]")
                player_statistics_url = player_statistics_link.get_attribute('href')

            player_statistics_link.click()
            browser.get(browser.current_url)
        except TimeoutException:
            while True:
                try:        
                    print "Timeout going on stats page, retrying, should open player stat profile.."
                    browser.quit()
                    browser = webdriver.Chrome("C:\Users\jbadiabo\PycharmProjects\Sibyl\chromedriver.exe")
                    delay += 10
                    browser.set_page_load_timeout(delay)
                    print "opening: " + player_statistics_url
                    browser.get(player_statistics_url)
                    browser.maximize_window()
                except TimeoutException:
                    continue
                else:
                    break         

        self.browser = browser
        
    # ---Scraping player stats---
    def scrape_player_stat_data(self, browser, players_stat_data, for_current_season, current_datetime, beginning_month_to_scrape_current_season_stats):
        main_div = browser.find_element_by_id("colMainContent1b")

        player_name_header = main_div.find_element_by_class_name("ppHeader")
        player_name = player_name_header.find_element_by_tag_name('h1').text.encode("ascii", "ignore")

        player_name = player_name.rsplit(' ', 2)[0] # 2 for 2nd split starting from the right

        table_titles = main_div.find_elements_by_tag_name("h2")
        table_titles = [x.text.encode('ascii', 'ignore') for x in table_titles]
        stat_tables = main_div.find_elements_by_class_name("sTable")
        number_of_years = len(stat_tables)

        year_player_data = []
        
        if for_current_season == 'No': # Mostly used for training, we take all season stat data

            for stat_table, table_title in zip(stat_tables, table_titles):

                try:
                    year = table_title.split(" ")[0]
                    # ------------------------------------------------------------------------------------
                    table_footer = stat_table.find_element_by_tag_name("tfoot")
                    table_footer_row = table_footer.find_element_by_tag_name("tr")
                    data = [player_name, year]
                    table_footer_row_data = table_footer_row.find_elements_by_tag_name("td")

                    for i in table_footer_row_data:
                        data.append(i.text.encode('ascii', 'ignore'))

                    year_player_data.append(data)
                except NoSuchElementException:
                    continue
                    
        else: # Meaning for_current_season == 'Yes'
              # Then we have 2 cases:
            
            if current_datetime.month >= beginning_month_to_scrape_current_season_stats: # Post March: Then we scrape current season data for predictions
            
                for stat_table, table_title in zip(stat_tables, table_titles):

                    try:
                        year = table_title.split(" ")[0]
                        if year == str(current_datetime.year):
                            # ------------------------------------------------------------------------------------
                            table_footer = stat_table.find_element_by_tag_name("tfoot")
                            table_footer_row = table_footer.find_element_by_tag_name("tr")
                            data = [player_name, year]
                            table_footer_row_data = table_footer_row.find_elements_by_tag_name("td")

                            for i in table_footer_row_data:
                                data.append(i.text.encode('ascii', 'ignore'))

                            year_player_data.append(data)
                        else:
                            continue

                    except NoSuchElementException:
                        continue
                        
            else: # Ante March: Then we scrape data of the previous season (ex: 2016) for the current_season_predictions
                
                for stat_table, table_title in zip(stat_tables, table_titles):

                    try:
                        year = table_title.split(" ")[0]
                        if year == str(current_datetime.year - 1):
                            # ------------------------------------------------------------------------------------
                            table_footer = stat_table.find_element_by_tag_name("tfoot")
                            table_footer_row = table_footer.find_element_by_tag_name("tr")
                            data = [player_name, year]
                            table_footer_row_data = table_footer_row.find_elements_by_tag_name("td")

                            for i in table_footer_row_data:
                                data.append(i.text.encode('ascii', 'ignore'))

                            year_player_data.append(data)
                        else:
                            continue

                    except NoSuchElementException:
                        continue                
                    
                    
        players_stat_data += year_player_data
        
        self.players_stat_data = players_stat_data

        self.browser = browser
        
    # ---Going on the player game data---
    def go_on_player_game_data_page(self, browser):
        delay = 15
        try:
            main_div = browser.find_element_by_id("colMainContent1b")
            nav_div = main_div.find_element_by_class_name("ppNav")

            try:
                results_button = nav_div.find_element_by_partial_link_text("Results")
                results_url = results_button.get_attribute("href")
            except NoSuchElementException:
                results_button = nav_div.find_element_by_xpath(".//div[@class='ppNav']/a[3]")
                results_url = results_button.get_attribute("href")

            results_button.click()
            browser.get(browser.current_url)
        except TimeoutException:
            while True:
                try:        
                    print "Timeout going on game stat page, retrying, should open player game stat data page.."
                    browser.quit()
                    browser = webdriver.Chrome("C:\Users\jbadiabo\PycharmProjects\Sibyl\chromedriver.exe")
                    delay += 10
                    browser.set_page_load_timeout(delay)
                    print "opening: " + results_url
                    browser.get(results_url)
                    browser.maximize_window()
                except TimeoutException:
                    continue
                else:
                    break     

        self.browser = browser

    # ---Scraping player game_data---
    def scrape_player_game_data(self, browser, players_game_data, for_current_season):
        player_game_data = []

        year_nav_div = browser.find_element_by_class_name("shadetabs")
        year_button_links = year_nav_div.find_elements_by_tag_name("li")

        main_div = browser.find_element_by_class_name("tabcontentstyle")
        year_divs = main_div.find_elements_by_class_name("tabcontent")
        
        if for_current_season == 'No': # Mostly used for training, we take all season stat data

            for year_button_link, year_div in zip(year_button_links, year_divs):
                year_button_link.click()

                # --- Scraping the year ---
                year = year_nav_div.find_element_by_css_selector("li.selected").text.encode("ascii", "ignore")

                # ---Scraping player name---
                header = browser.find_element_by_class_name("ppHeader")
                player_name = header.find_element_by_css_selector('h1').text.encode("ascii", "ignore")
                player_name = player_name.rsplit(' ', 2)[0] # 2 for 2nd split starting from the right

                # ---Scraping game data---
                containers = year_div.find_elements_by_class_name("pprContainer")
                year_container_rows_data = []

                for container in containers:

                    container_datetime_range = container.find_element_by_css_selector("div.pprHead")
                    try:
                        container_datetime_range = container_datetime_range.find_elements_by_tag_name("div")[0].text.encode("ascii", "ignore")
                    except NoSuchElementException:
                        container_datetime_range = container_datetime_range.find_element_by_class_name("plM1")

                    container_datetime_range = container_datetime_range.replace("\n", " - ")

                    container_rows = container.find_elements_by_css_selector("div.pprRow")

                    container_rows_data = []
                    for row in container_rows:
                        data = row.find_elements_by_tag_name("div")
                        data = [x.text.encode("ascii", "ignore") for x in data]

                        indices = 0, 2, 4
                        data = [i for j, i in enumerate(data) if j not in indices]
                        data.insert(0, data.pop(1))
                        data.insert(0, player_name)
                        data.insert(0, container_datetime_range)
                        data.insert(0, year)


                        container_rows_data.append(data)

                    year_container_rows_data = year_container_rows_data + container_rows_data

                player_game_data = player_game_data + year_container_rows_data
                
        else: # Meaning for_current_season == 'Yes'
            
            for year_button_link, year_div in zip(year_button_links[0], year_divs[0]): # Meaning we only scrape the top year == current year
                year_button_link.click()

                # --- Scraping the year ---
                year = year_nav_div.find_element_by_css_selector("li.selected").text.encode("ascii", "ignore")

                # ---Scraping player name---
                header = browser.find_element_by_class_name("ppHeader")
                player_name = header.find_element_by_css_selector('h1').text.encode("ascii", "ignore")
                player_name = player_name.rsplit(' ', 2)[0] # 2 for 2nd split starting from the right

                # ---Scraping game data---
                containers = year_div.find_elements_by_class_name("pprContainer")
                year_container_rows_data = []

                for container in containers:

                    container_datetime_range = container.find_element_by_css_selector("div.pprHead")
                    try:
                        container_datetime_range = container_datetime_range.find_elements_by_tag_name("div")[0].text.encode("ascii", "ignore")
                    except NoSuchElementException:
                        container_datetime_range = container_datetime_range.find_element_by_class_name("plM1")

                    container_datetime_range = container_datetime_range.replace("\n", " - ")

                    container_rows = container.find_elements_by_css_selector("div.pprRow")

                    container_rows_data = []
                    for row in container_rows:
                        data = row.find_elements_by_tag_name("div")
                        data = [x.text.encode("ascii", "ignore") for x in data]

                        indices = 0, 2, 4
                        data = [i for j, i in enumerate(data) if j not in indices]
                        data.insert(0, data.pop(1))
                        data.insert(0, player_name)
                        data.insert(0, container_datetime_range)
                        data.insert(0, year)


                        container_rows_data.append(data)

                    year_container_rows_data = year_container_rows_data + container_rows_data

                player_game_data = player_game_data + year_container_rows_data                
            
            
        players_game_data = players_game_data + player_game_data
        
        self.players_game_data = players_game_data

        browser.quit()
        
    def data_to_csv(self, players_stat_data, players_game_data, split_list, for_current_season):
        
        players_stat_data_df = pd.DataFrame(players_stat_data, 
                                            columns=['Player_name', 'Year', 'Tourn', 'Titles', 'Matches', 'Wins', 'Losses', 'PCT', '6-0', '0-6', '7-6', '6-7'])
        players_stat_data_df['Player_name'] = players_stat_data_df['Player_name'].str.rstrip().str.lstrip().str.replace('-', ' ')
        players_stat_data_df = players_stat_data_df[players_stat_data_df.Year != "2017"]
        players_stat_data_df = players_stat_data_df.sort_values(['Year'])


        # ------------------------------------------------------------

        players_game_data_df = pd.DataFrame(players_game_data, 
                                            columns=['Year', 'Week', 'Player_A', 'Player_B', 'True_Result'])
        players_game_data_df['Player_B'] = players_game_data_df['Player_B'].str[:-5]
        players_game_data_df.insert(1, 'Start', players_game_data_df['Week'].str.split(' - ').str.get(0))
        players_game_data_df.drop('Week', axis = 1, inplace = True)
        players_game_data_df.insert(1, 'Week', players_game_data_df['Year'] + ',' + players_game_data_df['Start'])
        players_game_data_df.drop('Start', axis = 1, inplace = True)
        players_game_data_df['Week'] = players_game_data_df['Week'].str.replace(' ', ',')
        players_game_data_df['Week'] = pd.to_datetime(players_game_data_df['Week'], infer_datetime_format=True)
        players_game_data_df['Player_A'] = players_game_data_df['Player_A'].str.rstrip().str.lstrip().str.replace('-', ' ')
        players_game_data_df['Player_B'] = players_game_data_df['Player_B'].str.rstrip().str.lstrip().str.replace('-', ' ')

        # Dropping duplicate rows
        players_game_data_df['Winner'] = players_game_data_df.apply(lambda x: x['Player_A'] if x['True_Result'] == 'W' else x['Player_B'], axis = 1)
        players_game_data_df['Looser'] = players_game_data_df.apply(lambda x: x['Player_A'] if x['True_Result'] == 'L' else x['Player_B'], axis = 1)
        players_game_data_df.drop_duplicates(subset=['Week', 'Winner', 'Looser'], inplace=True)
        players_game_data_df.drop(['Winner', 'Looser'], axis = 1, inplace = True)
        
        # -----------------------------------------------------------------------
        players_game_data_df = players_game_data_df[players_game_data_df.Year != "2017"]
        players_game_data_df = players_game_data_df.sort_values(['Year', 'Week'])


        # -----------------------------------------------------
        if for_current_season == "No":
            players_stat_data_df.to_csv("atp_player_stats_until_" + str(split_list[-1]) + "_.csv", mode='w+')

            players_game_data_df['Week'].to_csv("date_atp_game_stats.csv", mode='w+', header=False, index=False)
            players_game_data_df.drop('Week', axis = 1, inplace = True)
            players_game_data_df.to_csv("atp_game_stats_until_" + str(split_list[-1]) + "_.csv", index=False, mode='w+')
            
        else: # for the current season
            players_stat_data_df.to_csv("atp_player_stats_until_" + str(split_list[-1]) + "_current_season.csv", mode='w+')

            players_game_data_df['Week'].to_csv("date_atp_game_stats_current_season.csv", mode='w+', header=False, index=False)
            players_game_data_df.drop('Week', axis = 1, inplace = True)
            players_game_data_df.to_csv("atp_game_stats_until_" + str(split_list[-1]) + "_current_season.csv", index=False, mode='w+')
            
        self.players_stat_data_df = players_stat_data_df
        self.players_game_data_df = players_game_data_df
        

def scraping_split_one():
    a = ScrapeATPStats(range(1), "Yes")
    a()

def scraping_split_two():
    b = ScrapeATPStats(range(11, 21), "Yes")
    b()
    
def scraping_split_three():
    c = ScrapeATPStats(range(21, 31), "Yes")
    c()

def scraping_split_four():
    d = ScrapeATPStats(range(31, 41), "Yes")
    d()    
    
def scraping_split_five():
    e = ScrapeATPStats(range(41, 51), "Yes")
    e()

def scraping_split_six():
    f = ScrapeATPStats(range(51, 61), "Yes")
    f()
    
def scraping_split_seven():
    g = ScrapeATPStats(range(61, 71), "Yes")
    g()

def scraping_split_eight():
    h = ScrapeATPStats(range(71, 81), "Yes")
    h()  

def scraping_split_nine():
    i = ScrapeATPStats(range(81, 91), "Yes")
    i()

def scraping_split_ten():
    j = ScrapeATPStats(range(91, 101), "Yes")
    j() 
    
def run_in_parallel(*fns):
    proc = []
    for fn in fns:
        p = Process(target = fn)
        p.start()
        proc.append(p)
    for p in proc:
        p.join()
        
if __name__ == '__main__':
    run_in_parallel(scraping_split_one)

In [79]:
run_in_parallel(scraping_split_one, scraping_split_two)

In [71]:
x = ScrapeATPStats(range(10))
x()


Scraping data from 0 to 9


error: [Errno 10054] An existing connection was forcibly closed by the remote host

In [61]:
ten = range(10)

In [62]:
str(ten[-1])

'9'

In [63]:
print "until_" + str(ten[-1])

until 9


In [5]:
current_datetime_month

1

In [10]:
current_datetime.year

False