## Importing necessary packages 

In [1]:
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import re
import numpy as np
import pandas as pd
import requests
import collections
import math
from datetime import datetime, timedelta
import time

## Getting all tournament links given starting and ending dates 

In [2]:
def convert_date(date):
    year = str(date.year)
    month = str(date.month) if len(str(date.month)) == 2 else '0' + str(date.month)
    day = str(date.day) if len(str(date.day)) == 2 else '0' + str(date.day)
    converted = year + '-' + month + '-' + day
    return converted
    

def get_tournament_links(start, end):
    days_diff = (end - start).days
    num_of_intervals = math.ceil(days_diff/200)
    starting_dates = [start + timedelta(days = 200 * x) for x in range(num_of_intervals)]
    links = []
    for y in range(num_of_intervals):
        start_date = convert_date(starting_dates[y])
        if y != num_of_intervals - 1:
            end_date = convert_date(starting_dates[y+1] - timedelta(days=1))
        else:
            end_date = convert_date(end)
        payload = {'Page': '10000', 'TournamentFilter.StartDate': start_date, 'TournamentFilter.EndDate': end_date}    
        r = requests.post('http://bwf.tournamentsoftware.com/find/tournament/DoSearch', data = payload)    
        soup = BeautifulSoup(r.content, 'html.parser')     
        temp = soup.select('a[href]') 
        links += ['http://bwf.tournamentsoftware.com/sport/' + z['href'].split('/')[-1] for z in temp]
    links = list(set(links))    
    return links

### Example 

In [3]:
get_tournament_links(datetime(2016,1,1), datetime(2016,1,31))

['http://bwf.tournamentsoftware.com/sport/tournament?id=E5E22A37-A40A-45AA-BEF1-2F8A0D68AF4E',
 'http://bwf.tournamentsoftware.com/sport/tournament?id=65276591-A0C4-43C5-838B-211CB925B88E',
 'http://bwf.tournamentsoftware.com/sport/tournament?id=88F10E40-F76B-4D20-A62B-31C6DBC1E00B',
 'http://bwf.tournamentsoftware.com/sport/tournament?id=C25E58BA-9DC6-4161-B20A-16FD377EDD0D',
 'http://bwf.tournamentsoftware.com/sport/tournament?id=533DB455-B5B8-4D37-A32E-1D329872A730',
 'http://bwf.tournamentsoftware.com/sport/tournament?id=4EE0960D-D31F-4775-A32C-3370848F8781',
 'http://bwf.tournamentsoftware.com/sport/tournament?id=185A87E8-457F-4111-A262-FE510F677AFF',
 'http://bwf.tournamentsoftware.com/sport/tournament?id=A6189AA6-D097-49B9-B158-A84E7343EEC7',
 'http://bwf.tournamentsoftware.com/sport/tournament?id=89E5ADD3-26AA-43CE-8700-78358629CC2D']

## Getting all tournament data given tournament link

In [4]:
def get_tournament_data(tournament_link):
    r = requests.get(tournament_link,timeout=None)
    soup = BeautifulSoup(r.content, 'html.parser')
    
    temp = soup.find(class_ = 'media__link') 
    tournament_name = temp.get_text() 
    
    temp = soup.find_all(class_ = 'tag tag--mono') 
    tournament_type = temp[0].get_text() if len(temp) > 0 else None
    
    temp = soup.select('span img')[0].get_text().split(', ') 
    tournament_city = temp[0] if len(temp) > 1 else None
    tournament_country = soup.select('span img')[0]['title']
    
    temp = soup.select('span time')[1]['datetime']
    tournament_start_date = datetime.strptime(temp, '%Y-%m-%d %H:%M').date() 
    temp = soup.select('span time')[2]['datetime']
    tournament_end_date = datetime.strptime(temp, '%Y-%m-%d %H:%M').date()
    
    if 'Matches' in [a.text for a in soup.find_all(class_ = 'page-nav__link')]:
        temp = soup.find_all(attrs = {'href': re.compile('/sport/matches.aspx')})
        temp = [link for link in temp if link.text != 'Matches']
        matches_per_day_links = ['http://bwf.tournamentsoftware.com' + x['href'] for x in temp]
    else:
        matches_per_day_links = []
    return [tournament_name, tournament_type, tournament_city, tournament_country, 
            tournament_start_date, tournament_end_date, matches_per_day_links]

### Example 

In [5]:
get_tournament_data('http://bwf.tournamentsoftware.com/sport/matches.aspx?id=443C49EE-FE58-4D92-9F80-D9732298D30E')

['Hellas Open 2019',
 'International Series',
 'Sidirokastro',
 'Greece',
 datetime.date(2019, 8, 8),
 datetime.date(2019, 8, 11),
 ['http://bwf.tournamentsoftware.com/sport/matches.aspx?id=443C49EE-FE58-4D92-9F80-D9732298D30E&d=20190808',
  'http://bwf.tournamentsoftware.com/sport/matches.aspx?id=443C49EE-FE58-4D92-9F80-D9732298D30E&d=20190809',
  'http://bwf.tournamentsoftware.com/sport/matches.aspx?id=443C49EE-FE58-4D92-9F80-D9732298D30E&d=20190810',
  'http://bwf.tournamentsoftware.com/sport/matches.aspx?id=443C49EE-FE58-4D92-9F80-D9732298D30E&d=20190811']]

## Getting all matches link given matches per day link

In [6]:
def get_matches_link(matches_per_day_link):
    page = requests.get(matches_per_day_link,timeout=None)
    soup = BeautifulSoup(page.content, 'html.parser')
    temp = soup.find_all(class_='icon stats') 
    all_matches_link = ['http://bwf.tournamentsoftware.com/sport' + link['href'][1:] for link in temp]
    return all_matches_link

### Example 

In [7]:
get_matches_link('http://bwf.tournamentsoftware.com/sport/matches.aspx?id=6614E5F7-6C30-4791-BC52-F3FF17BECA38&d=20180411')[:5]

['http://bwf.tournamentsoftware.com/sport/match.aspx?id=6614E5F7-6C30-4791-BC52-F3FF17BECA38&match=345',
 'http://bwf.tournamentsoftware.com/sport/match.aspx?id=6614E5F7-6C30-4791-BC52-F3FF17BECA38&match=347',
 'http://bwf.tournamentsoftware.com/sport/match.aspx?id=6614E5F7-6C30-4791-BC52-F3FF17BECA38&match=346',
 'http://bwf.tournamentsoftware.com/sport/match.aspx?id=6614E5F7-6C30-4791-BC52-F3FF17BECA38&match=348',
 'http://bwf.tournamentsoftware.com/sport/match.aspx?id=6614E5F7-6C30-4791-BC52-F3FF17BECA38&match=349']

## Getting all information given match soup

In [8]:
def split_into_twos(lst):
    return [list(x) for x in np.array_split(lst, len(lst)/2)]

def get_player_information(matchlink):
    page = requests.get(matchlink,timeout=None)
    temp_soup = BeautifulSoup(page.content, 'html.parser')
    temp = temp_soup.find(attrs={'href':re.compile('player-profile')})['href']
    player_link = 'http://bwf.tournamentsoftware.com' + temp
    page = requests.get(player_link,timeout=None)
    temp_soup = BeautifulSoup(page.content, 'html.parser')
    player_name = temp_soup.find(class_ = 'hgroup__heading truncate').get_text()
    player_country = temp_soup.find(class_ = 'profile-head__nat')['title']
    return [player_link, player_name] 

def get_all_information(soup):
    soup_text = soup.text
    temp = [a.text for a in soup.select('td a')]
    match_discipline = temp[0]
    
    seed = [s[s.find('[')+1:s.find(']')] for s in temp if '[' in s and ']' in s and any(char.isdigit() for char in s)]
    player1_seed = None
    player2_seed = None
    if len(seed) == 2:
        player1_seed, player2_seed = seed[0], seed[1]
    if len(seed) == 1:
        if '[' in temp[1]:
            player1_seed = seed[0]
        else:
            player2_seed = seed[0]
    
    temp = soup.find_all(attrs = {'align':'right'})[-1].get_text() 
    temp = temp.split(':')
    if len(temp) == 2:
        match_duration = 60 * int(temp[0]) + int(temp[1])
    else:
        match_duration = None
 
    player1 = [int(j.partition("'")[0]) for j in soup_text.rsplit('Player 1: ', -1)[1:]]
    player2 = [int(j.partition("'")[0]) for j in soup_text.rsplit('Player 2: ', -1)[1:]]
    start_indices = [x for x in range(len(player1)) if player1[x] + player2[x] == 0] + [len(player1)]
    start_end_indices = [(start_indices[i],start_indices[i+1]) for i in range(len(start_indices)-1)]
    player1_score_sequence = [player1[s:e] for s,e in start_end_indices]
    player2_score_sequence = [player2[s:e] for s,e in start_end_indices]
    if len(player1_score_sequence[-1]) == 1:
        player1_score_sequence = player1_score_sequence[:-1]
        player2_score_sequence = player2_score_sequence[:-1]
    temp = [int(x.text) for x in soup.find_all("td") if re.match( '^[0-9]+$',x.text)]
    number_of_matches = len(player1_score_sequence) 
    match_score = [[temp[8+10*i], temp[9+10*i]] for i in range(number_of_matches)]
    match_statistic = split_into_twos(temp[:8]) 
    game_statistic = [split_into_twos(temp[10*i:8+10*i]) for i in range(1, number_of_matches+1)]
    
    retired = 'Retired' in soup_text
    
    tempa = [a['href'] for a in soup.find_all(attrs = {'href' : re.compile('player=')})]
    number_of_players = 1 if tempa[0] == tempa[2] else 2
    
    tempx = soup.find_all(class_='intext flag')
    
    if number_of_players == 1:
        player1_matchlink = 'http://bwf.tournamentsoftware.com/sport/' + tempa[0]
        player1_country = tempx[0]['title']
        player2_country = tempx[1]['title']
        temp = soup.select('strong a')[0]['href']
        winner = 1 if 'http://bwf.tournamentsoftware.com/sport/' + temp == player1_matchlink else 2 
    else:
        player11_matchlink = 'http://bwf.tournamentsoftware.com/sport/' + tempa[0]
        player12_matchlink = 'http://bwf.tournamentsoftware.com/sport/' + tempa[1]
        player1_country =  [tempx[0]['title'], tempx[1]['title']] 
        player1_matchlink = [player11_matchlink, player12_matchlink]
        player2_country =  [tempx[2]['title'], tempx[3]['title']] 
        temp = soup.select('strong a')[0]['href']
        winner = 1 if 'http://bwf.tournamentsoftware.com/sport/' + temp in player1_matchlink else 2
    return [match_discipline, match_duration, 
            player1_country, player2_country,
            player1_score_sequence, player2_score_sequence,
           match_score, match_statistic, game_statistic,
           retired, winner, player1_seed, player2_seed]

### Example 

In [9]:
get_all_information(BeautifulSoup(requests.get('http://bwf.tournamentsoftware.com/sport/match.aspx?id=4A992ADB-1E74-43C4-83AA-801234F2F12F&match=59').content,'html.parser'))

['MS-U19',
 23,
 'Switzerland',
 'Spain',
 [[0,
   0,
   1,
   1,
   1,
   2,
   2,
   2,
   3,
   4,
   4,
   4,
   4,
   4,
   5,
   5,
   5,
   6,
   6,
   6,
   6,
   7,
   8,
   8,
   8,
   9,
   9,
   9,
   9,
   10,
   10,
   10],
  [0,
   0,
   0,
   0,
   0,
   1,
   2,
   2,
   3,
   3,
   4,
   4,
   4,
   4,
   5,
   5,
   5,
   5,
   5,
   5,
   5,
   5,
   5,
   5,
   6,
   6,
   7,
   8,
   8,
   8]],
 [[0,
   1,
   1,
   2,
   3,
   3,
   4,
   5,
   5,
   5,
   6,
   7,
   8,
   9,
   9,
   10,
   11,
   11,
   12,
   13,
   14,
   14,
   14,
   15,
   16,
   16,
   17,
   18,
   19,
   19,
   20,
   21],
  [0,
   1,
   2,
   3,
   4,
   4,
   4,
   5,
   5,
   6,
   6,
   7,
   8,
   9,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   18,
   19,
   19,
   19,
   20,
   21]],
 [[10, 21], [8, 21]],
 [[2, 9], [0, 2], [60, 60], [18, 42]],
 [[[2, 4], [0, 1], [31, 31], [10, 21]], [[2, 9], [0, 1], [29, 29], [8, 21]]],
 False,
 2,
 None,
 '7']

## Getting players information given match soup and current dictionary

In [10]:
def unseed(player_name):
    if player_name[-1] == ']':
        return player_name.split('[')[0][:-1].lower()
    else:
        return player_name.lower()
    
def get_players_information(soup, dictionary):
    t = [a.text for a in soup.select('td a')][1:]
    temp = [m for m in t if not(bool(re.search('\[(?<![A-Z])[A-Z]{3}(?![A-Z])\]', m))) and len(m) > 2]
    temp = [unseed(m) for m in temp]
    tempa = [a['href'] for a in soup.find_all(attrs = {'href' : re.compile('player=')})]
    if len(temp) == 2:
        if temp[0] in dictionary.keys():
            tempy = dictionary[temp[0]]
            player1_link, player1_name = tempy[0], tempy[1]
        else:
            player1_matchlink = 'http://bwf.tournamentsoftware.com/sport/' + tempa[0]
            tempb = get_player_information(player1_matchlink)
            player1_link, player1_name = tempb[0], tempb[1]
            dictionary[temp[0]] = tempb
        if temp[1] in dictionary.keys():
            tempy = dictionary[temp[1]]
            player2_link, player2_name = tempy[0], tempy[1]
        else:
            player2_matchlink = 'http://bwf.tournamentsoftware.com/sport/' + tempa[1]
            tempb = get_player_information(player2_matchlink)
            player2_link, player2_name = tempb[0], tempb[1]
            dictionary[temp[1]] = tempb            
    else:
        if temp[0] in dictionary.keys():
            tempy = dictionary[temp[0]]
            player1_link, player1_name = tempy[0], tempy[1]
        else:
            player1_matchlink = 'http://bwf.tournamentsoftware.com/sport/' + tempa[0]
            tempb = get_player_information(player1_matchlink)
            player1_link, player1_name = tempb[0], tempb[1]
            dictionary[temp[0]] = tempb
        if temp[1] in dictionary.keys():
            tempy = dictionary[temp[1]]
            player2_link, player2_name = tempy[0], tempy[1]
        else:
            player2_matchlink = 'http://bwf.tournamentsoftware.com/sport/' + tempa[1]
            tempb = get_player_information(player2_matchlink)
            player2_link, player2_name = tempb[0], tempb[1]
            dictionary[temp[1]] = tempb          
        if temp[2] in dictionary.keys():
            tempy = dictionary[temp[2]]
            player3_link, player3_name = tempy[0], tempy[1]
        else:
            player3_matchlink = 'http://bwf.tournamentsoftware.com/sport/' + tempa[2]
            tempb = get_player_information(player3_matchlink)
            player3_link, player3_name = tempb[0], tempb[1]
            dictionary[temp[2]] = tempb
        if temp[3] in dictionary.keys():
            tempy = dictionary[temp[3]]
            player4_link, player4_name = tempy[0], tempy[1]
        else:
            player4_matchlink = 'http://bwf.tournamentsoftware.com/sport/' + tempa[3]
            tempb = get_player_information(player4_matchlink)
            player4_link, player4_name = tempb[0], tempb[1]
            dictionary[temp[3]] = tempb
        player1_link = [player1_link, player2_link]
        player1_name = [player1_name, player2_name]
        player2_link = [player3_link, player4_link]
        player2_name = [player3_name, player4_name]            
    return [player1_link, player1_name, player2_link, player2_name, dictionary]    

### Example 

In [11]:
get_players_information(BeautifulSoup(requests.get('http://bwf.tournamentsoftware.com/sport/match.aspx?id=4A992ADB-1E74-43C4-83AA-801234F2F12F&match=59').content,'html.parser'),{})

['http://bwf.tournamentsoftware.com/player-profile/8D79D2E1-FB10-4E5A-8736-F9C5F8936C83',
 'Arthur BOUDIER',
 'http://bwf.tournamentsoftware.com/player-profile/A14F218F-DE0E-4FC3-90B6-7087F75F9F8B',
 'Joan MONROY',
 {'arthur boudier': ['http://bwf.tournamentsoftware.com/player-profile/8D79D2E1-FB10-4E5A-8736-F9C5F8936C83',
   'Arthur BOUDIER'],
  'joan monroy': ['http://bwf.tournamentsoftware.com/player-profile/A14F218F-DE0E-4FC3-90B6-7087F75F9F8B',
   'Joan MONROY']}]

## Main function!

In [12]:
def get_csv(start, end, filename, dictionary):
    bigerror = []
    hono_error = []
    other_error = []
    hono_counter = 0
    df = pd.DataFrame(columns=['Player 1 Link', 'Player 1 Name', 
                               'Player 2 Link', 'Player 2 Name', 
                                'Tournament Link', 'Tournament Name',
                               'Tournament Type', 'Tournament City',
                               'Tournament Country', 'Tournament Start Date',
                               'Tournament End Date', 'Match Link', 
                               'Discipline', 'Duration',  
                               'Player 1 Country', 'Player 2 Country',
                                'Player 1 Sequence', 'Player 2 Sequence', 'Match Score',
                              'Match Statistic', 'Game Statistic', 'Retired', 'Winner',
                              'Player 1 Seed', 'Player 2 Seed', 'Match Date'])
    df.to_csv(filename, index=False)
    tournament_links = get_tournament_links(start, end)
    for tournament_link in tournament_links:
        results = []
        try:
            tournament_data = get_tournament_data(tournament_link)
            matches_per_day_links = tournament_data[-1]
        except Exception as e:
            bigerror.append([tournament_link, e])
            continue
        for matches_per_day_link in matches_per_day_links:
            try:
                match_date = datetime.strptime(matches_per_day_link.split('=')[-1], '%Y%m%d')
                match_links = get_matches_link(matches_per_day_link)
            except Exception as e:
                bigerror.append([tournament_link, matches_per_day_link, e])
                continue
            for match_link in match_links:
                #print(match_link)
                if hono_counter > 0:
                    time.sleep(7)
                try:
                    page = requests.get(match_link, timeout=None)
                    soup = BeautifulSoup(page.content, 'html.parser')
                    match_information = get_all_information(soup)
                    player_information_dict = get_players_information(soup, dictionary)
                    player_information = player_information_dict[:-1]
                    dictionary = player_information_dict[-1]
                    listx = player_information + [tournament_link] + tournament_data[:6] + [match_link] + match_information + [match_date]
                    results.append(listx)
                    hono_counter = max(0, hono_counter - 1)
                except requests.exceptions.ConnectionError:
                    #print('TAMURA HONO')
                    hono_counter += 1
                    hono_error.append([tournament_link, match_link])
                    continue
                except Exception as e:
                    other_error.append([tournament_link, match_link, e])
                    hono_counter = max(0, hono_counter - 1)
                    continue
                finally:    
                    #print(hono_counter)    
        results = pd.DataFrame(results)   
        with open(filename, 'a') as f:
            results.to_csv(f, header=False, index=False)
    return [bigerror, hono_error, other_error, dictionary]

In [13]:
data = get_csv(datetime(2018,1,1),datetime(2018,12,31),"data_2018.csv",{})  