# Loading the data

In [9]:
import pandas as pd
import matplotlib as plt
import numpy as np
import glob
import os
import csv

data_dir = "data/Tennis/JeffSackmann_data/"
database_dir = data_dir + "db/"

In [2]:
def load_data(directory):
    df = pd.DataFrame()
    d = []
            
    for file in glob.glob(directory + "*.csv"):
        tdf = pd.read_csv(file, index_col=None, header=0)
        d.append(tdf)
    
    return pd.concat(d)

In [5]:
data = load_data(data_dir + "matches/")
pd.set_option("display.max_columns", None)

# Viewing the data

In [6]:
data.head(2)

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,winner_name,winner_hand,winner_ht,winner_ioc,winner_age,winner_rank,winner_rank_points,loser_id,loser_seed,loser_entry,loser_name,loser_hand,loser_ht,loser_ioc,loser_age,loser_rank,loser_rank_points,score,best_of,round,minutes,w_ace,w_df,w_svpt,w_1stIn,w_1stWon,w_2ndWon,w_SvGms,w_bpSaved,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced
0,2016-M020,Brisbane,Hard,32,A,20160104,300,105683,4.0,,Milos Raonic,R,196.0,CAN,25.021218,14.0,2170.0,103819,1.0,,Roger Federer,R,185.0,SUI,34.406571,3.0,8265.0,6-4 6-4,3,F,87.0,6.0,6.0,60.0,34.0,28.0,14.0,10.0,1.0,1.0,7.0,3.0,61.0,34.0,25.0,14.0,10.0,3.0,5.0
1,2016-M020,Brisbane,Hard,32,A,20160104,299,103819,1.0,,Roger Federer,R,185.0,SUI,34.406571,3.0,8265.0,106233,8.0,,Dominic Thiem,R,,AUT,22.335387,20.0,1600.0,6-1 6-4,3,SF,60.0,6.0,0.0,49.0,27.0,23.0,12.0,9.0,0.0,1.0,2.0,4.0,55.0,31.0,18.0,9.0,8.0,2.0,6.0


## What the column names mean:

- tourney_id: tournament id number<br><br>
- tourney_name: tournament name<br><br>
- surface: type of surface of the playing field<br><br>
- draw_size: the number of players in the tournament<br><br>
- tourney_level: tournament skill level<br><br>
- tourney_date: date the tournament took place<br><br>
- match_num: number of the match that season<br><br>
- winner_id: the ID of the match winner<br><br>
- winner_seed: the seed of the winner. A seed is assigned to high ranking and thus popular players, to make sure they do not play against each other in the first rounds of the tournament. Keeping the highly popular players untill the end of the tournament.

     For more information about seeds in tennis, look at: https://www.quora.com/Why-is-the-term-seed-used-instead-of-rank-in-tennis <br><br>
- winner_entry: The way the winner entered the tournament. If this is empty, this means that the player entered the tournament in the main draw. If this value is WC, this means the player entered with a wildcard. Finally if the value is Q, the player entered the tournament by qualifying.

     For more information on entry's, look at: https://sports.stackexchange.com/questions/540/how-do-tennis-players-get-into-tournaments<br><br>
- name = the name of the winning player<br><br>
- hand = R or L, representing left-handed or right-handed playing<br><br>
- ht = height of the player in cm<br><br>
- ioc = Code of the country of origin of the player<br><br>
- age = age of the player<br><br>
- rank = rank of the player<br><br>
- rank_points = ranking points the player has collected (not sure if total or this season)<br><br>
- score = final score of the match<br><br>
- best_of = the format of game (best of 3 or best of 5)<br><br>
- round = the round of the tournament<br><br>
- minutes = the duration of the match in minutes<br><br>
- ace = number of aces
    For more information on aces in tennis, look at: https://en.wikipedia.org/wiki/Ace_(tennis) <br><br>
- df = number of double faults
    For more information on double faults in tennis, look at: https://en.wikipedia.org/wiki/Serve_(tennis)#double_fault <br><br>
- svpt = number of service points<br><br>
- 1stln = number of first serves<br><br>
- 1stWon = number of first serves won <br><br>
- 2ndWon = number of second serves won <br><br>
- SvGms = number of service games<br><br>
- bpSaved = number of break points saved<br><br>
- bpfaced = number of break points faced<br><br>

## Define the data features we will use and or create

To determine the data we input into the model, we need to think about data that we know before the match. Bellow is a list of data points I'm going to use / create.

### Match / tournament specific:

- venue
- weather
- temperature
- best of type
- tournament name
- tournament round
- tournament level
- play surface
- draw size

### Player specific:

- player age
- player height
- Total matches in career
- Total days in career
- player rank
- player rank-points
- player hand
- avg5 double faults
- avg5 first serves
- avg5 first serves won
- avg5 second serves won
- avg5 aces
- avg5 service games
- avg5 break points saved
- avg5 break points faced
- total double faults
- total first serves
- total first serves won
- total second serves won
- total aces
- total service games
- total break points saved
- total break points faced

# Database creation

## Players:

We'll be creating a csv file that will act as a sort of database for each player. Giving each player a unique id that we will use when training the model.

In [39]:
def get_player_id(player_name):
    
    filepath = database_dir + "players_db.csv"  # The database file
    
    if not os.path.exists(filepath):  # Create the file if it does not exist yet
        print("File does not exist yet, creating file..")
        
        with open(filepath, "w") as file:
            field_names = ["id", "name", "birth_date", "went_pro_date", "height"]
            writer = csv.DictWriter(file, fieldnames=field_names)
            writer.writeheader()
            
        print("File created!")
        
    try:
        df = pd.read_csv(filepath)
        index = df.index[df["name"] == player_name]
        
        if(len(index > 0)):
            return index[0]
        else:
            return None
                
    except pd.io.common.EmptyDataError:
        print("File is empty!")
        return None
        
    print(dataframe)

In [41]:
print(get_player_id("kenneth"))

0
