## Check where this runs
***

In [1]:
import sys
sys.path.append("..") # this is only to enable imports in cell below...(!)
from util.minor_funcs import Minors
m_funcs = Minors()
environment_running = m_funcs.check_environs()

# IMPORTS
***

In [2]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [3]:
import os
import time
            # data sourcing:
import csv
            # databases:
import sqlite3
            # useful other
from collections import defaultdict
            # data toolkit:
import numpy as np
import pandas as pd
import dask.dataframe as dd
            # plotting
import matplotlib.pyplot as plt
%matplotlib inline  

if not environment_running == 'laptop':
    %pip install seaborn
    # no seaborn by default in AWS SageMaker.
    
import seaborn as sns

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.preprocessing import FunctionTransformer
 # Visualizing Pipelines in HTML
from sklearn import set_config; set_config(display='diagram')

## Establish paths

In [36]:
from path_defs.path_definitions import ATP_paths
paths = ATP_paths()
raw_data_path = paths.get_raw_data_path()
atp_data_path = paths.get_atp_data_path()
read_data_path = paths.get_read_data_path()
cleaned_once_data_path = paths.get_cleaned_once_data_path()
matches_data_path_full = paths.get_matches_data_path_full()
staged_1_data_path = paths.get_staged_1_data_path()

# Organise for broad categories of column

In [46]:
# keep a simple dict of columns, based on type
check_completeness = {'start_date': 0, 'end_date': 0, 'player_name': 0, 
                      'duration': 0
                      }
check_key_distinguishing_columns = {'doubles': 0, 'masters': 0, 'nation': 0
                                    , 'seed': 0, 'prize_money': 0
                                    }
check_numerical_columns = {'prize_money': 0,'sets_won': 0,'games_won': 0,  
                           'games_against': 0, 'tiebreaks_won': 0,  'tiebreaks_total': 0,
                      'serve_rating': 0, 'aces': 0, 
                      'double_faults': 0, 'first_serve_made': 0, 
                      'first_serve_attempted': 0, 'first_serve_points_made': 0,  
                      'first_serve_points_attempted': 0, 'second_serve_points_made': 0, 
                      'second_serve_points_attempted': 0, 'break_points_saved': 0, 
                      'break_points_against': 0, 'service_games_won': 0, 
                      'return_rating': 0, 'first_serve_return_points_made': 0, 
                      'first_serve_return_points_attempted': 0, 'second_serve_return_points_made': 0, 
                      'second_serve_return_points_attempted': 0, 'break_points_made': 0, 
                      'break_points_attempted': 0, 'return_games_played': 0, 
                      'service_points_won': 0, 'service_points_attempted': 0, 
                      'return_points_won': 0, 'return_points_attempted': 0, 
                      'total_points_won': 0, 'total_points': 0, 
                      'duration': 0, 
                      # distinguishing column:
                      'round_num': 0
                      }
check_mixed_columns = {'seed': 0, 'currency': 0, 'player_name': 0, 'opponent_name': 0, 'duration': 0
                       }
check_boolean_columns = {'player_victory': 0, 'retirement': 0, 'won_first_set': 0,
                         'doubles': 0, }

In [47]:
# queries 
single_date_query = {"start_date > 2010": 0}
single_year_query = {"year > 2010": 0}


In [30]:
# check_numerical_columns - got to deal with these too
subset_1st_numerical = {k: check_numerical_columns[k] for k in check_numerical_columns.keys() & {'prize_money','sets_won','games_won',  
                           'games_against','tiebreaks_won','tiebreaks_total','round_num'}}
subset_2nd_numerical = {k: check_numerical_columns[k] for k in check_numerical_columns.keys() & {'serve_rating', 'aces', 'double_faults', 'first_serve_made', 'first_serve_attempted', 
                                                                                                 'first_serve_points_made', 'first_serve_points_attempted', 'second_serve_points_made', 
                                                                                                 'second_serve_points_attempted', 'break_points_saved', 'break_points_against', 
                                                                                                 'service_games_won', 'return_rating', 'first_serve_return_points_made', 
                                                                                                 'second_serve_return_points_attempted', 'first_serve_return_points_attempted', 
                                                                                                 'second_serve_return_points_made', 'break_points_made', 'break_points_attempted', 
                                                                                                 'return_games_played', 'service_points_won', 'service_points_attempted', 'return_points_won', 
                                                                                                 'return_points_attempted', 'total_points_won', 'total_points', 'duration'}}


In [31]:
# NOTED DOWN FROM ATP SITE 24 06 25 (need to understand if % or float etc) => columns in existing data
# - Aces => aces
# - 1st Serve => first_serve_made
# - 1st Serve points won => first_serve_points_made
# - 2nd Serve => second_serve_points_made
# - 2nd Serve points won => second_serve_points_made
# - Service Games won => service_games_won
# - Break Points saved => break_points_saved
# - 1st Serve Return Points won => first_serve_return_points_made
# - 2nd Serve Return Points won => second_serve_return_points_made
# - Break Points Converted => break_points_made
# - Return Games won => ???

In [31]:
atp_provided_subset = ['aces', 'first_serve_made', 'first_serve_points_made', 
                    'second_serve_points_made', 'second_serve_points_made', 
                    'service_games_won', 'break_points_saved', 
                    'first_serve_return_points_made', 'second_serve_return_points_made', 'break_points_made']

# Load 180k staged_1 data

In [6]:

schema = {'start_date': 'datetime64[ns]', 'end_date': 'datetime64[ns]'}
staged_1_csv = os.path.join(staged_1_data_path, "matches_staged_1.csv")
staged_1 = pd.read_csv(staged_1_csv, index_col=None).astype(schema)


In [9]:
staged_1.duration = pd.to_timedelta(staged_1.duration)

In [16]:
staged_1.columns

Index(['start_date', 'end_date', 'location', 'court_surface', 'prize_money',
       'currency', 'year', 'player_id', 'player_name', 'opponent_id',
       'opponent_name', 'tournament', 'round', 'num_sets', 'sets_won',
       'games_won', 'games_against', 'tiebreaks_won', 'tiebreaks_total',
       'serve_rating', 'aces', 'double_faults', 'first_serve_made',
       'first_serve_attempted', 'first_serve_points_made',
       'first_serve_points_attempted', 'second_serve_points_made',
       'second_serve_points_attempted', 'break_points_saved',
       'break_points_against', 'service_games_won', 'return_rating',
       'first_serve_return_points_made', 'first_serve_return_points_attempted',
       'second_serve_return_points_made',
       'second_serve_return_points_attempted', 'break_points_made',
       'break_points_attempted', 'return_games_played', 'service_points_won',
       'service_points_attempted', 'return_points_won',
       'return_points_attempted', 'total_points_won', 'total

In [11]:
# staged_1.dtypes

In [32]:
atp_plus_ids = atp_provided_subset
for x in ['start_date', 'end_date', 'player_id']:
    atp_plus_ids.append(x)
atp_plus_ids

['aces',
 'first_serve_made',
 'first_serve_points_made',
 'second_serve_points_made',
 'second_serve_points_made',
 'service_games_won',
 'break_points_saved',
 'first_serve_return_points_made',
 'second_serve_return_points_made',
 'break_points_made',
 'start_date',
 'end_date',
 'player_id']

In [33]:
staged_1[atp_plus_ids].head()

Unnamed: 0,aces,first_serve_made,first_serve_points_made,second_serve_points_made,second_serve_points_made.1,service_games_won,break_points_saved,first_serve_return_points_made,second_serve_return_points_made,break_points_made,start_date,end_date,player_id
0,0,36,18,5,5,8,1,8,5,0,1996-04-08,1996-04-14,albert-costa
1,1,62,43,4,4,10,6,18,12,4,1996-04-08,1996-04-14,alberto-berasategui
2,1,94,60,6,6,15,4,20,36,6,1996-04-08,1996-04-14,alberto-berasategui
3,6,42,36,21,21,13,1,12,23,6,1996-04-08,1996-04-14,bernardo-mota
4,5,37,27,12,12,10,4,5,10,2,1996-04-08,1996-04-14,bernardo-mota


In [39]:
atp_first_data_selected = staged_1[atp_plus_ids]


In [35]:
# Very easy to select just the ATP data (of course)
# Now need to see what / how the ATP data can be found to match in 

## Firstly up to 2022 data from Jeff Sackman's git repo
See data_sourcing.md in foci folder 

In [37]:
jeff_sackman_2024_path = os.path.join(raw_data_path, 'jeff_sackman', 'atp_matches_2024.csv')
js_data_2024 = pd.read_csv(jeff_sackman_2024_path, index_col=None)

In [38]:
js_data_2024.columns

Index(['tourney_id', 'tourney_name', 'surface', 'draw_size', 'tourney_level',
       'tourney_date', 'match_num', 'winner_id', 'winner_seed', 'winner_entry',
       'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age',
       'loser_id', 'loser_seed', 'loser_entry', 'loser_name', 'loser_hand',
       'loser_ht', 'loser_ioc', 'loser_age', 'score', 'best_of', 'round',
       'minutes', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon',
       'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'l_ace', 'l_df', 'l_svpt',
       'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced',
       'winner_rank', 'winner_rank_points', 'loser_rank', 'loser_rank_points'],
      dtype='object')

In [40]:
atp_first_data_selected.columns

Index(['aces', 'first_serve_made', 'first_serve_points_made',
       'second_serve_points_made', 'second_serve_points_made',
       'service_games_won', 'break_points_saved',
       'first_serve_return_points_made', 'second_serve_return_points_made',
       'break_points_made', 'start_date', 'end_date', 'player_id'],
      dtype='object')