# Author: Jose Guzman


In [6]:
# Install the packages listed in requirements.txt
!pip install -r requirements.txt



Gather Tennis Player Rankings via Sport Radar's API

In [8]:
!echo $SPORTRADAR_API_KEY




In [9]:
import csv
import requests
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Verify the environment variable
api_key = os.getenv("SPORTRADAR_API_KEY")
if api_key:
    print('API Key is set')
else:
    print('API Key is not set')

if not api_key:
    raise ValueError("No API key provided. Please set the SPORTRADAR_API_KEY environment variable.")

# Function to extract ID segments
def extract_id(full_id):
    return full_id.split(':')[-1] if full_id else ''

# URL with hidden API key
url = f"https://api.sportradar.com/tennis/production/v3/en/competitors/sr%3Acompetitor%3A407573/profile.json?api_key={api_key}"

headers = {"accept": "application/json"}

try:
    response = requests.get(url, headers=headers)
    response.raise_for_status()  # Raise an exception for HTTP errors
    data = response.json()
except requests.exceptions.RequestException as e:
    raise SystemExit(f"API request failed: {e}")

# Define the CSV file name
csv_file = "tennis_competitor_profile.csv"

# Check if the response contains the expected keys
if "competitor" not in data or "info" not in data or "competitor_rankings" not in data:
    raise ValueError("Response data is missing expected keys")

# Extract the data
competitor = data["competitor"]
info = data["info"]
competitor_rankings = data["competitor_rankings"][0]

# Write data to CSV
with open(csv_file, mode='w', newline='') as file:
    writer = csv.writer(file)
    
    # Write headers
    headers = [
        "ID", "Name", "Country", "Abbreviation", "Gender",
        "Pro Year", "Handedness", "Highest Singles Ranking", "Highest Doubles Ranking",
        "Weight", "Height", "Date of Birth", "Highest Singles Ranking Date", "Highest Doubles Ranking Date",
        "Current Singles Rank", "Rank Movement", "Rank Points", "Competitor ID", "Rank Name", "Rank Type", "Race Ranking",
        "Year", "Surface Type", "Competitions Played (Surface)", "Competitions Won (Surface)", "Matches Played (Surface)", "Matches Won (Surface)",
        "Competitions Played (Overall)", "Competitions Won (Overall)", "Matches Played (Overall)", "Matches Won (Overall)"
    ]
    writer.writerow(headers)
    
    # Write general data
    general_data = [
        extract_id(competitor["id"]), competitor["name"], competitor["country"], competitor["abbreviation"], competitor["gender"],
        info["pro_year"], info["handedness"], info["highest_singles_ranking"], info.get("highest_doubles_ranking"),
        info["weight"], info["height"], info["date_of_birth"], info["highest_singles_ranking_date"], info.get("highest_doubles_ranking_date"),
        competitor_rankings["rank"], competitor_rankings["movement"], competitor_rankings["points"], extract_id(competitor_rankings["competitor_id"]), competitor_rankings["name"], competitor_rankings["type"], competitor_rankings["race_ranking"]
    ]
    
    # Iterate through periods
    for period in data.get("periods", []):
        year = period["year"]
        overall_stats = period["statistics"]
        
        # Iterate through surfaces
        for surface in period.get("surfaces", []):
            surface_type = surface["type"]
            surface_stats = surface["statistics"]
            
            # Combine general data with specific year and surface data
            row = general_data + [
                year, surface_type, surface_stats["competitions_played"], surface_stats["competitions_won"], surface_stats["matches_played"], surface_stats["matches_won"],
                overall_stats["competitions_played"], overall_stats["competitions_won"], overall_stats["matches_played"], overall_stats["matches_won"]
            ]
            
            writer.writerow(row)

print(f"Data written to {csv_file}")


API Key is not set


ValueError: No API key provided. Please set the SPORTRADAR_API_KEY environment variable.

# Data Importing

In [1]:
import pandas as pd

df = pd.read_csv('data/atp_tennis.csv')
print(df.head())
s

                           Tournament        Date         Series    Court  \
0  Australian Hardcourt Championships  2000-01-03  International  Outdoor   
1  Australian Hardcourt Championships  2000-01-03  International  Outdoor   
2  Australian Hardcourt Championships  2000-01-03  International  Outdoor   
3  Australian Hardcourt Championships  2000-01-03  International  Outdoor   
4  Australian Hardcourt Championships  2000-01-03  International  Outdoor   

  Surface      Round  Best of        Player_1       Player_2       Winner  \
0    Hard  1st Round        3      Dosedel S.    Ljubicic I.   Dosedel S.   
1    Hard  1st Round        3      Clement A.     Enqvist T.   Enqvist T.   
2    Hard  1st Round        3       Escude N.  Baccanello P.    Escude N.   
3    Hard  1st Round        3  Knippschild J.     Federer R.   Federer R.   
4    Hard  1st Round        3     Fromberg R.  Woodbridge T.  Fromberg R.   

   Rank_1  Rank_2  Pts_1  Pts_2  Odd_1  Odd_2        Score  
0      63    

NameError: name 's' is not defined

Data Inspection

In [2]:
print(df.info())
print(df.describe())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63079 entries, 0 to 63078
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Tournament  63079 non-null  object 
 1   Date        63079 non-null  object 
 2   Series      63079 non-null  object 
 3   Court       63079 non-null  object 
 4   Surface     63079 non-null  object 
 5   Round       63079 non-null  object 
 6   Best of     63079 non-null  int64  
 7   Player_1    63079 non-null  object 
 8   Player_2    63079 non-null  object 
 9   Winner      63079 non-null  object 
 10  Rank_1      63079 non-null  int64  
 11  Rank_2      63079 non-null  int64  
 12  Pts_1       63079 non-null  int64  
 13  Pts_2       63079 non-null  int64  
 14  Odd_1       63079 non-null  float64
 15  Odd_2       63079 non-null  float64
 16  Score       63079 non-null  object 
dtypes: float64(2), int64(5), object(10)
memory usage: 8.2+ MB
None
            Best of        Rank_1        Ran

String Manipulation

In [3]:
def split_player_name(df, player_column):
    names = df[player_column].str.split(' ', n=1, expand=True)
    last_name_col = f"{player_column}_Last_Name"
    first_initial_col = f"{player_column}_First_Initial"
    df[last_name_col] = names[0]
    df[first_initial_col] = names[1].str[0]
    df.drop(columns=[player_column], inplace=True)
    return df

df = split_player_name(df, 'Player_1')
df = split_player_name(df, 'Player_2')


Handling Missing Values

In [6]:
def split_player_name(df, player_column):
    names = df[player_column].str.split(' ', n=1, expand=True)
    last_name_col = f"{player_column}_Last_Name"
    first_initial_col = f"{player_column}_First_Initial"
    df[last_name_col] = names[0]
    df[first_initial_col] = names[1].str[0]
    df.drop(columns=[player_column], inplace=True)
    return df

df = split_player_name(df, 'Player_1')
df = split_player_name(df, 'Player_2')


KeyError: 'Player_1'

Data Transformation

In [7]:
df['Date'] = pd.to_datetime(df['Date'])
df['Score'] = df['Score'].astype(float)


ValueError: could not convert string to float: '6-4 6-2'