## Importamos librerías necesarias

In [1]:
import pandas as pd
import requests, json
import time
import sys

pd.set_option("display.max_columns", None)

## Primer dataset 
### CSV con los resultados de los partidos del circuito ATP desde el año 2000

### Carga de datos

In [2]:
sys.path.append("../")
dfResultados = pd.read_csv("../data/raw/atp_tennis.csv", sep=";")
dfResultados.head()

Unnamed: 0,Tournament,Date,Series,Court,Surface,Round,Best of,Player_1,Player_2,Winner,Rank_1,Rank_2,Pts_1,Pts_2,Odd_1,Odd_2,Score
0,Australian Hardcourt Championships,03/01/2000,International,Outdoor,Hard,1st Round,3,Dosedel S.,Ljubicic I.,Dosedel S.,63,77,-1,-1,-1.0,-1.0,6-4 6-2
1,Australian Hardcourt Championships,03/01/2000,International,Outdoor,Hard,1st Round,3,Clement A.,Enqvist T.,Enqvist T.,56,5,-1,-1,-1.0,-1.0,3-6 3-6
2,Australian Hardcourt Championships,03/01/2000,International,Outdoor,Hard,1st Round,3,Escude N.,Baccanello P.,Escude N.,40,655,-1,-1,-1.0,-1.0,6-7 7-5 6-3
3,Australian Hardcourt Championships,03/01/2000,International,Outdoor,Hard,1st Round,3,Knippschild J.,Federer R.,Federer R.,87,65,-1,-1,-1.0,-1.0,1-6 4-6
4,Australian Hardcourt Championships,03/01/2000,International,Outdoor,Hard,1st Round,3,Fromberg R.,Woodbridge T.,Fromberg R.,81,198,-1,-1,-1.0,-1.0,7-6 5-7 6-4


In [3]:
dfResultados.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61560 entries, 0 to 61559
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Tournament  61560 non-null  object
 1   Date        61560 non-null  object
 2   Series      61560 non-null  object
 3   Court       61560 non-null  object
 4   Surface     61560 non-null  object
 5   Round       61560 non-null  object
 6   Best of     61560 non-null  int64 
 7   Player_1    61560 non-null  object
 8   Player_2    61560 non-null  object
 9   Winner      61560 non-null  object
 10  Rank_1      61560 non-null  int64 
 11  Rank_2      61560 non-null  int64 
 12  Pts_1       61560 non-null  int64 
 13  Pts_2       61560 non-null  int64 
 14  Odd_1       61560 non-null  object
 15  Odd_2       61560 non-null  object
 16  Score       61560 non-null  object
dtypes: int64(5), object(12)
memory usage: 8.0+ MB


In [4]:
dfResultados.describe().round(2)

Unnamed: 0,Best of,Rank_1,Rank_2,Pts_1,Pts_2
count,61560.0,61560.0,61560.0,61560.0,61560.0
mean,3.38,76.03,75.66,1093.31,1099.31
std,0.78,100.83,101.56,1708.44,1731.83
min,3.0,-1.0,-1.0,-1.0,-1.0
25%,3.0,25.0,24.0,-1.0,-1.0
50%,3.0,54.0,54.0,670.0,672.0
75%,3.0,92.0,92.0,1205.0,1210.0
max,5.0,3390.0,4915.0,16950.0,16950.0


In [5]:
dfResultados.shape

(61560, 17)

### Primer análisis de la información

In [6]:
dfResultados.isna().sum()/len(dfResultados) *100 

Tournament    0.0
Date          0.0
Series        0.0
Court         0.0
Surface       0.0
Round         0.0
Best of       0.0
Player_1      0.0
Player_2      0.0
Winner        0.0
Rank_1        0.0
Rank_2        0.0
Pts_1         0.0
Pts_2         0.0
Odd_1         0.0
Odd_2         0.0
Score         0.0
dtype: float64

Como no hay ningún valor NaN, no es necesario aplicar ningún tratamiento

In [7]:
dfResultados.to_csv("../data/process/resultados.csv")

## Segundo dataset 
### API que contiene estadísticas de jugadores

### Carga de datos

In [8]:
url = "https://ultimate-tennis1.p.rapidapi.com/live_leaderboard/50"

headers = {
	"X-RapidAPI-Key": "0b00932e03msh9c0e6292431999cp102f2cjsnfaa91f7ec449",
	"X-RapidAPI-Host": "ultimate-tennis1.p.rapidapi.com"
}

response = requests.get(url, headers=headers)

todos = json.loads(response.text)

players = pd.DataFrame(todos["data"])

In [9]:
ids = players[["Name","id"]]

In [11]:
dfStats = pd.DataFrame()

for i in range(len(ids)):
	id = ids.iloc[i]["id"]
	name = ids.iloc[i]["Name"]
	# print(id, name)
	url = (f"https://ultimate-tennis1.p.rapidapi.com/player_stats/atp/{id}/2023/all")

	headers = {
		"X-RapidAPI-Key": "0b00932e03msh9c0e6292431999cp102f2cjsnfaa91f7ec449",
		"X-RapidAPI-Host": "ultimate-tennis1.p.rapidapi.com"
	}

	response = requests.get(url, headers=headers)

	playerStats = json.loads(response.text)

	data1 = pd.DataFrame(playerStats["player_data"]).head()
	# print(data1)

	name = pd.DataFrame([name], columns=["Name"])
	conc = pd.concat([name, data1], axis=1)
	dfStats = pd.concat([dfStats, conc], ignore_index=True)
	time.sleep(5)
dfStats.head()

Unnamed: 0,Name,1st Serve,1st Serve Points Won,1st Serve Return Points Won,2nd Serve Points Won,2nd Serve Return Points Won,Aces,Break Points Converted,Break Points Faced,Break Points Opportunities,Break Points Saved,Double Faults,Return Games Played,Return Games Won,Return Points Won,Service Games Played,Service Games Won,Total Points Won,Total Service Points Won
0,Novak Djokovic,64%,76%,33%,58%,54%,383,42%,261,491,67%,157,721,29%,41%,741,88%,55%,70%
1,Carlos Alcaraz,66%,72%,35%,56%,54%,302,40%,369,691,65%,161,874,32%,42%,881,85%,54%,67%
2,Daniil Medvedev,64%,75%,34%,50%,54%,547,46%,419,622,67%,314,922,31%,41%,923,85%,54%,66%
3,Jannik Sinner,60%,76%,33%,57%,54%,436,42%,368,615,69%,129,892,29%,41%,901,87%,54%,68%
4,Andrey Rublev,61%,75%,30%,51%,52%,616,39%,439,639,63%,156,1029,24%,38%,1038,84%,52%,66%


### Primer análisis de la información

In [12]:
dfStats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 19 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Name                         50 non-null     object
 1   1st Serve                    50 non-null     object
 2   1st Serve Points Won         50 non-null     object
 3   1st Serve Return Points Won  50 non-null     object
 4   2nd Serve Points Won         50 non-null     object
 5   2nd Serve Return Points Won  50 non-null     object
 6   Aces                         50 non-null     object
 7   Break Points Converted       50 non-null     object
 8   Break Points Faced           50 non-null     object
 9   Break Points Opportunities   50 non-null     object
 10  Break Points Saved           50 non-null     object
 11  Double Faults                50 non-null     object
 12  Return Games Played          50 non-null     object
 13  Return Games Won             50 non-n

In [13]:
dfStats.describe()

Unnamed: 0,Name,1st Serve,1st Serve Points Won,1st Serve Return Points Won,2nd Serve Points Won,2nd Serve Return Points Won,Aces,Break Points Converted,Break Points Faced,Break Points Opportunities,Break Points Saved,Double Faults,Return Games Played,Return Games Won,Return Points Won,Service Games Played,Service Games Won,Total Points Won,Total Service Points Won
count,50,50,50,50,50,50,50,50,50,50,50,50,50,50,50,50,50,50,50
unique,50,14,14,12,12,11,46,14,44,48,13,46,47,17,12,49,17,8,10
top,Novak Djokovic,64%,72%,27%,51%,51%,362,41%,335,303,65%,212,530,20%,38%,764,84%,50%,65%
freq,1,9,7,10,9,10,2,8,2,2,6,2,2,6,11,2,6,17,11


In [14]:
dfStats.to_csv("../data/process/estadisticas.csv")

<hr>
<div>
    <a href="./2_Preparacion_Datos.ipynb">
        <button style="float: right;">2.Preparación de datos &#8594;</button>
    </a>
</div>
<hr>