In [None]:
import pandas as pd
import re
import os

# Carpetas
input_folder = 'Tests/player_stats'
output_folder = 'Tests/player_stats_con_resultado'
os.makedirs(output_folder, exist_ok=True)

# Función para limpiar sets
def clean_set_score(set_str):
    set_str = re.sub(r'\(.*?\)', '', set_str)
    set_str = set_str.replace('ch', '')
    if '-' not in set_str:
        return None
    parts = set_str.strip().split('-')
    if len(parts) != 2:
        return None
    try:
        return int(parts[0]), int(parts[1])
    except ValueError:
        return None

# Procesar todos los archivos CSV
for filename in os.listdir(input_folder):
    if filename.endswith('.csv'):
        path = os.path.join(input_folder, filename)
        df = pd.read_csv(path)

        # Extraer apellido del jugador desde el nombre del archivo
        player_fullname = filename.replace('_matches_full.csv', '')
        player_lastname = re.findall(r'[A-Z][a-z]+$', player_fullname)
        player_lastname = player_lastname[0] if player_lastname else player_fullname

        # Lógica de victoria o derrota
        def get_winner(row):
            match = str(row['match'])
            score = str(row['Score'])
            if 'd.' not in match:
                return None
            parts = match.split('d.')
            if len(parts) != 2:
                return None
            player1 = parts[0].strip()
            player2 = parts[1].strip()
            is_local = player_lastname in player1

            sets_won = 0
            sets_lost = 0
            for set_raw in score.split():
                set_score = clean_set_score(set_raw)
                if set_score is None:
                    continue
                s1, s2 = set_score
                if is_local:
                    if s1 > s2:
                        sets_won += 1
                    elif s1 < s2:
                        sets_lost += 1
                else:
                    if s2 > s1:
                        sets_won += 1
                    elif s2 < s1:
                        sets_lost += 1

            if sets_won > sets_lost:
                return 'W'
            elif sets_lost > sets_won:
                return 'L'
            return None

        # Aplicar la lógica
        df['W_or_L'] = df.apply(get_winner, axis=1)

        # Guardar con todas las columnas originales + W_or_L
        output_path = os.path.join(output_folder, filename.replace('.csv', '_con_WorL.csv'))
        df.to_csv(output_path, index=False)
        print(f"Procesado: {filename} → {output_path}")


Procesado: AlexanderZverev_matches_full.csv → Tests/player_stats_con_resultado\AlexanderZverev_matches_full_con_WorL.csv
Procesado: AndreyRublev_matches_full.csv → Tests/player_stats_con_resultado\AndreyRublev_matches_full_con_WorL.csv
Procesado: CarlosAlcaraz_matches_full.csv → Tests/player_stats_con_resultado\CarlosAlcaraz_matches_full_con_WorL.csv
Procesado: CasperRuud_matches_full.csv → Tests/player_stats_con_resultado\CasperRuud_matches_full_con_WorL.csv
Procesado: DaniilMedvedev_matches_full.csv → Tests/player_stats_con_resultado\DaniilMedvedev_matches_full_con_WorL.csv
Procesado: HolgerRune_matches_full.csv → Tests/player_stats_con_resultado\HolgerRune_matches_full_con_WorL.csv
Procesado: HubertHurkacz_matches_full.csv → Tests/player_stats_con_resultado\HubertHurkacz_matches_full_con_WorL.csv
Procesado: JannikSinner_matches_full.csv → Tests/player_stats_con_resultado\JannikSinner_matches_full_con_WorL.csv
Procesado: NovakDjokovic_matches_full.csv → Tests/player_stats_con_resulta

In [8]:
# Carpeta con los CSVs ya procesados
processed_folder = 'Tests/player_stats_con_resultado'

# Eliminar la columna 'More' si existe
for filename in os.listdir(processed_folder):
    if filename.endswith('.csv'):
        path = os.path.join(processed_folder, filename)
        df = pd.read_csv(path)

        if 'More' in df.columns:
            df.drop(columns=['More'], inplace=True)
            df.to_csv(path, index=False)
            print(f"Columna 'More' eliminada de: {filename}")
        else:
            print(f"Sin columna 'More': {filename}")


Columna 'More' eliminada de: AlexanderZverev_matches_full_con_WorL.csv
Columna 'More' eliminada de: AndreyRublev_matches_full_con_WorL.csv
Columna 'More' eliminada de: CarlosAlcaraz_matches_full_con_WorL.csv
Columna 'More' eliminada de: CasperRuud_matches_full_con_WorL.csv
Columna 'More' eliminada de: DaniilMedvedev_matches_full_con_WorL.csv
Columna 'More' eliminada de: HolgerRune_matches_full_con_WorL.csv
Columna 'More' eliminada de: HubertHurkacz_matches_full_con_WorL.csv
Columna 'More' eliminada de: JannikSinner_matches_full_con_WorL.csv
Columna 'More' eliminada de: NovakDjokovic_matches_full_con_WorL.csv
Columna 'More' eliminada de: StefanosTsitsipas_matches_full_con_WorL.csv


In [9]:

# Columnas a evaluar
columnas_objetivo = [
    'DR', 'A%', 'DF%', '1stIn', '1st%', '2nd%', 'BPSvd', 'Time', 'TPW', 'RPW',
    'vA%', 'v1st%', 'v2nd%', 'BPCnv', 'TP', 'Aces', 'DFs', 'SP', '1SP', '2SP', 'vA'
]

# Procesar cada archivo
for filename in os.listdir(processed_folder):
    if filename.endswith('.csv'):
        path = os.path.join(processed_folder, filename)
        df = pd.read_csv(path)

        # Verificamos solo las columnas que existen en el archivo
        cols_existentes = [col for col in columnas_objetivo if col in df.columns]

        # Filtrar filas con más de 3 nulos en ese subconjunto de columnas
        antes = df.shape[0]
        df_filtrado = df[df[cols_existentes].isnull().sum(axis=1) <= 3]
        despues = df_filtrado.shape[0]

        df_filtrado.to_csv(path, index=False)
        print(f"{filename}: filas eliminadas = {antes - despues}")


AlexanderZverev_matches_full_con_WorL.csv: filas eliminadas = 59
AndreyRublev_matches_full_con_WorL.csv: filas eliminadas = 81
CarlosAlcaraz_matches_full_con_WorL.csv: filas eliminadas = 35
CasperRuud_matches_full_con_WorL.csv: filas eliminadas = 86
DaniilMedvedev_matches_full_con_WorL.csv: filas eliminadas = 113
HolgerRune_matches_full_con_WorL.csv: filas eliminadas = 69
HubertHurkacz_matches_full_con_WorL.csv: filas eliminadas = 86
JannikSinner_matches_full_con_WorL.csv: filas eliminadas = 57
NovakDjokovic_matches_full_con_WorL.csv: filas eliminadas = 145
StefanosTsitsipas_matches_full_con_WorL.csv: filas eliminadas = 121


In [None]:


# Mostrar cantidad de nulos por columna en cada archivo
for filename in os.listdir(processed_folder):
    if filename.endswith('.csv'):
        path = os.path.join(processed_folder, filename)
        df = pd.read_csv(path)

        print(f"Nulos en columnas de {filename}:")
        print(df.isnull().sum())
        print("-" * 40)

Nulos en columnas de AlexanderZverev_matches_full_con_WorL.csv:
Date           0
Tournament     0
Surface        0
Rd             0
Rk             1
vRk            3
match          0
Score          0
DR             0
A%             0
DF%            0
1stIn          0
1st%           0
2nd%           0
BPSvd          0
Time          46
TPW            0
RPW            0
vA%            0
v1st%          0
v2nd%          0
BPCnv          0
TP             0
Aces           0
DFs            0
SP             0
1SP            0
2SP            0
vA             0
W_or_L         2
dtype: int64
----------------------------------------
Nulos en columnas de AndreyRublev_matches_full_con_WorL.csv:
Date           0
Tournament     0
Surface        0
Rd             0
Rk             0
vRk            1
match          0
Score          0
DR             0
A%             0
DF%            0
1stIn          0
1st%           0
2nd%           0
BPSvd          0
Time          28
TPW            0
RPW            0
vA%  

In [10]:
#las filas que son nulas en la columna W_or_L se debe 
#a torneos con un formato unico y no estandarizado, por lo que vamos a decidir eliminarlas.

# Filtrar filas con nulos en 'W_or_L'
for filename in os.listdir(processed_folder):
    if filename.endswith('.csv'):
        path = os.path.join(processed_folder, filename)
        df = pd.read_csv(path)

        # Filtrar filas con nulos en 'W_or_L'
        antes = df.shape[0]
        df_filtrado = df[df['W_or_L'].notnull()]
        despues = df_filtrado.shape[0]

        df_filtrado.to_csv(path, index=False)
        print(f"{filename}: filas eliminadas (nulos en 'W_or_L') = {antes - despues}")
        
# Mostrar cantidad de nulos por columna en cada archivo
for filename in os.listdir(processed_folder):
    if filename.endswith('.csv'):
        path = os.path.join(processed_folder, filename)
        df = pd.read_csv(path)

        print(f"Nulos en columnas de {filename}:")
        print(df.isnull().sum())
        print("-" * 40)

AlexanderZverev_matches_full_con_WorL.csv: filas eliminadas (nulos en 'W_or_L') = 2
AndreyRublev_matches_full_con_WorL.csv: filas eliminadas (nulos en 'W_or_L') = 0
CarlosAlcaraz_matches_full_con_WorL.csv: filas eliminadas (nulos en 'W_or_L') = 3
CasperRuud_matches_full_con_WorL.csv: filas eliminadas (nulos en 'W_or_L') = 0
DaniilMedvedev_matches_full_con_WorL.csv: filas eliminadas (nulos en 'W_or_L') = 4
HolgerRune_matches_full_con_WorL.csv: filas eliminadas (nulos en 'W_or_L') = 0
HubertHurkacz_matches_full_con_WorL.csv: filas eliminadas (nulos en 'W_or_L') = 0
JannikSinner_matches_full_con_WorL.csv: filas eliminadas (nulos en 'W_or_L') = 0
NovakDjokovic_matches_full_con_WorL.csv: filas eliminadas (nulos en 'W_or_L') = 2
StefanosTsitsipas_matches_full_con_WorL.csv: filas eliminadas (nulos en 'W_or_L') = 3
Nulos en columnas de AlexanderZverev_matches_full_con_WorL.csv:
Date           0
Tournament     0
Surface        0
Rd             0
Rk             1
vRk            3
match          