# Imports y funciones

In [1]:
# # 📦 Importar librerías necesarias
import pyspark
import pandas as pd
import numpy as np
import seaborn as sns
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.functions import trim
from pyspark.sql.types import *

import os

In [2]:
import kagglehub

# Descargar el dataset desde Kaggle Hub
path = kagglehub.dataset_download("davidcariboo/player-scores")

print("Path to dataset files:", path)

# Lista los archivos descargados
for root, _, files in os.walk(path):
    for file in files:
        print(os.path.join(root, file))

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: /home/bigdata/.cache/kagglehub/datasets/davidcariboo/player-scores/versions/602
/home/bigdata/.cache/kagglehub/datasets/davidcariboo/player-scores/versions/602/club_games.csv
/home/bigdata/.cache/kagglehub/datasets/davidcariboo/player-scores/versions/602/game_events.csv
/home/bigdata/.cache/kagglehub/datasets/davidcariboo/player-scores/versions/602/game_lineups.csv
/home/bigdata/.cache/kagglehub/datasets/davidcariboo/player-scores/versions/602/clubs.csv
/home/bigdata/.cache/kagglehub/datasets/davidcariboo/player-scores/versions/602/player_valuations.csv
/home/bigdata/.cache/kagglehub/datasets/davidcariboo/player-scores/versions/602/appearances.csv
/home/bigdata/.cache/kagglehub/datasets/davidcariboo/player-scores/versions/602/games.csv
/home/bigdata/.cache/kagglehub/datasets/davidcariboo/player-scores/versions/602/transfers.csv
/home/bigdata/.cache/kagglehub/datasets/davidcariboo/player-scores/versions/602/competitions.csv
/home/bigdata/.cache/kagglehub/datasets/

In [3]:
spark = SparkSession.builder.appName('tfm_2025').config("spark.driver.memory", "16g").config("spark.executor.memory", "16g").getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/06/13 12:52:03 WARN Utils: Your hostname, DESKTOP-JCU85UN, resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/06/13 12:52:03 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/13 12:52:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Funciones utiles para la limpieza

In [4]:
def mostrar_sumario(df):
    """
    Muestra un sumario del dataset.
    Explicacion:
        - Usa la funcion 'describe()' para mostrar las siguientes metricas:
            - El conteo de filas por columna.
            - La media de los valores de las columnas numericas (muestra NULL si no es numerica).
            - La desviacion  estandar de los valores de las columnas numericas (muestra NULL si no es numerica).
            - El valor minimo de cada columna.
            - El valor maximo de cada columna.
        - Devuelve el DataFrame con las metricas.
    Args:
        df: DataFrame a analizar

    Returns:
        Un DataFrame con las métricas del dataset.
    """
    return df.summary()

In [5]:
def contar_nulos_por_columna(df):
    """
    Calcula el numero de valores nulos en cada columna del DataFrame.
    Explicacion:
        - Itera sobre cada columna del DataFrame utilizando 'df.columns'.
        - Usa la funcion 'expr()' que utiliza una expresion SQL para contar los valores nulos en cada columna.
        - Asigna '1' si el valor es nulo y '0' si no es nulo.
        - Renombra la columna con '.alias(c)' para mantener el nombre original.
        - Selecciona y aplica la transformacion en todas las columnas dinamicamente con 'df.select([...])'.
        - Devuelve el DataFrame con los conteos de nulos.
    Args:
        df: DataFrame a analizar

    Returns:
        Un DataFrame con las mismas columnas del DataFrame original
        y una fila que indica el numero de valores nulos por columna.
    """
    return df.select([expr(f"sum(case when {c} is null then 1 else 0 end)").alias(c) for c in df.columns])

In [6]:
def duplicados_por_columna(df, columna):
    """
    Muestra los valores duplicados y su conteo para una columna especifica.
    Explicacion:
        - Agrupa el DataFrame por la columna especificada.
        - Cuenta el numero de ocurrencias de cada valor en la columna.
        - Filtra para ver los valores que aparecen mas de una vez.
    Args:
        df: DataFrame a analizar.
        columna: Nombre de la columna a verificar.
    """
    df.groupBy(columna).count().filter(col("count") > 1).show(truncate=False)

# Archivo "clubs.csv"

In [7]:
# Cargar el csv
df_clubs = spark.read.csv(path + "/clubs.csv", header=True, inferSchema=True)

# Ver las primeras filas
df_clubs.show(10)
df_clubs.printSchema()
df_clubs.columns

+-------+-----------------+--------------------+-----------------------+------------------+----------+-----------+-----------------+---------------------+---------------------+--------------------+-------------+-------------------+----------+-----------+--------------------+--------------------+
|club_id|        club_code|                name|domestic_competition_id|total_market_value|squad_size|average_age|foreigners_number|foreigners_percentage|national_team_players|        stadium_name|stadium_seats|net_transfer_record|coach_name|last_season|            filename|                 url|
+-------+-----------------+--------------------+-----------------------+------------------+----------+-----------+-----------------+---------------------+---------------------+--------------------+-------------+-------------------+----------+-----------+--------------------+--------------------+
|    105|  sv-darmstadt-98|     SV Darmstadt 98|                     L1|              NULL|        27|       

['club_id',
 'club_code',
 'name',
 'domestic_competition_id',
 'total_market_value',
 'squad_size',
 'average_age',
 'foreigners_number',
 'foreigners_percentage',
 'national_team_players',
 'stadium_name',
 'stadium_seats',
 'net_transfer_record',
 'coach_name',
 'last_season',
 'filename',
 'url']

In [8]:
mostrar_sumario(df_clubs).show()

25/06/12 11:48:18 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 3:>                                                          (0 + 1) / 1]

+-------+------------------+--------------------+--------------------+-----------------------+------------------+------------------+------------------+------------------+---------------------+---------------------+--------------------+------------------+-------------------+----------+------------------+--------------------+--------------------+
|summary|           club_id|           club_code|                name|domestic_competition_id|total_market_value|        squad_size|       average_age| foreigners_number|foreigners_percentage|national_team_players|        stadium_name|     stadium_seats|net_transfer_record|coach_name|       last_season|            filename|                 url|
+-------+------------------+--------------------+--------------------+-----------------------+------------------+------------------+------------------+------------------+---------------------+---------------------+--------------------+------------------+-------------------+----------+------------------+--

                                                                                

In [9]:
contar_nulos_por_columna(df_clubs).show()

+-------+---------+----+-----------------------+------------------+----------+-----------+-----------------+---------------------+---------------------+------------+-------------+-------------------+----------+-----------+--------+---+
|club_id|club_code|name|domestic_competition_id|total_market_value|squad_size|average_age|foreigners_number|foreigners_percentage|national_team_players|stadium_name|stadium_seats|net_transfer_record|coach_name|last_season|filename|url|
+-------+---------+----+-----------------------+------------------+----------+-----------+-----------------+---------------------+---------------------+------------+-------------+-------------------+----------+-----------+--------+---+
|      0|        0|   0|                      0|               439|         0|         38|                0|                   49|                    0|           0|            0|                  0|       439|          0|       0|  0|
+-------+---------+----+-----------------------+--------

In [10]:
duplicados_por_columna(df_clubs, "club_id")

+-------+-----+
|club_id|count|
+-------+-----+
+-------+-----+



In [11]:
# Comparar numero de filas totales con numero de nulos en 'total_market_value'
print(f"Filas totales: {df_clubs.count()}")
print(f"Nulos en la columna 'total_market_value': {contar_nulos_por_columna(df_clubs).select('total_market_value').first()[0]}")

Filas totales: 439
Nulos en la columna 'total_market_value': 439


In [12]:
# La columna 'total_market_value' esta completamente vacia, podemos eliminarla
df_clubs = df_clubs.drop("total_market_value")
df_clubs.show()

+-------+-------------------+--------------------+-----------------------+----------+-----------+-----------------+---------------------+---------------------+--------------------+-------------+-------------------+----------+-----------+--------------------+--------------------+
|club_id|          club_code|                name|domestic_competition_id|squad_size|average_age|foreigners_number|foreigners_percentage|national_team_players|        stadium_name|stadium_seats|net_transfer_record|coach_name|last_season|            filename|                 url|
+-------+-------------------+--------------------+-----------------------+----------+-----------+-----------------+---------------------+---------------------+--------------------+-------------+-------------------+----------+-----------+--------------------+--------------------+
|    105|    sv-darmstadt-98|     SV Darmstadt 98|                     L1|        27|       25.6|               13|                 48.1|                    1|M

In [13]:
# Ver si hay registros en la columna 'name' con caracteres especiales
df_clubs.filter(col('name').rlike('[^a-zA-Z0-9- ]')).show()

+-------+--------------------+--------------------+-----------------------+----------+-----------+-----------------+---------------------+---------------------+--------------------+-------------+-------------------+----------+-----------+--------------------+--------------------+
|club_id|           club_code|                name|domestic_competition_id|squad_size|average_age|foreigners_number|foreigners_percentage|national_team_players|        stadium_name|stadium_seats|net_transfer_record|coach_name|last_season|            filename|                 url|
+-------+--------------------+--------------------+-----------------------+----------+-----------+-----------------+---------------------+---------------------+--------------------+-------------+-------------------+----------+-----------+--------------------+--------------------+
|    114|   besiktas-istanbul|Beşiktaş Jimnasti...|                    TR1|        30|       26.6|               15|                 50.0|                   

In [14]:
# Varios nombres de clubes usan caracteres de otros alfabetos, decidimos dejar esos nombres como estan
# pero eliminar otros caracteres como numeros o parentesis.
# La expresion '[^\\p{L} ]' acepta letras de diferentes alfabetos y espacios en blanco
df_clubs = df_clubs.withColumn("name", regexp_replace(col("name"), "[^\\p{L} ]", ""))
# Ver nombres de clubs tras la sustitucion
df_clubs.select("name").distinct().show(truncate=False)

+-------------------------------------+
|name                                 |
+-------------------------------------+
| Fußballclub Union Berlin            |
|PAS Lamia                            |
|FC Schalke                           |
|SV Darmstadt                         |
|Kieler Sportvereinigung Holstein von |
|FC Sochi                             |
|Beerschot AC                         |
|FC Ingulets Petrove                  |
|Gaziantep Futbol Kulübü AŞ           |
|FK Krasnodar                         |
|Lyngby Boldklubben af                |
|FK Kolos Kovalivka                   |
|Koninklijke Atletiek Associatie Gent |
|GFC Ajaccio                          |
|SKA Khabarovsk                       |
|SC Dnipro                            |
|Samsunspor                           |
|FußballClub Augsburg                 |
|Adana Demirspor Kulübü               |
|Dundee Football Club                 |
+-------------------------------------+
only showing top 20 rows


In [15]:
# Ver nulos de la columna 'average_age'
df_clubs.filter(col('average_age').isNull()).show()

+-------+--------------------+--------------------+-----------------------+----------+-----------+-----------------+---------------------+---------------------+--------------------+-------------+-------------------+----------+-----------+--------------------+--------------------+
|club_id|           club_code|                name|domestic_competition_id|squad_size|average_age|foreigners_number|foreigners_percentage|national_team_players|        stadium_name|stadium_seats|net_transfer_record|coach_name|last_season|            filename|                 url|
+-------+--------------------+--------------------+-----------------------+----------+-----------+-----------------+---------------------+---------------------+--------------------+-------------+-------------------+----------+-----------+--------------------+--------------------+
|  21957|        ao-platanias|        AO Platanias|                    GR1|         0|       NULL|                0|                 NULL|                   

In [16]:
# Calcular la media de 'average_age' por liga (excluyendo nulos) y redondear a 1 decimal
avg_age_by_league_df = df_clubs.filter(col("average_age").isNotNull()).groupBy("domestic_competition_id").agg(round(avg("average_age"), 1).alias("avg_age_by_league"))

# Unir la nueva columna al dataset
df_clubs = df_clubs.join(avg_age_by_league_df, on="domestic_competition_id", how="left_outer")

# Ver nueva columna
df_clubs.select("domestic_competition_id", "avg_age_by_league").show()

+-----------------------+-----------------+
|domestic_competition_id|avg_age_by_league|
+-----------------------+-----------------+
|                     L1|             25.8|
|                    RU1|             25.9|
|                    TR1|             25.4|
|                    IT1|             25.8|
|                    GB1|             26.1|
|                    BE1|             24.5|
|                    DK1|             24.8|
|                    DK1|             24.8|
|                    NL1|             24.5|
|                    TR1|             25.4|
|                    GR1|             26.4|
|                    GR1|             26.4|
|                    DK1|             24.8|
|                    PO1|             25.6|
|                    SC1|             26.2|
|                     L1|             25.8|
|                    BE1|             24.5|
|                    BE1|             24.5|
|                    PO1|             25.6|
|                    BE1|       

In [17]:
# Sustituir nulos por 'avg_age_by_league'
df_clubs = df_clubs.withColumn(
    "average_age",
    when(col("average_age").isNull(), col("avg_age_by_league")).otherwise(col("average_age"))
)

# Eliminar columna de media por liga
df_clubs = df_clubs.drop("avg_age_by_league")

# Comprobar de nuevo los nulos despues de la sustitucion
df_clubs.filter(col('average_age').isNull()).show()

+-----------------------+-------+---------+----+----------+-----------+-----------------+---------------------+---------------------+------------+-------------+-------------------+----------+-----------+--------+---+
|domestic_competition_id|club_id|club_code|name|squad_size|average_age|foreigners_number|foreigners_percentage|national_team_players|stadium_name|stadium_seats|net_transfer_record|coach_name|last_season|filename|url|
+-----------------------+-------+---------+----+----------+-----------+-----------------+---------------------+---------------------+------------+-------------+-------------------+----------+-----------+--------+---+
+-----------------------+-------+---------+----+----------+-----------+-----------------+---------------------+---------------------+------------+-------------+-------------------+----------+-----------+--------+---+



In [18]:
# Ver nulos de la columna 'foreigners_percentage'
df_clubs.filter(col('foreigners_percentage').isNull()).show()

+-----------------------+-------+------------------+-------------------+----------+-----------+-----------------+---------------------+---------------------+--------------------+-------------+-------------------+----------+-----------+--------------------+--------------------+
|domestic_competition_id|club_id|         club_code|               name|squad_size|average_age|foreigners_number|foreigners_percentage|national_team_players|        stadium_name|stadium_seats|net_transfer_record|coach_name|last_season|            filename|                 url|
+-----------------------+-------+------------------+-------------------+----------+-----------+-----------------+---------------------+---------------------+--------------------+-------------+-------------------+----------+-----------+--------------------+--------------------+
|                    GR1|  21957|      ao-platanias|       AO Platanias|         0|       26.4|                0|                 NULL|                    0|Dimotiko 

In [19]:
# Calcular la media de 'foreigners_percentage' por liga (excluyendo nulos) y redondear a 1 decimal
avg_foreigners_percentage_df = df_clubs.filter(col("foreigners_percentage").isNotNull()).groupBy("domestic_competition_id").agg(round(avg("foreigners_percentage"), 1).alias("avg_foreigners_percentage_by_league"))

# Unir la nueva columna al dataset
df_clubs = df_clubs.join(avg_foreigners_percentage_df, on="domestic_competition_id", how="left_outer")

# Ver nueva columna
df_clubs.select("domestic_competition_id", "avg_foreigners_percentage_by_league").show()

+-----------------------+-----------------------------------+
|domestic_competition_id|avg_foreigners_percentage_by_league|
+-----------------------+-----------------------------------+
|                     L1|                               46.7|
|                    RU1|                               32.0|
|                    TR1|                               34.7|
|                    IT1|                               50.6|
|                    GB1|                               61.2|
|                    BE1|                               59.5|
|                    DK1|                               42.3|
|                    DK1|                               42.3|
|                    NL1|                               42.6|
|                    TR1|                               34.7|
|                    GR1|                               46.6|
|                    GR1|                               46.6|
|                    DK1|                               42.3|
|       

In [20]:
# Sustituir nulos por 'avg_foreigners_percentage_by_league'
df_clubs = df_clubs.withColumn(
    "foreigners_percentage",
    when(col("foreigners_percentage").isNull(), col("avg_foreigners_percentage_by_league")).otherwise(col("foreigners_percentage"))
)

# Eliminar tabla de media por liga
df_clubs = df_clubs.drop("avg_foreigners_percentage_by_league")

# Comprobar de nuevo los nulos despues de la sustitucion
contar_nulos_por_columna(df_clubs).show()

+-----------------------+-------+---------+----+----------+-----------+-----------------+---------------------+---------------------+------------+-------------+-------------------+----------+-----------+--------+---+
|domestic_competition_id|club_id|club_code|name|squad_size|average_age|foreigners_number|foreigners_percentage|national_team_players|stadium_name|stadium_seats|net_transfer_record|coach_name|last_season|filename|url|
+-----------------------+-------+---------+----+----------+-----------+-----------------+---------------------+---------------------+------------+-------------+-------------------+----------+-----------+--------+---+
|                      0|      0|        0|   0|         0|          0|                0|                    0|                    0|           0|            0|                  0|       439|          0|       0|  0|
+-----------------------+-------+---------+----+----------+-----------+-----------------+---------------------+---------------------

In [21]:
# Eliminamos las columnas 'filename' y 'url' porque no son relevantes para nuestro analisis
# Tambien eliminamos la columna 'coach_name' porque esa informacion ya esta en otra tabla
df_clubs = df_clubs.drop("filename", "url", "coach_name")
df_clubs.show(10)

+-----------------------+-------+-----------------+--------------------+----------+-----------+-----------------+---------------------+---------------------+--------------------+-------------+-------------------+-----------+
|domestic_competition_id|club_id|        club_code|                name|squad_size|average_age|foreigners_number|foreigners_percentage|national_team_players|        stadium_name|stadium_seats|net_transfer_record|last_season|
+-----------------------+-------+-----------------+--------------------+----------+-----------+-----------------+---------------------+---------------------+--------------------+-------------+-------------------+-----------+
|                     L1|    105|  sv-darmstadt-98|       SV Darmstadt |        27|       25.6|               13|                 48.1|                    1|Merck-Stadion am ...|        17810|            +€3.05m|       2023|
|                    RU1|  11127|ural-ekaterinburg|  Ural Yekaterinburg|        30|       26.5|     

# Archivo "competitions.csv"

In [22]:
# Cargar el csv
df_competitions = spark.read.csv(path + "/competitions.csv", header=True, inferSchema=True)

# Ver las primeras filas
df_competitions.show(10)
df_competitions.printSchema()
df_competitions.columns

+--------------+--------------------+--------------------+------------------+-----------------+----------+------------+--------------------+-------------+--------------------+------------------------+
|competition_id|    competition_code|                name|          sub_type|             type|country_id|country_name|domestic_league_code|confederation|                 url|is_major_national_league|
+--------------+--------------------+--------------------+------------------+-----------------+----------+------------+--------------------+-------------+--------------------+------------------------+
|           CIT|           italy-cup|           italy-cup|      domestic_cup|     domestic_cup|        75|       Italy|                 IT1|       europa|https://www.trans...|                   false|
|          NLSC|johan-cruijff-schaal|johan-cruijff-schaal|domestic_super_cup|            other|       122| Netherlands|                 NL1|       europa|https://www.trans...|                   fa

['competition_id',
 'competition_code',
 'name',
 'sub_type',
 'type',
 'country_id',
 'country_name',
 'domestic_league_code',
 'confederation',
 'url',
 'is_major_national_league']

In [23]:
mostrar_sumario(df_competitions).show()

+-------+--------------+-------------------+-------------------+--------------+------------+-----------------+------------+--------------------+-------------+--------------------+
|summary|competition_id|   competition_code|               name|      sub_type|        type|       country_id|country_name|domestic_league_code|confederation|                 url|
+-------+--------------+-------------------+-------------------+--------------+------------+-----------------+------------+--------------------+-------------+--------------------+
|  count|            44|                 44|                 44|            44|          44|               44|          36|                  36|           44|                  44|
|   mean|          NULL|               NULL|               NULL|          NULL|        NULL|94.86363636363636|        NULL|                NULL|         NULL|                NULL|
| stddev|          NULL|               NULL|               NULL|          NULL|        NULL|70.51887

In [24]:
contar_nulos_por_columna(df_competitions).show()

+--------------+----------------+----+--------+----+----------+------------+--------------------+-------------+---+------------------------+
|competition_id|competition_code|name|sub_type|type|country_id|country_name|domestic_league_code|confederation|url|is_major_national_league|
+--------------+----------------+----+--------+----+----------+------------+--------------------+-------------+---+------------------------+
|             0|               0|   0|       0|   0|         0|           8|                   8|            0|  0|                       0|
+--------------+----------------+----+--------+----+----------+------------+--------------------+-------------+---+------------------------+



In [25]:
duplicados_por_columna(df_competitions, "competition_id")

+--------------+-----+
|competition_id|count|
+--------------+-----+
+--------------+-----+



In [26]:
# Ver nulos en 'country_name'
df_competitions.filter(col('country_name').isNull()).show()

+--------------+--------------------+--------------------+--------------------+-----------------+----------+------------+--------------------+-------------+--------------------+------------------------+
|competition_id|    competition_code|                name|            sub_type|             type|country_id|country_name|domestic_league_code|confederation|                 url|is_major_national_league|
+--------------+--------------------+--------------------+--------------------+-----------------+----------+------------+--------------------+-------------+--------------------+------------------------+
|           USC|      uefa-super-cup|      uefa-super-cup|      uefa_super_cup|international_cup|        -1|        NULL|                NULL|       europa|https://www.trans...|                   false|
|            EL|       europa-league|       europa-league|       europa_league|international_cup|        -1|        NULL|                NULL|       europa|https://www.trans...|           

In [27]:
# Las filas con 'country_id'= -1 y 'country_name'= null son de competiciones internacionales
# Sustituimos 'country_name' por 'International', 'country_id' por '1' y 'domestic_league_code' por 'INT'
df_competitions = df_competitions.withColumn(
    "country_name",
    when(col("country_id") == -1, "International").otherwise(col("country_name"))
)

df_competitions = df_competitions.withColumn(
    "country_id",
    when(col("country_id") == -1, 1).otherwise(col("country_id"))
)

df_competitions = df_competitions.withColumn(
    "domestic_league_code",
    when(col("country_id") == 1, "INT").otherwise(col("domestic_league_code"))
)

contar_nulos_por_columna(df_competitions).show()

+--------------+----------------+----+--------+----+----------+------------+--------------------+-------------+---+------------------------+
|competition_id|competition_code|name|sub_type|type|country_id|country_name|domestic_league_code|confederation|url|is_major_national_league|
+--------------+----------------+----+--------+----+----------+------------+--------------------+-------------+---+------------------------+
|             0|               0|   0|       0|   0|         0|           0|                   0|            0|  0|                       0|
+--------------+----------------+----+--------+----+----------+------------+--------------------+-------------+---+------------------------+



In [28]:
# La columna 'is_major_national_league' es de tipo booleano
# Convertimos la columna a numerico para poder usarla al entrenar nuestro modelo
df_competitions = df_competitions.withColumn(
    "is_major_national_league",
    col("is_major_national_league").cast(IntegerType())
)
# Ver si el cambio se hizo correctamente
df_competitions.printSchema()
df_competitions.show()

root
 |-- competition_id: string (nullable = true)
 |-- competition_code: string (nullable = true)
 |-- name: string (nullable = true)
 |-- sub_type: string (nullable = true)
 |-- type: string (nullable = true)
 |-- country_id: integer (nullable = true)
 |-- country_name: string (nullable = true)
 |-- domestic_league_code: string (nullable = true)
 |-- confederation: string (nullable = true)
 |-- url: string (nullable = true)
 |-- is_major_national_league: integer (nullable = true)

+--------------+--------------------+--------------------+--------------------+-----------------+----------+-------------+--------------------+-------------+--------------------+------------------------+
|competition_id|    competition_code|                name|            sub_type|             type|country_id| country_name|domestic_league_code|confederation|                 url|is_major_national_league|
+--------------+--------------------+--------------------+--------------------+-----------------+-------

In [29]:
# Eliminar la columna 'url' porque no es relevante para nuestro analisis
df_competitions = df_competitions.drop("url")
df_competitions.show()

+--------------+--------------------+--------------------+--------------------+-----------------+----------+-------------+--------------------+-------------+------------------------+
|competition_id|    competition_code|                name|            sub_type|             type|country_id| country_name|domestic_league_code|confederation|is_major_national_league|
+--------------+--------------------+--------------------+--------------------+-----------------+----------+-------------+--------------------+-------------+------------------------+
|           CIT|           italy-cup|           italy-cup|        domestic_cup|     domestic_cup|        75|        Italy|                 IT1|       europa|                       0|
|          NLSC|johan-cruijff-schaal|johan-cruijff-schaal|  domestic_super_cup|            other|       122|  Netherlands|                 NL1|       europa|                       0|
|           GRP|     kypello-elladas|     kypello-elladas|        domestic_cup|     d

# Archivo "game_events.csv"

In [27]:
# Cargar el csv
df_game_events = spark.read.csv(path + "/game_events.csv", header=True, inferSchema=True)

# Ver las primeras filas
df_game_events.show(10)
df_game_events.printSchema()
df_game_events.columns

+--------------------+----------+-------+------+-------------+-------+---------+--------------------+------------+----------------+
|       game_event_id|      date|game_id|minute|         type|club_id|player_id|         description|player_in_id|player_assist_id|
+--------------------+----------+-------+------+-------------+-------+---------+--------------------+------------+----------------+
|2f41da30c471492e7...|2012-08-05|2211607|    77|        Cards|    610|     4425|1. Yellow card  ,...|        NULL|            NULL|
|a72f7186d132775f2...|2012-08-05|2211607|    77|        Cards|    383|    33210|1. Yellow card  ,...|        NULL|            NULL|
|b2d721eaed4692a5c...|2012-08-05|2211607|     3|        Goals|    383|    36500|, Header, 1. Tour...|        NULL|           56416|
|aef768899cedac0c9...|2012-08-05|2211607|    53|        Goals|    383|    36500|, Right-footed sh...|        NULL|          146258|
|5d6d9533023057b66...|2012-08-05|2211607|    74|Substitutions|    383|    36

['game_event_id',
 'date',
 'game_id',
 'minute',
 'type',
 'club_id',
 'player_id',
 'description',
 'player_in_id',
 'player_assist_id']

In [28]:
mostrar_sumario(df_game_events).show()

[Stage 74:>                                                         (0 + 1) / 1]

+-------+--------------------+------------------+-----------------+-------------+------------------+------------------+--------------------+------------------+-----------------+
|summary|       game_event_id|           game_id|           minute|         type|           club_id|         player_id|         description|      player_in_id| player_assist_id|
+-------+--------------------+------------------+-----------------+-------------+------------------+------------------+--------------------+------------------+-----------------+
|  count|             1035043|           1035043|          1035043|      1035043|           1035043|           1035043|              947716|            497678|           156759|
|   mean|            Infinity|3232166.6426525274|61.82031857613645|         NULL| 4956.375424982344|234649.75911049105|                NULL| 281129.1353606147|211605.5213544358|
| stddev|                 NaN| 680074.1692634815|23.58611694335326|         NULL|11822.249636871984| 212023.86

                                                                                

In [29]:
contar_nulos_por_columna(df_game_events).show()

+-------------+----+-------+------+----+-------+---------+-----------+------------+----------------+
|game_event_id|date|game_id|minute|type|club_id|player_id|description|player_in_id|player_assist_id|
+-------------+----+-------+------+----+-------+---------+-----------+------------+----------------+
|            0|   0|      0|     0|   0|      0|        0|      87327|      537365|          878284|
+-------------+----+-------+------+----+-------+---------+-----------+------------+----------------+



In [30]:
duplicados_por_columna(df_game_events, "game_event_id")

+-------------+-----+
|game_event_id|count|
+-------------+-----+
+-------------+-----+



In [31]:
# Mostrar los diferentes tipos de eventos
df_game_events.select("type").distinct().show()

+-------------+
|         type|
+-------------+
|        Goals|
|        Cards|
|Substitutions|
|     Shootout|
+-------------+



In [32]:
# Ver nulos de la columna 'description'
df_game_events.filter(col('description').isNull()).show()

+--------------------+----------+-------+------+-------------+-------+---------+-----------+------------+----------------+
|       game_event_id|      date|game_id|minute|         type|club_id|player_id|description|player_in_id|player_assist_id|
+--------------------+----------+-------+------+-------------+-------+---------+-----------+------------+----------------+
|e4d512124476ec941...|2012-08-22|2235964|    81|Substitutions|  10468|   121235|       NULL|      121027|            NULL|
|ac641aeaaf272da2f...|2012-08-22|2235964|    66|Substitutions|  10468|   201284|       NULL|      113699|            NULL|
|9e4d13d444aa9315f...|2013-02-17|2240152|    81|Substitutions|   1465|    17352|       NULL|      125700|            NULL|
|4bc0bb78995d14303...|2013-02-17|2240152|    62|Substitutions|   1301|    24880|       NULL|      237663|            NULL|
|b08ec33cbbcb9be73...|2013-02-17|2240152|    65|Substitutions|   1465|    42303|       NULL|      150535|            NULL|
|2c3f355afc76f8a

In [33]:
# Ver valores unicos de 'description'
df_game_events.select("description").distinct().show(truncate=False)

+------------------------------------------------------------------------------------------+
|description                                                                               |
+------------------------------------------------------------------------------------------+
|, Own-goal Assist: , Header, 3. Assist of the Season                                      |
|, Header, 12. Goal of the Season Assist: , Cross, 3. Assist of the Season                 |
|, Header, 9. Goal of the Season Assist: , Shot on goal, 7. Assist of the Season           |
|, Right-footed shot, 13. Goal of the Season Assist: , Pass, 11. Assist of the Season      |
|, Header, 10. Goal of the Season Assist: , Cross, 3. Assist of the Season                 |
|, Left-footed shot, 9. Goal of the Season Assist: , Cross, 5. Assist of the Season        |
|, Tap-in, 1. Goal of the Season Assist: , Pass, 1. Assist of the Season                   |
|, Tap-in, 3. Goal of the Season Assist: , Header, 3. Assist of the Se

In [34]:
# Debido a la gran cantidad de valores nulos y formatos confusos,
# consideramos que es mejor eliminar la columna 'description', pues ya tenemos el tipo de jugada en la columna 'type'
df_game_events = df_game_events.drop("description")
df_game_events.show()

+--------------------+----------+-------+------+-------------+-------+---------+------------+----------------+
|       game_event_id|      date|game_id|minute|         type|club_id|player_id|player_in_id|player_assist_id|
+--------------------+----------+-------+------+-------------+-------+---------+------------+----------------+
|2f41da30c471492e7...|2012-08-05|2211607|    77|        Cards|    610|     4425|        NULL|            NULL|
|a72f7186d132775f2...|2012-08-05|2211607|    77|        Cards|    383|    33210|        NULL|            NULL|
|b2d721eaed4692a5c...|2012-08-05|2211607|     3|        Goals|    383|    36500|        NULL|           56416|
|aef768899cedac0c9...|2012-08-05|2211607|    53|        Goals|    383|    36500|        NULL|          146258|
|5d6d9533023057b66...|2012-08-05|2211607|    74|Substitutions|    383|    36500|       49499|            NULL|
|eef9c46dd75c3aa4c...|2012-08-05|2211607|    11|        Goals|    383|    38497|        NULL|           33210|
|

Los valores vacíos en las columnas 'player_in_id' y 'player_assist_id' tienen sentido ya que dependen del tipo de evento; si el evento es 'Substitutions', sólo hace falta el 'player_in_id'; si el evento es 'Goals', sólo hace falta el 'player_assist_id'.

# Archivo "game_lineups.csv"

In [7]:
# Este archivo tiene problemas cargando el esquema, decidimos crear un esquema personalizado para este archivo
custom_schema = StructType([
    StructField("game_lineups_id", StringType(), True),
    StructField("date", DateType(), True),
    StructField("game_id", IntegerType(), True),
    StructField("player_id", IntegerType(), True),
    StructField("club_id", IntegerType(), True),
    StructField("player_name", StringType(), True),
    StructField("type", StringType(), True),
    StructField("position", StringType(), True),
    StructField("number", StringType(), True),
    StructField("team_captain", IntegerType(), True)
])
# Cargar el csv
df_game_lineups = spark.read.csv(path + "/game_lineups.csv", header=True, schema=custom_schema)

# Ver las primeras filas
df_game_lineups.show(10)
df_game_lineups.printSchema()
df_game_lineups.columns

+--------------------+----------+-------+---------+-------+-----------------+---------------+------------------+------+------------+
|     game_lineups_id|      date|game_id|player_id|club_id|      player_name|           type|          position|number|team_captain|
+--------------------+----------+-------+---------+-------+-----------------+---------------+------------------+------+------------+
|b2dbe01c3656b06c8...|2013-07-27|2317258|     1443|    610|Christian Poulsen|    substitutes|Defensive Midfield|     5|           0|
|b50a3ec6d52fd1490...|2013-07-27|2317258|     5017|    610| Niklas Moisander|starting_lineup|       Centre-Back|     4|           0|
|7d890e6d0ff8af84b...|2013-07-27|2317258|     9602|   1090|  Maarten Martens|    substitutes|       Left Winger|    11|           0|
|8c355268678b9bbc7...|2013-07-27|2317258|    12282|    610|      Daley Blind|starting_lineup|         Left-Back|    17|           0|
|76193074d549e5fdc...|2013-07-27|2317258|    25427|   1090|      Roy 

['game_lineups_id',
 'date',
 'game_id',
 'player_id',
 'club_id',
 'player_name',
 'type',
 'position',
 'number',
 'team_captain']

In [None]:
mostrar_sumario(df_game_lineups).show()

25/06/12 11:57:42 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 1:>                                                        (0 + 12) / 12]

In [None]:
contar_nulos_por_columna(df_game_lineups).show()

+---------------+-----+-------+---------+-------+-----------+-----+--------+------+------------+
|game_lineups_id| date|game_id|player_id|club_id|player_name| type|position|number|team_captain|
+---------------+-----+-------+---------+-------+-----------+-----+--------+------+------------+
|              0|93378|  93378|    93378|  93378|      93378|93378|   93381| 93378|      140067|
+---------------+-----+-------+---------+-------+-----------+-----+--------+------+------------+



In [38]:
duplicados_por_columna(df_game_lineups, "game_lineups_id")

+--------------------+-----+
|game_lineups_id     |count|
+--------------------+-----+
|                  77|401  |
|                  92|55   |
|                  91|132  |
|                  96|18   |
|                  74|37   |
|                  45|217  |
|                  69|57   |
|                  99|386  |
|                  68|40   |
|                  54|79   |
|                  29|799  |
|                  26|700  |
|                  55|195  |
|                  37|341  |
|                  17|1430 |
|                  24|880  |
|                  44|369  |
|                  90|149  |
|                  38|262  |
|                  63|59   |
+--------------------+-----+
only showing top 20 rows


In [None]:
df_game_lineups.filter(trim(col("game_lineups_id")) == "77").show()

+--------------------+----+-------+---------+-------+-----------+----+--------+------+------------+
|     game_lineups_id|date|game_id|player_id|club_id|player_name|type|position|number|team_captain|
+--------------------+----+-------+---------+-------+-----------+----+--------+------+------------+
|                  77|NULL|   NULL|     NULL|   NULL|       NULL|NULL|    NULL|  NULL|        NULL|
|                  77|NULL|   NULL|     NULL|   NULL|       NULL|NULL|    NULL|  NULL|        NULL|
|                  77|NULL|   NULL|     NULL|   NULL|       NULL|NULL|    NULL|  NULL|        NULL|
|                  77|NULL|   NULL|     NULL|   NULL|       NULL|NULL|    NULL|  NULL|        NULL|
|                  77|NULL|   NULL|     NULL|   NULL|       NULL|NULL|    NULL|  NULL|        NULL|
|                  77|NULL|   NULL|     NULL|   NULL|       NULL|NULL|    NULL|  NULL|        NULL|
|                  77|NULL|   NULL|     NULL|   NULL|       NULL|NULL|    NULL|  NULL|        NULL|


In [40]:
# Parece que algunas filas se introdujeron incorrectamente y no tienen valores
# Eliminaremos esas filas

duplicated_ids_df = df_game_lineups.groupBy("game_lineups_id").count().filter(col("count") > 1).select("game_lineups_id")

# Convertir el DataFrame de IDs duplicados a una lista de strings
# Esto es necesario para usarlo en una operacion de filtrado
duplicated_ids_list = [row.game_lineups_id for row in duplicated_ids_df.collect()]

# Eliminar las filas con IDs duplicados del DataFrame original
df_game_lineups_cleaned = df_game_lineups.filter(~col("game_lineups_id").isin(duplicated_ids_list))

# Mostrar el conteo de filas antes y despues para verificar la eliminacion
print(f"Filas antes de eliminar duplicados: {df_game_lineups.count()}")
print(f"Filas despues de eliminar duplicados: {df_game_lineups_cleaned.count()}")

Filas antes de eliminar duplicados: 2285289
Filas despues de eliminar duplicados: 2191911


In [41]:
# Ver nulos despues de eliminar duplicados
contar_nulos_por_columna(df_game_lineups_cleaned).show()

+---------------+----+-------+---------+-------+-----------+----+--------+------+------------+
|game_lineups_id|date|game_id|player_id|club_id|player_name|type|position|number|team_captain|
+---------------+----+-------+---------+-------+-----------+----+--------+------+------------+
|              0|   0|      0|        0|      0|          0|   0|       3|     0|       46689|
+---------------+----+-------+---------+-------+-----------+----+--------+------+------------+



In [None]:
# Ver si hay caracteres especiales en la columna 'player_name'
df_game_lineups_cleaned.filter(col('player_name').rlike('[^a-zA-Z- ]')).show()

No parece que haya números ni signos de puntuación en los nombres de los jugadores

In [42]:
# Ver nulos de la columna 'team_captain'
df_game_lineups_cleaned.filter(col('team_captain').isNull()).show()

+--------------------+----------+-------+---------+-------+--------------------+---------------+------------------+------+------------+
|     game_lineups_id|      date|game_id|player_id|club_id|         player_name|           type|          position|number|team_captain|
+--------------------+----------+-------+---------+-------+--------------------+---------------+------------------+------+------------+
|4615138f1f610cad5...|2024-02-18|4090243|    23523|    206|    Thomas Mikkelsen|    substitutes|        Goalkeeper|     "|        NULL|
|6e93292ea722d3665...|2024-02-18|4090243|    48870|    865|         Jonas Lössl|starting_lineup|        Goalkeeper|     "|        NULL|
|b010f792720e1c3cd...|2024-02-18|4090243|    63463|    206|         Daniel Wass|starting_lineup|  Central Midfield|     "|        NULL|
|e5751e0f3439a7a57...|2024-02-18|4090243|    68983|    206|        Kevin Mensah|    substitutes|        Right-Back|     "|        NULL|
|f886eb307ccf0a92d...|2024-02-18|4090243|    933

In [43]:
# ver valores vacios de number
df_game_lineups_cleaned.filter(col('number') == '"').show()

+--------------------+----------+-------+---------+-------+--------------------+---------------+------------------+------+------------+
|     game_lineups_id|      date|game_id|player_id|club_id|         player_name|           type|          position|number|team_captain|
+--------------------+----------+-------+---------+-------+--------------------+---------------+------------------+------+------------+
|4615138f1f610cad5...|2024-02-18|4090243|    23523|    206|    Thomas Mikkelsen|    substitutes|        Goalkeeper|     "|        NULL|
|6e93292ea722d3665...|2024-02-18|4090243|    48870|    865|         Jonas Lössl|starting_lineup|        Goalkeeper|     "|        NULL|
|b010f792720e1c3cd...|2024-02-18|4090243|    63463|    206|         Daniel Wass|starting_lineup|  Central Midfield|     "|        NULL|
|e5751e0f3439a7a57...|2024-02-18|4090243|    68983|    206|        Kevin Mensah|    substitutes|        Right-Back|     "|        NULL|
|f886eb307ccf0a92d...|2024-02-18|4090243|    933

In [None]:
# Filtrar valores no numéricos (con guiones, letras, vacíos, etc.)
df_no_convertibles = df_game_lineups_cleaned.filter(
    ~trim(col("number")).rlike("^[0-9]+$") | col("number").isNull() | isnan(col("number"))
)

df_no_convertibles.select("number").distinct().show(100, truncate=False)

+------+
|number|
+------+
|-     |
|"     |
+------+



In [None]:
# Algunos valores de 'team_captain' estan nulos y 'number' tiene varios vacios
# Sustituimos los nulos de 'team_captain' y los vacios de 'number' por 0
df_game_lineups_cleaned = df_game_lineups_cleaned.withColumn(
    "team_captain",
    when(col("team_captain").isNull(), 0).otherwise(col("team_captain"))
)

df_game_lineups_cleaned = df_game_lineups_cleaned.withColumn(
    "number",
    when(trim(col("number")).rlike("^[0-9]+$"), col("number").cast("int")).otherwise(0)
)

# Ver nulos de nuevo
contar_nulos_por_columna(df_game_lineups_cleaned).show()

+---------------+----+-------+---------+-------+-----------+----+--------+------+------------+
|game_lineups_id|date|game_id|player_id|club_id|player_name|type|position|number|team_captain|
+---------------+----+-------+---------+-------+-----------+----+--------+------+------------+
|              0|   0|      0|        0|      0|          0|   0|       3|     0|           0|
+---------------+----+-------+---------+-------+-----------+----+--------+------+------------+



In [46]:
# Ver nulos en la columna 'position'
df_game_lineups_cleaned.filter(col('position').isNull()).show()

+--------------------+----------+-------+---------+-------+----------------+---------------+--------+------+------------+
|     game_lineups_id|      date|game_id|player_id|club_id|     player_name|           type|position|number|team_captain|
+--------------------+----------+-------+---------+-------+----------------+---------------+--------+------+------------+
|efbdeff16a683e068...|2014-08-16|2469561|   336850|  25418|    Shaun Dunion|starting_lineup|    NULL|     0|           0|
|8e0d3d1774f12f52c...|2015-08-26|2597514|   415060|   6601|   Jimmy Theelen|starting_lineup|    NULL|     0|           0|
|22014173b77618f43...|2015-08-26|2597514|   415063|   6601|Bart Zeevenhoven|    substitutes|    NULL|     0|           0|
+--------------------+----------+-------+---------+-------+----------------+---------------+--------+------+------------+



In [47]:
# Sustituir posiciones nulas por 'Unkown'
df_game_lineups_cleaned = df_game_lineups_cleaned.withColumn(
    "position",
    when(col("position").isNull(), "Unknown").otherwise(col("position"))
)
df_game_lineups_cleaned.show()

+--------------------+----------+-------+---------+-------+--------------------+---------------+------------------+------+------------+
|     game_lineups_id|      date|game_id|player_id|club_id|         player_name|           type|          position|number|team_captain|
+--------------------+----------+-------+---------+-------+--------------------+---------------+------------------+------+------------+
|b2dbe01c3656b06c8...|2013-07-27|2317258|     1443|    610|   Christian Poulsen|    substitutes|Defensive Midfield|     5|           0|
|b50a3ec6d52fd1490...|2013-07-27|2317258|     5017|    610|    Niklas Moisander|starting_lineup|       Centre-Back|     4|           0|
|7d890e6d0ff8af84b...|2013-07-27|2317258|     9602|   1090|     Maarten Martens|    substitutes|       Left Winger|    11|           0|
|8c355268678b9bbc7...|2013-07-27|2317258|    12282|    610|         Daley Blind|starting_lineup|         Left-Back|    17|           0|
|76193074d549e5fdc...|2013-07-27|2317258|    254

# Archivo "appearances.csv"

In [48]:
# Cargar el csv
df_appearances = spark.read.csv(path + "/appearances.csv", header=True, inferSchema=True)

# Ver las primeras filas
df_appearances.show(10)
df_appearances.printSchema()
df_appearances.columns

+--------------+-------+---------+--------------+----------------------+----------+----------------+--------------+------------+---------+-----+-------+--------------+
| appearance_id|game_id|player_id|player_club_id|player_current_club_id|      date|     player_name|competition_id|yellow_cards|red_cards|goals|assists|minutes_played|
+--------------+-------+---------+--------------+----------------------+----------+----------------+--------------+------------+---------+-----+-------+--------------+
| 2231978_38004|2231978|    38004|           853|                   235|2012-07-03|Aurélien Joachim|           CLQ|           0|        0|    2|      0|            90|
| 2233748_79232|2233748|    79232|          8841|                  2698|2012-07-05|  Ruslan Abyshov|           ELQ|           0|        0|    0|      0|            90|
| 2234413_42792|2234413|    42792|          6251|                   465|2012-07-05|     Sander Puri|           ELQ|           0|        0|    0|      0|        

['appearance_id',
 'game_id',
 'player_id',
 'player_club_id',
 'player_current_club_id',
 'date',
 'player_name',
 'competition_id',
 'yellow_cards',
 'red_cards',
 'goals',
 'assists',
 'minutes_played']

In [49]:
mostrar_sumario(df_appearances).show()

[Stage 150:>                                                        (0 + 1) / 1]

+-------+--------------+-----------------+------------------+------------------+----------------------+-----------+--------------+-------------------+--------------------+-------------------+-------------------+-----------------+
|summary| appearance_id|          game_id|         player_id|    player_club_id|player_current_club_id|player_name|competition_id|       yellow_cards|           red_cards|              goals|            assists|   minutes_played|
+-------+--------------+-----------------+------------------+------------------+----------------------+-----------+--------------+-------------------+--------------------+-------------------+-------------------+-----------------+
|  count|       1706806|          1706806|           1706806|           1706806|               1706806|    1706800|       1706806|            1706806|             1706806|            1706806|            1706806|          1706806|
|   mean|          NULL|3166096.010632726|209974.54851283625|3138.2895976461296|

                                                                                

In [50]:
contar_nulos_por_columna(df_appearances).show()

+-------------+-------+---------+--------------+----------------------+----+-----------+--------------+------------+---------+-----+-------+--------------+
|appearance_id|game_id|player_id|player_club_id|player_current_club_id|date|player_name|competition_id|yellow_cards|red_cards|goals|assists|minutes_played|
+-------------+-------+---------+--------------+----------------------+----+-----------+--------------+------------+---------+-----+-------+--------------+
|            0|      0|        0|             0|                     0|   0|          6|             0|           0|        0|    0|      0|             0|
+-------------+-------+---------+--------------+----------------------+----+-----------+--------------+------------+---------+-----+-------+--------------+



In [51]:
duplicados_por_columna(df_appearances, "appearance_id")

+-------------+-----+
|appearance_id|count|
+-------------+-----+
+-------------+-----+



In [52]:
# Ver nulos de la columna 'player_name'
df_appearances.filter(col('player_name').isNull()).show()

+--------------+-------+---------+--------------+----------------------+----------+-----------+--------------+------------+---------+-----+-------+--------------+
| appearance_id|game_id|player_id|player_club_id|player_current_club_id|      date|player_name|competition_id|yellow_cards|red_cards|goals|assists|minutes_played|
+--------------+-------+---------+--------------+----------------------+----------+-----------+--------------+------------+---------+-----+-------+--------------+
|3084062_380365|3084062|   380365|         16486|                    -1|2018-09-05|       NULL|           CDR|           1|        0|    0|      0|            90|
|3084059_411294|3084059|   411294|          3302|                    -1|2018-09-11|       NULL|           CDR|           0|        0|    0|      0|            90|
|3084057_255495|3084057|   255495|         11596|                    -1|2018-09-12|       NULL|           CDR|           0|        0|    0|      0|            90|
|3102749_380365|310274

In [53]:
# Ver valores en 'player_current_club_id'= -1
df_appearances.filter(col('player_current_club_id') == -1).show()

+--------------+-------+---------+--------------+----------------------+----------+-----------+--------------+------------+---------+-----+-------+--------------+
| appearance_id|game_id|player_id|player_club_id|player_current_club_id|      date|player_name|competition_id|yellow_cards|red_cards|goals|assists|minutes_played|
+--------------+-------+---------+--------------+----------------------+----------+-----------+--------------+------------+---------+-----+-------+--------------+
|3084062_380365|3084062|   380365|         16486|                    -1|2018-09-05|       NULL|           CDR|           1|        0|    0|      0|            90|
|3084059_411294|3084059|   411294|          3302|                    -1|2018-09-11|       NULL|           CDR|           0|        0|    0|      0|            90|
|3084057_255495|3084057|   255495|         11596|                    -1|2018-09-12|       NULL|           CDR|           0|        0|    0|      0|            90|
|3102749_380365|310274

In [54]:
# Existen 6 jugadores sin nombre y con 'player_current_club_id'= -1, son los unicos valores con este id
# Creemos que es un fallo y decidimos eliminarlos
df_appearances = df_appearances.filter(col('player_current_club_id') != -1)
# Ver nulos de nuevo
contar_nulos_por_columna(df_appearances).show()
df_appearances.show()

+-------------+-------+---------+--------------+----------------------+----+-----------+--------------+------------+---------+-----+-------+--------------+
|appearance_id|game_id|player_id|player_club_id|player_current_club_id|date|player_name|competition_id|yellow_cards|red_cards|goals|assists|minutes_played|
+-------------+-------+---------+--------------+----------------------+----+-----------+--------------+------------+---------+-----+-------+--------------+
|            0|      0|        0|             0|                     0|   0|          0|             0|           0|        0|    0|      0|             0|
+-------------+-------+---------+--------------+----------------------+----+-----------+--------------+------------+---------+-----+-------+--------------+

+--------------+-------+---------+--------------+----------------------+----------+----------------+--------------+------------+---------+-----+-------+--------------+
| appearance_id|game_id|player_id|player_club_id|pl

# Archivo "player_valuations.csv"

In [55]:
# Cargar el csv
df_player_valuations = spark.read.csv(path + "/player_valuations.csv", header=True, inferSchema=True)

# Ver las primeras filas
df_player_valuations.show(10)
df_player_valuations.printSchema()
df_player_valuations.columns

+---------+----------+-------------------+---------------+-----------------------------------+
|player_id|      date|market_value_in_eur|current_club_id|player_club_domestic_competition_id|
+---------+----------+-------------------+---------------+-----------------------------------+
|   405973|2000-01-20|             150000|           3057|                                BE1|
|   342216|2001-07-20|             100000|           1241|                                SC1|
|     3132|2003-12-09|             400000|            126|                                TR1|
|     6893|2003-12-15|             900000|            984|                                GB1|
|       10|2004-10-04|            7000000|            398|                                IT1|
|       26|2004-10-04|            1500000|             16|                                 L1|
|       65|2004-10-04|            8000000|           1091|                                GR1|
|       77|2004-10-04|           13000000|        

['player_id',
 'date',
 'market_value_in_eur',
 'current_club_id',
 'player_club_domestic_competition_id']

In [56]:
mostrar_sumario(df_player_valuations).show()

[Stage 176:>                                                        (0 + 4) / 4]

+-------+-----------------+-------------------+-----------------+-----------------------------------+
|summary|        player_id|market_value_in_eur|  current_club_id|player_club_domestic_competition_id|
+-------+-----------------+-------------------+-----------------+-----------------------------------+
|  count|           496606|             496606|           496606|                             496606|
|   mean| 224411.569751473|  2471144.917699343| 4428.74504134062|                               NULL|
| stddev|207007.2529541088|  6983759.321324877|10940.78286880771|                               NULL|
|    min|               10|                  0|                3|                                BE1|
|    25%|            58957|             200000|              369|                               NULL|
|    50%|           164897|             500000|             1025|                               NULL|
|    75%|           334042|            1700000|             2995|                 

                                                                                

In [57]:
contar_nulos_por_columna(df_player_valuations).show()

+---------+----+-------------------+---------------+-----------------------------------+
|player_id|date|market_value_in_eur|current_club_id|player_club_domestic_competition_id|
+---------+----+-------------------+---------------+-----------------------------------+
|        0|   0|                  0|              0|                                  0|
+---------+----+-------------------+---------------+-----------------------------------+



In [58]:
duplicados_por_columna(df_player_valuations, "date")

+----------+-----+
|date      |count|
+----------+-----+
|2005-01-16|69   |
|2005-06-06|2    |
|2006-05-21|13   |
|2007-04-20|5    |
|2007-11-23|12   |
|2012-04-17|2    |
|2012-10-06|9    |
|2013-01-22|230  |
|2013-03-26|6    |
|2013-05-21|2    |
|2013-09-09|2    |
|2014-09-26|15   |
|2006-04-23|6    |
|2007-04-19|41   |
|2008-07-15|98   |
|2009-06-28|17   |
|2009-10-19|61   |
|2011-01-29|3    |
|2011-01-30|5    |
|2012-07-17|25   |
+----------+-----+
only showing top 20 rows


# Archivo "games.csv"

In [8]:
# Cargar el csv
df_games = spark.read.csv(path + "/games.csv", header=True, inferSchema=True)

# Ver las primeras filas
df_games.show(10)
df_games.printSchema()
df_games.columns

                                                                                

+-------+--------------+------+------------+----------+------------+------------+---------------+---------------+------------------+------------------+----------------------+----------------------+--------------------+----------+---------------+--------------------+-------------------+-------------------+--------------------+--------------------+-------------------+----------------+
|game_id|competition_id|season|       round|      date|home_club_id|away_club_id|home_club_goals|away_club_goals|home_club_position|away_club_position|home_club_manager_name|away_club_manager_name|             stadium|attendance|        referee|                 url|home_club_formation|away_club_formation|      home_club_name|      away_club_name|          aggregate|competition_type|
+-------+--------------+------+------------+----------+------------+------------+---------------+---------------+------------------+------------------+----------------------+----------------------+--------------------+----------

['game_id',
 'competition_id',
 'season',
 'round',
 'date',
 'home_club_id',
 'away_club_id',
 'home_club_goals',
 'away_club_goals',
 'home_club_position',
 'away_club_position',
 'home_club_manager_name',
 'away_club_manager_name',
 'stadium',
 'attendance',
 'referee',
 'url',
 'home_club_formation',
 'away_club_formation',
 'home_club_name',
 'away_club_name',
 'aggregate',
 'competition_type']

In [9]:
mostrar_sumario(df_games).show()

25/06/13 12:54:21 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 8:>                                                          (0 + 1) / 1]

+-------+-----------------+--------------+------------------+-----------+------------------+------------------+------------------+------------------+------------------+------------------+----------------------+----------------------+--------------------+------------------+---------------+--------------------+-------------------+--------------------+--------------------+--------------------+----------------+
|summary|          game_id|competition_id|            season|      round|      home_club_id|      away_club_id|   home_club_goals|   away_club_goals|home_club_position|away_club_position|home_club_manager_name|away_club_manager_name|             stadium|        attendance|        referee|                 url|home_club_formation| away_club_formation|      home_club_name|      away_club_name|competition_type|
+-------+-----------------+--------------+------------------+-----------+------------------+------------------+------------------+------------------+------------------+----------

                                                                                

In [61]:
contar_nulos_por_columna(df_games).show()

+-------+--------------+------+-----+----+------------+------------+---------------+---------------+------------------+------------------+----------------------+----------------------+-------+----------+-------+---+-------------------+-------------------+--------------+--------------+---------+----------------+
|game_id|competition_id|season|round|date|home_club_id|away_club_id|home_club_goals|away_club_goals|home_club_position|away_club_position|home_club_manager_name|away_club_manager_name|stadium|attendance|referee|url|home_club_formation|away_club_formation|home_club_name|away_club_name|aggregate|competition_type|
+-------+--------------+------+-----+----+------------+------------+---------------+---------------+------------------+------------------+----------------------+----------------------+-------+----------+-------+---+-------------------+-------------------+--------------+--------------+---------+----------------+
|      0|             0|     0|    0|   0|           9|      

In [62]:
duplicados_por_columna(df_games, "game_id")

+-------+-----+
|game_id|count|
+-------+-----+
+-------+-----+



In [10]:
# Ver nulos de la tablas 'home_club_id' y 'away_club_id'
df_games.filter(col('home_club_id').isNull()).show()
df_games.filter(col('away_club_id').isNull()).show()

+-------+--------------+------+--------------+----------+------------+------------+---------------+---------------+------------------+------------------+----------------------+----------------------+--------------------+----------+-------+--------------------+-------------------+-------------------+--------------+--------------+---------+----------------+
|game_id|competition_id|season|         round|      date|home_club_id|away_club_id|home_club_goals|away_club_goals|home_club_position|away_club_position|home_club_manager_name|away_club_manager_name|             stadium|attendance|referee|                 url|home_club_formation|away_club_formation|home_club_name|away_club_name|aggregate|competition_type|
+-------+--------------+------+--------------+----------+------------+------------+---------------+---------------+------------------+------------------+----------------------+----------------------+--------------------+----------+-------+--------------------+-------------------+----

In [11]:
# Parece que los partidos sin club_id son de fechas futuras, deberian de eliminarse
df_games = df_games.withColumn('date', to_date(col('date'), 'yyyy-MM-dd')).filter(col('date') <= current_date())
# Ver nulos de nuevo
contar_nulos_por_columna(df_games).show()

+-------+--------------+------+-----+----+------------+------------+---------------+---------------+------------------+------------------+----------------------+----------------------+-------+----------+-------+---+-------------------+-------------------+--------------+--------------+---------+----------------+
|game_id|competition_id|season|round|date|home_club_id|away_club_id|home_club_goals|away_club_goals|home_club_position|away_club_position|home_club_manager_name|away_club_manager_name|stadium|attendance|referee|url|home_club_formation|away_club_formation|home_club_name|away_club_name|aggregate|competition_type|
+-------+--------------+------+-----+----+------------+------------+---------------+---------------+------------------+------------------+----------------------+----------------------+-------+----------+-------+---+-------------------+-------------------+--------------+--------------+---------+----------------+
|      0|             0|     0|    0|   0|           0|      

In [12]:
# Ver nulos columnas de 'club_position'
df_games.filter(col('home_club_position').isNull()).show()
df_games.filter(col('away_club_position').isNull()).show()

+-------+--------------+------+--------------------+----------+------------+------------+---------------+---------------+------------------+------------------+----------------------+----------------------+--------------------+----------+--------------------+--------------------+-------------------+-------------------+--------------------+--------------------+-------------------+-----------------+
|game_id|competition_id|season|               round|      date|home_club_id|away_club_id|home_club_goals|away_club_goals|home_club_position|away_club_position|home_club_manager_name|away_club_manager_name|             stadium|attendance|             referee|                 url|home_club_formation|away_club_formation|      home_club_name|      away_club_name|          aggregate| competition_type|
+-------+--------------+------+--------------------+----------+------------+------------+---------------+---------------+------------------+------------------+----------------------+------------------

In [13]:
# Parece que cuando la competicion es algun tipo de copa, los valores de posicion estan nulos porque las copas no usan ese tipo de ranking

# Convertimos las columnas a string y le añadimos un valor 'Sin Ranking' para estos casos
df_games = df_games.withColumn("home_club_position", col("home_club_position").cast(StringType()))
df_games = df_games.withColumn("away_club_position", col("away_club_position").cast(StringType()))

# La logica toma los casos en los que la position sea nulo y la competicion contenga 'cup' sin tener en cuenta mayusculas, o sea de tipo 'other'
df_games = df_games.withColumn(
    "home_club_position",
    when((lower(col("competition_type")).contains("cup") | (col("competition_type") == "other")) & col("home_club_position").isNull(), "Sin Ranking")
    .otherwise(col("home_club_position"))
)
df_games = df_games.withColumn(
    "away_club_position",
    when((lower(col("competition_type")).contains("cup") | (col("competition_type") == "other")) & col("away_club_position").isNull(), "Sin Ranking")
    .otherwise(col("away_club_position"))
)

# Ver nulos despues de la sustitucion
df_games.filter(col('home_club_position').isNull()).show()
df_games.filter(col('away_club_position').isNull()).show()

                                                                                

+-------+--------------+------+------------+----------+------------+------------+---------------+---------------+------------------+------------------+----------------------+----------------------+--------------------+----------+--------------------+--------------------+-------------------+-------------------+--------------------+--------------------+-------------------+----------------+
|game_id|competition_id|season|       round|      date|home_club_id|away_club_id|home_club_goals|away_club_goals|home_club_position|away_club_position|home_club_manager_name|away_club_manager_name|             stadium|attendance|             referee|                 url|home_club_formation|away_club_formation|      home_club_name|      away_club_name|          aggregate|competition_type|
+-------+--------------+------+------------+----------+------------+------------+---------------+---------------+------------------+------------------+----------------------+----------------------+--------------------+

[Stage 23:>                                                         (0 + 2) / 2]

+-------+--------------+------+------------+----------+------------+------------+---------------+---------------+------------------+------------------+----------------------+----------------------+--------------------+----------+--------------------+--------------------+-------------------+-------------------+--------------------+--------------------+-------------------+----------------+
|game_id|competition_id|season|       round|      date|home_club_id|away_club_id|home_club_goals|away_club_goals|home_club_position|away_club_position|home_club_manager_name|away_club_manager_name|             stadium|attendance|             referee|                 url|home_club_formation|away_club_formation|      home_club_name|      away_club_name|          aggregate|competition_type|
+-------+--------------+------+------------+----------+------------+------------+---------------+---------------+------------------+------------------+----------------------+----------------------+--------------------+

                                                                                

In [14]:
# Parece que aun quedan nulos en position que no son de copas
# Sustituimos los valores nulos restantes por 'Unkown'
df_games = df_games.withColumn(
    "home_club_position",
    when(col("home_club_position").isNull(), "Unknown").otherwise(col("home_club_position"))
)
df_games = df_games.withColumn(
    "away_club_position",
    when(col("away_club_position").isNull(), "Unknown").otherwise(col("away_club_position"))
)
# Ver todos los nulos
contar_nulos_por_columna(df_games).show()

+-------+--------------+------+-----+----+------------+------------+---------------+---------------+------------------+------------------+----------------------+----------------------+-------+----------+-------+---+-------------------+-------------------+--------------+--------------+---------+----------------+
|game_id|competition_id|season|round|date|home_club_id|away_club_id|home_club_goals|away_club_goals|home_club_position|away_club_position|home_club_manager_name|away_club_manager_name|stadium|attendance|referee|url|home_club_formation|away_club_formation|home_club_name|away_club_name|aggregate|competition_type|
+-------+--------------+------+-----+----+------------+------------+---------------+---------------+------------------+------------------+----------------------+----------------------+-------+----------+-------+---+-------------------+-------------------+--------------+--------------+---------+----------------+
|      0|             0|     0|    0|   0|           0|      

In [15]:
# Sustituir valores nulos en 'home_club_manager_name', 'away_club_manager_name', 'stadium' y 'referee' por 'Unkown'
df_games = df_games.fillna({
    "home_club_manager_name": "Unknown",
    "away_club_manager_name": "Unknown",
    "stadium": "Unknown",
    "referee": "Unknown"
})
# Comprobar de nuevo los nulos despues de la sustitucion
contar_nulos_por_columna(df_games).show()

+-------+--------------+------+-----+----+------------+------------+---------------+---------------+------------------+------------------+----------------------+----------------------+-------+----------+-------+---+-------------------+-------------------+--------------+--------------+---------+----------------+
|game_id|competition_id|season|round|date|home_club_id|away_club_id|home_club_goals|away_club_goals|home_club_position|away_club_position|home_club_manager_name|away_club_manager_name|stadium|attendance|referee|url|home_club_formation|away_club_formation|home_club_name|away_club_name|aggregate|competition_type|
+-------+--------------+------+-----+----+------------+------------+---------------+---------------+------------------+------------------+----------------------+----------------------+-------+----------+-------+---+-------------------+-------------------+--------------+--------------+---------+----------------+
|      0|             0|     0|    0|   0|           0|      

In [16]:
# Ver nulos en columnas 'attendance'
df_games.filter(col('attendance').isNull()).show()

+-------+--------------+------+--------------------+----------+------------+------------+---------------+---------------+------------------+------------------+----------------------+----------------------+--------------------+----------+-------------------+--------------------+-------------------+-------------------+--------------------+--------------------+-------------------+-----------------+
|game_id|competition_id|season|               round|      date|home_club_id|away_club_id|home_club_goals|away_club_goals|home_club_position|away_club_position|home_club_manager_name|away_club_manager_name|             stadium|attendance|            referee|                 url|home_club_formation|away_club_formation|      home_club_name|      away_club_name|          aggregate| competition_type|
+-------+--------------+------+--------------------+----------+------------+------------+---------------+---------------+------------------+------------------+----------------------+--------------------

In [17]:
# Sustituir los valores nulos por la capacidad del estadio
# La capacidad del estadio esta en 'df_clubs'
df_clubs_capacity = df_clubs.select("stadium_name", "stadium_seats")

# Unir la nueva columna al dataset
df_games = df_games.join(df_clubs_capacity, df_games.stadium == df_clubs_capacity.stadium_name, "left")

# Mostrar dataframe despues de la union
df_games.show()

+-------+--------------+------+------------+----------+------------+------------+---------------+---------------+------------------+------------------+----------------------+----------------------+--------------------+----------+-----------------+--------------------+-------------------+-------------------+--------------------+--------------------+-------------------+----------------+--------------------+-------------+
|game_id|competition_id|season|       round|      date|home_club_id|away_club_id|home_club_goals|away_club_goals|home_club_position|away_club_position|home_club_manager_name|away_club_manager_name|             stadium|attendance|          referee|                 url|home_club_formation|away_club_formation|      home_club_name|      away_club_name|          aggregate|competition_type|        stadium_name|stadium_seats|
+-------+--------------+------+------------+----------+------------+------------+---------------+---------------+------------------+------------------+---

In [18]:
# Rellenar los nulos en 'attendance' usando la capacidad del estadio correspondiente
df_games = df_games.withColumn(
    "attendance",
    when(col("attendance").isNull(), col("stadium_seats")).otherwise(col("attendance"))
)

# Eliminar las columna temporales
df_games = df_games.drop("stadium_name", "stadium_seats")

# Comprobar los nulos en 'attendance' despues de la sustitucion
df_games.filter(col('attendance').isNull()).show()

+-------+--------------+------+-------------------+----------+------------+------------+---------------+---------------+------------------+------------------+----------------------+----------------------+--------------------+----------+-------------------+--------------------+-------------------+-------------------+--------------------+--------------------+-------------------+----------------+
|game_id|competition_id|season|              round|      date|home_club_id|away_club_id|home_club_goals|away_club_goals|home_club_position|away_club_position|home_club_manager_name|away_club_manager_name|             stadium|attendance|            referee|                 url|home_club_formation|away_club_formation|      home_club_name|      away_club_name|          aggregate|competition_type|
+-------+--------------+------+-------------------+----------+------------+------------+---------------+---------------+------------------+------------------+----------------------+----------------------+--

In [19]:
# Aun quedan valores nulos en 'attendance', probablemente sean de estadios que tengan de nombre 'Unkown'
# Sustituir esos valores de 'attendance' por 0
df_games = df_games.withColumn(
    "attendance",
    when(col("attendance").isNull(), 0).otherwise(col("attendance"))
)
# Ver nulos en todas las columnas
contar_nulos_por_columna(df_games).show()

+-------+--------------+------+-----+----+------------+------------+---------------+---------------+------------------+------------------+----------------------+----------------------+-------+----------+-------+---+-------------------+-------------------+--------------+--------------+---------+----------------+
|game_id|competition_id|season|round|date|home_club_id|away_club_id|home_club_goals|away_club_goals|home_club_position|away_club_position|home_club_manager_name|away_club_manager_name|stadium|attendance|referee|url|home_club_formation|away_club_formation|home_club_name|away_club_name|aggregate|competition_type|
+-------+--------------+------+-----+----+------------+------------+---------------+---------------+------------------+------------------+----------------------+----------------------+-------+----------+-------+---+-------------------+-------------------+--------------+--------------+---------+----------------+
|      0|             0|     0|    0|   0|           0|      

In [20]:
# Ver valores nulos en las columnas de formaciones
df_games.filter(col('home_club_formation').isNull()).show()
df_games.filter(col('away_club_formation').isNull()).show()

+-------+--------------+------+------------+----------+------------+------------+---------------+---------------+------------------+------------------+----------------------+----------------------+--------------------+----------+--------------------+--------------------+-------------------+-------------------+--------------------+--------------------+-------------------+----------------+
|game_id|competition_id|season|       round|      date|home_club_id|away_club_id|home_club_goals|away_club_goals|home_club_position|away_club_position|home_club_manager_name|away_club_manager_name|             stadium|attendance|             referee|                 url|home_club_formation|away_club_formation|      home_club_name|      away_club_name|          aggregate|competition_type|
+-------+--------------+------+------------+----------+------------+------------+---------------+---------------+------------------+------------------+----------------------+----------------------+--------------------+

In [21]:
# Ver valores unicos de las columnas de formaciones
df_games.select("home_club_formation").distinct().show(truncate=False)
df_games.select("away_club_formation").distinct().show(truncate=False)

+-------------------+
|home_club_formation|
+-------------------+
|4-5-1              |
|4-2-4              |
|2-4-4              |
|4-1-5              |
|6-1-3              |
|5-4-1              |
|4-5-1 flat         |
|3-5-2              |
|3-3-3-1            |
|3-4-2-1            |
|2-5-3              |
|1-9-0              |
|4-1-3-2            |
|4-3-3              |
|3-4-3              |
|5-3-2              |
|4-4-2 Diamond      |
|3-6-1              |
|5-2-3              |
|3-4-1-2            |
+-------------------+
only showing top 20 rows
+-------------------------+
|away_club_formation      |
+-------------------------+
|4-2-4                    |
|4-5-1                    |
|2-4-4                    |
|4-1-5                    |
|6-1-3                    |
|Starting Line-up: 4-1-4-1|
|5-4-1                    |
|2-8-0                    |
|3-5-2                    |
|4-5-1 flat               |
|3-3-3-1                  |
|3-4-2-1                  |
|0-10-0                   |

Las columnas de 'home_club_formation' y 'home_club_formation' tienen muchos nulos y no tienen todas el formato correcto.

Decidimos definir una funcion para aplicarles a todas un formato uniforme.

In [22]:
def formatear_formaciones(column_name):
    """
    Construye una expresion Spark SQL para limpiar, validar el formato, y formatear una columna de formacion,
    reemplazando con "Unknown" si no cumple el formato.
    Explicacion:
      - Limpia caracteres no deseados en los valores (solo admite numeros y guiones).
      - Usa regexp_extract() para verificar el formato: de 3 a 4 digitos separados por guiones.
      - Si el patron coincide, usa la cadena limpiada; de lo contrario, usa "Unknown".
    Args:
        column_name (str): Nombre de la columna a formatear.

    Returns:
        Expresion Spark SQL para formatear la columna.
    """

    cleaned_formation = regexp_replace(col(column_name), "[^0-9-]", "")

    pattern = r"^(\d+)-(\d+)-(\d+)$|^(\d+)-(\d+)-(\d+)-(\d+)$"
    matches_pattern = regexp_extract(cleaned_formation, pattern, 0) != ""

    final_formation = when(matches_pattern, cleaned_formation).otherwise("Unknown")

    return final_formation

In [23]:
# Aplicar la funcion de formateo
df_games = df_games.withColumn("home_club_formation", formatear_formaciones("home_club_formation"))
df_games = df_games.withColumn("away_club_formation", formatear_formaciones("away_club_formation"))

# Verificar nulos de nuevo
contar_nulos_por_columna(df_games).show()

# Ver si el formato se aplico correctamente
df_games.select("home_club_formation").distinct().show(truncate=False)
df_games.select("away_club_formation").distinct().show(truncate=False)

                                                                                

+-------+--------------+------+-----+----+------------+------------+---------------+---------------+------------------+------------------+----------------------+----------------------+-------+----------+-------+---+-------------------+-------------------+--------------+--------------+---------+----------------+
|game_id|competition_id|season|round|date|home_club_id|away_club_id|home_club_goals|away_club_goals|home_club_position|away_club_position|home_club_manager_name|away_club_manager_name|stadium|attendance|referee|url|home_club_formation|away_club_formation|home_club_name|away_club_name|aggregate|competition_type|
+-------+--------------+------+-----+----+------------+------------+---------------+---------------+------------------+------------------+----------------------+----------------------+-------+----------+-------+---+-------------------+-------------------+--------------+--------------+---------+----------------+
|      0|             0|     0|    0|   0|           0|      

In [24]:
# Ver valores unicos de la columna 'round'
df_games.select("round").distinct().show(truncate=False)

+--------------------+
|round               |
+--------------------+
|39. Matchday        |
|3. Matchday         |
|17. Matchday        |
|21. Matchday        |
|4th round 2nd leg   |
|Group 1             |
|42. Matchday        |
|1. Matchday         |
|3rd round 1st leg   |
|Final               |
|Third Place Play-Off|
|intermediate stage  |
|Group H             |
|41. Matchday        |
|Semi-Finals 1st Leg |
|30. Matchday        |
|group L             |
|2. Matchday         |
|group I             |
|20. Matchday        |
+--------------------+
only showing top 20 rows


In [25]:
# Aplicar un formato a la columna 'round' para que los valores empiecen por mayuscula y no tengan caracteres especiales
df_games = df_games.withColumn(
    "round",
    initcap(regexp_replace(col("round"), "[^a-zA-Z0-9-. ]", ""))
)
# Ver si el formato se aplico correctamente
df_games.select("round").distinct().show(truncate=False)



+-------------------+
|round              |
+-------------------+
|39. Matchday       |
|3. Matchday        |
|17. Matchday       |
|21. Matchday       |
|Group 1            |
|42. Matchday       |
|1. Matchday        |
|Last 16 2nd Leg    |
|Final              |
|Group H            |
|41. Matchday       |
|Last 16 1st Leg    |
|Round Of 16 Replay |
|30. Matchday       |
|2. Matchday        |
|20. Matchday       |
|Group 5            |
|Second Round Replay|
|6th Round Deciders |
|4th Round 1st Leg  |
+-------------------+
only showing top 20 rows


                                                                                

In [77]:
# Hay valores nulos en 'home_club_name' y 'away_club_name', como ya tenemos el 'club_id' decidimos eliminar los nombres de esta tabla
df_games = df_games.drop("home_club_name", "away_club_name")
contar_nulos_por_columna(df_games).show()

+-------+--------------+------+-----+----+------------+------------+---------------+---------------+------------------+------------------+----------------------+----------------------+-------+----------+-------+---+-------------------+-------------------+---------+----------------+
|game_id|competition_id|season|round|date|home_club_id|away_club_id|home_club_goals|away_club_goals|home_club_position|away_club_position|home_club_manager_name|away_club_manager_name|stadium|attendance|referee|url|home_club_formation|away_club_formation|aggregate|competition_type|
+-------+--------------+------+-----+----+------------+------------+---------------+---------------+------------------+------------------+----------------------+----------------------+-------+----------+-------+---+-------------------+-------------------+---------+----------------+
|      0|             0|     0|    0|   0|           0|           0|              0|              0|                 0|                 0|             

In [78]:
# Eliminamos tambien las columnas 'url' y 'aggregate' porque no son relevantes para nuestro analisis
df_games = df_games.drop("url", "aggregate")
df_games.show()

+-------+--------------+------+------------+----------+------------+------------+---------------+---------------+------------------+------------------+----------------------+----------------------+--------------------+----------+-----------------+-------------------+-------------------+----------------+
|game_id|competition_id|season|       round|      date|home_club_id|away_club_id|home_club_goals|away_club_goals|home_club_position|away_club_position|home_club_manager_name|away_club_manager_name|             stadium|attendance|          referee|home_club_formation|away_club_formation|competition_type|
+-------+--------------+------+------------+----------+------------+------------+---------------+---------------+------------------+------------------+----------------------+----------------------+--------------------+----------+-----------------+-------------------+-------------------+----------------+
|2321027|            L1|  2013| 1. Matchday|2013-08-11|          33|          41|    

# Archivo "players.csv"

In [79]:
# Cargar el csv
df_players = spark.read.csv(path + "/players.csv", header=True, inferSchema=True)

# Ver las primeras filas
df_players.show(10)
df_players.printSchema()
df_players.columns

+---------+----------+------------+-------------------+-----------+---------------+-------------------+------------------+--------------+----------------------+-------------------+------------------+----------+-----+------------+------------------------+--------------------+--------------------+--------------------+------------------------------------+--------------------+-------------------+---------------------------+
|player_id|first_name|   last_name|               name|last_season|current_club_id|        player_code|  country_of_birth| city_of_birth|country_of_citizenship|      date_of_birth|      sub_position|  position| foot|height_in_cm|contract_expiration_date|          agent_name|           image_url|                 url|current_club_domestic_competition_id|   current_club_name|market_value_in_eur|highest_market_value_in_eur|
+---------+----------+------------+-------------------+-----------+---------------+-------------------+------------------+--------------+---------------

['player_id',
 'first_name',
 'last_name',
 'name',
 'last_season',
 'current_club_id',
 'player_code',
 'country_of_birth',
 'city_of_birth',
 'country_of_citizenship',
 'date_of_birth',
 'sub_position',
 'position',
 'foot',
 'height_in_cm',
 'contract_expiration_date',
 'agent_name',
 'image_url',
 'url',
 'current_club_domestic_competition_id',
 'current_club_name',
 'market_value_in_eur',
 'highest_market_value_in_eur']

In [80]:
mostrar_sumario(df_players).show()



+-------+-----------------+----------+------------------+-----------+-----------------+------------------+-----------------+----------------+----------------+----------------------+------------------+--------+-----+------------------+------------+--------------------+--------------------+------------------------------------+--------------------+-------------------+---------------------------+
|summary|        player_id|first_name|         last_name|       name|      last_season|   current_club_id|      player_code|country_of_birth|   city_of_birth|country_of_citizenship|      sub_position|position| foot|      height_in_cm|  agent_name|           image_url|                 url|current_club_domestic_competition_id|   current_club_name|market_value_in_eur|highest_market_value_in_eur|
+-------+-----------------+----------+------------------+-----------+-----------------+------------------+-----------------+----------------+----------------+----------------------+------------------+--------

                                                                                

In [81]:
contar_nulos_por_columna(df_players).show()

+---------+----------+---------+----+-----------+---------------+-----------+----------------+-------------+----------------------+-------------+------------+--------+----+------------+------------------------+----------+---------+---+------------------------------------+-----------------+-------------------+---------------------------+
|player_id|first_name|last_name|name|last_season|current_club_id|player_code|country_of_birth|city_of_birth|country_of_citizenship|date_of_birth|sub_position|position|foot|height_in_cm|contract_expiration_date|agent_name|image_url|url|current_club_domestic_competition_id|current_club_name|market_value_in_eur|highest_market_value_in_eur|
+---------+----------+---------+----+-----------+---------------+-----------+----------------+-------------+----------------------+-------------+------------+--------+----+------------+------------------------+----------+---------+---+------------------------------------+-----------------+-------------------+------------

In [82]:
duplicados_por_columna(df_players, "player_id")

+---------+-----+
|player_id|count|
+---------+-----+
+---------+-----+



In [83]:
# Ver nulos de la columna 'contract_expiration_date'
df_players.filter(col('contract_expiration_date').isNull()).show()

+---------+----------+--------------+--------------------+-----------+---------------+--------------------+------------------+------------------+----------------------+-------------------+------------------+----------+-----+------------+------------------------+--------------------+--------------------+--------------------+------------------------------------+--------------------+-------------------+---------------------------+
|player_id|first_name|     last_name|                name|last_season|current_club_id|         player_code|  country_of_birth|     city_of_birth|country_of_citizenship|      date_of_birth|      sub_position|  position| foot|height_in_cm|contract_expiration_date|          agent_name|           image_url|                 url|current_club_domestic_competition_id|   current_club_name|market_value_in_eur|highest_market_value_in_eur|
+---------+----------+--------------+--------------------+-----------+---------------+--------------------+------------------+----------

In [84]:
# Ver valores unicos de la columna 'last_season'
df_players.select('last_season').distinct().show()

+-----------+
|last_season|
+-----------+
|       2018|
|       2015|
|       2023|
|       2022|
|       2013|
|       2014|
|       2019|
|       2020|
|       2012|
|       2016|
|       2024|
|       2017|
|       2021|
+-----------+



In [85]:
# A primera vista, parece que los valores nulos de la columna 'contract_expiration_date' representan contratos que siguen en vigor
# Pero tras corroborar con Internet, vimos que tambien hay nulos contratos que ya finalizaron
# Decidimos eliminar la columna 'contract_expiration_date' por no tener informacion fiable y tambien la columna 'agent_name' que esta relacionada con ella
df_players = df_players.drop("contract_expiration_date")
df_players = df_players.drop("agent_name")
df_players.show(10)

+---------+----------+------------+-------------------+-----------+---------------+-------------------+------------------+--------------+----------------------+-------------------+------------------+----------+-----+------------+--------------------+--------------------+------------------------------------+--------------------+-------------------+---------------------------+
|player_id|first_name|   last_name|               name|last_season|current_club_id|        player_code|  country_of_birth| city_of_birth|country_of_citizenship|      date_of_birth|      sub_position|  position| foot|height_in_cm|           image_url|                 url|current_club_domestic_competition_id|   current_club_name|market_value_in_eur|highest_market_value_in_eur|
+---------+----------+------------+-------------------+-----------+---------------+-------------------+------------------+--------------+----------------------+-------------------+------------------+----------+-----+------------+---------------

In [86]:
# Ver todos los valores de la columna 'foot'
df_players.select('foot').distinct().show()

+-----+
| foot|
+-----+
| left|
| both|
|right|
| NULL|
+-----+



In [87]:
# Sustituir valores nulos de 'foot' por 'sin especificar'
df_players = df_players.withColumn(
    "foot",
    when(col("foot").isNull(), "sin especificar").otherwise(col("foot"))
)

In [88]:
# Ver valores nulos de la columna 'market_value_in_eur'
df_players.filter(col('market_value_in_eur').isNull()).show()

+---------+----------+----------+------------------+-----------+---------------+------------------+----------------+--------------------+----------------------+-------------------+------------------+----------+---------------+------------+--------------------+--------------------+------------------------------------+--------------------+-------------------+---------------------------+
|player_id|first_name| last_name|              name|last_season|current_club_id|       player_code|country_of_birth|       city_of_birth|country_of_citizenship|      date_of_birth|      sub_position|  position|           foot|height_in_cm|           image_url|                 url|current_club_domestic_competition_id|   current_club_name|market_value_in_eur|highest_market_value_in_eur|
+---------+----------+----------+------------------+-----------+---------------+------------------+----------------+--------------------+----------------------+-------------------+------------------+----------+--------------

In [89]:
# Calcular la mediana del valor de mercado (excluyendo nulos y ceros)
mediana_market_value = df_players.filter(
    col("market_value_in_eur").isNotNull() & (col("market_value_in_eur") > 0)
).approxQuantile("market_value_in_eur", [0.5], 0.01)[0]  # 0.01 es la precisión

print(f"Mediana del valor de mercado (excluyendo valores 0): {mediana_market_value}")

# Sustituir nulos y ceros por 'mediana_market_value'
df_players = df_players.withColumn(
    "market_value_in_eur",
    when(col("market_value_in_eur").isNull() | (col("market_value_in_eur") == 0), mediana_market_value)
    .otherwise(col("market_value_in_eur"))
)

# Comprobar de nuevo los nulos y valores minimos despues de la sustitucion
contar_nulos_por_columna(df_players).show()
df_players.select('market_value_in_eur').summary('min').show()

Mediana del valor de mercado (excluyendo valores 0): 250000.0
+---------+----------+---------+----+-----------+---------------+-----------+----------------+-------------+----------------------+-------------+------------+--------+----+------------+---------+---+------------------------------------+-----------------+-------------------+---------------------------+
|player_id|first_name|last_name|name|last_season|current_club_id|player_code|country_of_birth|city_of_birth|country_of_citizenship|date_of_birth|sub_position|position|foot|height_in_cm|image_url|url|current_club_domestic_competition_id|current_club_name|market_value_in_eur|highest_market_value_in_eur|
+---------+----------+---------+----+-----------+---------------+-----------+----------------+-------------+----------------------+-------------+------------+--------+----+------------+---------+---+------------------------------------+-----------------+-------------------+---------------------------+
|        0|      2062|       

In [90]:
# Repetimos el proceso para la columna 'highest_market_value_in_eur'

# Calcular la mediana del valor de mercado (excluyendo nulos y ceros)
mediana_market_value = df_players.filter(
    col("highest_market_value_in_eur").isNotNull() & (col("highest_market_value_in_eur") > 0)
).approxQuantile("highest_market_value_in_eur", [0.5], 0.01)[0]  # 0.01 es la precisión

print(f"Mediana del valor máximo de mercado (excluyendo valores 0): {mediana_market_value}")

# Sustituir nulos y ceros por 'mediana_market_value'
df_players = df_players.withColumn(
    "highest_market_value_in_eur",
    when(col("highest_market_value_in_eur").isNull() | (col("highest_market_value_in_eur") == 0), mediana_market_value)
    .otherwise(col("highest_market_value_in_eur"))
)

# Comprobar de nuevo los nulos y valores minimos despues de la sustitucion
contar_nulos_por_columna(df_players).show()
df_players.select('highest_market_value_in_eur').summary('min').show()

Mediana del valor máximo de mercado (excluyendo valores 0): 800000.0
+---------+----------+---------+----+-----------+---------------+-----------+----------------+-------------+----------------------+-------------+------------+--------+----+------------+---------+---+------------------------------------+-----------------+-------------------+---------------------------+
|player_id|first_name|last_name|name|last_season|current_club_id|player_code|country_of_birth|city_of_birth|country_of_citizenship|date_of_birth|sub_position|position|foot|height_in_cm|image_url|url|current_club_domestic_competition_id|current_club_name|market_value_in_eur|highest_market_value_in_eur|
+---------+----------+---------+----+-----------+---------------+-----------+----------------+-------------+----------------------+-------------+------------+--------+----+------------+---------+---+------------------------------------+-----------------+-------------------+---------------------------+
|        0|      2062|

In [91]:
# Ver nulos en la columna 'height_in_cm'
df_players.filter(col('height_in_cm').isNull()).show()

+---------+----------+----------+-------------------+-----------+---------------+-------------------+------------------+-----------------+----------------------+-------------------+------------------+----------+---------------+------------+--------------------+--------------------+------------------------------------+--------------------+-------------------+---------------------------+
|player_id|first_name| last_name|               name|last_season|current_club_id|        player_code|  country_of_birth|    city_of_birth|country_of_citizenship|      date_of_birth|      sub_position|  position|           foot|height_in_cm|           image_url|                 url|current_club_domestic_competition_id|   current_club_name|market_value_in_eur|highest_market_value_in_eur|
+---------+----------+----------+-------------------+-----------+---------------+-------------------+------------------+-----------------+----------------------+-------------------+------------------+----------+-----------

In [92]:
# Ver el valor minimo de la columna 'height_in_cm'
df_players.select('height_in_cm').summary('min').show()

+-------+------------+
|summary|height_in_cm|
+-------+------------+
|    min|          17|
+-------+------------+



In [93]:
# El valor minimo de altura es 17cm, lo cual es imposible
# Sustituimos los valores nulos y minimos por la media de altura

# Calcular la media de altura (excluyendo nulos y valores <= 100)
avg_height = df_players.filter(col("height_in_cm").isNotNull()).filter(col("height_in_cm") > 100).agg(avg("height_in_cm")).collect()[0][0]
print(f"Media de la altura (excluyendo valores <= 100): {avg_height}")

# Sustituimos los valores nulos y minimos por 'avg_height'
df_players = df_players.withColumn(
    "height_in_cm",
    when(col("height_in_cm").isNull() | (col("height_in_cm") <= 100), avg_height).otherwise(col("height_in_cm"))
)

# Comprobar de nuevo los nulos y valores minimos despues de la sustitucion
contar_nulos_por_columna(df_players).show()
df_players.select('height_in_cm').summary('min').show()

Media de la altura (excluyendo valores <= 100): 182.31905075807515
+---------+----------+---------+----+-----------+---------------+-----------+----------------+-------------+----------------------+-------------+------------+--------+----+------------+---------+---+------------------------------------+-----------------+-------------------+---------------------------+
|player_id|first_name|last_name|name|last_season|current_club_id|player_code|country_of_birth|city_of_birth|country_of_citizenship|date_of_birth|sub_position|position|foot|height_in_cm|image_url|url|current_club_domestic_competition_id|current_club_name|market_value_in_eur|highest_market_value_in_eur|
+---------+----------+---------+----+-----------+---------------+-----------+----------------+-------------+----------------------+-------------+------------+--------+----+------------+---------+---+------------------------------------+-----------------+-------------------+---------------------------+
|        0|      2062|  

In [None]:
# Ver si hay caracteres especiales en la columna 'name'
df_players.filter(col('name').rlike('[^\\p{L} ]')).show()

In [94]:
# Ver nulos en la columna 'first_name'
df_players.filter(col('first_name').isNull()).show()

+---------+----------+------------+------------+-----------+---------------+------------+----------------+--------------------+----------------------+-------------------+------------------+----------+---------------+------------------+--------------------+--------------------+------------------------------------+--------------------+-------------------+---------------------------+
|player_id|first_name|   last_name|        name|last_season|current_club_id| player_code|country_of_birth|       city_of_birth|country_of_citizenship|      date_of_birth|      sub_position|  position|           foot|      height_in_cm|           image_url|                 url|current_club_domestic_competition_id|   current_club_name|market_value_in_eur|highest_market_value_in_eur|
+---------+----------+------------+------------+-----------+---------------+------------+----------------+--------------------+----------------------+-------------------+------------------+----------+---------------+----------------

In [95]:
# Parece que los nulos de la columna 'first_name' se deben a que el jugador aparece con su apodo
# Sustituir los first_name nulos por 'Desconocido'

df_players = df_players.withColumn(
    "first_name",
    when(col("first_name").isNull(), "desconocido").otherwise(col("first_name"))
)
# Ver nulos de nuevo
contar_nulos_por_columna(df_players).show()

+---------+----------+---------+----+-----------+---------------+-----------+----------------+-------------+----------------------+-------------+------------+--------+----+------------+---------+---+------------------------------------+-----------------+-------------------+---------------------------+
|player_id|first_name|last_name|name|last_season|current_club_id|player_code|country_of_birth|city_of_birth|country_of_citizenship|date_of_birth|sub_position|position|foot|height_in_cm|image_url|url|current_club_domestic_competition_id|current_club_name|market_value_in_eur|highest_market_value_in_eur|
+---------+----------+---------+----+-----------+---------------+-----------+----------------+-------------+----------------------+-------------+------------+--------+----+------------+---------+---+------------------------------------+-----------------+-------------------+---------------------------+
|        0|         0|        0|   0|          0|              0|          0|            27

In [96]:
# Ver nulos de la columna 'date_of_birth'
df_players.filter(col('date_of_birth').isNull()).show()

+---------+----------+------------+-------------------+-----------+---------------+-------------------+----------------+--------------------+----------------------+-------------+------------------+----------+-----+------------+--------------------+--------------------+------------------------------------+--------------------+-------------------+---------------------------+
|player_id|first_name|   last_name|               name|last_season|current_club_id|        player_code|country_of_birth|       city_of_birth|country_of_citizenship|date_of_birth|      sub_position|  position| foot|height_in_cm|           image_url|                 url|current_club_domestic_competition_id|   current_club_name|market_value_in_eur|highest_market_value_in_eur|
+---------+----------+------------+-------------------+-----------+---------------+-------------------+----------------+--------------------+----------------------+-------------+------------------+----------+-----+------------+--------------------+

In [97]:
# Sustituimos las fechas de nacimiento nulas por un valor por defecto
# para indicar que la fecha es desconocida (no deberia ser una fecha que pueda aparecer en otros registros)
default_timestamp = to_timestamp(lit('1900-01-01 00:00:00'))

df_players = df_players.withColumn(
    "date_of_birth",
    when(col("date_of_birth").isNull(), default_timestamp).otherwise(col("date_of_birth"))
)
# Ver nulos de nuevo
contar_nulos_por_columna(df_players).show()

+---------+----------+---------+----+-----------+---------------+-----------+----------------+-------------+----------------------+-------------+------------+--------+----+------------+---------+---+------------------------------------+-----------------+-------------------+---------------------------+
|player_id|first_name|last_name|name|last_season|current_club_id|player_code|country_of_birth|city_of_birth|country_of_citizenship|date_of_birth|sub_position|position|foot|height_in_cm|image_url|url|current_club_domestic_competition_id|current_club_name|market_value_in_eur|highest_market_value_in_eur|
+---------+----------+---------+----+-----------+---------------+-----------+----------------+-------------+----------------------+-------------+------------+--------+----+------------+---------+---+------------------------------------+-----------------+-------------------+---------------------------+
|        0|         0|        0|   0|          0|              0|          0|            27

In [98]:
# Ver nulos de la columna 'sub_position'
df_players.filter(col('sub_position').isNull()).show()

+---------+-----------+-----------+--------------------+-----------+---------------+--------------------+--------------------+-------------+----------------------+-------------------+------------+--------+---------------+------------------+--------------------+--------------------+------------------------------------+--------------------+-------------------+---------------------------+
|player_id| first_name|  last_name|                name|last_season|current_club_id|         player_code|    country_of_birth|city_of_birth|country_of_citizenship|      date_of_birth|sub_position|position|           foot|      height_in_cm|           image_url|                 url|current_club_domestic_competition_id|   current_club_name|market_value_in_eur|highest_market_value_in_eur|
+---------+-----------+-----------+--------------------+-----------+---------------+--------------------+--------------------+-------------+----------------------+-------------------+------------+--------+---------------+-

In [99]:
# En la columna 'position' ya existe un valor 'Missing' que podemos usar para corregir los nulos
df_players = df_players.withColumn(
    "sub_position",
    when(col("sub_position").isNull(), "Missing").otherwise(col("sub_position"))
)
# Ver nulos de nuevo
contar_nulos_por_columna(df_players).show()

+---------+----------+---------+----+-----------+---------------+-----------+----------------+-------------+----------------------+-------------+------------+--------+----+------------+---------+---+------------------------------------+-----------------+-------------------+---------------------------+
|player_id|first_name|last_name|name|last_season|current_club_id|player_code|country_of_birth|city_of_birth|country_of_citizenship|date_of_birth|sub_position|position|foot|height_in_cm|image_url|url|current_club_domestic_competition_id|current_club_name|market_value_in_eur|highest_market_value_in_eur|
+---------+----------+---------+----+-----------+---------------+-----------+----------------+-------------+----------------------+-------------+------------+--------+----+------------+---------+---+------------------------------------+-----------------+-------------------+---------------------------+
|        0|         0|        0|   0|          0|              0|          0|            27

In [100]:
# Ver nulos de las columnas 'country_of_birth' y 'city_of_birth'
df_players.filter(col('country_of_birth').isNull()).show()
df_players.filter(col('city_of_birth').isNull()).show()

+---------+----------+------------------+--------------------+-----------+---------------+--------------------+----------------+--------------------+----------------------+-------------------+------------------+----------+---------------+------------------+--------------------+--------------------+------------------------------------+--------------------+-------------------+---------------------------+
|player_id|first_name|         last_name|                name|last_season|current_club_id|         player_code|country_of_birth|       city_of_birth|country_of_citizenship|      date_of_birth|      sub_position|  position|           foot|      height_in_cm|           image_url|                 url|current_club_domestic_competition_id|   current_club_name|market_value_in_eur|highest_market_value_in_eur|
+---------+----------+------------------+--------------------+-----------+---------------+--------------------+----------------+--------------------+----------------------+----------------

En las columnas 'country_of_birth' y 'city_of_birth' hay varios casos donde no hay valor para el país pero si para la ciudad. Para recuperar esos datos perdidos, definimos una función que mapea ciudad a país usando `geopy`

In [None]:
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut, GeocoderServiceError
import time

# Instanciamos el geolocalizador
geolocator = Nominatim(user_agent="city_to_country_mapper")

# Funcion para mapear ciudad a pais
def get_country(city):
    """
    Mapea el nombre de una ciudad a su pais utilizando geolocalizacion.
    Explicacion:
      - Intenta obtener la informacion de geolocalizacion para la ciudad proporcionada.
      - Si la geolocalizacion es exitosa y se obtiene una direccion, parsea
        la direccion para extraer el nombre del pais.
      - Maneja posibles errores de tiempo de espera o del servicio del geolocalizador.

    Args:
        city (str): El nombre de la ciudad a mapear.

    Returns:
        part: El nombre del pais si se puede mapear la ciudad, de lo contrario, None.
    """
    try:
        location = geolocator.geocode(city, language="en", timeout=10)
        if location and location.address:
            address_parts = location.address.split(", ")
            for part in reversed(address_parts):
                if len(part) > 2 and not any(char.isdigit() for char in part):
                    return part
        return None
    except (GeocoderTimedOut, GeocoderServiceError) as e:
        print(f"Error geocoding {city}: {e}")
        return None
    except Exception as e:
        print(f"Unexpected error for {city}: {e}")
        return None

# Crea una UDF para usar en Spark
get_country_udf = udf(get_country, StringType())

# Filtra las ciudades donde no hay pais pero hay ciudad
cities_df = df_players.filter(col("country_of_birth").isNull() & col("city_of_birth").isNotNull()) \
                   .select("city_of_birth").distinct()

# Collect para traerlas al driver (Cuidado si hay muchas)
cities = [row['city_of_birth'] for row in cities_df.collect() if row['city_of_birth'] is not None]

# Diccionario ciudad-pais
city_country_map = {}

for city in cities:
    country = get_country(city)
    if country:
        city_country_map[city] = country
        print(f"Mapped {city} to {country}")
    else:
        city_country_map[city] = None
        print(f"Could not map {city}")
    time.sleep(1)  # No bombardees el server como un cavernicola

# Transforma el diccionario a un DataFrame para hacer join
mapping_rows = [(k, v) for k, v in city_country_map.items()]
mapping_df = spark.createDataFrame(mapping_rows, ["city_of_birth", "mapped_country"])

# Une el mapping con el DataFrame original
df_players_mapped = df_players.join(mapping_df, on="city_of_birth", how="left")

# Actualiza la columna 'country_of_birth' donde sea NULL
df_players_mapped = df_players_mapped.withColumn("country_of_birth",
    when(col("country_of_birth").isNull() & col("mapped_country").isNotNull(), col("mapped_country"))
    .otherwise(col("country_of_birth"))
)

# Quita la columna de mapeo
df_players_mapped = df_players_mapped.drop("mapped_country")

Mapped Thessaloniki to Greece
Mapped Kirovograd to Ukraine
Mapped Bryansk to Russia
Mapped Konin to Poland
Mapped Salzburg to Austria
Mapped St.-Katelijne-Waver to Belgium
Mapped Tarnów to Poland
Mapped Castellammare di Stabia to Italy
Mapped Mytilini to Greece
Mapped Tamworth to Australia
Mapped Almaty to Kazakhstan
Mapped Paysandú to Uruguay
Mapped Pforzheim to Germany
Mapped Rogatica to Bosnia and Herzegovina
Mapped Marseille to France
Mapped Pontedeume to Spain
Mapped Brugg to Switzerland
Mapped Bilbao to Spain
Mapped Rotterdam to Netherlands
Could not map Bratsk, Irkutsk Region
Mapped London to United Kingdom
Mapped Brampton, Ontario to Canada
Mapped Oldham to United States
Mapped Kavala to Greece
Mapped Dresden to Germany
Mapped Düsseldorf to Germany
Mapped Hoorn to Netherlands
Mapped Korçë to Albania
Mapped Sundby to Denmark
Mapped Osasco to Brazil
Mapped Ivry-sur-Seine to France
Mapped Naberezhnye Chelny to Russia
Mapped Prilep to North Macedonia
Mapped Blagoevgrad to Bulgaria


In [102]:
# Comprobar si el mapeo se ha realizado correctamente
print(f"Valores nulos en 'country_of_birth' antes del mapeo:", df_players.filter(col('country_of_birth').isNull()).count())
print(f"Valores nulos en 'country_of_birth' despues del mapeo:", df_players_mapped.filter(col('country_of_birth').isNull()).count())

Valores nulos en 'country_of_birth' antes del mapeo: 2799
Valores nulos en 'country_of_birth' despues del mapeo: 2460


                                                                                

In [103]:
# El resto de valores nulos son de filas donde no hay ni pais ni ciudad
# Sustituimos esos valores por un valor 'Desconocido'
df_players_mapped = df_players_mapped.withColumn(
    "country_of_birth",
    when(col("country_of_birth").isNull(), "Desconocido").otherwise(col("country_of_birth"))
)
df_players_mapped = df_players_mapped.withColumn(
    "city_of_birth",
    when(col("city_of_birth").isNull(), "Desconocida").otherwise(col("city_of_birth"))
)
# Ver nulos de nuevo
contar_nulos_por_columna(df_players_mapped).show()

+-------------+---------+----------+---------+----+-----------+---------------+-----------+----------------+----------------------+-------------+------------+--------+----+------------+---------+---+------------------------------------+-----------------+-------------------+---------------------------+
|city_of_birth|player_id|first_name|last_name|name|last_season|current_club_id|player_code|country_of_birth|country_of_citizenship|date_of_birth|sub_position|position|foot|height_in_cm|image_url|url|current_club_domestic_competition_id|current_club_name|market_value_in_eur|highest_market_value_in_eur|
+-------------+---------+----------+---------+----+-----------+---------------+-----------+----------------+----------------------+-------------+------------+--------+----+------------+---------+---+------------------------------------+-----------------+-------------------+---------------------------+
|            0|        0|         0|        0|   0|          0|              0|          0|

In [104]:
# Eliminamos las columnas 'country_of_citizenship', 'image_url' y 'url' porque no son utiles para nuestro analisis
df_players_mapped = df_players_mapped.drop("country_of_citizenship","image_url", "url")
df_players_mapped.show()

+-------------+---------+----------+------------------+--------------------+-----------+---------------+--------------------+----------------+-------------------+------------------+----------+---------------+------------------+------------------------------------+--------------------+-------------------+---------------------------+
|city_of_birth|player_id|first_name|         last_name|                name|last_season|current_club_id|         player_code|country_of_birth|      date_of_birth|      sub_position|  position|           foot|      height_in_cm|current_club_domestic_competition_id|   current_club_name|market_value_in_eur|highest_market_value_in_eur|
+-------------+---------+----------+------------------+--------------------+-----------+---------------+--------------------+----------------+-------------------+------------------+----------+---------------+------------------+------------------------------------+--------------------+-------------------+---------------------------

# Archivo "transfers.csv"

In [105]:
# Cargar el csv
df_transfers = spark.read.csv(path + "/transfers.csv", header=True, inferSchema=True)

# Ver las primeras filas
df_transfers.show(10)
df_transfers.printSchema()
df_transfers.columns

+---------+-------------+---------------+------------+----------+--------------+---------------+------------+-------------------+----------------+
|player_id|transfer_date|transfer_season|from_club_id|to_club_id|from_club_name|   to_club_name|transfer_fee|market_value_in_eur|     player_name|
+---------+-------------+---------------+------------+----------+--------------+---------------+------------+-------------------+----------------+
|    16136|   2026-07-01|          26/27|         417|       123|      OGC Nice|        Retired|        NULL|           500000.0|           Dante|
|  1138758|   2026-07-01|          26/27|         336|       631|   Sporting CP|        Chelsea|     5.214E7|              4.5E7|  Geovany Quenda|
|   195778|   2026-06-30|          25/26|          79|        27| VfB Stuttgart|  Bayern Munich|         0.0|              1.2E7| Alexander Nübel|
|   569033|   2026-06-30|          25/26|          39|        27|1.FSV Mainz 05|  Bayern Munich|         0.0|         

['player_id',
 'transfer_date',
 'transfer_season',
 'from_club_id',
 'to_club_id',
 'from_club_name',
 'to_club_name',
 'transfer_fee',
 'market_value_in_eur',
 'player_name']

In [106]:
mostrar_sumario(df_transfers).show()



+-------+------------------+---------------+------------------+------------------+--------------+--------------+-----------------+-------------------+------------------+
|summary|         player_id|transfer_season|      from_club_id|        to_club_id|from_club_name|  to_club_name|     transfer_fee|market_value_in_eur|       player_name|
+-------+------------------+---------------+------------------+------------------+--------------+--------------+-----------------+-------------------+------------------+
|  count|             79646|          79646|             79646|             79646|         79646|         79646|            51931|              49330|             79646|
|   mean|423242.62649725034|           NULL| 17158.68822037516|12902.337556186123|          NULL|          NULL|1115650.110184668|  2488055.483478613|              NULL|
| stddev|269454.25715372444|           NULL|23567.526887485714| 20482.85586144404|          NULL|          NULL|5258424.365466746|  5901402.705113297|

                                                                                

In [107]:
contar_nulos_por_columna(df_transfers).show()

+---------+-------------+---------------+------------+----------+--------------+------------+------------+-------------------+-----------+
|player_id|transfer_date|transfer_season|from_club_id|to_club_id|from_club_name|to_club_name|transfer_fee|market_value_in_eur|player_name|
+---------+-------------+---------------+------------+----------+--------------+------------+------------+-------------------+-----------+
|        0|            0|              0|           0|         0|             0|           0|       27715|              30316|          0|
+---------+-------------+---------------+------------+----------+--------------+------------+------------+-------------------+-----------+



In [108]:
duplicados_por_columna(df_transfers, "player_id")

+---------+-----+
|player_id|count|
+---------+-----+
|481688   |7    |
|724035   |5    |
|655838   |5    |
|1007972  |2    |
|926952   |5    |
|359796   |11   |
|647534   |8    |
|396561   |6    |
|904712   |7    |
|100986   |7    |
|566723   |6    |
|222556   |17   |
|156941   |17   |
|259849   |11   |
|262389   |14   |
|320408   |13   |
|455661   |7    |
|474838   |5    |
|575716   |7    |
|607226   |6    |
+---------+-----+
only showing top 20 rows


En esta tabla la clave primaria es formada por las columnas 'player_id' y 'transfer_date', por lo que no es problema que aparezca un jugador varias veces.

In [109]:
# Sustituir los nulos de la columna transfer_fee por 0

df_transfers = df_transfers.withColumn(
    "transfer_fee",
    when(col("transfer_fee").isNull(), 0).otherwise(col("transfer_fee"))
)

# Comprobar de nuevo los nulos despues de la sustitucion
contar_nulos_por_columna(df_transfers).show()


+---------+-------------+---------------+------------+----------+--------------+------------+------------+-------------------+-----------+
|player_id|transfer_date|transfer_season|from_club_id|to_club_id|from_club_name|to_club_name|transfer_fee|market_value_in_eur|player_name|
+---------+-------------+---------------+------------+----------+--------------+------------+------------+-------------------+-----------+
|        0|            0|              0|           0|         0|             0|           0|           0|              30316|          0|
+---------+-------------+---------------+------------+----------+--------------+------------+------------+-------------------+-----------+



In [110]:
# Sustituir los nulos de market_value_in_eur por la mediana del equipo que procede

# Calcular la mediana del valor de mercado por jugador (excluyendo nulos y ceros)
mediana_market_value_por_jugador = df_transfers.filter(
    col("market_value_in_eur").isNotNull() & (col("market_value_in_eur") > 0)
).groupBy("player_id").agg(
    approx_percentile("market_value_in_eur", 0.5).alias("mediana_mv_por_jugador")
)

# Unir la mediana por club al DataFrame original
df_transfers = df_transfers.join(mediana_market_value_por_jugador, on="player_id", how="left_outer")

# Sustituir nulos en 'market_value_in_eur' por la mediana del jugador
df_transfers = df_transfers.withColumn(
    "market_value_in_eur",
    when(col("market_value_in_eur").isNull(), col("mediana_mv_por_jugador")).otherwise(col("market_value_in_eur"))
)

# Eliminar la columna temporal de la mediana por club
df_transfers = df_transfers.drop("mediana_mv_por_jugador")

# Comprobar de nuevo los nulos despues de la sustitucion
contar_nulos_por_columna(df_transfers).show()
df_transfers.select("player_id", "market_value_in_eur", "transfer_fee").show(20)

+---------+-------------+---------------+------------+----------+--------------+------------+------------+-------------------+-----------+
|player_id|transfer_date|transfer_season|from_club_id|to_club_id|from_club_name|to_club_name|transfer_fee|market_value_in_eur|player_name|
+---------+-------------+---------------+------------+----------+--------------+------------+------------+-------------------+-----------+
|        0|            0|              0|           0|         0|             0|           0|           0|               4394|          0|
+---------+-------------+---------------+------------+----------+--------------+------------+------------+-------------------+-----------+

+---------+-------------------+------------+
|player_id|market_value_in_eur|transfer_fee|
+---------+-------------------+------------+
|    16136|           500000.0|         0.0|
|  1138758|              4.5E7|     5.214E7|
|   195778|              1.2E7|         0.0|
|   569033|          4000000.0|   

In [111]:
# Aun quedan nulos en 'market_value_in_eur', estos son de jugadores que no estaban en 'df_players' o no tenían market_value > 0
# Podemos rellenarlos con la mediana general
mediana_market_value = df_transfers.filter(
    col("market_value_in_eur").isNotNull()
).agg(approx_percentile("market_value_in_eur", 0.5)).collect()[0][0]

df_transfers = df_transfers.withColumn(
    "market_value_in_eur",
    when(col("market_value_in_eur").isNull(), mediana_market_value).otherwise(col("market_value_in_eur"))
)
# Comprobar de nuevo los nulos despues de la sustitucion
contar_nulos_por_columna(df_transfers).show()
df_transfers.select("player_id", "market_value_in_eur", "transfer_fee").show(20)

+---------+-------------+---------------+------------+----------+--------------+------------+------------+-------------------+-----------+
|player_id|transfer_date|transfer_season|from_club_id|to_club_id|from_club_name|to_club_name|transfer_fee|market_value_in_eur|player_name|
+---------+-------------+---------------+------------+----------+--------------+------------+------------+-------------------+-----------+
|        0|            0|              0|           0|         0|             0|           0|           0|                  0|          0|
+---------+-------------+---------------+------------+----------+--------------+------------+------------+-------------------+-----------+

+---------+-------------------+------------+
|player_id|market_value_in_eur|transfer_fee|
+---------+-------------------+------------+
|    16136|           500000.0|         0.0|
|  1138758|              4.5E7|     5.214E7|
|   195778|              1.2E7|         0.0|
|   569033|          4000000.0|   

# Guardar las tablas limpias

In [112]:
import shutil
from pathlib import Path

# Ruta donde guardar el archivo CSV
output_path = "../Data"

# Diccionario con los DataFrames y los nombres deseados
dataframes = {
    "clubs": df_clubs,
    "competitions": df_competitions,
    "game_events": df_game_events,
    "game_lineups": df_game_lineups_cleaned,
    "appearances": df_appearances,
    "player_valuations": df_player_valuations,
    "games": df_games,
    "players": df_players_mapped,
    "transfers": df_transfers,
}

# Carpeta de salida general
output = Path(output_path)

for name, df in dataframes.items():
    # Ruta temporal y final
    temp_dir = output / f"{name}_temp"
    final_file = output / f"{name}.csv"

    # 1. Guardar en carpeta temporal
    df.repartition(1).write.csv(str(temp_dir), header=True, mode="overwrite")

    # 2. Buscar archivo part-*.csv
    part_file = next(temp_dir.glob("part-*.csv"))

    # 3. Mover y renombrar
    shutil.move(str(part_file), str(final_file))

    # 4. Eliminar carpeta temporal
    shutil.rmtree(temp_dir)

                                                                                

In [113]:
# Importante cerrar la sesion de spark
spark.stop()