In [1]:
# # 📦 Importar librerías necesarias
import pyspark
import pandas as pd
import numpy as np
import seaborn as sns
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

from google.colab import drive
import os

if not os.path.exists('/content/drive'):
  drive.mount('/content/drive')
else: print('Drive already mounted')

Mounted at /content/drive


## GUIA DE COMANDOS SPARK PARA LA LIMPIEZA

In [None]:
# # 🚀 Crear sesión Spark
# spark = SparkSession.builder.appName("LimpiezaDatos").getOrCreate()

# # 📥 Cargar datos desde CSV
# df = spark.read.csv("archivo.csv", header=True, inferSchema=True)

# # 👁️ Visualizar primeras filas y esquema
# df.show(5)                 # Primeras 5 filas
# df.printSchema()           # Tipos de columnas
# df.columns                 # Lista de nombres de columnas

# # 📊 Resumen estadístico
# df.describe().show()       # count, mean, stddev, min, max
# df.summary().show()        # count, mean, stddev, min, 25%, 50%, 75%, max

# # 🧼 Limpieza de datos

# ## 1. Verificar nulos
# df.select([count(when(col(c).isNull() | isnan(c), c)).alias(c) for c in df.columns]).show()

# ## 2. Eliminar filas con nulos
# df = df.dropna()

# ## 3. Rellenar nulos con valores por defecto
# df = df.fillna({
#     "columna_numerica": 0,
#     "columna_categorica": "desconocido"
# })

# ## 4. Reemplazar valores nulos en una columna
# df = df.withColumn("columna", when(col("columna").isNull(), "valor_defecto").otherwise(col("columna")))

# ## 5. Eliminar duplicados
# df = df.dropDuplicates()  # O: df.dropDuplicates(["columna_clave"])

# ## 6. Renombrar columnas
# df = df.withColumnRenamed("Col Antiguo", "col_nuevo")

# ## 7. Conversión de tipos
# df = df.withColumn("edad", col("edad").cast("int"))
# df = df.withColumn("fecha", col("fecha").cast("date"))

# ## 8. Limpiar y modificar texto
# df = df.withColumn("col_trim", trim(col("col_texto")))
# df = df.withColumn("col_upper", upper(col("col_texto")))
# df = df.withColumn("col_lower", lower(col("col_texto")))

# ## 9. Crear o modificar columnas
# df = df.withColumn("suma", col("col1") + col("col2"))
# df = df.withColumn("categoria_edad", when(col("edad") >= 18, "adulto").otherwise("menor"))

# ## 10. Filtrar datos
# df = df.filter(col("ingresos") > 1000)
# df = df.filter(col("pais") == "España")

# ## 11. Eliminar columnas innecesarias
# df = df.drop("columna_innecesaria")  # Una
# df = df.drop(*["col1", "col2"])      # Varias

# ## 12. Reemplazar valores
# df = df.replace("desconocido", "otros", subset=["columna"])

# ## 13. Ver valores únicos y categorías
# df.select("columna").distinct().show()
# df.groupBy("categoria").count().show()

# # 📈 Visualización básica (limitada en Spark)
# df.show(10)                       # Muestra 10 filas
# df.groupBy("columna").count().show()  # Conteo por categoría

# # Si quieres graficar: convertir a Pandas (⚠️ cuidado con datasets grandes)
# df_pd = df.limit(1000).toPandas()

# # Ejemplo: usar matplotlib o seaborn
# import matplotlib.pyplot as plt
# import seaborn as sns

# sns.histplot(df_pd["edad"], bins=20)
# plt.title("Distribución de edad")
# plt.show()

# sns.countplot(data=df_pd, x="categoria")
# plt.title("Conteo por categoría")
# plt.xticks(rotation=45)
# plt.show()


In [2]:
spark = SparkSession.builder.appName('tfm_2025').getOrCreate()

Funciones utiles para la limpieza

In [3]:
def mostrar_sumario(df):
    """
    Muestra un sumario del dataset.
    Explicacion:
        - Usa la funcion 'describe()' para mostrar las siguientes metricas:
            - El conteo de filas por columna.
            - La media de los valores de las columnas numericas (muestra NULL si no es numerica).
            - La desviacion  estandar de los valores de las columnas numericas (muestra NULL si no es numerica).
            - El valor minimo de cada columna.
            - El valor maximo de cada columna.
        - Devuelve el DataFrame con las metricas.
    Args:
        df: DataFrame a analizar

    Returns:
        Un DataFrame con las métricas del dataset.
    """
    return df.summary()

In [4]:
def contar_nulos_por_columna(df):
    """
    Calcula el numero de valores nulos en cada columna del DataFrame.
    Explicacion:
        - Itera sobre cada columna del DataFrame utilizando 'df.columns'.
        - Usa la funcion 'expr()' que utiliza una expresion SQL para contar los valores nulos en cada columna.
        - Asigna '1' si el valor es nulo y '0' si no es nulo.
        - Renombra la columna con '.alias(c)' para mantener el nombre original.
        - Selecciona y aplica la transformacion en todas las columnas dinamicamente con 'df.select([...])'.
        - Devuelve el DataFrame con los conteos de nulos.
    Args:
        df: DataFrame a analizar

    Returns:
        Un DataFrame con las mismas columnas del DataFrame original
        y una fila que indica el numero de valores nulos por columna.
    """
    # return df.select([count(when(col(c).isNull() | isnan(c), c)).alias(c) for c in df.columns]).show()
    return df.select([expr(f"sum(case when {c} is null then 1 else 0 end)").alias(c) for c in df.columns])

# Archivo "clubs.csv"

In [None]:
# Cargar el csv
df_clubs = spark.read.csv("/content/drive/MyDrive/BD-IA 2024-25/NTTDATA/proyecto_tfm_2025/Data/clubs.csv", header=True, inferSchema=True)

# Ver las primeras filas
df_clubs.show(10)
df_clubs.printSchema()
df_clubs.columns

+-------+-----------------+--------------------+-----------------------+------------------+----------+-----------+-----------------+---------------------+---------------------+--------------------+-------------+-------------------+----------+-----------+--------------------+--------------------+
|club_id|        club_code|                name|domestic_competition_id|total_market_value|squad_size|average_age|foreigners_number|foreigners_percentage|national_team_players|        stadium_name|stadium_seats|net_transfer_record|coach_name|last_season|            filename|                 url|
+-------+-----------------+--------------------+-----------------------+------------------+----------+-----------+-----------------+---------------------+---------------------+--------------------+-------------+-------------------+----------+-----------+--------------------+--------------------+
|    105|  sv-darmstadt-98|     SV Darmstadt 98|                     L1|              NULL|        27|       

['club_id',
 'club_code',
 'name',
 'domestic_competition_id',
 'total_market_value',
 'squad_size',
 'average_age',
 'foreigners_number',
 'foreigners_percentage',
 'national_team_players',
 'stadium_name',
 'stadium_seats',
 'net_transfer_record',
 'coach_name',
 'last_season',
 'filename',
 'url']

In [None]:
mostrar_sumario(df_clubs).show()

In [None]:
contar_nulos_por_columna(df_clubs).show()

In [None]:
# Ver nulos de la columna 'average_age'
df_clubs.filter(col('average_age').isNull()).show()

+-------+--------------------+--------------------+-----------------------+------------------+----------+-----------+-----------------+---------------------+---------------------+--------------------+-------------+-------------------+----------+-----------+--------------------+--------------------+
|club_id|           club_code|                name|domestic_competition_id|total_market_value|squad_size|average_age|foreigners_number|foreigners_percentage|national_team_players|        stadium_name|stadium_seats|net_transfer_record|coach_name|last_season|            filename|                 url|
+-------+--------------------+--------------------+-----------------------+------------------+----------+-----------+-----------------+---------------------+---------------------+--------------------+-------------+-------------------+----------+-----------+--------------------+--------------------+
|  21957|        ao-platanias|        AO Platanias|                    GR1|              NULL|      

In [None]:
# Calcular la media de 'average_age' por liga (excluyendo nulos) y redondear a 1 decimal
avg_age_by_league_df = df_clubs.filter(col("average_age").isNotNull()).groupBy("domestic_competition_id").agg(round(avg("average_age"), 1).alias("avg_age_by_league"))

# Unir la nueva columna al dataset
df_clubs = df_clubs.join(avg_age_by_league_df, on="domestic_competition_id", how="left_outer")

# Ver nueva columna
df_clubs.select("domestic_competition_id", "avg_age_by_league").show()

+-----------------------+-----------------+
|domestic_competition_id|avg_age_by_league|
+-----------------------+-----------------+
|                     L1|             25.8|
|                    RU1|             25.9|
|                    TR1|             25.4|
|                    IT1|             25.8|
|                    GB1|             26.1|
|                    BE1|             24.5|
|                    DK1|             24.8|
|                    DK1|             24.8|
|                    NL1|             24.5|
|                    TR1|             25.4|
|                    GR1|             26.4|
|                    GR1|             26.4|
|                    DK1|             24.8|
|                    PO1|             25.6|
|                    SC1|             26.2|
|                     L1|             25.8|
|                    BE1|             24.5|
|                    BE1|             24.5|
|                    PO1|             25.6|
|                    BE1|       

In [None]:
# Sustituir nulos por 'avg_age_by_league'
df_clubs = df_clubs.withColumn(
    "average_age",
    when(col("average_age").isNull(), col("avg_age_by_league")).otherwise(col("average_age"))
)

# Eliminar columna de media por liga
df_clubs = df_clubs.drop("avg_age_by_league")

# Comprobar de nuevo los nulos despues de la sustitucion
df_clubs.filter(col('average_age').isNull()).show()

+-----------------------+-------+---------+----+------------------+----------+-----------+-----------------+---------------------+---------------------+------------+-------------+-------------------+----------+-----------+--------+---+
|domestic_competition_id|club_id|club_code|name|total_market_value|squad_size|average_age|foreigners_number|foreigners_percentage|national_team_players|stadium_name|stadium_seats|net_transfer_record|coach_name|last_season|filename|url|
+-----------------------+-------+---------+----+------------------+----------+-----------+-----------------+---------------------+---------------------+------------+-------------+-------------------+----------+-----------+--------+---+
+-----------------------+-------+---------+----+------------------+----------+-----------+-----------------+---------------------+---------------------+------------+-------------+-------------------+----------+-----------+--------+---+



In [None]:
# Ver nulos de la columna 'foreigners_percentage'
df_clubs.filter(col('foreigners_percentage').isNull()).show()

+-----------------------+-------+------------------+--------------------+------------------+----------+-----------+-----------------+---------------------+---------------------+--------------------+-------------+-------------------+----------+-----------+--------------------+--------------------+
|domestic_competition_id|club_id|         club_code|                name|total_market_value|squad_size|average_age|foreigners_number|foreigners_percentage|national_team_players|        stadium_name|stadium_seats|net_transfer_record|coach_name|last_season|            filename|                 url|
+-----------------------+-------+------------------+--------------------+------------------+----------+-----------+-----------------+---------------------+---------------------+--------------------+-------------+-------------------+----------+-----------+--------------------+--------------------+
|                    GR1|  21957|      ao-platanias|        AO Platanias|              NULL|         0|   

In [None]:
# Calcular la media de 'foreigners_percentage' por liga (excluyendo nulos) y redondear a 1 decimal
avg_foreigners_percentage_df = df_clubs.filter(col("foreigners_percentage").isNotNull()).groupBy("domestic_competition_id").agg(round(avg("foreigners_percentage"), 1).alias("avg_foreigners_percentage_by_league"))

# Unir la nueva columna al dataset
df_clubs = df_clubs.join(avg_foreigners_percentage_df, on="domestic_competition_id", how="left_outer")

# Ver nueva columna
df_clubs.select("domestic_competition_id", "avg_foreigners_percentage_by_league").show()

+-----------------------+-----------------------------------+
|domestic_competition_id|avg_foreigners_percentage_by_league|
+-----------------------+-----------------------------------+
|                     L1|                               46.7|
|                    RU1|                               32.0|
|                    TR1|                               34.7|
|                    IT1|                               50.6|
|                    GB1|                               61.2|
|                    BE1|                               59.5|
|                    DK1|                               42.3|
|                    DK1|                               42.3|
|                    NL1|                               42.6|
|                    TR1|                               34.7|
|                    GR1|                               46.6|
|                    GR1|                               46.6|
|                    DK1|                               42.3|
|       

In [None]:
# Sustituir nulos por 'avg_foreigners_percentage_by_league'
df_clubs = df_clubs.withColumn(
    "foreigners_percentage",
    when(col("foreigners_percentage").isNull(), col("avg_foreigners_percentage_by_league")).otherwise(col("foreigners_percentage"))
)

# Eliminar tabla de media por liga
df_clubs = df_clubs.drop("avg_foreigners_percentage_by_league")

# Comprobar de nuevo los nulos despues de la sustitucion
df_clubs.filter(col('foreigners_percentage').isNull()).show()

+-----------------------+-------+---------+----+------------------+----------+-----------+-----------------+---------------------+---------------------+------------+-------------+-------------------+----------+-----------+--------+---+
|domestic_competition_id|club_id|club_code|name|total_market_value|squad_size|average_age|foreigners_number|foreigners_percentage|national_team_players|stadium_name|stadium_seats|net_transfer_record|coach_name|last_season|filename|url|
+-----------------------+-------+---------+----+------------------+----------+-----------+-----------------+---------------------+---------------------+------------+-------------+-------------------+----------+-----------+--------+---+
+-----------------------+-------+---------+----+------------------+----------+-----------+-----------------+---------------------+---------------------+------------+-------------+-------------------+----------+-----------+--------+---+



In [None]:
# Eliminamos las columnas 'filename' y 'url' porque no son relevantes para nuestro analisis
# Tambien eliminamos la columna 'coach_name' porque esa informacion ya esta en otra tabla
df_clubs = df_clubs.drop("filename", "url", "coach_name")
df_clubs.show(10)

+-----------------------+-------+-----------------+--------------------+------------------+----------+-----------+-----------------+---------------------+---------------------+--------------------+-------------+-------------------+-----------+
|domestic_competition_id|club_id|        club_code|                name|total_market_value|squad_size|average_age|foreigners_number|foreigners_percentage|national_team_players|        stadium_name|stadium_seats|net_transfer_record|last_season|
+-----------------------+-------+-----------------+--------------------+------------------+----------+-----------+-----------------+---------------------+---------------------+--------------------+-------------+-------------------+-----------+
|                     L1|    105|  sv-darmstadt-98|     SV Darmstadt 98|              NULL|        27|       25.6|               13|                 48.1|                    1|Merck-Stadion am ...|        17810|            +€3.05m|       2023|
|                    RU1

# Archivo "competitions.csv"

In [None]:
# Cargar el csv
df_competitions = spark.read.csv("/content/drive/MyDrive/BD-IA 2024-25/NTTDATA/proyecto_tfm_2025/Data/competitions.csv", header=True, inferSchema=True)

# Ver las primeras filas
df_competitions.show(10)
df_competitions.printSchema()
df_competitions.columns

+--------------+--------------------+--------------------+------------------+-----------------+----------+------------+--------------------+-------------+--------------------+------------------------+
|competition_id|    competition_code|                name|          sub_type|             type|country_id|country_name|domestic_league_code|confederation|                 url|is_major_national_league|
+--------------+--------------------+--------------------+------------------+-----------------+----------+------------+--------------------+-------------+--------------------+------------------------+
|           CIT|           italy-cup|           italy-cup|      domestic_cup|     domestic_cup|        75|       Italy|                 IT1|       europa|https://www.trans...|                   false|
|          NLSC|johan-cruijff-schaal|johan-cruijff-schaal|domestic_super_cup|            other|       122| Netherlands|                 NL1|       europa|https://www.trans...|                   fa

['competition_id',
 'competition_code',
 'name',
 'sub_type',
 'type',
 'country_id',
 'country_name',
 'domestic_league_code',
 'confederation',
 'url',
 'is_major_national_league']

In [None]:
mostrar_sumario(df_competitions).show()

+-------+--------------+-------------------+-------------------+--------------+------------+-----------------+------------+--------------------+-------------+--------------------+
|summary|competition_id|   competition_code|               name|      sub_type|        type|       country_id|country_name|domestic_league_code|confederation|                 url|
+-------+--------------+-------------------+-------------------+--------------+------------+-----------------+------------+--------------------+-------------+--------------------+
|  count|            44|                 44|                 44|            44|          44|               44|          36|                  36|           44|                  44|
|   mean|          NULL|               NULL|               NULL|          NULL|        NULL|94.86363636363636|        NULL|                NULL|         NULL|                NULL|
| stddev|          NULL|               NULL|               NULL|          NULL|        NULL|70.51887

In [None]:
contar_nulos_por_columna(df_competitions).show()

+--------------+----------------+----+--------+----+----------+------------+--------------------+-------------+---+------------------------+
|competition_id|competition_code|name|sub_type|type|country_id|country_name|domestic_league_code|confederation|url|is_major_national_league|
+--------------+----------------+----+--------+----+----------+------------+--------------------+-------------+---+------------------------+
|             0|               0|   0|       0|   0|         0|           8|                   8|            0|  0|                       0|
+--------------+----------------+----+--------+----+----------+------------+--------------------+-------------+---+------------------------+



In [None]:
# Ver nulos en 'country_name'
df_competitions.filter(col('country_name').isNull()).show()

+--------------+--------------------+--------------------+--------------------+-----------------+----------+------------+--------------------+-------------+--------------------+------------------------+
|competition_id|    competition_code|                name|            sub_type|             type|country_id|country_name|domestic_league_code|confederation|                 url|is_major_national_league|
+--------------+--------------------+--------------------+--------------------+-----------------+----------+------------+--------------------+-------------+--------------------+------------------------+
|           USC|      uefa-super-cup|      uefa-super-cup|      uefa_super_cup|international_cup|        -1|        NULL|                NULL|       europa|https://www.trans...|                   false|
|            EL|       europa-league|       europa-league|       europa_league|international_cup|        -1|        NULL|                NULL|       europa|https://www.trans...|           

In [None]:
# Las filas con 'country_id'= -1 y 'country_name'= null son de competiciones internacionales
# Sustituimos 'country_name' por 'International', 'country_id' por '1' y 'domestic_league_code' por 'INT'
df_competitions = df_competitions.withColumn(
    "country_name",
    when(col("country_id") == -1, "International").otherwise(col("country_name"))
)

df_competitions = df_competitions.withColumn(
    "country_id",
    when(col("country_id") == -1, 1).otherwise(col("country_id"))
)

df_competitions = df_competitions.withColumn(
    "domestic_league_code",
    when(col("country_name") == 1, 'INT').otherwise(col("domestic_league_code"))
)

df_competitions.show()

In [None]:
# Eliminar la columna 'url' porque no es relevante para nuestro analisis
df_competitions = df_competitions.drop("url")
df_competitions.show()

# Archivo "game_events.csv"

In [5]:
# Cargar el csv
df_game_events = spark.read.csv("/content/drive/MyDrive/BD-IA 2024-25/NTTDATA/proyecto_tfm_2025/Data/game_events.csv", header=True, inferSchema=True)

# Ver las primeras filas
df_game_events.show(10)
df_game_events.printSchema()
df_game_events.columns

+--------------------+----------+-------+------+-------------+-------+---------+--------------------+------------+----------------+
|       game_event_id|      date|game_id|minute|         type|club_id|player_id|         description|player_in_id|player_assist_id|
+--------------------+----------+-------+------+-------------+-------+---------+--------------------+------------+----------------+
|2f41da30c471492e7...|2012-08-05|2211607|    77|        Cards|    610|     4425|1. Yellow card  ,...|        NULL|            NULL|
|a72f7186d132775f2...|2012-08-05|2211607|    77|        Cards|    383|    33210|1. Yellow card  ,...|        NULL|            NULL|
|b2d721eaed4692a5c...|2012-08-05|2211607|     3|        Goals|    383|    36500|, Header, 1. Tour...|        NULL|           56416|
|aef768899cedac0c9...|2012-08-05|2211607|    53|        Goals|    383|    36500|, Right-footed sh...|        NULL|          146258|
|5d6d9533023057b66...|2012-08-05|2211607|    74|Substitutions|    383|    36

['game_event_id',
 'date',
 'game_id',
 'minute',
 'type',
 'club_id',
 'player_id',
 'description',
 'player_in_id',
 'player_assist_id']

In [6]:
mostrar_sumario(df_game_events).show()

+-------+--------------------+------------------+------------------+-------------+------------------+------------------+--------------------+-----------------+------------------+
|summary|       game_event_id|           game_id|            minute|         type|           club_id|         player_id|         description|     player_in_id|  player_assist_id|
+-------+--------------------+------------------+------------------+-------------+------------------+------------------+--------------------+-----------------+------------------+
|  count|             1035043|           1035043|           1035043|      1035043|           1035043|           1035043|              947716|           497678|            156759|
|   mean|            Infinity|3232166.6426525274| 61.82031857613645|         NULL| 4956.375424982344|234649.75911049105|                NULL|281129.1353606147| 211605.5213544358|
| stddev|                 NaN| 680074.1692635092|23.586116943353236|         NULL|11822.249636871891|2120

In [7]:
contar_nulos_por_columna(df_game_events).show()

+-------------+----+-------+------+----+-------+---------+-----------+------------+----------------+
|game_event_id|date|game_id|minute|type|club_id|player_id|description|player_in_id|player_assist_id|
+-------------+----+-------+------+----+-------+---------+-----------+------------+----------------+
|            0|   0|      0|     0|   0|      0|        0|      87327|      537365|          878284|
+-------------+----+-------+------+----+-------+---------+-----------+------------+----------------+



In [10]:
# Mostrar los diferentes tipos de eventos
df_game_events.select("type").distinct().show()

+-------------+
|         type|
+-------------+
|        Goals|
|     Shootout|
|        Cards|
|Substitutions|
+-------------+



In [8]:
# Ver nulos de la columna 'description'
df_game_events.filter(col('description').isNull()).show()

+--------------------+----------+-------+------+-------------+-------+---------+-----------+------------+----------------+
|       game_event_id|      date|game_id|minute|         type|club_id|player_id|description|player_in_id|player_assist_id|
+--------------------+----------+-------+------+-------------+-------+---------+-----------+------------+----------------+
|e4d512124476ec941...|2012-08-22|2235964|    81|Substitutions|  10468|   121235|       NULL|      121027|            NULL|
|ac641aeaaf272da2f...|2012-08-22|2235964|    66|Substitutions|  10468|   201284|       NULL|      113699|            NULL|
|9e4d13d444aa9315f...|2013-02-17|2240152|    81|Substitutions|   1465|    17352|       NULL|      125700|            NULL|
|4bc0bb78995d14303...|2013-02-17|2240152|    62|Substitutions|   1301|    24880|       NULL|      237663|            NULL|
|b08ec33cbbcb9be73...|2013-02-17|2240152|    65|Substitutions|   1465|    42303|       NULL|      150535|            NULL|
|2c3f355afc76f8a

In [9]:
# Sustituir los valores nulos por 'Unkown'
df_game_events = df_game_events.withColumn(
    "description",
    when(col("description").isNull(), "Unknown").otherwise(col("description"))
)
df_game_events.filter(col('description').isNull()).show()

+-------------+----+-------+------+----+-------+---------+-----------+------------+----------------+
|game_event_id|date|game_id|minute|type|club_id|player_id|description|player_in_id|player_assist_id|
+-------------+----+-------+------+----+-------+---------+-----------+------------+----------------+
+-------------+----+-------+------+----+-------+---------+-----------+------------+----------------+



Los valores vacíos en las columnas 'player_in_id' y 'player_assist_id' tienen sentido ya que dependen del tipo de evento; si el evento es 'Substitutions', sólo hace falta el 'player_in_id'; si el evento es 'Goals', sólo hace falta el 'player_assist_id'.

# Archivo "game_lineups.csv"

In [12]:
# Cargar el csv
df_game_lineups = spark.read.csv("/content/drive/MyDrive/BD-IA 2024-25/NTTDATA/proyecto_tfm_2025/Data/game_lineups.csv", header=True, inferSchema=True)

# Ver las primeras filas
df_game_lineups.show(10)
df_game_lineups.printSchema()
df_game_lineups.columns

+--------------------+----------+-------+---------+-------+-----------------+---------------+------------------+------+------------+
|     game_lineups_id|      date|game_id|player_id|club_id|      player_name|           type|          position|number|team_captain|
+--------------------+----------+-------+---------+-------+-----------------+---------------+------------------+------+------------+
|b2dbe01c3656b06c8...|2013-07-27|2317258|     1443|    610|Christian Poulsen|    substitutes|Defensive Midfield|     5|           0|
|b50a3ec6d52fd1490...|2013-07-27|2317258|     5017|    610| Niklas Moisander|starting_lineup|       Centre-Back|     4|           0|
|7d890e6d0ff8af84b...|2013-07-27|2317258|     9602|   1090|  Maarten Martens|    substitutes|       Left Winger|    11|           0|
|8c355268678b9bbc7...|2013-07-27|2317258|    12282|    610|      Daley Blind|starting_lineup|         Left-Back|    17|           0|
|76193074d549e5fdc...|2013-07-27|2317258|    25427|   1090|      Roy 

['game_lineups_id',
 'date',
 'game_id',
 'player_id',
 'club_id',
 'player_name',
 'type',
 'position',
 'number',
 'team_captain']

In [14]:
mostrar_sumario(df_game_lineups).show()

+-------+--------------------+--------------------+-----------------+------------------+------------------+-----------+---------------+--------+-----------------+-------------------+
|summary|     game_lineups_id|                date|          game_id|         player_id|           club_id|player_name|           type|position|           number|       team_captain|
+-------+--------------------+--------------------+-----------------+------------------+------------------+-----------+---------------+--------+-----------------+-------------------+
|  count|             2285289|             2238600|          2191911|           2191911|           2191911|    2191911|        2191911| 2191908|          2191911|            2145222|
|   mean|8.497272360538228E48|0.048105549487031204|3079740.595638235|213736.30504979444| 4394.989995031732|       NULL|           NULL|    NULL|20.89627818164177|0.04718579242614517|
| stddev|1.833895644341343E51| 0.21399155682321522|511107.1982517088|188289.509899011

In [15]:
contar_nulos_por_columna(df_game_lineups).show()

+---------------+-----+-------+---------+-------+-----------+-----+--------+------+------------+
|game_lineups_id| date|game_id|player_id|club_id|player_name| type|position|number|team_captain|
+---------------+-----+-------+---------+-------+-----------+-----+--------+------+------------+
|              0|46689|  93378|    93378|  93378|      93378|93378|   93381| 93378|      140067|
+---------------+-----+-------+---------+-------+-----------+-----+--------+------+------------+



Esta tabla no debería de tener tantos nulos, ya lo hablaré mañana con Iago

# Archivo "games.csv"

In [None]:
# Cargar el csv
df_games = spark.read.csv("/content/drive/MyDrive/BD-IA 2024-25/NTTDATA/proyecto_tfm_2025/Data/games.csv", header=True, inferSchema=True)

# Ver las primeras filas
df_games.show(10)
df_games.printSchema()
df_games.columns

In [None]:
mostrar_sumario(df_games).show()

+-------+-----------------+--------------+------------------+-----------+------------------+-----------------+------------------+------------------+------------------+------------------+----------------------+----------------------+--------------------+------------------+---------------+--------------------+-------------------+--------------------+--------------------+--------------------+----------------+
|summary|          game_id|competition_id|            season|      round|      home_club_id|     away_club_id|   home_club_goals|   away_club_goals|home_club_position|away_club_position|home_club_manager_name|away_club_manager_name|             stadium|        attendance|        referee|                 url|home_club_formation| away_club_formation|      home_club_name|      away_club_name|competition_type|
+-------+-----------------+--------------+------------------+-----------+------------------+-----------------+------------------+------------------+------------------+-------------

In [None]:
contar_nulos_por_columna(df_games).show()

+-------+--------------+------+-----+----+------------+------------+---------------+---------------+------------------+------------------+----------------------+----------------------+-------+----------+-------+---+-------------------+-------------------+--------------+--------------+---------+----------------+
|game_id|competition_id|season|round|date|home_club_id|away_club_id|home_club_goals|away_club_goals|home_club_position|away_club_position|home_club_manager_name|away_club_manager_name|stadium|attendance|referee|url|home_club_formation|away_club_formation|home_club_name|away_club_name|aggregate|competition_type|
+-------+--------------+------+-----+----+------------+------------+---------------+---------------+------------------+------------------+----------------------+----------------------+-------+----------+-------+---+-------------------+-------------------+--------------+--------------+---------+----------------+
|      0|             0|     0|    0|   0|           9|      

In [None]:
# Ver nulos de la tablas 'home_club_id' y 'away_club_id'
df_games.filter(col('home_club_id').isNull()).show()
df_games.filter(col('away_club_id').isNull()).show()

+-------+--------------+------+--------------+----------+------------+------------+---------------+---------------+------------------+------------------+----------------------+----------------------+--------------------+----------+-------+--------------------+-------------------+-------------------+--------------+--------------+---------+----------------+
|game_id|competition_id|season|         round|      date|home_club_id|away_club_id|home_club_goals|away_club_goals|home_club_position|away_club_position|home_club_manager_name|away_club_manager_name|             stadium|attendance|referee|                 url|home_club_formation|away_club_formation|home_club_name|away_club_name|aggregate|competition_type|
+-------+--------------+------+--------------+----------+------------+------------+---------------+---------------+------------------+------------------+----------------------+----------------------+--------------------+----------+-------+--------------------+-------------------+----

In [None]:
# Parece que los partidos sin club_id son de fechas futuras, deberian de eliminarse
df_games = df_games.withColumn('date', to_date(col('date'), 'yyyy-MM-dd')).filter(col('date') <= current_date())
# Ver nulos de nuevo
contar_nulos_por_columna(df_games).show()

+-------+--------------+------+-----+----+------------+------------+---------------+---------------+------------------+------------------+----------------------+----------------------+-------+----------+-------+---+-------------------+-------------------+--------------+--------------+---------+----------------+
|game_id|competition_id|season|round|date|home_club_id|away_club_id|home_club_goals|away_club_goals|home_club_position|away_club_position|home_club_manager_name|away_club_manager_name|stadium|attendance|referee|url|home_club_formation|away_club_formation|home_club_name|away_club_name|aggregate|competition_type|
+-------+--------------+------+-----+----+------------+------------+---------------+---------------+------------------+------------------+----------------------+----------------------+-------+----------+-------+---+-------------------+-------------------+--------------+--------------+---------+----------------+
|      0|             0|     0|    0|   0|           0|      

In [None]:
# Ver nulos columnas de 'club_position'
df_games.filter(col('home_club_position').isNull()).show()
df_games.filter(col('away_club_position').isNull()).show()

+-------+--------------+------+--------------------+----------+------------+------------+---------------+---------------+------------------+------------------+----------------------+----------------------+--------------------+----------+--------------------+--------------------+-------------------+-------------------+--------------------+--------------------+-------------------+-----------------+
|game_id|competition_id|season|               round|      date|home_club_id|away_club_id|home_club_goals|away_club_goals|home_club_position|away_club_position|home_club_manager_name|away_club_manager_name|             stadium|attendance|             referee|                 url|home_club_formation|away_club_formation|      home_club_name|      away_club_name|          aggregate| competition_type|
+-------+--------------+------+--------------------+----------+------------+------------+---------------+---------------+------------------+------------------+----------------------+------------------

In [None]:
# Parece ser que cuando el tipo de competicion es alguna copa o torneo, no hay valores de 'club_position' porque no tiene el mismo sistema de ranking




In [None]:
# Sustituir valores nulos en 'home_club_manager_name', 'away_club_manager_name', 'stadium' y 'referee' por 'Unkown'
df_games = df_games.fillna({
    "home_club_manager_name": "Unknown",
    "away_club_manager_name": "Unknown",
    "stadium": "Unknown",
    "referee": "Unknown"
})
# Comprobar de nuevo los nulos despues de la sustitucion
contar_nulos_por_columna(df_games).show()

+-------+--------------+------+-----+----+------------+------------+---------------+---------------+------------------+------------------+----------------------+----------------------+-------+----------+-------+---+-------------------+-------------------+--------------+--------------+---------+----------------+
|game_id|competition_id|season|round|date|home_club_id|away_club_id|home_club_goals|away_club_goals|home_club_position|away_club_position|home_club_manager_name|away_club_manager_name|stadium|attendance|referee|url|home_club_formation|away_club_formation|home_club_name|away_club_name|aggregate|competition_type|
+-------+--------------+------+-----+----+------------+------------+---------------+---------------+------------------+------------------+----------------------+----------------------+-------+----------+-------+---+-------------------+-------------------+--------------+--------------+---------+----------------+
|      0|             0|     0|    0|   0|           0|      

In [None]:
# Ver nulos en columnas 'club_name'
df_games.filter(col('home_club_name').isNull()).show()
df_games.filter(col('away_club_name').isNull()).show()

+-------+--------------+------+--------------------+----------+------------+------------+---------------+---------------+------------------+------------------+----------------------+----------------------+--------------------+----------+--------------------+--------------------+-------------------+--------------------+--------------+--------------------+-------------------+-----------------+
|game_id|competition_id|season|               round|      date|home_club_id|away_club_id|home_club_goals|away_club_goals|home_club_position|away_club_position|home_club_manager_name|away_club_manager_name|             stadium|attendance|             referee|                 url|home_club_formation| away_club_formation|home_club_name|      away_club_name|          aggregate| competition_type|
+-------+--------------+------+--------------------+----------+------------+------------+---------------+---------------+------------------+------------------+----------------------+----------------------+-----

In [None]:
# Rellenar valores nulos en 'home_club_name' y 'away_club_name' buscando su nombre correcto por el club_id


In [None]:
# Eliminar columnas 'url' y 'aggregate' porque no son relevantes para nuestro analisis
df_games = df_games.drop("url", "aggregate")
df_games.show(10)

In [None]:
# Ruta donde quieres guardar el archivo CSV en Google Drive
output_path = "../Data/Football_cleaned"

# Guardar el DataFrame en formato CSV
# Spark por defecto guarda en particiones
# Para guardarlo en un solo archivo, usar repartition(1) antes de 'write'
df.repartition(1).write.csv(output_path, header=True, mode="overwrite")

In [None]:
# Importante cerrar la sesion de spark
spark.stop()