In [3]:
#CARGA DE DATOS 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, mean_squared_error, mean_absolute_error, r2_score
import joblib
from datetime import datetime, timedelta
import re

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print(" Análisis Predictivo Premier League - TFM Big Data")
print("=" * 60)

# 2. Descripción del Dataset

## 2.1 Carga y Exploración Inicial

# Cargar el dataset (asumiendo que está en el mismo directorio)
df = pd.read_csv("Premier League Estadisticas 1993-2025.csv", encoding='latin1', sep=';')
df.columns = df.columns.str.strip()
print(f"Dataset cargado ")
print(f"Total de partidos registrados: {df.shape[0]:,}")
print(f"Total de columnas (variables): {df.shape[1]}")
print("\n Columnas disponibles:")
print(df.columns.tolist())


In [None]:
## 2.2 Análisis de Calidad de Datos

print("\n Análisis de valores nulos:")
null_counts = df.isnull().sum()
null_percentages = (null_counts / len(df)) * 100
null_analysis = pd.DataFrame({
    'Valores Nulos': null_counts,
    'Porcentaje (%)': null_percentages.round(2)
}).sort_values('Valores Nulos', ascending=False)
print(null_analysis[null_analysis['Valores Nulos'] > 0])

# Información básica del dataset
print("\n📈 Información del dataset:")
print(df.info())

## 2.3 Corrección de Fechas Ambiguas

def fix_ambiguous_dates(date_str):
    """
    Corrige fechas ambiguas DD/MM/AA vs MM/DD/AA
    Asume formato europeo DD/MM/AA como estándar
    """
    if pd.isna(date_str):
        return pd.NaT
    
    try:
        # Intentar formato europeo DD/MM/YY primero
        if '/' in str(date_str):
            parts = str(date_str).split('/')
            if len(parts) == 3:
                day, month, year = parts
                
                # Convertir año de 2 dígitos a 4 dígitos
                if len(year) == 2:
                    year_int = int(year)
                    if year_int <= 25:  # 2000-2025
                        year = f"20{year}"
                    else:  # 1993-1999
                        year = f"19{year}"
                
                # Validar que día y mes sean válidos
                day_int, month_int = int(day), int(month)
                if 1 <= day_int <= 31 and 1 <= month_int <= 12:
                    return pd.to_datetime(f"{day}/{month}/{year}", format="%d/%m/%Y")
                elif 1 <= month_int <= 31 and 1 <= day_int <= 12:
                    # Formato americano detectado, intercambiar
                    return pd.to_datetime(f"{month}/{day}/{year}", format="%d/%m/%Y")
        
        # Si falla, intentar conversión automática
        return pd.to_datetime(date_str)
    except:
        return pd.NaT


df['Date'] = df['Date'].apply(fix_ambiguous_dates)

# Verificar corrección
print(f"Fechas válidas: {df['Date'].notna().sum():,} de {len(df):,}")
print(f"Rango temporal: {df['Date'].min()} a {df['Date'].max()}")

# Añadir información temporal útil
df['Season'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['DayOfWeek'] = df['Date'].dt.day_name()

# Función para corregir fechas ambiguas
def corregir_fecha(fecha_str):
    try:
        # Intentar interpretar como dd/mm/yyyy
        fecha_ddmm = pd.to_datetime(fecha_str, format='%d/%m/%Y', errors='coerce')
        # Si falla, intentar como mm/dd/yyyy
        if pd.isna(fecha_ddmm):
            fecha_mmdd = pd.to_datetime(fecha_str, format='%m/%d/%Y', errors='coerce')
            return fecha_mmdd
        return fecha_ddmm
    except:
        return pd.NaT

# Aplicar corrección a la columna 'Date'
df['Date'] = df['Date'].apply(corregir_fecha)

# Verificar si hay fechas no convertidas
n_fechas_invalidas = df['Date'].isna().sum()
print(f" Fechas no convertidas: {n_fechas_invalidas}")
print(" Fechas corregidas y convertidas a formato datetime.")

In [None]:
df.sample(10)

EDA

In [None]:
# Seleccionar solo columnas numéricas
numericas = df.select_dtypes(include=['int64', 'float64'])

# Mostrar estadísticas descriptivas
print(" Estadísticas descriptivas generales:")
numericas.describe().transpose()

### 3.2.1 Número de goles relacionados con el número de partidos y número de goles local y visitante para ver los resultados que más se repiten en futbol.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Crear columna de goles totales
df['Total Goles'] = df['Full Time Home Goals'] + df['Full Time Away Goals']

# Calcular frecuencias y porcentajes
frecuencias = df['Total Goles'].value_counts().sort_index()
valor_mas_frecuente = frecuencias.idxmax()
cantidad_mas_frecuente = frecuencias.max()
total_partidos = len(df)
porcentajes = (frecuencias / total_partidos * 100).round(2)

# Crear matriz de calor
tabla_goles = pd.crosstab(df['Full Time Home Goals'], df['Full Time Away Goals'])

# Crear subplots
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Gráfico de barras con porcentajes
sns.barplot(x=frecuencias.index, y=frecuencias.values, color='steelblue', ax=axes[0])
axes[0].set_title("Número de partidos por cantidad total de goles", fontsize=14)
axes[0].set_xlabel("Goles totales (local + visitante)", fontsize=12)
axes[0].set_ylabel("Número de partidos", fontsize=12)
axes[0].set_xticks(range(frecuencias.index.min(), frecuencias.index.max()+1))

# Añadir porcentajes encima de cada barra
for i, (x, y) in enumerate(zip(frecuencias.index, frecuencias.values)):
    axes[0].text(i, y + 5, f"{porcentajes[x]}%", ha='center', va='bottom', fontsize=10, color='black')

# Matriz de calor
sns.heatmap(tabla_goles, cmap='YlGnBu', annot=True, fmt='d', ax=axes[1])
axes[1].set_title("Matriz de goles: Local vs Visitante", fontsize=14)
axes[1].set_xlabel("Goles Visitante", fontsize=12)
axes[1].set_ylabel("Goles Local", fontsize=12)

# Texto explicativo debajo del gráfico de barras
explicacion = (
    f"Total de partidos analizados: {total_partidos}\n"
    f"El número de goles más frecuente fue {valor_mas_frecuente}, "
    f"con {cantidad_mas_frecuente} partidos ({porcentajes[valor_mas_frecuente]}%)"
)
fig.text(0.5, -0.05, explicacion, wrap=True, ha='center', fontsize=11)

plt.tight_layout()
plt.show()

### 3.2.2 Distribución de goles por equipo Local y Visitante.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Crear subplots
fig, axes = plt.subplots(1, 2, figsize=(14, 5), sharey=True)

# Histograma de goles del equipo local
sns.histplot(df['Full Time Home Goals'], bins=range(0, df['Full Time Home Goals'].max()+2),
             color='royalblue', edgecolor='black', ax=axes[0])
axes[0].set_title("Distribución de goles del equipo local")
axes[0].set_xlabel("Goles")
axes[0].set_ylabel("Número de partidos")
axes[0].set_xticks(range(0, df['Full Time Home Goals'].max()+1))
axes[0].grid(axis='y', linestyle='--', alpha=0.4)

# Histograma de goles del equipo visitante
sns.histplot(df['Full Time Away Goals'], bins=range(0, df['Full Time Away Goals'].max()+2),
             color='darkorange', edgecolor='black', ax=axes[1])
axes[1].set_title("Distribución de goles del equipo visitante")
axes[1].set_xlabel("Goles")
axes[1].set_ylabel("Número de partidos")
axes[1].set_xticks(range(0, df['Full Time Away Goals'].max()+1))
axes[1].grid(axis='y', linestyle='--', alpha=0.4)

plt.tight_layout()
plt.show()

In [None]:
### 3.2.3 Distribución de resultado final por partido para ver el hístorico de porcentaje de victoria local, victoria visitante o empate.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Crear la columna 'Resultado'
def resultado(row):
    if row['Full Time Home Goals'] > row['Full Time Away Goals']:
        return 'Victoria Local'
    elif row['Full Time Home Goals'] < row['Full Time Away Goals']:
        return 'Victoria Visitante'
    else:
        return 'Empate'

df['Resultado'] = df.apply(resultado, axis=1)

# Crear el gráfico
plt.figure(figsize=(8, 5))
ax = sns.countplot(x='Resultado', data=df, palette='Set2')

# Calcular totales y añadir porcentajes
total = len(df)
for p in ax.patches:
    count = int(p.get_height())
    percentage = f'{100 * count / total:.1f}%'
    x = p.get_x() + p.get_width() / 2
    y = p.get_height()
    ax.text(x, y + 1, percentage, ha='center', va='bottom', fontsize=11, fontweight='bold')

# Etiquetas y estilo
plt.title("Distribución de resultados del partido")
plt.xlabel("Resultado")
plt.ylabel("Número de partidos")
plt.tight_layout()
plt.show()

In [None]:
#Cambio nombre de columnas para facilitar analisis y codificación 

In [None]:
### 3.2.4 Evolución de goles por temporada , resultados por temporada y de goles por mes

In [None]:
# Estadísticas básicas de goles
goals_stats = df[['FTHG', 'FTAG']].describe()
print("\n Estadísticas de Goles por Partido:")
print(goals_stats)

# Distribución de resultados
result_dist = df['FTR'].value_counts(normalize=True) * 100
print("\n Distribución de Resultados:")
for result, pct in result_dist.items():
    result_name = {'H': 'Victoria Local', 'D': 'Empate', 'A': 'Victoria Visitante'}[result]
    print(f"{result_name}: {pct:.1f}%")

# Crear figura con 1 fila y 3 columnas
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# 1. Goles por temporada
seasonal_goals = df.groupby('Season')[['FTHG', 'FTAG']].mean()
seasonal_goals.plot(ax=axes[0], title='Evolución de Goles por Temporada')
axes[0].set_xlabel('Temporada')
axes[0].set_ylabel('Goles Promedio')
axes[0].legend(['Goles Local', 'Goles Visitante'])

# 2. Distribución de resultados por temporada
result_by_season = df.groupby(['Season', 'FTR']).size().unstack(fill_value=0)
result_by_season_pct = result_by_season.div(result_by_season.sum(axis=1), axis=0) * 100
result_by_season_pct.plot(ax=axes[1], title='Evolución de Resultados por Temporada (%)')
axes[1].set_xlabel('Temporada')
axes[1].set_ylabel('Porcentaje')
axes[1].legend(['Victoria Visitante', 'Empate', 'Victoria Local'])

# 3. Goles totales por mes
monthly_goals = df.groupby('Month')[['FTHG', 'FTAG']].sum()
monthly_goals.plot(kind='bar', ax=axes[2], title='Goles Totales por Mes')
axes[2].set_xlabel('Mes')
axes[2].set_ylabel('Goles Totales')
axes[2].set_xticklabels(
    ['Ene', 'Feb', 'Mar', 'Abr', 'May', 'Jun', 
     'Jul', 'Ago', 'Sep', 'Oct', 'Nov', 'Dic'], 
    rotation=45
)

plt.tight_layout()
plt.show()

In [None]:
df['Season'] = df['Date'].dt.year  

partidos_por_temporada = df.groupby('Season').size()
promedio_partidos = partidos_por_temporada.mean()
print(f"Promedio de partidos por temporada: {promedio_partidos:.2f}")

goles_por_temporada = df.groupby('Season')['Total Goles'].mean()
promedio_goles = goles_por_temporada.mean()
print(f"Promedio de goles por partido (promediado por temporada): {promedio_goles:.2f}")

In [None]:
### 3.2.5 Gráficos de dispersión tiros vs tiros a puerta y tiros a puerta vs goles con sus respectivos coeficientes de correlación de pearson

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import pearsonr

# 1. Prepara los datasets limpios para cada par de variables
data1_local    = df[['HS', 'HST']].dropna()
data1_visit    = df[['AS', 'AST']].dropna()
data2_local    = df[['HST', 'FTHG']].dropna()
data2_visit    = df[['AST', 'FTAG']].dropna()

# 2. Calcula coeficientes de correlación
r1_local, _  = pearsonr(data1_local['HS'],  data1_local['HST'])
r1_visit, _  = pearsonr(data1_visit['AS'],  data1_visit['AST'])
r2_local, _  = pearsonr(data2_local['HST'], data2_local['FTHG'])
r2_visit, _  = pearsonr(data2_visit['AST'], data2_visit['FTAG'])

# 3. Crea la figura con 2 filas y 2 columnas
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 4. Local: Disparos vs Tiros a puerta
sns.regplot(
    x='HS', y='HST', data=data1_local, ax=axes[0, 0],
    scatter_kws={'color': 'royalblue'}, line_kws={'color': 'black'}
)
axes[0, 0].set_title(f"Local: Disparos vs Tiros a puerta (r = {r1_local:.2f})")
axes[0, 0].set_xlabel("Tiros")
axes[0, 0].set_ylabel("Tiros a puerta")

# 5. Visitante: Disparos vs Tiros a puerta
sns.regplot(
    x='AS', y='AST', data=data1_visit, ax=axes[0, 1],
    scatter_kws={'color': 'darkorange'}, line_kws={'color': 'black'}
)
axes[0, 1].set_title(f"Visitante: Disparos vs Tiros a puerta (r = {r1_visit:.2f})")
axes[0, 1].set_xlabel("Tiros")
axes[0, 1].set_ylabel("Tiros a puerta")

# 6. Local: Tiros a puerta vs Goles
sns.regplot(
    x='HST', y='FTHG', data=data2_local, ax=axes[1, 0],
    scatter_kws={'color': 'seagreen'}, line_kws={'color': 'black'}
)
axes[1, 0].set_title(f"Local: Tiros a puerta vs Goles (r = {r2_local:.2f})")
axes[1, 0].set_xlabel("Tiros a puerta")
axes[1, 0].set_ylabel("Goles")

# 7. Visitante: Tiros a puerta vs Goles
sns.regplot(
    x='AST', y='FTAG', data=data2_visit, ax=axes[1, 1],
    scatter_kws={'color': 'firebrick'}, line_kws={'color': 'black'}
)
axes[1, 1].set_title(f"Visitante: Tiros a puerta vs Goles (r = {r2_visit:.2f})")
axes[1, 1].set_xlabel("Tiros a puerta")
axes[1, 1].set_ylabel("Goles")

# 8. Ajustes finales
plt.tight_layout()
plt.show()

In [None]:
### 3.2.6 Correlaciones entre goles y estadisticas 

In [None]:
# Separar datos por contexto y renombrar columnas
df_local = df[['FTHG', 'HS', 'HST', 'HF', 'HC', 'HY', 'HR']].copy()
df_local.columns = ['Goals', 'Shots', 'Shots_On_Target', 'Fouls', 'Corners', 'Yellow_Cards', 'Red_Cards']

df_visitante = df[['FTAG', 'AS', 'AST', 'AF', 'AC', 'AY', 'AR']].copy()
df_visitante.columns = ['Goals', 'Shots', 'Shots_On_Target', 'Fouls', 'Corners', 'Yellow_Cards', 'Red_Cards']

# Correlaciones
corr_local = df_local.corr()
corr_visitante = df_visitante.corr()

# Visualización comparativa
fig, axes = plt.subplots(1, 2, figsize=(18, 7))

sns.heatmap(corr_local, annot=True, cmap='Greens', center=0, ax=axes[0], square=True, linewidths=0.5)
axes[0].set_title(" Correlaciones - Equipo Local")

sns.heatmap(corr_visitante, annot=True, cmap='Reds', center=0, ax=axes[1], square=True, linewidths=0.5)
axes[1].set_title(" Correlaciones - Equipo Visitante")

plt.tight_layout()
plt.show()

# Tabla resumen debajo de cada visualización
print("\n RESUMEN DE CORRELACIONES CON GOLES - EQUIPO LOCAL:")
print(corr_local['Goals'].drop('Goals').sort_values(ascending=False).round(3).to_string())

print("\n RESUMEN DE CORRELACIONES CON GOLES - EQUIPO VISITANTE:")
print(corr_visitante['Goals'].drop('Goals').sort_values(ascending=False).round(3).to_string())

In [None]:
### 3.2.7 Estadisticas 10 mejores equipos 

In [None]:
import pandas as pd

# Configurar opciones de visualización
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# Agrupar estadísticas por tipo
goles = pd.DataFrame({
    'Goles Local': df.groupby('HomeTeam')['FTHG'].mean(),
    'Goles Visitante': df.groupby('AwayTeam')['FTAG'].mean(),
    'Goles Recibidos Local': df.groupby('HomeTeam')['FTAG'].mean(),
    'Goles Recibidos Visitante': df.groupby('AwayTeam')['FTHG'].mean()
})

tiros = pd.DataFrame({
    'Tiros Local': df.groupby('HomeTeam')['HS'].mean(),
    'Tiros Recibidos Local': df.groupby('HomeTeam')['AS'].mean(),
    'Tiros Visitante': df.groupby('AwayTeam')['AS'].mean(),
    'Tiros Recibidos Visitante': df.groupby('AwayTeam')['HS'].mean()
})

tiros_puerta = pd.DataFrame({
    'Tiros a Puerta Local': df.groupby('HomeTeam')['HST'].mean(),
    'Tiros a Puerta Recibidos Local': df.groupby('HomeTeam')['AST'].mean(),
    'Tiros a Puerta Visitante': df.groupby('AwayTeam')['AST'].mean(),
    'Tiros a Puerta Recibidos Visitante': df.groupby('AwayTeam')['HST'].mean()
})

corners = pd.DataFrame({
    'Corners Local': df.groupby('HomeTeam')['HC'].mean(),
    'Corners Recibidos Local': df.groupby('HomeTeam')['AC'].mean(),
    'Corners Visitante': df.groupby('AwayTeam')['AC'].mean(),
    'Corners Recibidos Visitante': df.groupby('AwayTeam')['HC'].mean()
})

faltas = pd.DataFrame({
    'Faltas Local': df.groupby('HomeTeam')['HF'].mean(),
    'Faltas Recibidas Local': df.groupby('HomeTeam')['AF'].mean(), 
    'Faltas Visitante': df.groupby('AwayTeam')['AF'].mean(),
    'Faltas Recibidas Visitante': df.groupby('AwayTeam')['HF'].mean()
})

amarillas = pd.DataFrame({
    'Amarillas Local': df.groupby('HomeTeam')['HY'].mean(),
    'Amarillas Visitante': df.groupby('AwayTeam')['AY'].mean()
})

rojas = pd.DataFrame({
    'Rojas Local': df.groupby('HomeTeam')['HR'].mean(),
    'Rojas Visitante': df.groupby('AwayTeam')['AR'].mean()
})

# Ratio de victorias como local
victorias_local = df[df['FTR'] == 'H'].groupby('HomeTeam').size()
partidos_local = df.groupby('HomeTeam').size()
ratio_local = (victorias_local / partidos_local).fillna(0) * 100

# Ratio de victorias como visitante
victorias_visitante = df[df['FTR'] == 'A'].groupby('AwayTeam').size()
partidos_visitante = df.groupby('AwayTeam').size()
ratio_visitante = (victorias_visitante / partidos_visitante).fillna(0) * 100

# Crear DataFrame con ratios
ratios = pd.DataFrame({
    '% Victorias Local': ratio_local,
    '% Victorias Visitante': ratio_visitante
})

# Unir todas las tablas
tabla_legible = goles.join([
    tiros, tiros_puerta, corners, faltas, amarillas, rojas, ratios
])

# Redondear y ordenar
tabla_legible = tabla_legible.round(2).sort_values('Goles Local', ascending=False)

# Transponer la tabla: estadísticas como filas, equipos como columnas
tabla_transpuesta = tabla_legible.T

# Seleccionar los equipos que quieres comparar (por ejemplo, los 10 primeros)
equipos_destacados = tabla_legible.head(10).index.tolist()

# Filtrar solo esos equipos en la tabla transpuesta
tabla_filtrada = tabla_transpuesta[equipos_destacados]

# Mostrar tabla reorganizada
print("\n Estadisticas 10 principales equipos:")
print(tabla_filtrada.to_string())

# Exportar a Excel
tabla_filtrada.to_excel("comparativa_por_estadistica.xlsx")

In [None]:
### 3.2.8 Gráficos de barras de tiros, tiros a puerta y corners para local y visitantes.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Crear columna de disparos totales
df['Total Shots'] = df['HS'] + df['AS']

# Preparar estadísticas
estadisticas = pd.DataFrame({
    'Local': df['HS'].describe()[['mean', '50%', 'min', 'max', 'std']],
    'Visitante': df['AS'].describe()[['mean', '50%', 'min', 'max', 'std']],
    'Total': df['Total Shots'].describe()[['mean', '50%', 'min', 'max', 'std']]
}).rename(index={'50%': 'median'}).round(2)

# Crear figura con subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Gráfico 1: Disparos local
sns.histplot(df['HS'], bins=20, color='dodgerblue', edgecolor='black', ax=axes[0])
axes[0].set_title("Tiros equipo local")
axes[0].set_xlabel("Tiros")
axes[0].set_ylabel("Número de partidos")

# Gráfico 2: Disparos visitante
sns.histplot(df['AS'], bins=20, color='tomato', edgecolor='black', ax=axes[1])
axes[1].set_title("Tiros equipo visitante")
axes[1].set_xlabel("Tiros")
axes[1].set_ylabel("")

# Gráfico 3: Disparos totales
sns.histplot(df['Total Shots'], bins=20, color='mediumseagreen', edgecolor='black', ax=axes[2])
axes[2].set_title("Tiros totales por partido")
axes[2].set_xlabel("Tiros")
axes[2].set_ylabel("")

# Ajustar diseño
plt.tight_layout()

# Mostrar tabla debajo
plt.figure(figsize=(8, 2))
plt.axis('off')
tabla = plt.table(cellText=estadisticas.values,
                  rowLabels=estadisticas.index,
                  colLabels=estadisticas.columns,
                  cellLoc='center',
                  loc='center')
tabla.scale(1.2, 1.2)
plt.title("Estadísticas de tiros", fontsize=12)
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Crear columna de disparos a puerta totales
df['Total Shots on Target'] = df['HST'] + df['AST']

# Estadísticas descriptivas
estadisticas = pd.DataFrame({
    'Local': df['HST'].describe()[['mean', '50%', 'min', 'max', 'std']],
    'Visitante': df['AST'].describe()[['mean', '50%', 'min', 'max', 'std']],
    'Total': df['Total Shots on Target'].describe()[['mean', '50%', 'min', 'max', 'std']]
}).rename(index={'50%': 'median'}).round(2)

# Crear figura con subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Gráfico 1: Disparos a puerta local
sns.histplot(df['HST'], bins=15, color='navy', edgecolor='black', ax=axes[0])
axes[0].set_title("Tiros a puerta equipo local")
axes[0].set_xlabel("Tiros a puerta")
axes[0].set_ylabel("Número de partidos")

# Gráfico 2: Disparos a puerta visitante
sns.histplot(df['AST'], bins=15, color='firebrick', edgecolor='black', ax=axes[1])
axes[1].set_title("Tiros a puerta equipo visitante")
axes[1].set_xlabel("Tiros a puerta")
axes[1].set_ylabel("")

# Gráfico 3: Disparos a puerta totales
sns.histplot(df['Total Shots on Target'], bins=15, color='darkcyan', edgecolor='black', ax=axes[2])
axes[2].set_title("Tiros a puerta totales por partido")
axes[2].set_xlabel("Tiros a puerta")
axes[2].set_ylabel("")

plt.tight_layout()
plt.show()

# Tabla 1: Estadísticas descriptivas
fig, ax1 = plt.subplots(figsize=(10, 2.5))
ax1.axis('off')
tabla1 = ax1.table(cellText=estadisticas.values,
                   rowLabels=estadisticas.index,
                   colLabels=estadisticas.columns,
                   cellLoc='center',
                   loc='center')
tabla1.scale(1.2, 1.2)
ax1.set_title("Estadísticas de tiros a puerta", fontsize=12)
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Crear columna de corners totales
df['Total Corners'] = df['HC'] + df['AC']

# Estadísticas descriptivas
estadisticas_corners = pd.DataFrame({
    'Local': df['HC'].describe()[['mean', '50%', 'min', 'max', 'std']],
    'Visitante': df['AC'].describe()[['mean', '50%', 'min', 'max', 'std']],
    'Total': df['Total Corners'].describe()[['mean', '50%', 'min', 'max', 'std']]
}).rename(index={'50%': 'median'}).round(2)

# Gráficos
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Gráfico 1: Corners local
sns.histplot(df['HC'], bins=15, color='slateblue', edgecolor='black', ax=axes[0])
axes[0].set_title("Corners equipo local")
axes[0].set_xlabel("Corners")
axes[0].set_ylabel("Número de partidos")

# Gráfico 2: Corners visitante
sns.histplot(df['AC'], bins=15, color='orangered', edgecolor='black', ax=axes[1])
axes[1].set_title("Corners equipo visitante")
axes[1].set_xlabel("Corners")
axes[1].set_ylabel("")

# Gráfico 3: Corners totales
sns.histplot(df['Total Corners'], bins=15, color='seagreen', edgecolor='black', ax=axes[2])
axes[2].set_title("Corners totales por partido")
axes[2].set_xlabel("Corners")
axes[2].set_ylabel("")

plt.tight_layout()
plt.show()

# Tabla 1: Estadísticas descriptivas
fig, ax1 = plt.subplots(figsize=(10, 2.5))
ax1.axis('off')
tabla1 = ax1.table(cellText=estadisticas_corners.values,
                   rowLabels=estadisticas_corners.index,
                   colLabels=estadisticas_corners.columns,
                   cellLoc='center',
                   loc='center')
tabla1.scale(1.2, 1.2)
ax1.set_title("Estadísticas de corners", fontsize=12)
plt.show()

### 3.2.9 Gráficos de barras de faltas, tarjetas amarillas y rojas y tabla de información sobre los 10 árbitros que más partidos han pitado en las últimas 10 temporadas 

In [None]:
# Filtrar los últimos 10 años
df['Date'] = pd.to_datetime(df['Date'])
df_10y = df[df['Date'] >= pd.Timestamp.now() - pd.DateOffset(years=10)].copy()  # ← ¡IMPORTANTE!

# Crear columnas de tarjetas totales
df_10y.loc[:, 'Total Yellow Cards'] = df_10y['HY'] + df_10y['AY']
df_10y.loc[:, 'Total Red Cards'] = df_10y['HR'] + df_10y['AR']
df_10y.loc[:, 'Total Cards'] = df_10y['Total Yellow Cards'] + df_10y['Total Red Cards']

# Agrupar por árbitro
arbitros = df_10y.groupby('Referee')[['Total Yellow Cards', 'Total Red Cards', 'Total Cards']].sum()

# Seleccionar los 10 con más tarjetas totales
top_arbitros = arbitros.sort_values('Total Cards', ascending=False).head(10)

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Limpieza de columnas
df.columns = df.columns.str.strip()

# Asegurarse de que la columna 'Date' esté en formato datetime
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

# Filtrar últimos 10 años y copiar para evitar SettingWithCopyWarning
df_10y = df[df['Date'] >= pd.Timestamp.now() - pd.DateOffset(years=10)].copy()

# Crear columnas de tarjetas
df_10y['Total Yellow Cards'] = df_10y['HY'] + df_10y['AY']
df_10y['Total Red Cards'] = df_10y['HR'] + df_10y['AR']
df_10y['Total Cards'] = df_10y['Total Yellow Cards'] + df_10y['Total Red Cards']

# Contar partidos arbitrados por cada árbitro
partidos_por_arbitro = df_10y['Referee'].value_counts()

# Agrupar por árbitro y calcular totales
arbitros = df_10y.groupby('Referee').agg({
    'HF': 'sum',
    'AF': 'sum',
    'HY': 'sum',
    'AY': 'sum',
    'HR': 'sum',
    'AR': 'sum',
    'Total Cards': 'sum'
})


# Añadir número de partidos arbitrados
arbitros['Partidos'] = partidos_por_arbitro

# Calcular promedios por partido
arbitros['Faltas Local'] = arbitros['HF'] / arbitros['Partidos']
arbitros['Faltas Visitante'] = arbitros['AF'] / arbitros['Partidos']
arbitros['Amarillas Local'] = arbitros['HY'] / arbitros['Partidos']
arbitros['Amarillas Visitante'] = arbitros['AY'] / arbitros['Partidos']
arbitros['Rojas Local'] = arbitros['HR'] / arbitros['Partidos']
arbitros['Rojas Visitante'] = arbitros['AR'] / arbitros['Partidos']

# Seleccionar top 10 árbitros con más tarjetas totales
top_arbitros = arbitros.sort_values('Total Cards', ascending=False).head(10)

# Preparar datos para cada gráfico
labels = top_arbitros.index.tolist()
faltas_local = top_arbitros['Faltas Local'].values
faltas_visitante = top_arbitros['Faltas Visitante'].values
amarillas_local = top_arbitros['Amarillas Local'].values
amarillas_visitante = top_arbitros['Amarillas Visitante'].values
rojas_local = top_arbitros['Rojas Local'].values
rojas_visitante = top_arbitros['Rojas Visitante'].values

# Crear figura con 3 subplots
fig, axes = plt.subplots(1, 3, figsize=(20, 6), sharey=False)

sns.set_style("whitegrid")
bar_width = 0.4
x = range(len(labels))

# Gráfico 1: Promedio de faltas
axes[0].bar(x, faltas_local, width=bar_width, label='Local', color='royalblue')
axes[0].bar([i + bar_width for i in x], faltas_visitante, width=bar_width, label='Visitante', color='darkorange')
axes[0].set_title("Promedio de faltas por partido")
axes[0].set_xticks([i + bar_width / 2 for i in x])
axes[0].set_xticklabels(labels, rotation=45, ha='right')
axes[0].set_ylabel("Faltas por partido")
axes[0].legend()

# Gráfico 2: Promedio de tarjetas amarillas
axes[1].bar(x, amarillas_local, width=bar_width, label='Local', color='gold')
axes[1].bar([i + bar_width for i in x], amarillas_visitante, width=bar_width, label='Visitante', color='darkgoldenrod')
axes[1].set_title("Promedio de tarjetas amarillas")
axes[1].set_xticks([i + bar_width / 2 for i in x])
axes[1].set_xticklabels(labels, rotation=45, ha='right')
axes[1].legend()

# Gráfico 3: Promedio de tarjetas rojas
axes[2].bar(x, rojas_local, width=bar_width, label='Local', color='lightcoral')
axes[2].bar([i + bar_width for i in x], rojas_visitante, width=bar_width, label='Visitante', color='firebrick')
axes[2].set_title("Promedio de tarjetas rojas")
axes[2].set_xticks([i + bar_width / 2 for i in x])
axes[2].set_xticklabels(labels, rotation=45, ha='right')
axes[2].legend()

plt.tight_layout()
plt.show()

In [None]:
# Crear tabla resumen con todas las columnas relevantes
tabla_resumen = top_arbitros[[
    'Faltas Local',
    'Faltas Visitante',
    'Amarillas Local',
    'Amarillas Visitante',
    'Rojas Local',
    'Rojas Visitante'
]].round(2)

# Ajustar opciones de pandas para mostrar todas las columnas
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

# Mostrar la tabla completa en una sola línea
print(" Tabla resumen de promedios por árbitro (últimos 10 años):")
print(tabla_resumen)

In [None]:
print(df.columns.tolist())

In [None]:
#outliers y valores faltantes

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Lista de métricas numéricas
metricas = [
    'FTHG', 'FTAG', 'HTHG', 'HTAG',
    'HS', 'AS', 'HST', 'AST',
    'HF', 'AF', 'HC', 'AC',
    'HY', 'AY', 'HR', 'AR'
]

# Reorganizar el DataFrame en formato largo
df_long = df[metricas].melt(var_name='Métrica', value_name='Valor')

# Crear boxplot único
plt.figure(figsize=(14, 8))
sns.boxplot(data=df_long, x='Métrica', y='Valor', palette='pastel')
plt.xticks(rotation=90)
plt.title('Detección de Outliers en Variables Numéricas (Premier League 1993–2025)')
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
print("\n Análisis de valores nulos:")
null_counts = df.isnull().sum()
null_percentages = (null_counts / len(df)) * 100
null_analysis = pd.DataFrame({
    'Valores Nulos': null_counts,
    'Porcentaje (%)': null_percentages.round(2)
}).sort_values('Valores Nulos', ascending=False)
print(null_analysis[null_analysis['Valores Nulos'] > 0])

In [None]:
#IMPUTACION DE VALORES FALTANTES Y LIMPIEZA. 

import pandas as pd
import numpy as np



# Hacer copia de seguridad
df = df.copy()

# Eliminar columnas irrelevantes
df.drop(['Div', 'Time'], axis=1, inplace=True, errors='ignore')

# Columnas numéricas a imputar
num_cols = [
    'HTHG', 'HTAG',
    'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC',
    'HY', 'AY', 'HR', 'AR',
    'Total Shots', 'Total Shots on Target', 'Total Corners'
]

# Calcular medianas globales
global_meds = df[num_cols].median()

def impute_numeric(row, col):
    if pd.notna(row[col]):
        return row[col]
    # Intentar mediana del mismo enfrentamiento
    mask = (
        (df.HomeTeam == row.HomeTeam) &
        (df.AwayTeam == row.AwayTeam) &
        (df.index != row.name)
    )
    vals = df.loc[mask, col].dropna()
    if not vals.empty:
        return vals.median()
    # Medianas por equipo
    home_med = df.loc[df.HomeTeam == row.HomeTeam, col].median()
    away_med = df.loc[df.AwayTeam == row.AwayTeam, col].median()
    candidate = home_med if col.startswith('Home') else away_med
    if pd.notna(candidate):
        return candidate
    # Mediana global
    return global_meds[col]

# Aplicar imputación numérica
for col in num_cols:
    df[col] = df.apply(lambda r: impute_numeric(r, col), axis=1)

# Imputación de árbitros
df['Referee'] = df['Referee'].fillna('Unknown')

# Lógica de imputación para HTR usando goles al descanso
def impute_htr(row):
    # Si ya existe un valor, conservarlo
    if pd.notna(row['HTR']):
        return row['HTR']
    # Basado en goles al descanso
    if row['HTHG'] > row['HTAG']:
        return 'H'
    elif row['HTHG'] == row['HTAG']:
        return 'D'
    else:
        return 'A'

# Aplicar imputación de HTR
df['HTR'] = df.apply(impute_htr, axis=1)

# Verificación final de nulos
print("Valores nulos tras imputación:")
print(df.isnull().sum())

In [None]:
#feature engineering

In [None]:
df.to_csv(
    "Premier League Estadisticas 1993-2025_clean.csv",
    sep=";",
    encoding="latin1",
    index=False
)

print("CSV limpio generado: Premier League Estadisticas 1993-2025_clean.csv")

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import joblib

# 0. Carpeta de artefactos
os.makedirs("models", exist_ok=True)

# 1. Carga histórico crudo
df = pd.read_csv(
    "Premier League Estadisticas 1993-2025_clean.csv",
    parse_dates=["Date"],
    sep=";",
    encoding="latin1"
)

# Normalizar equipos
df["HomeTeam"] = df["HomeTeam"].str.strip().str.upper()
df["AwayTeam"] = df["AwayTeam"].str.strip().str.upper()

# 2. Variables pre-match básicas
df["home_points"] = np.where(df.FTHG > df.FTAG, 3,
                     np.where(df.FTHG == df.FTAG, 1, 0))
df["away_points"] = np.where(df.FTAG > df.FTHG, 3,
                     np.where(df.FTAG == df.FTHG, 1, 0))

# Expanding means excluyendo partido actual
for side, g, s, sot, c in [
    ("home","FTHG","HS","HST","HC"),
    ("away","FTAG","AS","AST","AC")
]:
    team = f"{side.capitalize()}Team"
    df[f"{side}_avg_goals"] = df.groupby(team)[g] \
        .transform(lambda x: x.shift().expanding().mean())
    df[f"{side}_avg_shots"] = df.groupby(team)[s] \
        .transform(lambda x: x.shift().expanding().mean())
    df[f"{side}_avg_sot"]   = df.groupby(team)[sot] \
        .transform(lambda x: x.shift().expanding().mean())
    df[f"{side}_avg_corners"] = df.groupby(team)[c] \
        .transform(lambda x: x.shift().expanding().mean())

# Forma últimos 5
df["form_home_5"] = df.groupby("HomeTeam")["home_points"] \
    .transform(lambda x: x.shift().rolling(5,1).mean())
df["form_away_5"] = df.groupby("AwayTeam")["away_points"] \
    .transform(lambda x: x.shift().rolling(5,1).mean())

# Racha de victorias
df["win_home"] = (df.FTHG > df.FTAG).astype(int)
df["win_away"] = (df.FTAG > df.FTHG).astype(int)
def calc_streak(wins):
    streak, cnt = [], 0
    for w in wins:
        streak.append(cnt)
        cnt = cnt+1 if w==1 else 0
    return streak
df["streak_home"] = df.groupby("HomeTeam")["win_home"].transform(calc_streak)
df["streak_away"] = df.groupby("AwayTeam")["win_away"].transform(calc_streak)

# H2H mediana de goal diff
def h2h_med_diff(row):
    mask = (
        (df.HomeTeam==row.HomeTeam)&
        (df.AwayTeam==row.AwayTeam)&
        (df.Date<row.Date)
    )
    if not mask.any(): return 0
    return (df.loc[mask,"FTHG"] - df.loc[mask,"FTAG"]).median()
df["h2h_goal_diff_med"] = df.apply(h2h_med_diff, axis=1)

# Indicador de campo
df["is_home"] = 1

# Score fuerza histórica
for side, g, w, p in [
    ("home","FTHG","win_home","home_points"),
    ("away","FTAG","win_away","away_points"),
]:
    team = f"{side.capitalize()}Team"
    df[f"{side}_total_goals"] = df.groupby(team)[g] \
        .transform(lambda x: x.shift().expanding().sum())
    df[f"{side}_total_wins"]  = df.groupby(team)[w] \
        .transform(lambda x: x.shift().expanding().sum())
    df[f"{side}_total_pts"]   = df.groupby(team)[p] \
        .transform(lambda x: x.shift().expanding().sum())

df["home_strength_score"] = df.home_total_wins*2 + df.home_total_goals + df.home_total_pts*1.5
df["away_strength_score"] = df.away_total_wins*2 + df.away_total_goals + df.away_total_pts*1.5
df[["home_strength_score","away_strength_score"]] = \
    df[["home_strength_score","away_strength_score"]].fillna(0)

# >>> Mejora: nuevas pre-match
df["home_avg_points"] = df.groupby("HomeTeam")["home_points"] \
    .transform(lambda x: x.shift().expanding().mean())
df["away_avg_points"] = df.groupby("AwayTeam")["away_points"] \
    .transform(lambda x: x.shift().expanding().mean())

df["home_goal_diff"] = df.FTHG - df.FTAG
df["away_goal_diff"] = df.FTAG - df.FTHG
df["home_avg_goal_diff"] = df.groupby("HomeTeam")["home_goal_diff"] \
    .transform(lambda x: x.shift().expanding().mean())
df["away_avg_goal_diff"] = df.groupby("AwayTeam")["away_goal_diff"] \
    .transform(lambda x: x.shift().expanding().mean())

df["home_sot_ratio"] = df.HST/df.HS
df["away_sot_ratio"] = df.AST/df.AS
df["home_avg_sot_ratio"] = df.groupby("HomeTeam")["home_sot_ratio"] \
    .transform(lambda x: x.shift().expanding().mean())
df["away_avg_sot_ratio"] = df.groupby("AwayTeam")["away_sot_ratio"] \
    .transform(lambda x: x.shift().expanding().mean())

df["shots_home_5"] = df.groupby("HomeTeam")["HS"] \
    .transform(lambda x: x.shift().rolling(5,1).mean())
df["shots_away_5"] = df.groupby("AwayTeam")["AS"] \
    .transform(lambda x: x.shift().rolling(5,1).mean())

df["home_rest_days"] = df.groupby("HomeTeam")["Date"] \
    .transform(lambda x: x.diff().dt.days.shift())
df["away_rest_days"] = df.groupby("AwayTeam")["Date"] \
    .transform(lambda x: x.diff().dt.days.shift())
df.home_rest_days.fillna(df.home_rest_days.median(), inplace=True)
df.away_rest_days.fillna(df.away_rest_days.median(), inplace=True)

df["diff_avg_goals"]   = df.home_avg_goals - df.away_avg_goals
df["diff_avg_shots"]   = df.home_avg_shots - df.away_avg_shots
df["diff_avg_corners"] = df.home_avg_corners - df.away_avg_corners
df["diff_strength"]    = df.home_strength_score - df.away_strength_score

df["goal_diff"]   = df.home_avg_goals - df.away_avg_goals
df["shot_diff"]   = df.home_avg_shots - df.away_avg_shots
df["sot_diff"]    = df.home_avg_sot - df.away_avg_sot
df["corner_diff"] = df.home_avg_corners - df.away_avg_corners

# 3. Guardar histórico y features
df.to_csv("df_hist_raw.csv", sep=";", index=False)
print("df_hist_raw.csv guardado")

FEATURES_PM = [
    "home_avg_goals","away_avg_goals",
    "home_avg_shots","away_avg_shots",
    "home_avg_sot","away_avg_sot",
    "home_avg_corners","away_avg_corners",
    "form_home_5","form_away_5",
    "streak_home","streak_away",
    "h2h_goal_diff_med","is_home",
    "home_strength_score","away_strength_score",
    "home_avg_points","away_avg_points",
    "home_avg_goal_diff","away_avg_goal_diff",
    "home_avg_sot_ratio","away_avg_sot_ratio",
    "shots_home_5","shots_away_5",
    "home_rest_days","away_rest_days",
    "diff_avg_goals","diff_avg_shots",
    "diff_avg_corners","diff_strength"
]

X_pm = df[FEATURES_PM].fillna(0)
scaler = StandardScaler().fit(X_pm)
joblib.dump(scaler, "models/scaler_pm.joblib")
print("scaler_pm.joblib guardado")

df_scaled = pd.DataFrame(scaler.transform(X_pm), columns=FEATURES_PM)
df_model_ready = pd.concat([
    df_scaled,
    df[["FTR","FTHG","FTAG","HS","AS","HST","AST","HC","AC"]]
], axis=1)
df_model_ready.to_csv("df_model_ready.csv", index=False)
print("df_model_ready.csv guardado")

In [None]:
#entrenamiento

import os
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from xgboost import XGBClassifier, XGBRegressor
from sklearn.metrics import (
    accuracy_score, f1_score, roc_auc_score, log_loss,
    classification_report, confusion_matrix,
    mean_absolute_error, mean_squared_error, r2_score
)
from sklearn.calibration import CalibratedClassifierCV

# 1. Carga de datos
df = pd.read_csv("df_model_ready.csv")

# 2. Derivadas adicionales
df["goal_diff"]   = df.home_avg_goals   - df.away_avg_goals
df["shot_diff"]   = df.home_avg_shots   - df.away_avg_shots
df["sot_diff"]    = df.home_avg_sot     - df.away_avg_sot
df["corner_diff"] = df.home_avg_corners - df.away_avg_corners

FEATURES_PM = [
    "home_avg_goals","away_avg_goals",
    "home_avg_shots","away_avg_shots",
    "home_avg_sot","away_avg_sot",
    "home_avg_corners","away_avg_corners",
    "form_home_5","form_away_5",
    "streak_home","streak_away",
    "h2h_goal_diff_med","is_home",
    "home_strength_score","away_strength_score",
    "goal_diff","shot_diff","sot_diff","corner_diff"
]

X = df[FEATURES_PM]
y_clf = df.FTR.map({"H":0,"D":1,"A":2})
y_fthg = df.FTHG
y_ftag = df.FTAG

# 3. Split train/test
X_train, X_test, y_train_clf, y_test_clf = train_test_split(
    X, y_clf, test_size=0.3, random_state=42, stratify=y_clf
)
Xr_train, Xr_test = X_train, X_test
y_train_fthg = y_fthg.loc[Xr_train.index]
y_test_fthg  = y_fthg.loc[Xr_test.index]
y_train_ftag = y_ftag.loc[Xr_train.index]
y_test_ftag  = y_ftag.loc[Xr_test.index]

os.makedirs("models", exist_ok=True)

# 4. Baselines
base_clf = DummyClassifier(strategy="most_frequent", random_state=42)
base_clf.fit(X_train, y_train_clf)
joblib.dump(base_clf, "models/baseline_clf.joblib")

base_reg_fthg = DummyRegressor(strategy="mean")
base_reg_fthg.fit(Xr_train, y_train_fthg)
joblib.dump(base_reg_fthg, "models/baseline_reg_FTHG.joblib")

base_reg_ftag = DummyRegressor(strategy="mean")
base_reg_ftag.fit(Xr_train, y_train_ftag)
joblib.dump(base_reg_ftag, "models/baseline_reg_FTAG.joblib")

# 5. Clasificadores
classifiers = {
    "LogReg": LogisticRegression(
        multi_class='multinomial',
        class_weight='balanced',
        max_iter=1000,
        random_state=42
    ),
    "RF": RandomForestClassifier(
        n_estimators=100,
        class_weight='balanced',
        random_state=42
    ),
    "XGB": XGBClassifier(
        use_label_encoder=False,
        eval_metric='mlogloss',
        random_state=42
    )
}

results_clf = {}
for name, model in classifiers.items():
    if name=="LogReg":
        params={"C":[0.01,0.1,1,10,100]}
    elif name=="RF":
        params={"n_estimators":[100,200,300],"max_depth":[None,10,20],"min_samples_split":[2,5,10]}
    else:
        params={"n_estimators":[100,200],"max_depth":[3,6,10],"learning_rate":[0.01,0.1,0.2]}

    search = RandomizedSearchCV(model, params, n_iter=5, cv=3,
                                scoring='accuracy', random_state=42, n_jobs=-1)
    search.fit(X_train, y_train_clf)
    best = search.best_estimator_
    print(f" Mejores params {name}:", search.best_params_)

    if name=="XGB":
        best = CalibratedClassifierCV(best, cv=3)
    best.fit(X_train, y_train_clf)
    joblib.dump(best, f"models/{name}_clf.joblib")

    preds = best.predict(X_test)
    probs = best.predict_proba(X_test)
    results_clf[name] = {
        "acc": accuracy_score(y_test_clf, preds),
        "f1":  f1_score(y_test_clf, preds, average='weighted'),
        "roc": roc_auc_score(pd.get_dummies(y_test_clf), probs, average='weighted'),
        "ll":  log_loss(y_test_clf, probs)
    }

    print(classification_report(y_test_clf, preds, target_names=["Home","Draw","Away"]))
    print(confusion_matrix(y_test_clf, preds))

    # --- Mejoras añadidas ---
    # Validación cruzada
    cv_scores = cross_val_score(best, X_train, y_train_clf, cv=5, scoring='accuracy')
    print(f" Accuracy promedio CV para {name}: {cv_scores.mean():.3f}")

    # Feature importances (solo RF y XGB)
    if hasattr(best, "feature_importances_"):
        imp = pd.Series(best.feature_importances_, index=FEATURES_PM)
        print(f" Top 5 features para clasificación {name}:")
        print(imp.nlargest(5))

# Baseline metrics
pred_base = base_clf.predict(X_test)
results_clf["Baseline"] = {
    "acc": accuracy_score(y_test_clf, pred_base),
    "f1":  f1_score(y_test_clf, pred_base, average='weighted'),
    "roc": None, "ll": None
}
print(pd.DataFrame(results_clf).T)

# 6. Regresión estadísticas
stat_targets = ["FTHG","FTAG","HS","AS","HST","AST","HC","AC"]
regressors = {}
results_stats = {}

for stat in stat_targets:
    regressors[f"RF_{stat}"]  = RandomForestRegressor(n_estimators=100, random_state=42)
    regressors[f"XGB_{stat}"] = XGBRegressor(random_state=42)

for name, model in regressors.items():
    stat = name.split("_",1)[1]
    y_tr = df[stat].loc[Xr_train.index]
    y_te = df[stat].loc[Xr_test.index]

    params = {"n_estimators":[100,200],"max_depth":[None,10]}
    if "XGB" in name: params["learning_rate"]=[0.05,0.1]

    search = RandomizedSearchCV(model, params, n_iter=3, cv=3,
                                scoring='neg_mean_absolute_error',
                                random_state=42, n_jobs=-1)
    search.fit(Xr_train, y_tr)
    best = search.best_estimator_
    print(f" Params {name}:", search.best_params_)
    best.fit(Xr_train, y_tr)
    joblib.dump(best, f"models/{name}.joblib")

    preds = best.predict(Xr_test)
    mse = mean_squared_error(y_te, preds)
    results_stats[name] = {
        "mae": mean_absolute_error(y_te, preds),
        "rmse": np.sqrt(mse),
        "r2": r2_score(y_te, preds)
    }

    # --- Mejoras añadidas ---
    # Validación cruzada
    cv_scores = cross_val_score(best, Xr_train, y_tr, cv=5, scoring='neg_mean_absolute_error')
    print(f" MAE promedio CV para {name}: {-cv_scores.mean():.3f}")

    # Feature importances
    if hasattr(best, "feature_importances_"):
        imp = pd.Series(best.feature_importances_, index=FEATURES_PM)
        print(f" Top 5 features para regresión {name}:")
        print(imp.nlargest(5))

print(pd.DataFrame(results_stats).T)

In [None]:
#importancia de clasificadores 

#!/usr/bin/env python3


import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif, RFE

# 1. Carga de datos y derivadas
df = pd.read_csv("df_model_ready.csv")

df["goal_diff"]   = df.home_avg_goals   - df.away_avg_goals
df["shot_diff"]   = df.home_avg_shots   - df.away_avg_shots
df["sot_diff"]    = df.home_avg_sot     - df.away_avg_sot
df["corner_diff"] = df.home_avg_corners - df.away_avg_corners

FEATURES_PM = [
    "home_avg_goals","away_avg_goals",
    "home_avg_shots","away_avg_shots",
    "home_avg_sot","away_avg_sot",
    "home_avg_corners","away_avg_corners",
    "form_home_5","form_away_5",
    "streak_home","streak_away",
    "h2h_goal_diff_med","is_home",
    "home_strength_score","away_strength_score",
    "goal_diff","shot_diff","sot_diff","corner_diff"
]

X = df[FEATURES_PM]
y_clf = df.FTR.map({"H":0, "D":1, "A":2})

# 2. Split idéntico al script de entrenamiento
X_train, X_unused, y_train_clf, y_unused = train_test_split(
    X, y_clf, test_size=0.3, random_state=42, stratify=y_clf
)

# 3. Carga de modelos entrenados
models_clf = {
    "LogReg": joblib.load("models/LogReg_clf.joblib"),
    "RF":     joblib.load("models/RF_clf.joblib"),
    "XGB":    joblib.load("models/XGB_clf.joblib")
}

# 4. Cálculo de importancias
print("\n\n=== Importancia en Clasificadores ===\n")

for name, model in models_clf.items():
    print(f"--- {name} ---")

    # 1) Coeficientes absolutos (solo LogReg)
    if hasattr(model, "coef_"):
        coef = model.coef_
        if coef.ndim > 1:
            vals = np.mean(np.abs(coef), axis=0)
        else:
            vals = np.abs(coef).ravel()
        imp = pd.Series(vals, index=FEATURES_PM).sort_values(ascending=False)
        print("Top 10 coeficientes absolutos promedio (LogReg):")
        print(imp.head(10), "\n")

    # 2) feature_importances_ (árboles)
    if hasattr(model, "feature_importances_"):
        fi = pd.Series(model.feature_importances_, index=FEATURES_PM) \
               .sort_values(ascending=False)
        print("Top 10 feature_importances_:")
        print(fi.head(10), "\n")

    # 3) SelectKBest univariante ANOVA F
    skb = SelectKBest(f_classif, k=10).fit(X_train, y_train_clf)
    sk  = pd.Series(skb.scores_, index=FEATURES_PM).sort_values(ascending=False)
    print("Top 10 univariate ANOVA F:")
    print(sk.head(10), "\n")

    # 4) RFE sobre RandomForest (solo RF)
    if name == "RF":
        rfe = RFE(estimator=model, n_features_to_select=10, step=1) \
                  .fit(X_train, y_train_clf)
        selected = pd.Series(rfe.support_, index=FEATURES_PM)
        print("Features seleccionadas por RFE (RF):")
        print(selected[selected].index.tolist(), "\n")



In [None]:
#funcion de prediccion 

import os
import pandas as pd
import numpy as np
import joblib
import difflib
from datetime import datetime

# 0. Base path de artefactos
try:
    BASE_DIR = os.path.dirname(os.path.abspath(__file__))
except NameError:
    BASE_DIR = os.getcwd()

# 0.1 Histórico enriquecido
df_hist = pd.read_csv(
    os.path.join(BASE_DIR, "df_hist_raw.csv"),
    parse_dates=["Date"],
    sep=";"
)
df_hist["HomeTeam"] = df_hist["HomeTeam"].str.strip().str.upper()
df_hist["AwayTeam"] = df_hist["AwayTeam"].str.strip().str.upper()

# 0.2 Carga scaler y modelos
scaler      = joblib.load(os.path.join(BASE_DIR, "models", "scaler_pm.joblib"))

clf_files = {
    "LogReg":  "LogReg_clf.joblib",
    "RF":      "RF_clf.joblib",
    "XGB":     "XGB_clf.joblib",
    "Baseline":"baseline_clf.joblib"
}
clf_models = {
    name: joblib.load(os.path.join(BASE_DIR, "models", fname))
    for name, fname in clf_files.items()
}

stat_targets = ["FTHG","FTAG","HS","AS","HST","AST","HC","AC"]
reg_models = {}
for prefix in ["RF","XGB"]:
    for tgt in stat_targets:
        key = f"{prefix}_{tgt}"
        reg_models[key] = joblib.load(
            os.path.join(BASE_DIR, "models", f"{key}.joblib")
        )

# 1. Listas de features
EXT_FEATURES_PM = [
    "home_avg_goals","away_avg_goals",
    "home_avg_shots","away_avg_shots",
    "home_avg_sot","away_avg_sot",
    "home_avg_corners","away_avg_corners",
    "form_home_5","form_away_5",
    "streak_home","streak_away",
    "h2h_goal_diff_med","is_home",
    "home_strength_score","away_strength_score",
    "home_avg_points","away_avg_points",
    "home_avg_goal_diff","away_avg_goal_diff",
    "home_avg_sot_ratio","away_avg_sot_ratio",
    "shots_home_5","shots_away_5",
    "home_rest_days","away_rest_days",
    "diff_avg_goals","diff_avg_shots",
    "diff_avg_corners","diff_strength",
    # las 4 diffs finales
    "goal_diff","shot_diff","sot_diff","corner_diff"
]
MODEL_FEATURES = EXT_FEATURES_PM[:16] + EXT_FEATURES_PM[30:]

# 2. Helpers
def get_consecutive_wins(win_list):
    cnt = 0
    for w in win_list:
        if w == 1:
            cnt += 1
        else:
            break
    return cnt

def resolve_team(name, col):
    s = name.strip().upper()
    pool = df_hist[col].unique().tolist()
    if s in pool:
        return s
    cands = difflib.get_close_matches(s, pool, n=1, cutoff=0.6)
    if cands:
        print(f"Mapeo '{name}'→'{cands[0]}'")
        return cands[0]
    raise ValueError(f"Equipo no encontrado: '{name}'")

def build_pre_match_features(home, away, match_date=None):
    df_temp = df_hist.copy()
    if match_date:
        df_temp = df_temp[df_temp.Date < match_date]

    hk = resolve_team(home, "HomeTeam")
    ak = resolve_team(away, "AwayTeam")

    hdf = df_temp[df_temp.HomeTeam == hk].sort_values("Date")
    adf = df_temp[df_temp.AwayTeam == ak].sort_values("Date")

    hpts = np.where(hdf.FTHG > hdf.FTAG, 3,
            np.where(hdf.FTHG == hdf.FTAG, 1, 0))
    apts = np.where(adf.FTAG > adf.FTHG, 3,
            np.where(adf.FTAG == adf.FTHG, 1, 0))
    hw = (hdf.FTHG > hdf.FTAG).astype(int)
    aw = (adf.FTAG > adf.FTHG).astype(int)

    data = {
        "home_avg_goals":    hdf.FTHG.mean(),
        "away_avg_goals":    adf.FTAG.mean(),
        "home_avg_shots":    hdf.HS.mean(),
        "away_avg_shots":    adf.AS.mean(),
        "home_avg_sot":      hdf.HST.mean(),
        "away_avg_sot":      adf.AST.mean(),
        "home_avg_corners":  hdf.HC.mean(),
        "away_avg_corners":  adf.AC.mean(),
        "form_home_5":       hdf.assign(
                                pts=lambda x: np.where(
                                    x.FTHG > x.FTAG, 3,
                                    np.where(x.FTHG == x.FTAG, 1, 0)
                                )
                             ).sort_values("Date", ascending=False)
                              .pts.head(5).mean(),
        "form_away_5":       adf.assign(
                                pts=lambda x: np.where(
                                    x.FTAG > x.FTHG, 3,
                                    np.where(x.FTAG == x.FTHG, 1, 0)
                                )
                             ).sort_values("Date", ascending=False)
                              .pts.head(5).mean(),
        "streak_home":       get_consecutive_wins(hw[::-1].tolist()),
        "streak_away":       get_consecutive_wins(aw[::-1].tolist()),
        "h2h_goal_diff_med": ((df_temp[(df_temp.HomeTeam == hk) & (df_temp.AwayTeam == ak)]["FTHG"] -
                               df_temp[(df_temp.HomeTeam == hk) & (df_temp.AwayTeam == ak)]["FTAG"]).median()),
        "is_home":           1,
        "home_strength_score": hw.sum()*2 + hdf.FTHG.sum() + hpts.sum()*1.5,
        "away_strength_score": aw.sum()*2 + adf.FTAG.sum() + apts.sum()*1.5,
        "home_avg_points":   hpts.mean(),
        "away_avg_points":   apts.mean(),
        "home_avg_goal_diff":hdf.FTHG.sub(hdf.FTAG).expanding().mean().shift().iloc[-1],
        "away_avg_goal_diff":adf.FTAG.sub(adf.FTHG).expanding().mean().shift().iloc[-1],
        "home_avg_sot_ratio":hdf.HST.div(hdf.HS).expanding().mean().shift().iloc[-1],
        "away_avg_sot_ratio":adf.AST.div(adf.AS).expanding().mean().shift().iloc[-1],
        "shots_home_5":      hdf.HS.rolling(5).mean().shift().iloc[-1],
        "shots_away_5":      adf.AS.rolling(5).mean().shift().iloc[-1],
        "home_rest_days":    hdf.Date.diff().dt.days.shift().iloc[-1],
        "away_rest_days":    adf.Date.diff().dt.days.shift().iloc[-1],
        "diff_avg_goals":    hdf.FTHG.mean() - adf.FTAG.mean(),
        "diff_avg_shots":    hdf.HS.mean()   - adf.AS.mean(),
        "diff_avg_corners":  hdf.HC.mean()   - adf.AC.mean(),
        "diff_strength":     (hw.sum()*2 + hdf.FTHG.sum() + hpts.sum()*1.5) -
                             (aw.sum()*2 + adf.FTAG.sum() + apts.sum()*1.5),
        "goal_diff":         hdf.FTHG.mean() - adf.FTAG.mean(),
        "shot_diff":         hdf.HS.mean()   - adf.AS.mean(),
        "sot_diff":          hdf.HST.mean()  - adf.AST.mean(),
        "corner_diff":       hdf.HC.mean()   - adf.AC.mean()
    }

    return pd.DataFrame([data], columns=EXT_FEATURES_PM).fillna(0)

# 3. Predicción
def predict_match(home, away, date_str=None):
    match_date = None
    if date_str:
        try:
            match_date = datetime.strptime(date_str, "%Y-%m-%d")
        except:
            raise ValueError("Fecha inválida, usa YYYY-MM-DD")

    # construye features raw
    X_ext = build_pre_match_features(home, away, match_date)

    # escala solo las 30 features que el scaler conoce
    SCALER_FEATURES = [
        "home_avg_goals","away_avg_goals",
        "home_avg_shots","away_avg_shots",
        "home_avg_sot","away_avg_sot",
        "home_avg_corners","away_avg_corners",
        "form_home_5","form_away_5",
        "streak_home","streak_away",
        "h2h_goal_diff_med","is_home",
        "home_strength_score","away_strength_score",
        "home_avg_points","away_avg_points",
        "home_avg_goal_diff","away_avg_goal_diff",
        "home_avg_sot_ratio","away_avg_sot_ratio",
        "shots_home_5","shots_away_5",
        "home_rest_days","away_rest_days",
        "diff_avg_goals","diff_avg_shots",
        "diff_avg_corners","diff_strength"
    ]

    raw = X_ext[SCALER_FEATURES]
    X_scaled = scaler.transform(raw)
    Xs_df = pd.DataFrame(
        np.nan_to_num(X_scaled, nan=0, posinf=0, neginf=0),
        columns=SCALER_FEATURES
    )

    # recrea los 4 diffs en el espacio escalado
    Xs_df["goal_diff"]   = Xs_df.home_avg_goals   - Xs_df.away_avg_goals
    Xs_df["shot_diff"]   = Xs_df.home_avg_shots   - Xs_df.away_avg_shots
    Xs_df["sot_diff"]    = Xs_df.home_avg_sot     - Xs_df.away_avg_sot
    Xs_df["corner_diff"] = Xs_df.home_avg_corners - Xs_df.away_avg_corners

    # features finales para los modelos
    Xm = Xs_df[MODEL_FEATURES]

    # clasificación
    inv = {0: "H", 1: "D", 2: "A"}
    ftr, stats = {}, {}

    for name, clf in clf_models.items():
        if name == "Baseline":
            label_idx = int(clf.predict(Xm)[0])
            proba = None
        else:
            proba = clf.predict_proba(Xm)[0]
            label_idx = int(np.argmax(proba))
        ftr[name] = {
            "label": inv[label_idx],
            "proba": None if proba is None else {
                "H": proba[0],
                "D": proba[1],
                "A": proba[2]
            }
        }

    # regresión de estadísticas
    for key, model in reg_models.items():
        prefix, tgt = key.split("_", 1)
        pred = model.predict(Xm)[0]
        lo = hi = None
        if hasattr(model, "estimators_"):
            arr = np.array([est.predict(Xm)[0] for est in model.estimators_])
            lo, hi = np.percentile(arr, 5), np.percentile(arr, 95)
        stats.setdefault(tgt, {})[prefix] = {
            "pred": round(pred, 2),
            "interval": None if lo is None else (round(lo, 2), round(hi, 2))
        }

    return ftr, stats

def predict_match_formatted(home, away, date_str=None):
    if home.strip().upper() == away.strip().upper():
        return "Error: local y visitante iguales."
    try:
        ftr, stats = predict_match(home, away, date_str)
    except ValueError as e:
        return f"Error: {e}"

    # elige mejor clasificador entre RF/XGB
    cand = {
        n: r
        for n, r in ftr.items()
        if r["proba"] and n in ["RF", "XGB"]
    }
    best = max(cand, key=lambda m: cand[m]["proba"][cand[m]["label"]])
    lbl = ftr[best]["label"]
    conf = ftr[best]["proba"][lbl] * 100

    out = [f"Predicción: {home} vs {away}", "="*40]
    out.append(f"Modelo: {best} → {lbl} ({conf:.1f}%)")
    out.append("")
    out.append("Estadísticas esperadas:")
    for t in stat_targets:
        p = stats[t][best]["pred"]
        iv = stats[t][best]["interval"]
        if iv:
            out.append(f"  - {t}: {p:.2f} IC95%[{iv[0]:.2f}–{iv[1]:.2f}]")
        else:
            out.append(f"  - {t}: {p:.2f}")
    out.append("")
    out.append("Resumen completo:")
    out.append("  Clasificación:")
    for n, r in ftr.items():
        if not r["proba"]:
            out.append(f"    - {n}: {r['label']}")
        else:
            h, d, a = r["proba"]["H"], r["proba"]["D"], r["proba"]["A"]
            out.append(f"    - {n}: {r['label']} (H {h*100:.1f}%, D {d*100:.1f}%, A {a*100:.1f}%)")
    out.append("  Estadísticas (RF vs XGB):")
    for t in stat_targets:
        rf = stats[t]["RF"]
        xg = stats[t]["XGB"]
        rf_txt  = f"RF  {rf['pred']:.2f}" + (f" [{rf['interval'][0]:.2f}–{rf['interval'][1]:.2f}]" if rf['interval'] else "")
        xg_txt  = f"XGB {xg['pred']:.2f}" + (f" [{xg['interval'][0]:.2f}–{xg['interval'][1]:.2f}]" if xg['interval'] else "")
        out.append(f"    - {t}: {rf_txt}, {xg_txt}")

    return "\n".join(out)

if __name__ == "__main__":
    home = input("Local: ").strip()
    away = input("Visitante: ").strip()
    print(predict_match_formatted(home, away))



In [None]:
#EQUIPOS DISPONIBLES EN EL MODELO 

# Tras cargar y normalizar df_hist 
teams_home = df_hist["HomeTeam"].unique()
teams_away = df_hist["AwayTeam"].unique()

# Unión de ambos arrays y orden alfabético
all_teams = sorted(set(teams_home).union(teams_away))

# Imprime
print(f"Tienes {len(all_teams)} equipos registrados:")
for t in all_teams:
    print(" -", t)