<a href="https://colab.research.google.com/github/josevargas1229/PrySoccerPredict/blob/Jose/Copia_de_futbol.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, classification_report
from sklearn.preprocessing import OrdinalEncoder,StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
urls = [
    "https://raw.githubusercontent.com/josevargas1229/pruebasPython/main/futbol%20datasets/E0%2019-20.csv",
    "https://raw.githubusercontent.com/josevargas1229/pruebasPython/main/futbol%20datasets/E0%2020-21.csv",
    "https://raw.githubusercontent.com/josevargas1229/pruebasPython/main/futbol%20datasets/E0%2021-22.csv",
    "https://raw.githubusercontent.com/josevargas1229/pruebasPython/main/futbol%20datasets/E0%2022-23.csv",
    "https://raw.githubusercontent.com/josevargas1229/pruebasPython/main/futbol%20datasets/E0%2023-24.csv"
]
#Hacer una lista de los dataframes a partir de los datasets por cada url en la lista de urls
df_list = [pd.read_csv(url) for url in urls]
#Contatenar en un solo dataframe todos los dataframes
combined_df = pd.concat(df_list, ignore_index=True)
#Obtener las columnas relacionadas con nuestro objeto de estudio
df = combined_df.iloc[:, 3:24]
df.head()

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,HS,...,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
0,Liverpool,Norwich,4,1,H,4,0,H,M Oliver,15,...,7,5,9,9,11,2,0,2,0,0
1,West Ham,Man City,0,5,A,0,1,A,M Dean,5,...,3,9,6,13,1,1,2,2,0,0
2,Bournemouth,Sheffield United,1,1,D,0,0,D,K Friend,13,...,3,3,10,19,3,4,2,1,0,0
3,Burnley,Southampton,3,0,H,0,0,D,G Scott,10,...,4,3,6,12,2,7,0,0,0,0
4,Crystal Palace,Everton,0,0,D,0,0,D,J Moss,6,...,2,3,16,14,6,2,2,1,0,1


In [None]:
# Codificación de variables categóricas usando el historial de rendimiento
def calculate_team_performance(df, team_col, result_col):
    team_stats = df.groupby(team_col)[result_col].value_counts(normalize=True).unstack().fillna(0)
    return team_stats

# Convertir resultados a valores numéricos
result_mapping = {'H': 1, 'D': 0, 'A': -1}
df['FTR'] = df['FTR'].map(result_mapping)

# Calcular estadísticas históricas
home_performance = calculate_team_performance(df, 'HomeTeam', 'FTR')
away_performance = calculate_team_performance(df, 'AwayTeam', 'FTR')

# Cambiar los nombres de las columnas para evitar conflictos
home_performance.columns = [f'{col}_home' for col in home_performance.columns]
away_performance.columns = [f'{col}_away' for col in away_performance.columns]

# Merge con el DataFrame original
df = df.merge(home_performance, left_on='HomeTeam', right_index=True, how='left', suffixes=('', '_home'))
df = df.merge(away_performance, left_on='AwayTeam', right_index=True, how='left', suffixes=('', '_away'))

# Eliminar las columnas originales de HomeTeam y AwayTeam
df.drop(columns=['HomeTeam', 'AwayTeam'], inplace=True)


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1900 entries, 0 to 1899
Data columns (total 25 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   FTHG     1900 non-null   int64  
 1   FTAG     1900 non-null   int64  
 2   FTR      1900 non-null   int64  
 3   HTHG     1900 non-null   int64  
 4   HTAG     1900 non-null   int64  
 5   HTR      1900 non-null   object 
 6   Referee  1900 non-null   object 
 7   HS       1900 non-null   int64  
 8   AS       1900 non-null   int64  
 9   HST      1900 non-null   int64  
 10  AST      1900 non-null   int64  
 11  HF       1900 non-null   int64  
 12  AF       1900 non-null   int64  
 13  HC       1900 non-null   int64  
 14  AC       1900 non-null   int64  
 15  HY       1900 non-null   int64  
 16  AY       1900 non-null   int64  
 17  HR       1900 non-null   int64  
 18  AR       1900 non-null   int64  
 19  -1_home  1900 non-null   float64
 20  0_home   1900 non-null   float64
 21  1_home   1900 

In [None]:
#Borrar columnas que no se utilizarán para el modelo
df.drop(columns=['FTHG','FTAG','HTHG','HTAG','HTR','Referee'],inplace=True)
"""
    'FTHG':'Goles_local',
    'FTAG':'Goles_visitante',
    'HTHG':'Goles_local_medio',
    'HTAG':'Goles_visitante_medio',
    'HTR':'Resultados_medio',
    'HomeTeam':'Equipo_local',
    'AwayTeam':'Equipo_visitante',
    'Referee':'Arbitro',
"""
#Renombrar las columnas
df.rename(columns={
    'FTR':'Resultados',
    'HS':'Disparos_local',
    'AS':'Disparos_visitantes',
    'HST':'Disparos_local_porteria',
    'AST':'Disparos_visitantes_porteria',
    'HF':'Faltas_local',
    'AF':'Faltas_visitantes',
    'HC':'Esquinas_local',
    'AC':'Esquinas_visitantes',
    'HY':'Tar_amarilla_local',
    'AY':'Tar_amarilla_visitantes',
    'HR':'Tar_roja_local',
    'AR':'Tar_roja_visitantes',
    },
    inplace=True)
df.head()

Unnamed: 0,Resultados,Disparos_local,Disparos_visitantes,Disparos_local_porteria,Disparos_visitantes_porteria,Faltas_local,Faltas_visitantes,Esquinas_local,Esquinas_visitantes,Tar_amarilla_local,Tar_amarilla_visitantes,Tar_roja_local,Tar_roja_visitantes,-1_home,0_home,1_home,-1_away,0_away,1_away
0,1,15,12,7,5,9,9,11,2,0,2,0,0,0.084211,0.168421,0.747368,0.078947,0.184211,0.736842
1,-1,5,14,3,9,6,13,1,1,2,2,0,0,0.315789,0.263158,0.421053,0.673684,0.147368,0.178947
2,0,13,8,3,3,10,19,3,4,2,1,0,0,0.403509,0.280702,0.315789,0.122807,0.22807,0.649123
3,1,10,11,4,3,6,12,2,7,0,0,0,0,0.486842,0.263158,0.25,0.263158,0.210526,0.526316
4,0,6,10,2,3,16,14,6,2,2,1,0,1,0.336842,0.305263,0.357895,0.263158,0.263158,0.473684


In [None]:
#Obtener la Y
Y=df['Resultados']
Y

0       1
1      -1
2       0
3       1
4       0
       ..
1895    1
1896    1
1897   -1
1898    1
1899   -1
Name: Resultados, Length: 1900, dtype: int64

In [None]:
#Obtener la X
X=df.drop(columns=['Resultados'])
X

Unnamed: 0,Disparos_local,Disparos_visitantes,Disparos_local_porteria,Disparos_visitantes_porteria,Faltas_local,Faltas_visitantes,Esquinas_local,Esquinas_visitantes,Tar_amarilla_local,Tar_amarilla_visitantes,Tar_roja_local,Tar_roja_visitantes,-1_home,0_home,1_home,-1_away,0_away,1_away
0,15,12,7,5,9,9,11,2,0,2,0,0,0.084211,0.168421,0.747368,0.078947,0.184211,0.736842
1,5,14,3,9,6,13,1,1,2,2,0,0,0.315789,0.263158,0.421053,0.673684,0.147368,0.178947
2,13,8,3,3,10,19,3,4,2,1,0,0,0.403509,0.280702,0.315789,0.122807,0.228070,0.649123
3,10,11,4,3,6,12,2,7,0,0,0,0,0.486842,0.263158,0.250000,0.263158,0.210526,0.526316
4,6,10,2,3,16,14,6,2,2,1,0,1,0.336842,0.305263,0.357895,0.263158,0.263158,0.473684
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1895,15,8,9,2,10,8,2,4,1,4,0,0,0.336842,0.305263,0.357895,0.336842,0.189474,0.473684
1896,36,4,14,3,14,11,10,2,1,1,0,1,0.084211,0.168421,0.747368,0.284211,0.252632,0.463158
1897,15,15,6,7,15,20,4,4,5,4,0,0,0.578947,0.210526,0.210526,0.245614,0.298246,0.456140
1898,28,3,12,2,3,12,11,2,0,1,0,0,0.094737,0.126316,0.778947,0.315789,0.178947,0.505263


In [None]:
#Escalar las variables
scaler = StandardScaler()
X[:] = scaler.fit_transform(X[:])
X

Unnamed: 0,Disparos_local,Disparos_visitantes,Disparos_local_porteria,Disparos_visitantes_porteria,Faltas_local,Faltas_visitantes,Esquinas_local,Esquinas_visitantes,Tar_amarilla_local,Tar_amarilla_visitantes,Tar_roja_local,Tar_roja_visitantes,-1_home,0_home,1_home,-1_away,0_away,1_away
0,0.196677,0.077416,0.823827,0.366931,-0.468563,-0.494276,1.686182,-0.971123,-1.307301,0.132786,-0.231406,-0.244917,-1.855303,-0.863987,1.957264,-1.890682,-0.865109,2.252612
1,-1.541398,0.462976,-0.691073,2.015086,-1.335708,0.607982,-1.518846,-1.328984,0.278318,0.132786,-0.231406,-0.244917,-0.118424,0.524564,-0.127794,2.560299,-1.603617,-1.996087
2,-0.150938,-0.693703,-0.691073,-0.457146,-0.179514,2.261370,-0.877840,-0.255400,0.278318,-0.636400,-0.231406,-0.244917,0.539485,0.781703,-0.800393,-1.562439,0.014067,1.584578
3,-0.672360,-0.115364,-0.312348,-0.457146,-1.335708,0.332418,-1.198343,0.818184,-1.307301,-1.405586,-0.231406,-0.244917,1.164499,0.524564,-1.220768,-0.512060,-0.337604,0.649329
4,-1.367590,-0.308144,-1.069798,-0.457146,1.554776,0.883547,0.083668,-0.971123,0.278318,-0.636400,-0.231406,3.801531,0.039475,1.141698,-0.531353,-0.512060,0.717408,0.248509
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1895,0.196677,-0.693703,1.581277,-0.869185,-0.179514,-0.769840,-1.198343,-0.255400,-0.514491,1.671157,-0.231406,-0.244917,0.039475,1.141698,-0.531353,0.039389,-0.759608,0.248509
1896,3.846633,-1.464823,3.474902,-0.457146,0.976679,0.056853,1.365679,-0.971123,-0.514491,-0.636400,-0.231406,3.801531,-1.855303,-0.863987,1.957264,-0.354503,0.506405,0.168345
1897,0.196677,0.655756,0.445102,1.191008,1.265728,2.536935,-0.557337,-0.255400,2.656745,1.671157,-0.231406,-0.244917,1.855303,-0.246854,-1.472992,-0.643357,1.420748,0.114902
1898,2.456174,-1.657603,2.717452,-0.869185,-2.202853,0.332418,1.686182,-0.971123,-1.307301,-0.636400,-0.231406,-0.244917,-1.776354,-1.481121,2.159044,-0.118168,-0.970610,0.489001


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

In [None]:
#Inicializar las variables para almacenar el mejor numero de caracteristicas, el accuracy y las caracteristicas a seleccionar
best_num_features = 0
best_accuracy = 0
best_features = None

In [None]:
#Crear el modelo de regresión logística
modelo_logistico = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=1000)
for n_features in range(1, 16):
      # Instanciar RFE con el modelo de regresión logística
      rfe = RFE(estimator=modelo_logistico, n_features_to_select=n_features)
      X_rfe = rfe.fit_transform(X, Y)

      # Realizar validación cruzada con SVM
      modelo_svm = SVC()
      accuracies = cross_val_score(modelo_svm, X_rfe, Y, cv=5, scoring='accuracy')

      # Calcular la precisión promedio
      mean_accuracy = np.mean(accuracies)

      print(f"Características: {n_features}, Precisión promedio: {mean_accuracy}")

      # Actualizar los mejores resultados si es necesario
      if mean_accuracy > best_accuracy:
          best_accuracy = mean_accuracy
          best_num_features = n_features
          best_features = rfe.support_

Características: 1, Precisión promedio: 0.5536842105263158
Características: 2, Precisión promedio: 0.6005263157894737
Características: 3, Precisión promedio: 0.611578947368421
Características: 4, Precisión promedio: 0.6121052631578948
Características: 5, Precisión promedio: 0.6163157894736842
Características: 6, Precisión promedio: 0.6226315789473684
Características: 7, Precisión promedio: 0.6268421052631579
Características: 8, Precisión promedio: 0.6226315789473684
Características: 9, Precisión promedio: 0.6273684210526316
Características: 10, Precisión promedio: 0.6284210526315789
Características: 11, Precisión promedio: 0.6273684210526316
Características: 12, Precisión promedio: 0.6205263157894737
Características: 13, Precisión promedio: 0.6189473684210526
Características: 14, Precisión promedio: 0.6168421052631579
Características: 15, Precisión promedio: 0.6189473684210526


In [None]:
#Obtenemos las mejores características
selected_features = X.columns[best_features]
print("Mejor número de características:", best_num_features)
print("Mejores características:", selected_features)
print("Mejor precisión promedio:", best_accuracy)

Mejor número de características: 10
Mejores características: Index(['Disparos_local', 'Disparos_visitantes', 'Disparos_local_porteria',
       'Disparos_visitantes_porteria', 'Esquinas_local', 'Esquinas_visitantes',
       '-1_home', '0_home', '-1_away', '1_away'],
      dtype='object')
Mejor precisión promedio: 0.6284210526315789


In [None]:
X_selected = X[selected_features]
X_train, X_test, Y_train, Y_test = train_test_split(X_selected, Y, test_size=0.2)

In [None]:
#Crear un diccionario con los modelos y las funciones
modelos = {
        'SVC': SVC(),
        'KNN': KNeighborsClassifier(),
        "Decision Tree": DecisionTreeClassifier(),
        "Random Forest": RandomForestClassifier(n_estimators=100)
    }


In [None]:
best_model = None
best_accuracy = 0

In [None]:
#Entrenar los modelos y calcular el acuraccy
for name, modelo in modelos.items():
    modelo.fit(X_train, Y_train)
    Y_pred = modelo.predict(X_test)
    accuracy = accuracy_score(Y_test, Y_pred)
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = modelo
    conf_matrix = confusion_matrix(Y_test, Y_pred)
    print(f"Model: {name}")
    print(f"Accuracy: {accuracy}")
    print(f"Confusion Matrix:\n{conf_matrix}")
    print("\n" + "-"*50 + "\n")

Model: SVC
Accuracy: 0.6631578947368421
Confusion Matrix:
[[104   6  22]
 [ 26  13  48]
 [ 17   9 135]]

--------------------------------------------------

Model: KNN
Accuracy: 0.5657894736842105
Confusion Matrix:
[[ 96  11  25]
 [ 40  14  33]
 [ 29  27 105]]

--------------------------------------------------

Model: Decision Tree
Accuracy: 0.5184210526315789
Confusion Matrix:
[[ 66  31  35]
 [ 27  21  39]
 [ 22  29 110]]

--------------------------------------------------

Model: Random Forest
Accuracy: 0.6552631578947369
Confusion Matrix:
[[102   7  23]
 [ 28  15  44]
 [ 17  12 132]]

--------------------------------------------------



In [None]:
def predict_match_result(home_team, away_team, model, home_perf, away_perf):
    match_data = {
        'Disparos_local': 0, 'Disparos_visitantes': 0, 'Disparos_local_porteria': 0, 'Disparos_visitantes_porteria': 0,
        'Esquinas_local': 0, 'Esquinas_visitantes': 0, 'Tar_amarilla_local': 0, 'Tar_amarilla_visitantes': 0,
        'Tar_roja_local': 0, 'Tar_roja_visitantes': 0
    }

    if home_team in home_perf.index:
        for col in home_perf.columns:
            match_data[col] = home_perf.loc[home_team, col]
    else:
        for col in home_perf.columns:
            match_data[col] = 0

    if away_team in away_perf.index:
        for col in away_perf.columns:
            match_data[col] = away_perf.loc[away_team, col]
    else:
        for col in away_perf.columns:
            match_data[col] = 0

    match_df = pd.DataFrame([match_data])
    prediction = model.predict(match_df)
    return prediction[0]

In [None]:
# Equipos para el partido específico
home_team = "Liverpool"
away_team = "Norwich"

In [None]:
# Realizar la predicción
# result = predict_match_result(home_team, away_team, best_model, home_performance, away_performance)
# result_mapping_inverse = {1: 'Home Win', 0: 'Draw', -1: 'Away Win'}

# print(f"Predicted result for {home_team} vs {away_team}: {result_mapping_inverse[result]}")