In [1]:
import pandas as pd
import networkx as nx
import numpy as np
import glob

import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

# for notebook
%matplotlib inline

## Datasets

LaLigaCompleteDataset : 29 temporadas

In [2]:
# https://www.kaggle.com/datasets/kishan305/la-liga-results-19952020?rvi=1

la_liga = pd.read_csv('../data/LaLigaCompleteDataset/LaLiga_Matches.csv', sep=',')
df = la_liga.copy()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10883 entries, 0 to 10882
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Season    10883 non-null  object 
 1   Date      10883 non-null  object 
 2   HomeTeam  10883 non-null  object 
 3   AwayTeam  10883 non-null  object 
 4   FTHG      10883 non-null  int64  
 5   FTAG      10883 non-null  int64  
 6   FTR       10883 non-null  object 
 7   HTHG      10881 non-null  float64
 8   HTAG      10881 non-null  float64
 9   HTR       10881 non-null  object 
dtypes: float64(2), int64(2), object(6)
memory usage: 850.4+ KB


In [3]:
df.head(20)

Unnamed: 0,Season,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR
0,1995-96,02-09-1995,La Coruna,Valencia,3,0,H,2.0,0.0,H
1,1995-96,02-09-1995,Sp Gijon,Albacete,3,0,H,3.0,0.0,H
2,1995-96,03-09-1995,Ath Bilbao,Santander,4,0,H,2.0,0.0,H
3,1995-96,03-09-1995,Ath Madrid,Sociedad,4,1,H,1.0,1.0,D
4,1995-96,03-09-1995,Celta,Compostela,0,1,A,0.0,0.0,D
5,1995-96,03-09-1995,Espanol,Salamanca,3,1,H,2.0,1.0,H
6,1995-96,03-09-1995,Merida,Betis,1,1,D,1.0,1.0,D
7,1995-96,03-09-1995,Sevilla,Tenerife,0,1,A,0.0,0.0,D
8,1995-96,03-09-1995,Valladolid,Barcelona,0,2,A,0.0,0.0,D
9,1995-96,03-09-1995,Vallecano,Real Madrid,1,5,A,0.0,1.0,A


In [4]:
df.tail(20)

Unnamed: 0,Season,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR
10863,2023-24,26-09-2023,Sevilla,Almeria,5,1,H,3.0,0.0,H
10864,2023-24,26-09-2023,Mallorca,Barcelona,2,2,D,2.0,1.0,H
10865,2023-24,27-09-2023,Ath Bilbao,Getafe,2,2,D,1.0,0.0,H
10866,2023-24,27-09-2023,Real Madrid,Las Palmas,2,0,H,1.0,0.0,H
10867,2023-24,27-09-2023,Villarreal,Girona,1,2,A,0.0,0.0,D
10868,2023-24,27-09-2023,Cadiz,Vallecano,0,0,D,0.0,0.0,D
10869,2023-24,27-09-2023,Valencia,Sociedad,0,1,A,0.0,1.0,A
10870,2023-24,28-09-2023,Celta,Alaves,1,1,D,1.0,0.0,H
10871,2023-24,28-09-2023,Granada,Betis,1,1,D,0.0,0.0,D
10872,2023-24,28-09-2023,Osasuna,Ath Madrid,0,2,A,0.0,1.0,A


In [5]:
df.columns

Index(['Season', 'Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG',
       'HTAG', 'HTR'],
      dtype='object')

In [6]:
df = df[['Season', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG']] 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10883 entries, 0 to 10882
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Season    10883 non-null  object
 1   HomeTeam  10883 non-null  object
 2   AwayTeam  10883 non-null  object
 3   FTHG      10883 non-null  int64 
 4   FTAG      10883 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 425.2+ KB


In [7]:
df.rename(columns = {
    'Season':'year', 
    'HomeTeam':'home', 
    'AwayTeam':'away', 
    'FTHG':'home_goals', 
    'FTAG':'away_goals'
    }, inplace = True) 


In [8]:
df = df[['year','home','away','home_goals','away_goals']]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10883 entries, 0 to 10882
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   year        10883 non-null  object
 1   home        10883 non-null  object
 2   away        10883 non-null  object
 3   home_goals  10883 non-null  int64 
 4   away_goals  10883 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 425.2+ KB


In [9]:
df = df.dropna(how='any',axis=0)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10883 entries, 0 to 10882
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   year        10883 non-null  object
 1   home        10883 non-null  object
 2   away        10883 non-null  object
 3   home_goals  10883 non-null  int64 
 4   away_goals  10883 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 425.2+ KB


In [10]:
print(sorted(df['year'].unique()))
print("#temporadas = ", len(df['year'].unique()))

['1995-96', '1996-97', '1997-98', '1998-99', '1999-2000', '2000-01', '2001-02', '2002-03', '2003-04', '2004-05', '2005-06', '2006-07', '2007-08', '2008-09', '2009-10', '2010-11', '2011-12', '2012-13', '2013-14', '2014-15', '2015-16', '2016-17', '2017-18', '2018-19', '2019-20', '2020-21', '2021-22', '2022-23', '2023-24']
#temporadas =  29


In [11]:
#temp = ['2017-18', '2018-19', '2019-20', '2020-21', '2021-22', '2022-23']
#df = df.loc[df['year'].isin(temp)].reset_index()

In [12]:
vhome = list(df['home'].unique())
print(len(vhome))

vaway = list(df['away'].unique())
print(len(vaway))

vertice = set(vhome) | set(vaway)
print(len(vertice))
print(list(vertice))

48
48
48
['Santander', 'Real Madrid', 'Huesca', 'Ath Bilbao', 'Las Palmas', 'Numancia', 'Sp Gijon', 'Vallecano', 'Malaga', 'Ath Madrid', 'Tenerife', 'Merida', 'Valencia', 'Gimnastic', 'Elche', 'La Coruna', 'Zaragoza', 'Compostela', 'Hercules', 'Xerez', 'Betis', 'Villarreal', 'Cordoba', 'Levante', 'Albacete', 'Almeria', 'Celta', 'Recreativo', 'Murcia', 'Eibar', 'Oviedo', 'Granada', 'Extremadura', 'Mallorca', 'Villareal', 'Sociedad', 'Osasuna', 'Cadiz', 'Leganes', 'Barcelona', 'Salamanca', 'Valladolid', 'Sevilla', 'Alaves', 'Espanol', 'Getafe', 'Logrones', 'Girona']


In [13]:
df['home_win'] = 0.0
df['away_win'] = 0.0
df['draw'] = 0.0

df.loc[df['home_goals'] > df['away_goals'], 'home_win'] = 1.0
df.loc[df['home_goals'] < df['away_goals'], 'away_win'] = 1.0
df.loc[df['home_goals'] == df['away_goals'], 'draw'] = 1.0

df.head()

Unnamed: 0,year,home,away,home_goals,away_goals,home_win,away_win,draw
0,1995-96,La Coruna,Valencia,3,0,1.0,0.0,0.0
1,1995-96,Sp Gijon,Albacete,3,0,1.0,0.0,0.0
2,1995-96,Ath Bilbao,Santander,4,0,1.0,0.0,0.0
3,1995-96,Ath Madrid,Sociedad,4,1,1.0,0.0,0.0
4,1995-96,Celta,Compostela,0,1,0.0,1.0,0.0


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10883 entries, 0 to 10882
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   year        10883 non-null  object 
 1   home        10883 non-null  object 
 2   away        10883 non-null  object 
 3   home_goals  10883 non-null  int64  
 4   away_goals  10883 non-null  int64  
 5   home_win    10883 non-null  float64
 6   away_win    10883 non-null  float64
 7   draw        10883 non-null  float64
dtypes: float64(3), int64(2), object(3)
memory usage: 680.3+ KB


In [15]:
df2 = df.groupby(
    ['home', 'away'],
    as_index=False,
    sort=False).agg({'home_goals':'sum', 'away_goals':'sum', 'home_win':'sum', 'away_win':'sum', 'draw' : 'sum'})

df2.head(20)

Unnamed: 0,home,away,home_goals,away_goals,home_win,away_win,draw
0,La Coruna,Valencia,27,27,8.0,9.0,4.0
1,Sp Gijon,Albacete,3,0,1.0,0.0,0.0
2,Ath Bilbao,Santander,32,17,9.0,1.0,6.0
3,Ath Madrid,Sociedad,46,18,14.0,2.0,7.0
4,Celta,Compostela,4,6,0.0,2.0,1.0
5,Espanol,Salamanca,10,1,3.0,0.0,0.0
6,Merida,Betis,2,4,0.0,1.0,1.0
7,Sevilla,Tenerife,7,2,3.0,1.0,0.0
8,Valladolid,Barcelona,16,30,4.0,12.0,2.0
9,Vallecano,Real Madrid,14,29,3.0,11.0,0.0


In [16]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1651 entries, 0 to 1650
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   home        1651 non-null   object 
 1   away        1651 non-null   object 
 2   home_goals  1651 non-null   int64  
 3   away_goals  1651 non-null   int64  
 4   home_win    1651 non-null   float64
 5   away_win    1651 non-null   float64
 6   draw        1651 non-null   float64
dtypes: float64(3), int64(2), object(2)
memory usage: 90.4+ KB


In [17]:
lista = []
for keys, infos in df2.iterrows():
    time1 = infos.home
    time2 = infos.away
    t = df2.loc[(df2['home'] == time2) & (df2['away'] == time1)][['home','away']]
    
    if len(t) == 0:
        lista.append(time1)
        print(time2,time1)

print(len(lista))

Girona Granada
1


In [18]:
time1 = "Girona"
time2 = "Granada"

t = df2.loc[(df2['home'] == time1) & (df2['away'] == time2)][['home','away','home_goals','away_goals','home_win', 'away_win', 'draw']]
print(t.values)

t = df2.loc[(df2['home'] == time2) & (df2['away'] == time1)][['home','away','home_goals','away_goals','home_win', 'away_win', 'draw']]
print(t.values)

[]
[['Granada' 'Girona' 2 4 0.0 1.0 0.0]]


In [19]:
df2.head(20)

Unnamed: 0,home,away,home_goals,away_goals,home_win,away_win,draw
0,La Coruna,Valencia,27,27,8.0,9.0,4.0
1,Sp Gijon,Albacete,3,0,1.0,0.0,0.0
2,Ath Bilbao,Santander,32,17,9.0,1.0,6.0
3,Ath Madrid,Sociedad,46,18,14.0,2.0,7.0
4,Celta,Compostela,4,6,0.0,2.0,1.0
5,Espanol,Salamanca,10,1,3.0,0.0,0.0
6,Merida,Betis,2,4,0.0,1.0,1.0
7,Sevilla,Tenerife,7,2,3.0,1.0,0.0
8,Valladolid,Barcelona,16,30,4.0,12.0,2.0
9,Vallecano,Real Madrid,14,29,3.0,11.0,0.0


In [20]:
df2.tail(20)

Unnamed: 0,home,away,home_goals,away_goals,home_win,away_win,draw
1631,Elche,Alaves,3,3,1.0,1.0,0.0
1632,Cadiz,Elche,5,4,1.0,1.0,1.0
1633,Levante,Cadiz,2,4,0.0,1.0,1.0
1634,Vallecano,Cadiz,8,2,2.0,0.0,0.0
1635,Mallorca,Elche,2,3,0.0,1.0,1.0
1636,Cadiz,Vallecano,3,0,2.0,0.0,1.0
1637,Elche,Mallorca,4,1,1.0,0.0,1.0
1638,Mallorca,Girona,1,1,0.0,0.0,1.0
1639,Girona,Cadiz,1,1,0.0,0.0,1.0
1640,Almeria,Girona,3,2,1.0,0.0,0.0


In [21]:
df2["weight1"] = 0.0
df2["weight2"] = 0.0

In [22]:
for keys, infos in df2.iterrows():
    time1 = infos.home
    time2 = infos.away

    t1 =  df2.loc[(df2['home'] == time1) & (df2['away'] == time2)][['away_goals','away_win','draw']].values
    t2 =  df2.loc[(df2['away'] == time1) & (df2['home'] == time2)][['home_goals','home_win','draw']].values

    g1 = p1 = 0
    for i1 in t1:
        g1 = i1[0]
        p1 = 3*i1[1] + 1*i1[2]

    g2 = p2 = 0
    for i2 in t2:
        g2 = i2[0]
        p2 = 3*i2[1] + 1*i2[2]

    df2.loc[keys,'weight1'] = g1 + g2
    df2.loc[keys,'weight2'] = p1 + p2

In [23]:
df2.head(20)

Unnamed: 0,home,away,home_goals,away_goals,home_win,away_win,draw,weight1,weight2
0,La Coruna,Valencia,27,27,8.0,9.0,4.0,64.0,73.0
1,Sp Gijon,Albacete,3,0,1.0,0.0,0.0,1.0,0.0
2,Ath Bilbao,Santander,32,17,9.0,1.0,6.0,38.0,25.0
3,Ath Madrid,Sociedad,46,18,14.0,2.0,7.0,52.0,52.0
4,Celta,Compostela,4,6,0.0,2.0,1.0,9.0,12.0
5,Espanol,Salamanca,10,1,3.0,0.0,0.0,7.0,4.0
6,Merida,Betis,2,4,0.0,1.0,1.0,6.0,7.0
7,Sevilla,Tenerife,7,2,3.0,1.0,0.0,8.0,7.0
8,Valladolid,Barcelona,16,30,4.0,12.0,2.0,82.0,83.0
9,Vallecano,Real Madrid,14,29,3.0,11.0,0.0,73.0,70.0


In [24]:
df2.tail(20)

Unnamed: 0,home,away,home_goals,away_goals,home_win,away_win,draw,weight1,weight2
1631,Elche,Alaves,3,3,1.0,1.0,0.0,4.0,6.0
1632,Cadiz,Elche,5,4,1.0,1.0,1.0,9.0,9.0
1633,Levante,Cadiz,2,4,0.0,1.0,1.0,7.0,6.0
1634,Vallecano,Cadiz,8,2,2.0,0.0,0.0,5.0,7.0
1635,Mallorca,Elche,2,3,0.0,1.0,1.0,7.0,8.0
1636,Cadiz,Vallecano,3,0,2.0,0.0,1.0,8.0,7.0
1637,Elche,Mallorca,4,1,1.0,0.0,1.0,3.0,2.0
1638,Mallorca,Girona,1,1,0.0,0.0,1.0,8.0,7.0
1639,Girona,Cadiz,1,1,0.0,0.0,1.0,3.0,4.0
1640,Almeria,Girona,3,2,1.0,0.0,0.0,8.0,3.0


In [25]:
for keys, infos in df2.iterrows():
    time1 = infos.home
    time2 = infos.away

    t2 = df2.loc[(df2['home'] == time2) & (df2['away'] == time1)]
    
    if len(t2) == 0:
        t1 =  df2.loc[(df2['home'] == time1) & (df2['away'] == time2)][['home_goals','home_win','draw']].values

        w1 = w2 = 0
        for i1 in t1:
            w1 = i1[0]
            w2 = 3*i1[1] + 1*i1[2]

        new = [time2, time1,np.nan,np.nan,0,0,0,w1,w2]

        new = pd.DataFrame(columns=df2.columns, data=[new])
        
        df2 = pd.concat([df2, new], axis=0)

In [26]:
df2.head(20)

Unnamed: 0,home,away,home_goals,away_goals,home_win,away_win,draw,weight1,weight2
0,La Coruna,Valencia,27.0,27.0,8.0,9.0,4.0,64.0,73.0
1,Sp Gijon,Albacete,3.0,0.0,1.0,0.0,0.0,1.0,0.0
2,Ath Bilbao,Santander,32.0,17.0,9.0,1.0,6.0,38.0,25.0
3,Ath Madrid,Sociedad,46.0,18.0,14.0,2.0,7.0,52.0,52.0
4,Celta,Compostela,4.0,6.0,0.0,2.0,1.0,9.0,12.0
5,Espanol,Salamanca,10.0,1.0,3.0,0.0,0.0,7.0,4.0
6,Merida,Betis,2.0,4.0,0.0,1.0,1.0,6.0,7.0
7,Sevilla,Tenerife,7.0,2.0,3.0,1.0,0.0,8.0,7.0
8,Valladolid,Barcelona,16.0,30.0,4.0,12.0,2.0,82.0,83.0
9,Vallecano,Real Madrid,14.0,29.0,3.0,11.0,0.0,73.0,70.0


In [27]:
df2.tail(20)

Unnamed: 0,home,away,home_goals,away_goals,home_win,away_win,draw,weight1,weight2
1632,Cadiz,Elche,5.0,4.0,1.0,1.0,1.0,9.0,9.0
1633,Levante,Cadiz,2.0,4.0,0.0,1.0,1.0,7.0,6.0
1634,Vallecano,Cadiz,8.0,2.0,2.0,0.0,0.0,5.0,7.0
1635,Mallorca,Elche,2.0,3.0,0.0,1.0,1.0,7.0,8.0
1636,Cadiz,Vallecano,3.0,0.0,2.0,0.0,1.0,8.0,7.0
1637,Elche,Mallorca,4.0,1.0,1.0,0.0,1.0,3.0,2.0
1638,Mallorca,Girona,1.0,1.0,0.0,0.0,1.0,8.0,7.0
1639,Girona,Cadiz,1.0,1.0,0.0,0.0,1.0,3.0,4.0
1640,Almeria,Girona,3.0,2.0,1.0,0.0,0.0,8.0,3.0
1641,Girona,Osasuna,1.0,1.0,0.0,0.0,1.0,3.0,4.0


In [28]:
time1 = "Girona"
time2 = "Granada"

t = df2.loc[(df2['home'] == time1) & (df2['away'] == time2)][['home','away','home_goals','away_goals','home_win', 'away_win', 'draw','weight1','weight2']]
print(t.values)

t = df2.loc[(df2['home'] == time2) & (df2['away'] == time1)][['home','away','home_goals','away_goals','home_win', 'away_win', 'draw','weight1','weight2']]
print(t.values)

[['Girona' 'Granada' nan nan 0.0 0.0 0.0 2.0 0.0]]
[['Granada' 'Girona' 2.0 4.0 0.0 1.0 0.0 4.0 3.0]]


In [30]:
df3 = df2[['home','away','weight1','weight2']]

df3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1652 entries, 0 to 0
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   home     1652 non-null   object 
 1   away     1652 non-null   object 
 2   weight1  1652 non-null   float64
 3   weight2  1652 non-null   float64
dtypes: float64(2), object(2)
memory usage: 64.5+ KB


In [31]:
df3.head(20)

Unnamed: 0,home,away,weight1,weight2
0,La Coruna,Valencia,64.0,73.0
1,Sp Gijon,Albacete,1.0,0.0
2,Ath Bilbao,Santander,38.0,25.0
3,Ath Madrid,Sociedad,52.0,52.0
4,Celta,Compostela,9.0,12.0
5,Espanol,Salamanca,7.0,4.0
6,Merida,Betis,6.0,7.0
7,Sevilla,Tenerife,8.0,7.0
8,Valladolid,Barcelona,82.0,83.0
9,Vallecano,Real Madrid,73.0,70.0


In [32]:
df3.tail(20)

Unnamed: 0,home,away,weight1,weight2
1632,Cadiz,Elche,9.0,9.0
1633,Levante,Cadiz,7.0,6.0
1634,Vallecano,Cadiz,5.0,7.0
1635,Mallorca,Elche,7.0,8.0
1636,Cadiz,Vallecano,8.0,7.0
1637,Elche,Mallorca,3.0,2.0
1638,Mallorca,Girona,8.0,7.0
1639,Girona,Cadiz,3.0,4.0
1640,Almeria,Girona,8.0,3.0
1641,Girona,Osasuna,3.0,4.0


In [34]:
df3.to_csv('../data/work1/laliga_1995_2023.csv', index=False, sep=';')