In [1]:
import sqlite3
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
with sqlite3.connect("../laliga.sqlite") as conn:
    df = pd.read_sql("SELECT * FROM Matches", con = conn)

df = df.loc[-df.score.isnull()]


In [3]:
def parse_score(score_as_string):
        if score_as_string is None: 
            return None
        home_goals, away_goals = score_as_string.split(":")
        home_goals = int(home_goals)
        away_goals = int(away_goals)
        if home_goals > away_goals: 
            return "1"
        if away_goals > home_goals: 
            return "2" 
        else: 
            return "X"
        

In [4]:
df['result'] = df['score'].apply(parse_score)

In [5]:
df[['home_goals','away_goals']] = df['score'].str.split(":",expand=True)
df = df.astype({"home_goals": int, "away_goals": int})

In [6]:
df['Difference_goals'] = abs(df['home_goals'] - df['away_goals'])

In [62]:
# We collect the information from the point of view of the home team and the away team, then we put all together
df3 = df[['season','division','home_team','result','home_goals','away_goals']]
SeasonStanding = df3.groupby(['season','division','home_team']).agg(
    GF_home = ('home_goals', sum),
    GA_home = ('away_goals', sum),
    W = ('result', lambda x: x.eq('1').sum()),
    L = ('result', lambda x: x.eq('2').sum()),
    T = ('result', lambda x: x.eq('X').sum())

).reset_index()
SeasonStanding.rename({'home_team':'team'}, axis=1, inplace=True)
SeasonStanding.sort_values(['season','division','team'])

df3_1 = df[["season","division","away_team","result","home_goals","away_goals"]]
df3_1 = df3_1.groupby(["season",'division',"away_team"], as_index=False).agg(
    GF_away = ("away_goals", sum),
    GA_away = ("home_goals", sum),
    W_away = ('result', lambda x: x.eq('2').sum()),
    L_away = ('result', lambda x: x.eq('1').sum()),
    T_away = ('result', lambda x: x.eq('X').sum())
)
df3_1.sort_values(['season','division','away_team'])

SeasonStanding['GF_away'] = df3_1['GF_away']
SeasonStanding['GA_away'] = df3_1['GA_away']
SeasonStanding['W_away'] = df3_1['W_away']
SeasonStanding['L_away'] = df3_1['L_away']
SeasonStanding['T_away'] = df3_1['T_away']

# Creating all the features
SeasonStanding['W'] = SeasonStanding['W'] + SeasonStanding['W_away']
SeasonStanding['L'] = SeasonStanding['L'] + SeasonStanding['L_away']
SeasonStanding['T'] = SeasonStanding['T'] + SeasonStanding['T_away']
SeasonStanding['GF'] = SeasonStanding['GF_home'] + SeasonStanding['GF_away']
SeasonStanding['GA'] = SeasonStanding['GA_home'] + SeasonStanding['GA_away']
SeasonStanding['GD'] = SeasonStanding['GF'] - SeasonStanding['GA']
SeasonStanding = SeasonStanding.drop(['GF_home', 'GF_away', 'GA_home', 'GA_away', 'W_away', 'L_away', 'T_away'], axis=1)
SeasonStanding['Pts'] = 3*SeasonStanding['W'] + SeasonStanding['T']

# Creating the rank for each season
team_count = SeasonStanding.groupby(['season','division'])['team'].count().tolist()
rank_column = []
for i in team_count :
    j = list(range(1,i+1,1))
    rank_column += j

SeasonStanding = SeasonStanding.sort_values(['season','division', 'Pts', 'GD', 'GF'], ascending=[True,True, False, False, False])
SeasonStanding['rank'] = rank_column



In [63]:
# Splitting season to compare it later
SeasonStanding['season2'] = SeasonStanding['season']
SeasonStanding = SeasonStanding.astype({"season": str})
SeasonStanding[['season','season2']] = SeasonStanding['season'].str.split("-",expand=True)
SeasonStanding = SeasonStanding.astype({"season": int})
SeasonStanding = SeasonStanding.drop('season2', axis=1)

Unnamed: 0,season,division,team,W,L,T,GF,GA,GD,Pts,rank
3,1928,1,Barcelona,11,4,3,37,23,14,36,1
8,1928,1,Real Madrid,11,6,1,40,27,13,34,2
1,1928,1,Athletic,8,6,4,43,33,10,28,3
5,1928,1,Donostia,8,6,4,46,41,5,28,4
0,1928,1,Arenas Club,8,7,3,32,39,-7,27,5
2,1928,1,Athletic Madrid,8,8,2,43,41,2,26,6
6,1928,1,Espanyol,7,7,4,32,38,-6,25,7
4,1928,1,Catalunya,6,8,4,45,49,-4,22,8
9,1928,1,Real Unión,5,11,2,40,42,-2,17,9
7,1928,1,Racing,3,12,3,25,50,-25,12,10


In [7]:
def isNaN(obj):
    return obj != obj

In [8]:
def last_results(row, row_1, row_2, row_3, row_4, row_5, row_6, row_7, row_8, row_9, row_10):
    if isNaN(row_1):
        return [row]
    elif isNaN(row_2):
        return [row, row_1]
    elif isNaN(row_3):
        return [row, row_1, row_2]
    elif isNaN(row_4):
        return [row, row_1, row_2, row_3]
    elif isNaN(row_5):
        return [row, row_1, row_2, row_3, row_4]
    elif isNaN(row_6):
        return [row, row_1, row_2, row_3, row_4, row_5]
    elif isNaN(row_7):
        return [row, row_1, row_2, row_3, row_4, row_5, row_6]
    elif isNaN(row_8):
        return [row, row_1, row_2, row_3, row_4, row_5, row_6, row_7]
    elif isNaN(row_9):
        return [row, row_1, row_2, row_3, row_4, row_5, row_6, row_7, row_8]
    elif isNaN(row_10):
        return [row, row_1, row_2, row_3, row_4, row_5, row_6, row_7, row_8, row_9] 
    else:
        return [row,row_1,row_2,row_3,row_4, row_5, row_6, row_7, row_8, row_9, row_10]

In [9]:
def identity_col(col):
    return col

In [10]:
def result(row):
    if row['W']==1:
        return 'W'
    elif row['L']==1:
        return 'L'
    elif row['T']==1:
        return 'T'
    else:
        return 'Fail'

# DATA
Datos que tiene que tener la tabla, se agrupa por temporada, division, jornada y equipo. El pilar principal es el equipo y las caracteristicas son:
$$
\begin{itemize}
    \item Equipo local si o no. // Yo creo que no haria falta    --HECHO--
    \item Equipo adversario.  --HECHO--
    \item Resultado del partido, W,L o T.  --HECHO--
    \item GD del partido.  --HECHO--
    \item GD acomulada de la jornada.  --HECHO--
    \item Ranking de la jornada  --HECHO--
    \item Ultimos rankings temporadas previas
    \item Ultimos resultados partidos  --MEDIO HECHO--
\end{itemize}
$$

In [130]:
df4 = df[['season','division','home_team','result','matchday','home_goals','away_team','away_goals']]
MatchdayStanding = df4.groupby(['season','division','matchday','home_team']).agg(
    GF = ('home_goals', sum),
    Away_team = ('away_team',identity_col),
    GA = ('away_goals', sum),
    W = ('result', lambda x: x.eq('1').sum()),
    L = ('result', lambda x: x.eq('2').sum()),
    T = ('result', lambda x: x.eq('X').sum()),
).reset_index()
MatchdayStanding.rename({'home_team':'team'}, axis=1, inplace=True)
MatchdayStanding.sort_values(['season','division','team','matchday'])

df4_1 = df[["season","division","away_team","result",'matchday',"home_goals",'home_team',"away_goals"]]
df4_1 = df4_1.groupby(["season",'division','matchday',"away_team"], as_index=False).agg(
    GF = ("away_goals", sum),
    GA = ("home_goals", sum),
    Away_team = ('home_team',identity_col),
    W_away = ('result', lambda x: x.eq('2').sum()),
    L_away = ('result', lambda x: x.eq('1').sum()),
    T_away = ('result', lambda x: x.eq('X').sum())
)
df4_1.sort_values(['season','division','away_team','matchday'])
df4_1.rename({'away_team': 'team', 'W_away': 'W', 'L_away': 'L', 'T_away': 'T'}, axis=1, inplace=True) 

MatchdayStanding = MatchdayStanding.append(df4_1).sort_values(['season','division','matchday'])
MatchdayStanding.reset_index()
MatchdayStanding.sort_values(['season','division','matchday'])
MatchdayStanding.reset_index()
MatchdayStanding['result'] = MatchdayStanding.apply(lambda row: result(row), axis=1)


MatchdayStanding['GD'] = MatchdayStanding['GF'] - MatchdayStanding['GA']

# Cummulative results
MatchdayStanding['GF'] = MatchdayStanding.groupby(["season",'division','team'], as_index=False)['GF'].cumsum()
MatchdayStanding['GA'] = MatchdayStanding.groupby(["season",'division','team'], as_index=False)['GA'].cumsum()
MatchdayStanding['W'] = MatchdayStanding.groupby(["season",'division','team'], as_index=False)['W'].cumsum()
MatchdayStanding['L'] = MatchdayStanding.groupby(["season",'division','team'], as_index=False)['L'].cumsum()
MatchdayStanding['T'] = MatchdayStanding.groupby(["season",'division','team'], as_index=False)['T'].cumsum()


MatchdayStanding['GD_cum'] = MatchdayStanding['GF'] - MatchdayStanding['GA']
MatchdayStanding['Pts'] = 3*MatchdayStanding['W'] + MatchdayStanding['T']

# Raking by matchday
team_count = MatchdayStanding.groupby(['season','division','matchday'])['team'].count().tolist()
rank_column = []
for i in team_count :
    j = list(range(1,i+1,1))
    rank_column += j

MatchdayStanding = MatchdayStanding.sort_values(['season','division','matchday', 'Pts', 'GD', 'GF'], ascending=[True,True, True, False, False, False])
MatchdayStanding['rank'] = rank_column

# Shifted rows
MatchdayStanding = MatchdayStanding.sort_values(['season','division','team'], ascending=[True,True,True])
MatchdayStanding['result1'] = MatchdayStanding.groupby(['season','division','team'])['result'].shift()
MatchdayStanding['result2'] = MatchdayStanding.groupby(['season','division','team'])['result1'].shift()
MatchdayStanding['result3'] = MatchdayStanding.groupby(['season','division','team'])['result2'].shift()
MatchdayStanding['result4'] = MatchdayStanding.groupby(['season','division','team'])['result3'].shift()
MatchdayStanding['result5'] = MatchdayStanding.groupby(['season','division','team'])['result4'].shift()
MatchdayStanding['result6'] = MatchdayStanding.groupby(['season','division','team'])['result5'].shift()
MatchdayStanding['result7'] = MatchdayStanding.groupby(['season','division','team'])['result6'].shift()
MatchdayStanding['result8'] = MatchdayStanding.groupby(['season','division','team'])['result7'].shift()
MatchdayStanding['result9'] = MatchdayStanding.groupby(['season','division','team'])['result8'].shift()
MatchdayStanding['result10'] = MatchdayStanding.groupby(['season','division','team'])['result9'].shift()

MatchdayStanding['last_results'] = MatchdayStanding.apply(lambda row: last_results(row['result'],row['result1'],row['result2'],row['result3'],row['result4'],row['result5'],row['result6'],row['result7'],row['result8'],row['result9'],row['result10']), axis=1)

MatchdayStanding = MatchdayStanding.drop(['GF','Pts','GA','result1','result2','result3','result4','result5','result6','result7','result8','result9','result10'], axis=1)


In [131]:
# Splitting season to compare it later
MatchdayStanding['season2'] = MatchdayStanding['season']
MatchdayStanding = MatchdayStanding.astype({"season": str})
MatchdayStanding[['season','season2']] = MatchdayStanding['season'].str.split("-",expand=True)
MatchdayStanding = MatchdayStanding.astype({"season": int})
MatchdayStanding = MatchdayStanding.drop('season2', axis=1)

In [132]:
rank_col = []
for name, group in MatchdayStanding.groupby(['season','division','matchday','team','GD']):    
        concrete_rank = SeasonStanding['rank'].loc[(SeasonStanding['season'] <= name[0]) & (SeasonStanding['team'] == name[3])]
        rank_col.append(concrete_rank.values)

In [133]:
MatchdayStanding.sort_values(['season','division','matchday','team','GD'])
MatchdayStanding['last_ranks'] = rank_col
MatchdayStanding.loc[(MatchdayStanding['season']==2002)&(MatchdayStanding['division']==1)&(MatchdayStanding['team']=='Barcelona')]

Unnamed: 0,season,division,matchday,team,Away_team,W,L,T,result,GD,GD_cum,rank,last_results,last_ranks
31940,2002,1,1,Barcelona,Atlético Madrid,0,0,1,T,0,0,8,[T],"[1, 14, 13, 4, 6, 3, 13, 1, 8, 5, 10, 10, 13, ..."
31949,2002,1,2,Barcelona,Athletic,1,0,1,W,2,2,6,"[W, T]","[10, 4, 3, 10, 10, 8, 9, 12, 10, 9, 8, 8, 15, ..."
31960,2002,1,3,Barcelona,Espanyol,2,0,1,W,2,4,3,"[W, W, T]","[13, 18, 15, 15, 13, 8, 4, 13, 6, 19, 12, 21, ..."
31969,2002,1,4,Barcelona,Real Betis,2,1,1,L,-3,1,9,"[L, W, W, T]","[9, 5, 10, 2, 4, 7, 2, 3, 10, 1, 5, 5, 8, 9, 2..."
31980,2002,1,5,Barcelona,CA Osasuna,2,1,2,T,0,1,8,"[T, L, W, W, T]","[8, 9, 9, 8, 8, 7, 5, 1, 8, 1, 6, 6, 1, 2, 2, ..."
31989,2002,1,6,Barcelona,Real Valladolid,2,2,2,L,-1,0,12,"[L, T, L, W, W, T]","[16, 17, 12, 10, 12, 15, 11, 4, 18, 3, 7, 15, 15]"
32000,2002,1,7,Barcelona,Alavés,3,2,2,W,5,5,8,"[W, L, T, L, W, W, T]","[9, 9, 10, 10, 14, 5, 7, 14, 14, 8, 16, 15, 10..."
32009,2002,1,8,Barcelona,Racing,3,2,3,T,0,5,8,"[T, W, L, T, L, W, W, T]","[3, 1, 1, 2, 2, 1, 4, 1, 3, 2, 6, 1, 9, 3, 3, ..."
32021,2002,1,9,Barcelona,Villarreal,4,2,3,W,1,6,6,"[W, T, W, L, T, L, W, W, T]","[3, 4, 1, 1, 4, 7, 11, 8, 5, 5, 2, 6, 5, 2, 3,..."
32030,2002,1,10,Barcelona,Dep. La Coruña,4,3,3,L,-2,4,10,"[L, W, T, W, L, T, L, W, W, T]","[1, 2, 4, 3, 4, 7, 6, 5, 10, 3, 12, 3, 7, 2, 2..."


In [115]:
concrete_rank = SeasonStanding['rank'].loc[(SeasonStanding['season'] <= 2000) & (SeasonStanding['team'] == 'Barcelona')]

In [123]:
concrete_rank.to_numpy()
concrete_rank

3       1
13      2
23      4
33      3
43      4
       ..
1696    2
1738    1
1781    1
1823    2
1864    4
Name: rank, Length: 70, dtype: int64