## Notebook for creating the training table 

In [1]:
import sqlite3
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
with sqlite3.connect("../laliga.sqlite") as conn:
    df = pd.read_sql("SELECT * FROM Matches", con = conn)

df = df.loc[-df.score.isnull()]

In [3]:
def parse_score(score_as_string):
        if score_as_string is None: 
            return None
        home_goals, away_goals = score_as_string.split(":")
        home_goals = int(home_goals)
        away_goals = int(away_goals)
        if home_goals > away_goals: 
            return "1"
        if away_goals > home_goals: 
            return "2" 
        else: 
            return "X"

In [4]:
def identity_col(col):
    return col

def id_int(col):
    return col.astype(int)

In [5]:
def result(row): 
    if row["W"] == 1 : 
        return "W"
    if row["L"] == 1 :
        return "L"
    if row["T"] == 1: 
        return "T"
    else: 
        return "NaN" 

def opposite_result(rows):
    if rows[10] == 1 : 
        rows['L_away'] = 1
        rows['T_away'] = 0
        rows['W_away'] = 0
    if rows[11] == 1 :
        rows['T_away'] = 1
        rows['L_away'] = 0
        rows['W_away'] = 0
    if rows[12] == 1: 
        rows['W_away'] = 1
        rows['T_away'] = 0
        rows['L_away'] = 0
    return rows 

In [6]:
df['result'] = df['score'].apply(parse_score)
df[['home_goals','away_goals']] = df['score'].str.split(":",expand=True)
df = df.astype({"home_goals": int, "away_goals": int})
df['Difference_goals'] = abs(df['home_goals'] - df['away_goals'])

# We decided to create a unique id_match to avoid repetitions
index_match = []
for i in range(len(df)):
    index_match.append(i)

df["id_match"] = index_match

In [8]:
# Information from point of view of home team
df4 = df[['season','division','home_team','result','matchday','home_goals','away_team','away_goals','id_match','date','time']]
MatchdayStanding = df4.groupby(['season','division','matchday','home_team']).agg(
    GF = ('home_goals', sum),
    Away_team = ('away_team',identity_col),
    date = ('date',identity_col),
    time = ('time',identity_col),
    index_match = ('id_match', identity_col),
    GA = ('away_goals', sum),
    W = ('result', lambda x: x.eq('1').sum()),
    L = ('result', lambda x: x.eq('2').sum()),
    T = ('result', lambda x: x.eq('X').sum()),
).reset_index()
MatchdayStanding.rename({'home_team':'team'}, axis=1, inplace=True)
MatchdayStanding.sort_values(['season','division','team','matchday'])

# Information from point of view of away team
df4_1 = df[["season","division","away_team","result",'matchday',"home_goals",'home_team',"away_goals",'id_match']]
df4_1 = df4_1.groupby(["season",'division','matchday',"away_team"], as_index=False).agg(
    GF = ("away_goals", sum),
    GA = ("home_goals", sum),
    index_match = ('id_match', identity_col),
    Away_team = ('home_team',identity_col),
    W_away = ('result', lambda x: x.eq('2').sum()),
    L_away = ('result', lambda x: x.eq('1').sum()),
    T_away = ('result', lambda x: x.eq('X').sum())
)
df4_1.sort_values(['season','division','away_team','matchday'])
df4_1.rename({'away_team': 'team', 'W_away': 'W', 'L_away': 'L', 'T_away': 'T'}, axis=1, inplace=True) 

# Adding information together and computing new features
MatchdayStanding = MatchdayStanding.append(df4_1).sort_values(['season','division','matchday'])
MatchdayStanding.reset_index()
MatchdayStanding.sort_values(['season','division','matchday'])
MatchdayStanding.reset_index()
MatchdayStanding['result_home'] = MatchdayStanding.apply(lambda row: result(row), axis=1)
MatchdayStanding = MatchdayStanding.apply(opposite_result, axis=1)
MatchdayStanding['GD_home'] = MatchdayStanding['GF'] - MatchdayStanding['GA']

# Cummulative results
for col in ['GD_home','GF','GA','W','T','L','W_away','T_away','L_away']:    
    MatchdayStanding[col] = MatchdayStanding.groupby(["season",'division','team'], as_index=False)[col].cumsum()

# Points at each matchday
MatchdayStanding['Pts_home'] = 3*MatchdayStanding['W'] + MatchdayStanding['T']
MatchdayStanding['Pts_away'] = 3*MatchdayStanding['W_away'] + MatchdayStanding['T_away']
MatchdayStanding['Pts_difference'] = MatchdayStanding['Pts_home'] - MatchdayStanding['Pts_away']

# Raking by matchday
team_count_home = MatchdayStanding.groupby(['season','division','matchday'])['team'].count().tolist()
rank_column_home = []
for i in team_count_home :
    j = list(range(1,i+1,1))
    rank_column_home += j
    
MatchdayStanding = MatchdayStanding.sort_values(['season','division','matchday', 'Pts_home', 'GD_home', 'GF'], ascending=[True,True, True, False, False, False])
MatchdayStanding['home_rank'] = rank_column_home

In [10]:
# Dropping unnecessary features
MatchdayStanding = MatchdayStanding.drop(['date','time','W','W_away','L','L_away','T','T_away','GA','GF','Pts_home','Pts_away'], axis=1)

# We only want the Gol Difference from previous results
MatchdayStanding['GD_home'] = MatchdayStanding.groupby(['season','division','team'])['GD_home'].shift()
   
# Deleting match repetitions
MatchdayStanding = MatchdayStanding.sort_values(['index_match'], ascending=[True])
MatchdayStanding = MatchdayStanding.iloc[::2]
MatchdayStanding = MatchdayStanding.drop(['index_match'], axis=1)

In [18]:
# We set to all Nan values to 0
MatchdayStanding['GD_home'] = MatchdayStanding['GD_home'].fillna(0)

In [19]:
MatchdayStanding.loc[(MatchdayStanding['season'] == '2000-2001') & (MatchdayStanding['division'] == 1) & (MatchdayStanding['matchday'] == 1)]

Unnamed: 0,Away_team,division,matchday,result_home,season,team,GD_home,Pts_difference,home_rank
30260,Real Sociedad,1,1,T,2000-2001,Racing,0.0,-2.0,10
30263,Espanyol,1,1,L,2000-2001,Real Zaragoza,0.0,-1.0,13
30259,Barcelona,1,1,L,2000-2001,Málaga CF,0.0,-1.0,14
30256,Dep. La Coruña,1,1,L,2000-2001,Athletic,0.0,-1.0,18
30261,Valencia,1,1,W,2000-2001,Real Madrid,0.0,3.0,6
30260,Real Valladolid,1,1,T,2000-2001,RCD Mallorca,0.0,-2.0,11
30262,CD Numancia,1,1,L,2000-2001,Real Oviedo,0.0,-1.0,16
30257,Celta de Vigo,1,1,L,2000-2001,CA Osasuna,0.0,-1.0,17
30264,Alavés,1,1,L,2000-2001,UD Las Palmas,0.0,-1.0,19
30265,Rayo Vallecano,1,1,L,2000-2001,Villarreal,0.0,-1.0,20


In [21]:
import openpyxl

# Save as an excel file
MatchdayStanding.to_csv(r'..\reports\Definitive_ML_Data.csv')