In [1]:
import pandas as pd
import numpy as np
import sqlite3

In [2]:
con = sqlite3.connect("./database.sqlite")

In [3]:
teams = pd.read_sql_query("SELECT * FROM Team", con=con)
teams_attributes = pd.read_sql_query("SELECT * FROM Team_Attributes", con=con)

In [4]:
teams

Unnamed: 0,id,team_api_id,team_fifa_api_id,team_long_name,team_short_name
0,1,9987,673.0,KRC Genk,GEN
1,2,9993,675.0,Beerschot AC,BAC
2,3,10000,15005.0,SV Zulte-Waregem,ZUL
3,4,9994,2007.0,Sporting Lokeren,LOK
4,5,9984,1750.0,KSV Cercle Brugge,CEB
...,...,...,...,...,...
294,49479,10190,898.0,FC St. Gallen,GAL
295,49837,10191,1715.0,FC Thun,THU
296,50201,9777,324.0,Servette FC,SER
297,50204,7730,1862.0,FC Lausanne-Sports,LAU


Drop some columns from 'teams' dataframe that are not necessary for our analysis.

In [5]:
teams.drop(columns=["id", "team_fifa_api_id", "team_short_name"], inplace=True)
teams

Unnamed: 0,team_api_id,team_long_name
0,9987,KRC Genk
1,9993,Beerschot AC
2,10000,SV Zulte-Waregem
3,9994,Sporting Lokeren
4,9984,KSV Cercle Brugge
...,...,...
294,10190,FC St. Gallen
295,10191,FC Thun
296,9777,Servette FC
297,7730,FC Lausanne-Sports


Check the NA values and uniqueness for this dataset

In [6]:
teams.isna().sum()

team_api_id       0
team_long_name    0
dtype: int64

With this we verify that we are working with a dataset with non-null values.

Let's check the uniqueness of the values. For this dataset it is important to verify the uniqueness since we are working with IDs and obviously all of them should be different in the first place.

In [7]:
teams.apply(lambda x: print(f"'{x.name}' is unique: {x.is_unique}"))

'team_api_id' is unique: True
'team_long_name' is unique: False


team_api_id       None
team_long_name    None
dtype: object

Given the results we can see the IDs are different but we have duplicated team's names, which means some teams were stored with different IDs. Let's check that out!

In [8]:
teams.loc[teams["team_long_name"].duplicated(), :]

Unnamed: 0,team_api_id,team_long_name
24,274581,Royal Excel Mouscron
183,8020,Polonia Bytom
199,8024,Widzew Łódź


We've found the teams that are duplicated. Let's analyze their data in the 'teams_attributes' dataframe.

First, we're going to add a column with the team's name to 'teams_attributes' dataframe.

In [9]:
teams_attributes

Unnamed: 0,id,team_fifa_api_id,team_api_id,date,buildUpPlaySpeed,buildUpPlaySpeedClass,buildUpPlayDribbling,buildUpPlayDribblingClass,buildUpPlayPassing,buildUpPlayPassingClass,...,chanceCreationShooting,chanceCreationShootingClass,chanceCreationPositioningClass,defencePressure,defencePressureClass,defenceAggression,defenceAggressionClass,defenceTeamWidth,defenceTeamWidthClass,defenceDefenderLineClass
0,1,434,9930,2010-02-22 00:00:00,60,Balanced,,Little,50,Mixed,...,55,Normal,Organised,50,Medium,55,Press,45,Normal,Cover
1,2,434,9930,2014-09-19 00:00:00,52,Balanced,48.0,Normal,56,Mixed,...,64,Normal,Organised,47,Medium,44,Press,54,Normal,Cover
2,3,434,9930,2015-09-10 00:00:00,47,Balanced,41.0,Normal,54,Mixed,...,64,Normal,Organised,47,Medium,44,Press,54,Normal,Cover
3,4,77,8485,2010-02-22 00:00:00,70,Fast,,Little,70,Long,...,70,Lots,Organised,60,Medium,70,Double,70,Wide,Cover
4,5,77,8485,2011-02-22 00:00:00,47,Balanced,,Little,52,Mixed,...,52,Normal,Organised,47,Medium,47,Press,52,Normal,Cover
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1453,1454,15005,10000,2011-02-22 00:00:00,52,Balanced,,Little,52,Mixed,...,53,Normal,Organised,46,Medium,48,Press,53,Normal,Cover
1454,1455,15005,10000,2012-02-22 00:00:00,54,Balanced,,Little,51,Mixed,...,50,Normal,Organised,44,Medium,55,Press,53,Normal,Cover
1455,1456,15005,10000,2013-09-20 00:00:00,54,Balanced,,Little,51,Mixed,...,32,Little,Organised,44,Medium,58,Press,37,Normal,Cover
1456,1457,15005,10000,2014-09-19 00:00:00,54,Balanced,42.0,Normal,51,Mixed,...,32,Little,Organised,44,Medium,58,Press,37,Normal,Cover


In [10]:
teams_attributes = pd.merge(
    teams,
    teams_attributes,
    on=["team_api_id"],
)
teams_attributes

Unnamed: 0,team_api_id,team_long_name,id,team_fifa_api_id,date,buildUpPlaySpeed,buildUpPlaySpeedClass,buildUpPlayDribbling,buildUpPlayDribblingClass,buildUpPlayPassing,...,chanceCreationShooting,chanceCreationShootingClass,chanceCreationPositioningClass,defencePressure,defencePressureClass,defenceAggression,defenceAggressionClass,defenceTeamWidth,defenceTeamWidthClass,defenceDefenderLineClass
0,9987,KRC Genk,485,673,2010-02-22 00:00:00,45,Balanced,,Little,45,...,60,Normal,Organised,70,High,65,Press,70,Wide,Cover
1,9987,KRC Genk,486,673,2011-02-22 00:00:00,66,Balanced,,Little,52,...,51,Normal,Organised,48,Medium,47,Press,54,Normal,Offside Trap
2,9987,KRC Genk,487,673,2012-02-22 00:00:00,53,Balanced,,Little,55,...,56,Normal,Organised,47,Medium,45,Press,55,Normal,Cover
3,9987,KRC Genk,488,673,2013-09-20 00:00:00,58,Balanced,,Little,38,...,56,Normal,Organised,47,Medium,45,Press,55,Normal,Cover
4,9987,KRC Genk,489,673,2014-09-19 00:00:00,58,Balanced,52.0,Normal,38,...,56,Normal,Organised,47,Medium,45,Press,55,Normal,Cover
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1453,9777,Servette FC,1156,324,2013-09-20 00:00:00,48,Balanced,,Little,51,...,45,Normal,Organised,49,Medium,45,Press,49,Normal,Cover
1454,7730,FC Lausanne-Sports,672,1862,2010-02-22 00:00:00,30,Slow,,Little,60,...,60,Normal,Organised,55,Medium,60,Press,50,Normal,Cover
1455,7730,FC Lausanne-Sports,673,1862,2012-02-22 00:00:00,37,Balanced,,Little,49,...,48,Normal,Organised,43,Medium,43,Press,55,Normal,Cover
1456,7730,FC Lausanne-Sports,674,1862,2013-09-20 00:00:00,51,Balanced,,Little,49,...,48,Normal,Organised,43,Medium,43,Press,55,Normal,Cover


We can rename the column 'team_long_name' to just 'team' for easy management.

In [11]:
teams_attributes.rename(columns={"team_long_name": "team"}, inplace=True)
teams_attributes

Unnamed: 0,team_api_id,team,id,team_fifa_api_id,date,buildUpPlaySpeed,buildUpPlaySpeedClass,buildUpPlayDribbling,buildUpPlayDribblingClass,buildUpPlayPassing,...,chanceCreationShooting,chanceCreationShootingClass,chanceCreationPositioningClass,defencePressure,defencePressureClass,defenceAggression,defenceAggressionClass,defenceTeamWidth,defenceTeamWidthClass,defenceDefenderLineClass
0,9987,KRC Genk,485,673,2010-02-22 00:00:00,45,Balanced,,Little,45,...,60,Normal,Organised,70,High,65,Press,70,Wide,Cover
1,9987,KRC Genk,486,673,2011-02-22 00:00:00,66,Balanced,,Little,52,...,51,Normal,Organised,48,Medium,47,Press,54,Normal,Offside Trap
2,9987,KRC Genk,487,673,2012-02-22 00:00:00,53,Balanced,,Little,55,...,56,Normal,Organised,47,Medium,45,Press,55,Normal,Cover
3,9987,KRC Genk,488,673,2013-09-20 00:00:00,58,Balanced,,Little,38,...,56,Normal,Organised,47,Medium,45,Press,55,Normal,Cover
4,9987,KRC Genk,489,673,2014-09-19 00:00:00,58,Balanced,52.0,Normal,38,...,56,Normal,Organised,47,Medium,45,Press,55,Normal,Cover
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1453,9777,Servette FC,1156,324,2013-09-20 00:00:00,48,Balanced,,Little,51,...,45,Normal,Organised,49,Medium,45,Press,49,Normal,Cover
1454,7730,FC Lausanne-Sports,672,1862,2010-02-22 00:00:00,30,Slow,,Little,60,...,60,Normal,Organised,55,Medium,60,Press,50,Normal,Cover
1455,7730,FC Lausanne-Sports,673,1862,2012-02-22 00:00:00,37,Balanced,,Little,49,...,48,Normal,Organised,43,Medium,43,Press,55,Normal,Cover
1456,7730,FC Lausanne-Sports,674,1862,2013-09-20 00:00:00,51,Balanced,,Little,49,...,48,Normal,Organised,43,Medium,43,Press,55,Normal,Cover


In order to make easy to analyze data, first, we can check the cleaness of the 'teams_attributes' dataset.
We can start removing some columns like 'id' and 'team_fifa_api_id' because these columns do not contain relevant information.

In [12]:
teams_attributes.drop(columns=["id", "team_fifa_api_id"], inplace=True)
teams_attributes

Unnamed: 0,team_api_id,team,date,buildUpPlaySpeed,buildUpPlaySpeedClass,buildUpPlayDribbling,buildUpPlayDribblingClass,buildUpPlayPassing,buildUpPlayPassingClass,buildUpPlayPositioningClass,...,chanceCreationShooting,chanceCreationShootingClass,chanceCreationPositioningClass,defencePressure,defencePressureClass,defenceAggression,defenceAggressionClass,defenceTeamWidth,defenceTeamWidthClass,defenceDefenderLineClass
0,9987,KRC Genk,2010-02-22 00:00:00,45,Balanced,,Little,45,Mixed,Organised,...,60,Normal,Organised,70,High,65,Press,70,Wide,Cover
1,9987,KRC Genk,2011-02-22 00:00:00,66,Balanced,,Little,52,Mixed,Organised,...,51,Normal,Organised,48,Medium,47,Press,54,Normal,Offside Trap
2,9987,KRC Genk,2012-02-22 00:00:00,53,Balanced,,Little,55,Mixed,Organised,...,56,Normal,Organised,47,Medium,45,Press,55,Normal,Cover
3,9987,KRC Genk,2013-09-20 00:00:00,58,Balanced,,Little,38,Mixed,Organised,...,56,Normal,Organised,47,Medium,45,Press,55,Normal,Cover
4,9987,KRC Genk,2014-09-19 00:00:00,58,Balanced,52.0,Normal,38,Mixed,Organised,...,56,Normal,Organised,47,Medium,45,Press,55,Normal,Cover
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1453,9777,Servette FC,2013-09-20 00:00:00,48,Balanced,,Little,51,Mixed,Organised,...,45,Normal,Organised,49,Medium,45,Press,49,Normal,Cover
1454,7730,FC Lausanne-Sports,2010-02-22 00:00:00,30,Slow,,Little,60,Mixed,Organised,...,60,Normal,Organised,55,Medium,60,Press,50,Normal,Cover
1455,7730,FC Lausanne-Sports,2012-02-22 00:00:00,37,Balanced,,Little,49,Mixed,Organised,...,48,Normal,Organised,43,Medium,43,Press,55,Normal,Cover
1456,7730,FC Lausanne-Sports,2013-09-20 00:00:00,51,Balanced,,Little,49,Mixed,Organised,...,48,Normal,Organised,43,Medium,43,Press,55,Normal,Cover


Let's check the null values of each column.

In [13]:
teams_attributes.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1458 entries, 0 to 1457
Data columns (total 24 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   team_api_id                     1458 non-null   int64  
 1   team                            1458 non-null   object 
 2   date                            1458 non-null   object 
 3   buildUpPlaySpeed                1458 non-null   int64  
 4   buildUpPlaySpeedClass           1458 non-null   object 
 5   buildUpPlayDribbling            489 non-null    float64
 6   buildUpPlayDribblingClass       1458 non-null   object 
 7   buildUpPlayPassing              1458 non-null   int64  
 8   buildUpPlayPassingClass         1458 non-null   object 
 9   buildUpPlayPositioningClass     1458 non-null   object 
 10  chanceCreationPassing           1458 non-null   int64  
 11  chanceCreationPassingClass      1458 non-null   object 
 12  chanceCreationCrossing          14

After displaying 'teams_attributes' dataframe info we can see column 'buildUpPlayDribbling' has just 489 non-null values, so there is not enough information in that particular column and it will be dismissed in the analysis.

In [14]:
teams_attributes.drop(columns=["buildUpPlayDribbling"], inplace=True)
teams_attributes.shape

(1458, 23)

In [15]:
teams_attributes

Unnamed: 0,team_api_id,team,date,buildUpPlaySpeed,buildUpPlaySpeedClass,buildUpPlayDribblingClass,buildUpPlayPassing,buildUpPlayPassingClass,buildUpPlayPositioningClass,chanceCreationPassing,...,chanceCreationShooting,chanceCreationShootingClass,chanceCreationPositioningClass,defencePressure,defencePressureClass,defenceAggression,defenceAggressionClass,defenceTeamWidth,defenceTeamWidthClass,defenceDefenderLineClass
0,9987,KRC Genk,2010-02-22 00:00:00,45,Balanced,Little,45,Mixed,Organised,50,...,60,Normal,Organised,70,High,65,Press,70,Wide,Cover
1,9987,KRC Genk,2011-02-22 00:00:00,66,Balanced,Little,52,Mixed,Organised,65,...,51,Normal,Organised,48,Medium,47,Press,54,Normal,Offside Trap
2,9987,KRC Genk,2012-02-22 00:00:00,53,Balanced,Little,55,Mixed,Organised,55,...,56,Normal,Organised,47,Medium,45,Press,55,Normal,Cover
3,9987,KRC Genk,2013-09-20 00:00:00,58,Balanced,Little,38,Mixed,Organised,67,...,56,Normal,Organised,47,Medium,45,Press,55,Normal,Cover
4,9987,KRC Genk,2014-09-19 00:00:00,58,Balanced,Normal,38,Mixed,Organised,67,...,56,Normal,Organised,47,Medium,45,Press,55,Normal,Cover
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1453,9777,Servette FC,2013-09-20 00:00:00,48,Balanced,Little,51,Mixed,Organised,53,...,45,Normal,Organised,49,Medium,45,Press,49,Normal,Cover
1454,7730,FC Lausanne-Sports,2010-02-22 00:00:00,30,Slow,Little,60,Mixed,Organised,40,...,60,Normal,Organised,55,Medium,60,Press,50,Normal,Cover
1455,7730,FC Lausanne-Sports,2012-02-22 00:00:00,37,Balanced,Little,49,Mixed,Organised,52,...,48,Normal,Organised,43,Medium,43,Press,55,Normal,Cover
1456,7730,FC Lausanne-Sports,2013-09-20 00:00:00,51,Balanced,Little,49,Mixed,Organised,52,...,48,Normal,Organised,43,Medium,43,Press,55,Normal,Cover


Now, go back to the analysis of the duplicated teams' names. The first team found duplicated was 'Royal Excel Mouscron'.

In [16]:
teams_attributes[teams_attributes["team"] == "Royal Excel Mouscron"]

Unnamed: 0,team_api_id,team,date,buildUpPlaySpeed,buildUpPlaySpeedClass,buildUpPlayDribblingClass,buildUpPlayPassing,buildUpPlayPassingClass,buildUpPlayPositioningClass,chanceCreationPassing,...,chanceCreationShooting,chanceCreationShootingClass,chanceCreationPositioningClass,defencePressure,defencePressureClass,defenceAggression,defenceAggressionClass,defenceTeamWidth,defenceTeamWidthClass,defenceDefenderLineClass
68,9996,Royal Excel Mouscron,2015-09-10 00:00:00,50,Balanced,Normal,50,Mixed,Organised,50,...,50,Normal,Organised,45,Medium,45,Press,50,Normal,Cover
69,9996,Royal Excel Mouscron,2015-09-10 00:00:00,50,Balanced,Normal,50,Mixed,Organised,50,...,50,Normal,Organised,45,Medium,45,Press,50,Normal,Cover
96,274581,Royal Excel Mouscron,2015-09-10 00:00:00,50,Balanced,Normal,50,Mixed,Organised,50,...,50,Normal,Organised,45,Medium,45,Press,50,Normal,Cover


Verify if the whole three rows have the same values except witht the column 'team_api_id':

In [17]:
teams_attributes \
    .loc[:, teams_attributes.columns[1:]] \
        .loc[teams_attributes["team"] == "Royal Excel Mouscron"] \
            .duplicated()

68    False
69     True
96     True
dtype: bool

Let's remove the rows that are duplicated except for the column 'team_api_id'.

In [18]:
teams_attributes.drop_duplicates(subset=teams_attributes.columns[1:], inplace=True)
teams_attributes

Unnamed: 0,team_api_id,team,date,buildUpPlaySpeed,buildUpPlaySpeedClass,buildUpPlayDribblingClass,buildUpPlayPassing,buildUpPlayPassingClass,buildUpPlayPositioningClass,chanceCreationPassing,...,chanceCreationShooting,chanceCreationShootingClass,chanceCreationPositioningClass,defencePressure,defencePressureClass,defenceAggression,defenceAggressionClass,defenceTeamWidth,defenceTeamWidthClass,defenceDefenderLineClass
0,9987,KRC Genk,2010-02-22 00:00:00,45,Balanced,Little,45,Mixed,Organised,50,...,60,Normal,Organised,70,High,65,Press,70,Wide,Cover
1,9987,KRC Genk,2011-02-22 00:00:00,66,Balanced,Little,52,Mixed,Organised,65,...,51,Normal,Organised,48,Medium,47,Press,54,Normal,Offside Trap
2,9987,KRC Genk,2012-02-22 00:00:00,53,Balanced,Little,55,Mixed,Organised,55,...,56,Normal,Organised,47,Medium,45,Press,55,Normal,Cover
3,9987,KRC Genk,2013-09-20 00:00:00,58,Balanced,Little,38,Mixed,Organised,67,...,56,Normal,Organised,47,Medium,45,Press,55,Normal,Cover
4,9987,KRC Genk,2014-09-19 00:00:00,58,Balanced,Normal,38,Mixed,Organised,67,...,56,Normal,Organised,47,Medium,45,Press,55,Normal,Cover
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1453,9777,Servette FC,2013-09-20 00:00:00,48,Balanced,Little,51,Mixed,Organised,53,...,45,Normal,Organised,49,Medium,45,Press,49,Normal,Cover
1454,7730,FC Lausanne-Sports,2010-02-22 00:00:00,30,Slow,Little,60,Mixed,Organised,40,...,60,Normal,Organised,55,Medium,60,Press,50,Normal,Cover
1455,7730,FC Lausanne-Sports,2012-02-22 00:00:00,37,Balanced,Little,49,Mixed,Organised,52,...,48,Normal,Organised,43,Medium,43,Press,55,Normal,Cover
1456,7730,FC Lausanne-Sports,2013-09-20 00:00:00,51,Balanced,Little,49,Mixed,Organised,52,...,48,Normal,Organised,43,Medium,43,Press,55,Normal,Cover


We verify that we have the same unique teams' IDs and teams' names.

In [19]:
teams_attributes["team_api_id"].nunique()

285

In [20]:
teams_attributes["team"].nunique()

285

In [21]:
print("mean observations of attributes:", teams_attributes["team"].value_counts().mean())
print("max observations of attributes:", teams_attributes["team"].value_counts().max())
print("min observations of attributes:", teams_attributes["team"].value_counts().min())

mean observations of attributes: 5.087719298245614
max observations of attributes: 6
min observations of attributes: 1


In [22]:
teams_attributes["date"] = pd.to_datetime(teams_attributes["date"], format="%Y-%m-%d")
teams_attributes.dtypes

team_api_id                                int64
team                                      object
date                              datetime64[ns]
buildUpPlaySpeed                           int64
buildUpPlaySpeedClass                     object
buildUpPlayDribblingClass                 object
buildUpPlayPassing                         int64
buildUpPlayPassingClass                   object
buildUpPlayPositioningClass               object
chanceCreationPassing                      int64
chanceCreationPassingClass                object
chanceCreationCrossing                     int64
chanceCreationCrossingClass               object
chanceCreationShooting                     int64
chanceCreationShootingClass               object
chanceCreationPositioningClass            object
defencePressure                            int64
defencePressureClass                      object
defenceAggression                          int64
defenceAggressionClass                    object
defenceTeamWidth    

For what period of time we have attributes of teams?

In [23]:
teams_attributes["date"].min(), teams_attributes["date"].max()

(Timestamp('2010-02-22 00:00:00'), Timestamp('2015-09-10 00:00:00'))

In [24]:
matches = pd.read_csv("./matches.csv")
matches

Unnamed: 0,season,date,home_team_goal,away_team_goal,country,league,home_team,away_team
0,2008/2009,2008-08-17,1,1,Belgium,Belgium Jupiler League,KRC Genk,Beerschot AC
1,2008/2009,2008-08-16,0,0,Belgium,Belgium Jupiler League,SV Zulte-Waregem,Sporting Lokeren
2,2008/2009,2008-08-16,0,3,Belgium,Belgium Jupiler League,KSV Cercle Brugge,RSC Anderlecht
3,2008/2009,2008-08-17,5,0,Belgium,Belgium Jupiler League,KAA Gent,RAEC Mons
4,2008/2009,2008-08-16,1,3,Belgium,Belgium Jupiler League,FCV Dender EH,Standard de Liège
...,...,...,...,...,...,...,...,...
25974,2015/2016,2015-09-22,1,0,Switzerland,Switzerland Super League,FC St. Gallen,FC Thun
25975,2015/2016,2015-09-23,1,2,Switzerland,Switzerland Super League,FC Vaduz,FC Luzern
25976,2015/2016,2015-09-23,2,0,Switzerland,Switzerland Super League,Grasshopper Club Zürich,FC Sion
25977,2015/2016,2015-09-22,0,0,Switzerland,Switzerland Super League,Lugano,FC Zürich


In [25]:
matches["date"] = pd.to_datetime(matches["date"], format="%Y-%m-%d")
seasons = matches.groupby(["season", "country"])["date"].agg(["min", "max"]).reset_index()
seasons

Unnamed: 0,season,country,min,max
0,2008/2009,Belgium,2008-08-16,2009-05-16
1,2008/2009,England,2008-08-16,2009-05-24
2,2008/2009,France,2008-08-09,2009-05-30
3,2008/2009,Germany,2008-08-15,2009-05-23
4,2008/2009,Italy,2008-08-30,2009-05-31
...,...,...,...,...
83,2015/2016,Poland,2015-07-17,2016-04-09
84,2015/2016,Portugal,2015-08-14,2016-05-15
85,2015/2016,Scotland,2015-08-01,2016-05-15
86,2015/2016,Spain,2015-08-21,2016-05-15


Read the table from csv to get the seasons and countries we have available.
<!-- Read the table from csv and remove those teams that did not played the 8 seasons. -->

In [26]:
table = pd.read_csv("./table.csv")
table

Unnamed: 0,season,team,country,drawn,lost,won,played,for,against,gd,points
0,2008/2009,1. FC Köln,Germany,6.0,17.0,11.0,34.0,35,50,-15,39.0
1,2008/2009,AC Bellinzona,Switzerland,10.0,15.0,11.0,36.0,44,51,-7,43.0
2,2008/2009,ADO Den Haag,Netherlands,8.0,18.0,8.0,34.0,41,58,-17,32.0
3,2008/2009,AJ Auxerre,France,7.0,15.0,16.0,38.0,35,35,0,55.0
4,2008/2009,AS Monaco,France,12.0,15.0,11.0,38.0,41,45,-4,45.0
...,...,...,...,...,...,...,...,...,...,...,...
1473,2015/2016,West Ham United,England,14.0,8.0,16.0,38.0,65,51,14,62.0
1474,2015/2016,Willem II,Netherlands,11.0,17.0,6.0,34.0,35,53,-18,29.0
1475,2015/2016,Wisła Kraków,Poland,13.0,9.0,8.0,30.0,45,35,10,37.0
1476,2015/2016,Zagłębie Lubin,Poland,9.0,9.0,12.0,30.0,41,37,4,45.0


In [27]:
teams_labels = table["team"].unique()
teams_attributes = teams_attributes[teams_attributes["team"].isin(teams_labels)]
teams_attributes.reset_index(drop=True, inplace=True)
teams_attributes

Unnamed: 0,team_api_id,team,date,buildUpPlaySpeed,buildUpPlaySpeedClass,buildUpPlayDribblingClass,buildUpPlayPassing,buildUpPlayPassingClass,buildUpPlayPositioningClass,chanceCreationPassing,...,chanceCreationShooting,chanceCreationShootingClass,chanceCreationPositioningClass,defencePressure,defencePressureClass,defenceAggression,defenceAggressionClass,defenceTeamWidth,defenceTeamWidthClass,defenceDefenderLineClass
0,9987,KRC Genk,2010-02-22,45,Balanced,Little,45,Mixed,Organised,50,...,60,Normal,Organised,70,High,65,Press,70,Wide,Cover
1,9987,KRC Genk,2011-02-22,66,Balanced,Little,52,Mixed,Organised,65,...,51,Normal,Organised,48,Medium,47,Press,54,Normal,Offside Trap
2,9987,KRC Genk,2012-02-22,53,Balanced,Little,55,Mixed,Organised,55,...,56,Normal,Organised,47,Medium,45,Press,55,Normal,Cover
3,9987,KRC Genk,2013-09-20,58,Balanced,Little,38,Mixed,Organised,67,...,56,Normal,Organised,47,Medium,45,Press,55,Normal,Cover
4,9987,KRC Genk,2014-09-19,58,Balanced,Normal,38,Mixed,Organised,67,...,56,Normal,Organised,47,Medium,45,Press,55,Normal,Cover
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1445,9777,Servette FC,2013-09-20,48,Balanced,Little,51,Mixed,Organised,53,...,45,Normal,Organised,49,Medium,45,Press,49,Normal,Cover
1446,7730,FC Lausanne-Sports,2010-02-22,30,Slow,Little,60,Mixed,Organised,40,...,60,Normal,Organised,55,Medium,60,Press,50,Normal,Cover
1447,7730,FC Lausanne-Sports,2012-02-22,37,Balanced,Little,49,Mixed,Organised,52,...,48,Normal,Organised,43,Medium,43,Press,55,Normal,Cover
1448,7730,FC Lausanne-Sports,2013-09-20,51,Balanced,Little,49,Mixed,Organised,52,...,48,Normal,Organised,43,Medium,43,Press,55,Normal,Cover


In [28]:
table["team"].nunique()

296

In [29]:
teams_attributes["team"].nunique()

285

In [30]:
teams_attributes = pd.merge(
    table[["team", "country"]].drop_duplicates().reset_index(drop=True),
    teams_attributes,
    on=["team"],
)
teams_attributes

Unnamed: 0,team,country,team_api_id,date,buildUpPlaySpeed,buildUpPlaySpeedClass,buildUpPlayDribblingClass,buildUpPlayPassing,buildUpPlayPassingClass,buildUpPlayPositioningClass,...,chanceCreationShooting,chanceCreationShootingClass,chanceCreationPositioningClass,defencePressure,defencePressureClass,defenceAggression,defenceAggressionClass,defenceTeamWidth,defenceTeamWidthClass,defenceDefenderLineClass
0,1. FC Köln,Germany,8722,2010-02-22,55,Balanced,Little,65,Mixed,Organised,...,60,Normal,Organised,45,Medium,55,Press,70,Wide,Cover
1,1. FC Köln,Germany,8722,2011-02-22,58,Balanced,Little,71,Long,Organised,...,50,Normal,Organised,40,Medium,40,Press,56,Normal,Cover
2,1. FC Köln,Germany,8722,2012-02-22,53,Balanced,Little,53,Mixed,Organised,...,57,Normal,Organised,43,Medium,55,Press,67,Wide,Cover
3,1. FC Köln,Germany,8722,2013-09-20,60,Balanced,Little,56,Mixed,Organised,...,57,Normal,Organised,43,Medium,55,Press,67,Wide,Cover
4,1. FC Köln,Germany,8722,2014-09-19,59,Balanced,Normal,57,Mixed,Organised,...,59,Normal,Organised,57,Medium,46,Press,54,Normal,Cover
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1445,Watford,England,9817,2011-02-22,64,Balanced,Little,53,Mixed,Organised,...,48,Normal,Organised,44,Medium,47,Press,66,Normal,Cover
1446,Watford,England,9817,2012-02-22,66,Balanced,Little,66,Mixed,Organised,...,60,Normal,Organised,48,Medium,50,Press,45,Normal,Cover
1447,Watford,England,9817,2013-09-20,67,Fast,Little,45,Mixed,Organised,...,53,Normal,Free Form,38,Medium,52,Press,45,Normal,Cover
1448,Watford,England,9817,2014-09-19,61,Balanced,Normal,56,Mixed,Organised,...,55,Normal,Free Form,38,Medium,52,Press,45,Normal,Cover


Verify that in this merge operation we don't have null values.

In [31]:
teams_attributes.isna().sum()

team                              0
country                           0
team_api_id                       0
date                              0
buildUpPlaySpeed                  0
buildUpPlaySpeedClass             0
buildUpPlayDribblingClass         0
buildUpPlayPassing                0
buildUpPlayPassingClass           0
buildUpPlayPositioningClass       0
chanceCreationPassing             0
chanceCreationPassingClass        0
chanceCreationCrossing            0
chanceCreationCrossingClass       0
chanceCreationShooting            0
chanceCreationShootingClass       0
chanceCreationPositioningClass    0
defencePressure                   0
defencePressureClass              0
defenceAggression                 0
defenceAggressionClass            0
defenceTeamWidth                  0
defenceTeamWidthClass             0
defenceDefenderLineClass          0
dtype: int64

In [32]:
def get_season(country, date):
    mask = (seasons["min"] <= date) & (date <= seasons["max"]) & (seasons["country"] == country)
    season = seasons.loc[mask, "season"]
    return season.iloc[0] if len(season) else np.nan

In [33]:
teams_attributes["season"] = teams_attributes.apply(lambda x: get_season(x["country"], x["date"]), axis=1)
teams_attributes

Unnamed: 0,team,country,team_api_id,date,buildUpPlaySpeed,buildUpPlaySpeedClass,buildUpPlayDribblingClass,buildUpPlayPassing,buildUpPlayPassingClass,buildUpPlayPositioningClass,...,chanceCreationShootingClass,chanceCreationPositioningClass,defencePressure,defencePressureClass,defenceAggression,defenceAggressionClass,defenceTeamWidth,defenceTeamWidthClass,defenceDefenderLineClass,season
0,1. FC Köln,Germany,8722,2010-02-22,55,Balanced,Little,65,Mixed,Organised,...,Normal,Organised,45,Medium,55,Press,70,Wide,Cover,2009/2010
1,1. FC Köln,Germany,8722,2011-02-22,58,Balanced,Little,71,Long,Organised,...,Normal,Organised,40,Medium,40,Press,56,Normal,Cover,2010/2011
2,1. FC Köln,Germany,8722,2012-02-22,53,Balanced,Little,53,Mixed,Organised,...,Normal,Organised,43,Medium,55,Press,67,Wide,Cover,2011/2012
3,1. FC Köln,Germany,8722,2013-09-20,60,Balanced,Little,56,Mixed,Organised,...,Normal,Organised,43,Medium,55,Press,67,Wide,Cover,2013/2014
4,1. FC Köln,Germany,8722,2014-09-19,59,Balanced,Normal,57,Mixed,Organised,...,Normal,Organised,57,Medium,46,Press,54,Normal,Cover,2014/2015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1445,Watford,England,9817,2011-02-22,64,Balanced,Little,53,Mixed,Organised,...,Normal,Organised,44,Medium,47,Press,66,Normal,Cover,2010/2011
1446,Watford,England,9817,2012-02-22,66,Balanced,Little,66,Mixed,Organised,...,Normal,Organised,48,Medium,50,Press,45,Normal,Cover,2011/2012
1447,Watford,England,9817,2013-09-20,67,Fast,Little,45,Mixed,Organised,...,Normal,Free Form,38,Medium,52,Press,45,Normal,Cover,2013/2014
1448,Watford,England,9817,2014-09-19,61,Balanced,Normal,56,Mixed,Organised,...,Normal,Free Form,38,Medium,52,Press,45,Normal,Cover,2014/2015


In [34]:
teams_attributes.isna().sum()

team                               0
country                            0
team_api_id                        0
date                               0
buildUpPlaySpeed                   0
buildUpPlaySpeedClass              0
buildUpPlayDribblingClass          0
buildUpPlayPassing                 0
buildUpPlayPassingClass            0
buildUpPlayPositioningClass        0
chanceCreationPassing              0
chanceCreationPassingClass         0
chanceCreationCrossing             0
chanceCreationCrossingClass        0
chanceCreationShooting             0
chanceCreationShootingClass        0
chanceCreationPositioningClass     0
defencePressure                    0
defencePressureClass               0
defenceAggression                  0
defenceAggressionClass             0
defenceTeamWidth                   0
defenceTeamWidthClass              0
defenceDefenderLineClass           0
season                            16
dtype: int64

In [35]:
teams_attributes[teams_attributes["season"].isna()]

Unnamed: 0,team,country,team_api_id,date,buildUpPlaySpeed,buildUpPlaySpeedClass,buildUpPlayDribblingClass,buildUpPlayPassing,buildUpPlayPassingClass,buildUpPlayPositioningClass,...,chanceCreationShootingClass,chanceCreationPositioningClass,defencePressure,defencePressureClass,defenceAggression,defenceAggressionClass,defenceTeamWidth,defenceTeamWidthClass,defenceDefenderLineClass,season
109,Beerschot AC,Belgium,9993,2013-09-20,47,Balanced,Little,39,Mixed,Organised,...,Normal,Organised,45,Medium,46,Press,67,Wide,Cover,
200,Club Brugge KV,Belgium,8342,2013-09-20,45,Balanced,Little,48,Mixed,Organised,...,Normal,Organised,43,Medium,59,Press,45,Normal,Cover,
449,KAA Gent,Belgium,9991,2013-09-20,50,Balanced,Little,37,Mixed,Organised,...,Normal,Free Form,50,Medium,53,Press,40,Normal,Cover,
455,KRC Genk,Belgium,9987,2013-09-20,58,Balanced,Little,38,Mixed,Organised,...,Normal,Organised,47,Medium,45,Press,55,Normal,Cover,
461,KSV Cercle Brugge,Belgium,9984,2013-09-20,53,Balanced,Little,40,Mixed,Organised,...,Normal,Free Form,59,Medium,47,Press,62,Normal,Cover,
468,KV Kortrijk,Belgium,8571,2013-09-20,42,Balanced,Little,40,Mixed,Organised,...,Little,Organised,46,Medium,45,Press,52,Normal,Cover,
474,KV Mechelen,Belgium,8203,2013-09-20,52,Balanced,Little,33,Short,Organised,...,Normal,Free Form,42,Medium,60,Press,52,Normal,Cover,
660,RAEC Mons,Belgium,9998,2013-09-20,50,Balanced,Little,50,Mixed,Organised,...,Normal,Organised,46,Medium,45,Press,47,Normal,Cover,
689,RSC Anderlecht,Belgium,8635,2013-09-20,52,Balanced,Little,41,Mixed,Organised,...,Normal,Organised,60,Medium,43,Press,65,Normal,Cover,
789,SV Zulte-Waregem,Belgium,10000,2013-09-20,54,Balanced,Little,51,Mixed,Organised,...,Little,Organised,44,Medium,58,Press,37,Normal,Cover,


In [36]:
teams_attributes.fillna(value={"season": "2013/2014"}, inplace=True)
teams_attributes.isna().sum()

team                              0
country                           0
team_api_id                       0
date                              0
buildUpPlaySpeed                  0
buildUpPlaySpeedClass             0
buildUpPlayDribblingClass         0
buildUpPlayPassing                0
buildUpPlayPassingClass           0
buildUpPlayPositioningClass       0
chanceCreationPassing             0
chanceCreationPassingClass        0
chanceCreationCrossing            0
chanceCreationCrossingClass       0
chanceCreationShooting            0
chanceCreationShootingClass       0
chanceCreationPositioningClass    0
defencePressure                   0
defencePressureClass              0
defenceAggression                 0
defenceAggressionClass            0
defenceTeamWidth                  0
defenceTeamWidthClass             0
defenceDefenderLineClass          0
season                            0
dtype: int64

In [37]:
teams_attributes = pd.merge(
    table,
    teams_attributes,
    on=["team", "season", "country"],
    how="right",
)
teams_attributes

Unnamed: 0,season,team,country,drawn,lost,won,played,for,against,gd,...,chanceCreationShooting,chanceCreationShootingClass,chanceCreationPositioningClass,defencePressure,defencePressureClass,defenceAggression,defenceAggressionClass,defenceTeamWidth,defenceTeamWidthClass,defenceDefenderLineClass
0,2009/2010,1. FC Köln,Germany,11.0,14.0,9.0,34.0,33.0,42.0,-9.0,...,60,Normal,Organised,45,Medium,55,Press,70,Wide,Cover
1,2010/2011,1. FC Köln,Germany,5.0,16.0,13.0,34.0,47.0,62.0,-15.0,...,50,Normal,Organised,40,Medium,40,Press,56,Normal,Cover
2,2011/2012,1. FC Köln,Germany,6.0,20.0,8.0,34.0,39.0,75.0,-36.0,...,57,Normal,Organised,43,Medium,55,Press,67,Wide,Cover
3,2013/2014,1. FC Köln,Germany,,,,,,,,...,57,Normal,Organised,43,Medium,55,Press,67,Wide,Cover
4,2014/2015,1. FC Köln,Germany,13.0,12.0,9.0,34.0,34.0,40.0,-6.0,...,59,Normal,Organised,57,Medium,46,Press,54,Normal,Cover
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1445,2010/2011,Watford,England,,,,,,,,...,48,Normal,Organised,44,Medium,47,Press,66,Normal,Cover
1446,2011/2012,Watford,England,,,,,,,,...,60,Normal,Organised,48,Medium,50,Press,45,Normal,Cover
1447,2013/2014,Watford,England,,,,,,,,...,53,Normal,Free Form,38,Medium,52,Press,45,Normal,Cover
1448,2014/2015,Watford,England,,,,,,,,...,55,Normal,Free Form,38,Medium,52,Press,45,Normal,Cover


We can see that there are some null values after the merge, and that is because not all teams played in all seasons. So, we do not have data regarding points, goals, etc.

In [38]:
teams_attributes.dropna(inplace=True)
teams_attributes

Unnamed: 0,season,team,country,drawn,lost,won,played,for,against,gd,...,chanceCreationShooting,chanceCreationShootingClass,chanceCreationPositioningClass,defencePressure,defencePressureClass,defenceAggression,defenceAggressionClass,defenceTeamWidth,defenceTeamWidthClass,defenceDefenderLineClass
0,2009/2010,1. FC Köln,Germany,11.0,14.0,9.0,34.0,33.0,42.0,-9.0,...,60,Normal,Organised,45,Medium,55,Press,70,Wide,Cover
1,2010/2011,1. FC Köln,Germany,5.0,16.0,13.0,34.0,47.0,62.0,-15.0,...,50,Normal,Organised,40,Medium,40,Press,56,Normal,Cover
2,2011/2012,1. FC Köln,Germany,6.0,20.0,8.0,34.0,39.0,75.0,-36.0,...,57,Normal,Organised,43,Medium,55,Press,67,Wide,Cover
4,2014/2015,1. FC Köln,Germany,13.0,12.0,9.0,34.0,34.0,40.0,-6.0,...,59,Normal,Organised,57,Medium,46,Press,54,Normal,Cover
5,2015/2016,1. FC Köln,Germany,13.0,11.0,10.0,34.0,38.0,42.0,-4.0,...,71,Lots,Organised,42,Medium,56,Press,51,Normal,Cover
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1434,2015/2016,Frosinone,Italy,7.0,23.0,8.0,38.0,35.0,76.0,-41.0,...,50,Normal,Organised,55,Medium,57,Press,47,Normal,Cover
1436,2015/2016,GFC Ajaccio,France,13.0,17.0,8.0,38.0,37.0,58.0,-21.0,...,55,Normal,Organised,42,Medium,46,Press,53,Normal,Cover
1437,2015/2016,SV Darmstadt 98,Germany,11.0,14.0,9.0,34.0,38.0,53.0,-15.0,...,45,Normal,Organised,32,Deep,56,Press,41,Normal,Cover
1443,2015/2016,UD Las Palmas,Spain,8.0,18.0,12.0,38.0,45.0,53.0,-8.0,...,53,Normal,Organised,53,Medium,39,Press,57,Normal,Cover


In [39]:
teams_attributes.isna().sum()

season                            0
team                              0
country                           0
drawn                             0
lost                              0
won                               0
played                            0
for                               0
against                           0
gd                                0
points                            0
team_api_id                       0
date                              0
buildUpPlaySpeed                  0
buildUpPlaySpeedClass             0
buildUpPlayDribblingClass         0
buildUpPlayPassing                0
buildUpPlayPassingClass           0
buildUpPlayPositioningClass       0
chanceCreationPassing             0
chanceCreationPassingClass        0
chanceCreationCrossing            0
chanceCreationCrossingClass       0
chanceCreationShooting            0
chanceCreationShootingClass       0
chanceCreationPositioningClass    0
defencePressure                   0
defencePressureClass        

In [40]:
teams_attributes.shape

(1066, 33)

In [41]:
len(teams_attributes.columns)

33

In [42]:
teams_attributes.dtypes

season                                    object
team                                      object
country                                   object
drawn                                    float64
lost                                     float64
won                                      float64
played                                   float64
for                                      float64
against                                  float64
gd                                       float64
points                                   float64
team_api_id                                int64
date                              datetime64[ns]
buildUpPlaySpeed                           int64
buildUpPlaySpeedClass                     object
buildUpPlayDribblingClass                 object
buildUpPlayPassing                         int64
buildUpPlayPassingClass                   object
buildUpPlayPositioningClass               object
chanceCreationPassing                      int64
chanceCreationPassin

In [56]:
teams_attributes["buildUpPlaySpeedClass"].value_counts()

Balanced    841
Fast        139
Slow         86
Name: buildUpPlaySpeedClass, dtype: int64

In [57]:
teams_attributes["buildUpPlayDribblingClass"].value_counts()

Little    735
Normal    312
Lots       19
Name: buildUpPlayDribblingClass, dtype: int64

In [58]:
teams_attributes["buildUpPlayPassingClass"].value_counts()

Mixed    897
Short    102
Long      67
Name: buildUpPlayPassingClass, dtype: int64

In [59]:
teams_attributes["buildUpPlayPositioningClass"].value_counts()

Organised    996
Free Form     70
Name: buildUpPlayPositioningClass, dtype: int64

In [60]:
teams_attributes["chanceCreationPassingClass"].value_counts()

Normal    896
Risky     129
Safe       41
Name: chanceCreationPassingClass, dtype: int64

In [61]:
teams_attributes["chanceCreationCrossingClass"].value_counts()

Normal    873
Lots      155
Little     38
Name: chanceCreationCrossingClass, dtype: int64

In [62]:
teams_attributes["chanceCreationShootingClass"].value_counts()

Normal    877
Lots      157
Little     32
Name: chanceCreationShootingClass, dtype: int64

In [63]:
teams_attributes["chanceCreationPositioningClass"].value_counts()

Organised    935
Free Form    131
Name: chanceCreationPositioningClass, dtype: int64

In [64]:
teams_attributes["defencePressureClass"].value_counts()

Medium    910
Deep      110
High       46
Name: defencePressureClass, dtype: int64

In [65]:
teams_attributes["defenceAggressionClass"].value_counts()

Press      924
Double      76
Contain     66
Name: defenceAggressionClass, dtype: int64

In [66]:
teams_attributes["defenceTeamWidthClass"].value_counts()

Normal    940
Wide       82
Narrow     44
Name: defenceTeamWidthClass, dtype: int64

In [67]:
teams_attributes["defenceDefenderLineClass"].value_counts()

Cover           989
Offside Trap     77
Name: defenceDefenderLineClass, dtype: int64

In [68]:
numerical_features = [
    "buildUpPlaySpeed",
    "buildUpPlayPassing",
    "chanceCreationPassing",
    "chanceCreationCrossing",
    "chanceCreationShooting",
    "defencePressure",
    "defenceAggression",
    "defenceTeamWidth",
]

First, in order to tackle the question 'what team attributes lead to the most victories?' we can face this as a classification problem, which means what attributes predict the if a team wins or not a match.

We have 'matches' dataframe and 'team_attributes' dataframe. The observations for 'matches' dataframe are per match and the observations for 'team_attributes' are per season, so we'll merge the data per season.

In [69]:
numerical_attributes = teams_attributes[["season", "team", "country"] + numerical_features]
numerical_attributes

Unnamed: 0,season,team,country,buildUpPlaySpeed,buildUpPlayPassing,chanceCreationPassing,chanceCreationCrossing,chanceCreationShooting,defencePressure,defenceAggression,defenceTeamWidth
0,2009/2010,1. FC Köln,Germany,55,65,65,40,60,45,55,70
1,2010/2011,1. FC Köln,Germany,58,71,42,39,50,40,40,56
2,2011/2012,1. FC Köln,Germany,53,53,57,44,57,43,55,67
4,2014/2015,1. FC Köln,Germany,59,57,53,47,59,57,46,54
5,2015/2016,1. FC Köln,Germany,59,65,57,36,71,42,56,51
...,...,...,...,...,...,...,...,...,...,...,...
1434,2015/2016,Frosinone,Italy,63,46,51,50,50,55,57,47
1436,2015/2016,GFC Ajaccio,France,48,52,55,53,55,42,46,53
1437,2015/2016,SV Darmstadt 98,Germany,59,77,52,77,45,32,56,41
1443,2015/2016,UD Las Palmas,Spain,48,44,53,42,53,53,39,57


In [70]:
teams_attributes["team"].nunique(), matches["home_team"].nunique()

(271, 296)

In [71]:
set(numerical_attributes["team"].unique()).issubset(set(matches["home_team"].unique()))

True

In [72]:
condlist = [
    np.array(matches["home_team_goal"] > matches["away_team_goal"]).reshape(-1, 1),
    np.array(matches["home_team_goal"] < matches["away_team_goal"]).reshape(-1, 1),
]

choicelist = [[3,0], [0, 3]]
points = pd.DataFrame(
    np.select(condlist=condlist, choicelist=choicelist, default=[1, 1]),
    columns=["home_team_points", "away_team_points"],
)
matches = pd.concat(
    [
        matches,
        points,
    ],
    axis=1,
)
matches

Unnamed: 0,season,date,home_team_goal,away_team_goal,country,league,home_team,away_team,home_team_points,away_team_points
0,2008/2009,2008-08-17,1,1,Belgium,Belgium Jupiler League,KRC Genk,Beerschot AC,1,1
1,2008/2009,2008-08-16,0,0,Belgium,Belgium Jupiler League,SV Zulte-Waregem,Sporting Lokeren,1,1
2,2008/2009,2008-08-16,0,3,Belgium,Belgium Jupiler League,KSV Cercle Brugge,RSC Anderlecht,0,3
3,2008/2009,2008-08-17,5,0,Belgium,Belgium Jupiler League,KAA Gent,RAEC Mons,3,0
4,2008/2009,2008-08-16,1,3,Belgium,Belgium Jupiler League,FCV Dender EH,Standard de Liège,0,3
...,...,...,...,...,...,...,...,...,...,...
25974,2015/2016,2015-09-22,1,0,Switzerland,Switzerland Super League,FC St. Gallen,FC Thun,3,0
25975,2015/2016,2015-09-23,1,2,Switzerland,Switzerland Super League,FC Vaduz,FC Luzern,0,3
25976,2015/2016,2015-09-23,2,0,Switzerland,Switzerland Super League,Grasshopper Club Zürich,FC Sion,3,0
25977,2015/2016,2015-09-22,0,0,Switzerland,Switzerland Super League,Lugano,FC Zürich,1,1


In [73]:
statistics = pd.concat(
    [
        matches[["country", "season", "home_team", "home_team_points"]].rename(columns={"home_team": "team", "home_team_points": "points"}),
        matches[["country", "season", "away_team", "away_team_points"]].rename(columns={"away_team": "team", "away_team_points": "points"}),
    ],
    ignore_index=True,
)
statistics

Unnamed: 0,country,season,team,points
0,Belgium,2008/2009,KRC Genk,1
1,Belgium,2008/2009,SV Zulte-Waregem,1
2,Belgium,2008/2009,KSV Cercle Brugge,0
3,Belgium,2008/2009,KAA Gent,3
4,Belgium,2008/2009,FCV Dender EH,0
...,...,...,...,...
51953,Switzerland,2015/2016,FC Thun,0
51954,Switzerland,2015/2016,FC Luzern,3
51955,Switzerland,2015/2016,FC Sion,0
51956,Switzerland,2015/2016,FC Zürich,1


Remove the teams in 'statistics' dataframe for which we do not have attributes.

In [74]:
teams_with_attributes = teams_attributes["team"].unique()
statistics = statistics[statistics["team"].isin(teams_with_attributes)]
statistics

Unnamed: 0,country,season,team,points
0,Belgium,2008/2009,KRC Genk,1
1,Belgium,2008/2009,SV Zulte-Waregem,1
2,Belgium,2008/2009,KSV Cercle Brugge,0
3,Belgium,2008/2009,KAA Gent,3
5,Belgium,2008/2009,KV Mechelen,1
...,...,...,...,...
51953,Switzerland,2015/2016,FC Thun,0
51954,Switzerland,2015/2016,FC Luzern,3
51955,Switzerland,2015/2016,FC Sion,0
51956,Switzerland,2015/2016,FC Zürich,1


In [75]:
set(statistics["team"].unique()) == set(teams_attributes["team"].unique())

True

In [76]:
statistics["points"].value_counts()

3    19214
0    18896
1    12988
Name: points, dtype: int64

In [77]:
statistics.replace(to_replace={"points": {3: "won", 1: "drawn", 0: "lost"}}, inplace=True)
statistics



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,country,season,team,points
0,Belgium,2008/2009,KRC Genk,drawn
1,Belgium,2008/2009,SV Zulte-Waregem,drawn
2,Belgium,2008/2009,KSV Cercle Brugge,lost
3,Belgium,2008/2009,KAA Gent,won
5,Belgium,2008/2009,KV Mechelen,drawn
...,...,...,...,...
51953,Switzerland,2015/2016,FC Thun,lost
51954,Switzerland,2015/2016,FC Luzern,won
51955,Switzerland,2015/2016,FC Sion,lost
51956,Switzerland,2015/2016,FC Zürich,drawn


In [78]:
statistics.rename(columns={"points": "game_status"}, inplace=True)
statistics



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,country,season,team,game_status
0,Belgium,2008/2009,KRC Genk,drawn
1,Belgium,2008/2009,SV Zulte-Waregem,drawn
2,Belgium,2008/2009,KSV Cercle Brugge,lost
3,Belgium,2008/2009,KAA Gent,won
5,Belgium,2008/2009,KV Mechelen,drawn
...,...,...,...,...
51953,Switzerland,2015/2016,FC Thun,lost
51954,Switzerland,2015/2016,FC Luzern,won
51955,Switzerland,2015/2016,FC Sion,lost
51956,Switzerland,2015/2016,FC Zürich,drawn


Merge 'statistics' with 'team_attributes'.

In [79]:
df = pd.merge(
    statistics,
    numerical_attributes,
    on=["season", "country", "team"],
)
df

Unnamed: 0,country,season,team,game_status,buildUpPlaySpeed,buildUpPlayPassing,chanceCreationPassing,chanceCreationCrossing,chanceCreationShooting,defencePressure,defenceAggression,defenceTeamWidth
0,Belgium,2009/2010,Standard de Liège,drawn,53,40,55,55,65,70,70,65
1,Belgium,2009/2010,Standard de Liège,won,53,40,55,55,65,70,70,65
2,Belgium,2009/2010,Standard de Liège,drawn,53,40,55,55,65,70,70,65
3,Belgium,2009/2010,Standard de Liège,won,53,40,55,55,65,70,70,65
4,Belgium,2009/2010,Standard de Liège,drawn,53,40,55,55,65,70,70,65
...,...,...,...,...,...,...,...,...,...,...,...,...
37533,Switzerland,2015/2016,BSC Young Boys,lost,52,64,39,66,46,44,34,50
37534,Switzerland,2015/2016,BSC Young Boys,won,52,64,39,66,46,44,34,50
37535,Switzerland,2015/2016,BSC Young Boys,won,52,64,39,66,46,44,34,50
37536,Switzerland,2015/2016,BSC Young Boys,won,52,64,39,66,46,44,34,50


In [80]:
df["game_status"].value_counts(dropna=False)

won      14138
lost     13909
drawn     9491
Name: game_status, dtype: int64

In [81]:
df["game_status"].value_counts(normalize=True) * 100.0

won      37.663168
lost     37.053120
drawn    25.283713
Name: game_status, dtype: float64

In [82]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.svm import SVC

In [83]:
X = df.loc[:, numerical_features]
X

Unnamed: 0,buildUpPlaySpeed,buildUpPlayPassing,chanceCreationPassing,chanceCreationCrossing,chanceCreationShooting,defencePressure,defenceAggression,defenceTeamWidth
0,53,40,55,55,65,70,70,65
1,53,40,55,55,65,70,70,65
2,53,40,55,55,65,70,70,65
3,53,40,55,55,65,70,70,65
4,53,40,55,55,65,70,70,65
...,...,...,...,...,...,...,...,...
37533,52,64,39,66,46,44,34,50
37534,52,64,39,66,46,44,34,50
37535,52,64,39,66,46,44,34,50
37536,52,64,39,66,46,44,34,50


In [84]:
y = df["game_status"]
y

0        drawn
1          won
2        drawn
3          won
4        drawn
         ...  
37533     lost
37534      won
37535      won
37536      won
37537     lost
Name: game_status, Length: 37538, dtype: object

In [85]:
model1 = LogisticRegression(multi_class="ovr")
model1.fit(X, y)

In [None]:
from collections import Counter

In [86]:
predictions1 = model1.predict(X)
Counter(predictions1)

array(['won', 'won', 'won', ..., 'lost', 'lost', 'lost'], dtype=object)

In [87]:
model1.feature_names_in_

array(['buildUpPlaySpeed', 'buildUpPlayPassing', 'chanceCreationPassing',
       'chanceCreationCrossing', 'chanceCreationShooting',
       'defencePressure', 'defenceAggression', 'defenceTeamWidth'],
      dtype=object)

In [88]:
model2 = LogisticRegression()
ovr = OneVsRestClassifier(model2)
ovr.fit(X, y)

In [89]:
predictions2 = ovr.predict(X)
predictions2

array(['won', 'won', 'won', ..., 'lost', 'lost', 'lost'], dtype='<U5')

In [90]:
model3 = SVC(decision_function_shape="ovo")
model3.fit(X, y)

In [91]:
predictions3 = model3.predict(X)
predictions3

array(['lost', 'lost', 'lost', ..., 'won', 'won', 'won'], dtype=object)

In [92]:
model4 = SVC()
ovo = OneVsOneClassifier(model4)
ovo.fit(X, y)

In [93]:
predictions4 = ovo.predict(X)
predictions4

array(['lost', 'lost', 'lost', ..., 'won', 'won', 'won'], dtype=object)

In [94]:
print((y == predictions1).sum())
print((y == predictions2).sum())
print((y == predictions3).sum())
print((y == predictions4).sum())

15543
15543
16988
16985


In [None]:
df2 = pd.concat(
    [y, pd.Series(predictions1)],
    ignore_index=True,
    axis=1,
).rename(columns={0: "y", 1: "predictions"})
df2

In [None]:
df2["equal"] = df2["y"] == df2["predictions"]
df2

In [None]:
df2.groupby(by=["y", "predictions"]).sum()

In [96]:
from collections import Counter

print(Counter(predictions1))
print(Counter(predictions2))
print(Counter(predictions3))
print(Counter(predictions4))

Counter({'won': 19768, 'lost': 17770})
Counter({'won': 19768, 'lost': 17770})
Counter({'lost': 22352, 'won': 15050, 'drawn': 136})
Counter({'lost': 22352, 'won': 15080, 'drawn': 106})


In [None]:
len(predictions1), len(y)

Create a Logistic Regression using multinomial.

In [None]:
model_multinomial = LogisticRegression(multi_class="multinomial", solver="lbfgs", max_iter=1000)
model_multinomial.fit(X, y)

In [None]:
predictions_multinomial = model_multinomial.predict(X)

In [None]:
Counter(predictions_multinomial)

In [None]:
model_ovr = LogisticRegression(multi_class="ovr")
model_ovr.fit(X, y)

In [None]:
predictions_ovr = model_ovr.predict(X)
predictions_ovr

In [110]:
y = y.replace({"won": 3, "drawn": 1, "lost": 0})
y

0        1
1        3
2        1
3        3
4        1
        ..
37533    0
37534    3
37535    3
37536    3
37537    0
Name: game_status, Length: 37538, dtype: int64

In [98]:
from random import random
from sklearn.datasets import make_classification

X_test, y_test = make_classification(n_samples=1000, n_features=10, n_informative=5, n_redundant=5, n_classes=3, random_state=1)
model_test = LogisticRegression(multi_class="ovr")
model_test.fit(X_test, y_test)
predictions_test = model_test.predict(X_test)
Counter(predictions_test)

Counter({1: 321, 0: 389, 2: 290})

In [101]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_std = scaler.fit_transform(X)
# X_std
model_scaled = LogisticRegression(multi_class="ovr")
model_scaled.fit(X_std, y)
predictions_scaled = model_scaled.predict(X_std)
Counter(predictions_scaled)

Counter({'won': 19768, 'lost': 17770})

In [128]:
X_std.shape

(37538, 8)

In [102]:
clf = OneVsRestClassifier(LogisticRegression())
clf.fit(X_std, y)
predictions_ovr = clf.predict(X_std)
Counter(predictions_ovr)

Counter({'won': 19768, 'lost': 17770})

In [103]:
Counter(y)

Counter({'drawn': 9491, 'won': 14138, 'lost': 13909})

In [120]:
from random import random
from sklearn.datasets import make_classification

X_test, y_test = make_classification(n_samples=37538, n_features=10, n_informative=5, n_redundant=5, n_classes=3, random_state=1)
model_test = LogisticRegression(multi_class="ovr")
model_test.fit(X, y)
predictions_test = model_test.predict(X)
Counter(predictions_test)

Counter({3: 19768, 0: 17770})

In [113]:
Counter(y_test)

Counter({1: 12514, 0: 12525, 2: 12499})

In [114]:
Counter(y.to_numpy())

Counter({1: 9491, 3: 14138, 0: 13909})