# The European Football Database



## 1. Introduction

The European Football Database records data of several football championship seasons. It comes bundled as a sqlite database, which contains several tables that can easily be converted to CSV files and usable with pandas.

One has to add the target column for the challenge.

In [2]:
import pandas as pd 

path = 'matches.csv'
df_match = pd.read_csv(path)
# list(df_match.columns)

In [1]:
import problem
X, y = problem.get_train_data()
X.columns

Index(['stage', 'date', 'home_team_api_id', 'away_team_api_id',
       'home_player_X1', 'home_player_X2', 'home_player_X3', 'home_player_X4',
       'home_player_X5', 'home_player_X6', 'home_player_X7', 'home_player_X8',
       'home_player_X9', 'home_player_X10', 'home_player_X11',
       'away_player_X1', 'away_player_X2', 'away_player_X3', 'away_player_X4',
       'away_player_X5', 'away_player_X6', 'away_player_X7', 'away_player_X8',
       'away_player_X9', 'away_player_X10', 'away_player_X11',
       'home_player_Y1', 'home_player_Y2', 'home_player_Y3', 'home_player_Y4',
       'home_player_Y5', 'home_player_Y6', 'home_player_Y7', 'home_player_Y8',
       'home_player_Y9', 'home_player_Y10', 'home_player_Y11',
       'away_player_Y1', 'away_player_Y2', 'away_player_Y3', 'away_player_Y4',
       'away_player_Y5', 'away_player_Y6', 'away_player_Y7', 'away_player_Y8',
       'away_player_Y9', 'away_player_Y10', 'away_player_Y11', 'B365H',
       'B365D', 'B365A', 'BWH', 'BWD', 'BWA

In [7]:
y.value_counts()

0    8964
1    5473
2    4891
dtype: int64

### Code Ã  caser

In [None]:
import pandas as pd
import sqlite3
import numpy as np
import matplotlib.pyplot as plt

import sklearn
from sklearn.impute import SimpleImputer

import lightgbm as lgb


# Importing data

conn = sqlite3.connect('database.sqlite')
query = "SELECT * FROM Match;"

df = pd.read_sql_query(query,conn)

df = df.drop(df.columns[range(77,85)],axis='columns')

df = df.set_index('id')


# Replacing Null values

df.iloc[:,10:76] = SimpleImputer(strategy='most_frequent').fit_transform(df.iloc[:,10:76])

df = df.fillna(1)

features = list(df.columns)

for feature in features :
    print(feature, " : ", df[feature].isnull().sum())
    
# Keeping usefull features
    
X = df.drop(['home_team_goal','away_team_goal','season','date','country_id','league_id','match_api_id'],axis='columns')

# Target : 
    # 0 -> home_win, 
    # 1 -> draw, 
    # 2 -> away_win
    
Y = pd.Series(0,index=df.index)

away = df[df['home_team_goal']<df['away_team_goal']].index
draw = df[df['home_team_goal']==df['away_team_goal']].index

Y[away] = 1
Y[draw] = 2

_ = Y.value_counts().plot(kind="bar")

# Train/Test split 

train_index = df[df['season'].isin(['2008/2009','2009/2010',
                                  '2010/2011','2011/2012',
                                  '2012/2013','2013/2014'])].index

test_index = df[df['season'].isin(['2014/2015','2015/2016'])].index

X_train = X.loc[train_index.values].set_index(pd.Index(range(1,19329)))
X_test = X.loc[test_index.values].set_index(pd.Index(range(1,6652)))
y_train = Y[train_index].set_axis(pd.Index(range(1,19329)))
y_test = Y[test_index].set_axis(pd.Index(range(1,6652)))

# First model

model = lgb.LGBMClassifier()
model = model.fit(X_train,y_train)
preds = model.predict(X_test)
accuracy = sklearn.metrics.accuracy_score(y_test, preds)
print('accuracy : ', accuracy)

# Visualisation selon les variables :
    
colors = ['r', 'b', 'g']
labels = [0,1,2]

def plot_classwise_normalized(feature, bins=None):
    if bins is None:
        bins = np.linspace(X_train[feature].min(), X_train[feature].max(), 10)
    for label, color in zip(labels, colors):
        plt.hist(X_train[y_train == label][feature].values, density=True, bins=bins, 
                 alpha=0.8, color=color)
        plt.xlabel(feature)
        plt.ylabel("Density")

        
#for feature in features :
#    plt.figure()
#    plot_classwise_normalized(feature)
    
    
def plot_classwise_scatter(feature1, feature2, range1=None, range2=None):
    if range1 is None:
        range1 = [X_train[feature1].min(), X_train[feature1].max()]
    if range2 is None:
        range2 = [X_train[feature2].min(), X_train[feature2].max()]
    for label, color in zip(labels, colors):
        plt.xlim(range1[0], range1[1])
        plt.ylim(range2[0], range2[1])
        plt.scatter(X_train[y_train == label][feature1], 
                    X_train[y_train == label][feature2],
            alpha=0.3, s=80, c=color, marker='.');

## 2. Description of the columns

There are several interesting columns in the matches database.

## 3. Exploratory Data Analysis

We'll do here some plots, as seen in the Stars challenge

## 4. Explanation of the challenge

The goal is to predict the outcome of a match.

### Example of pipeline with a Random Forests classifier

In [None]:
# code pipeline Random Forest