In [1]:
import pandas as pd 
import numpy as np
from time import time
import seaborn as sns

In [2]:
import scikitplot as skplt

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn import preprocessing

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

import matplotlib.pyplot as plt
from matplotlib import pyplot

import sys
import warnings
warnings.filterwarnings("ignore")

## Feature Engineering

In this section we want to process all our data, i.e. encoding the features and automatic feature selection.

In [12]:
df1 = pd.read_csv(r'..\reports\CleanedData.csv', index_col = 0)

In [11]:
# convert strings in numbers 
le = preprocessing.LabelEncoder()

le.fit(pd.concat([df1.team, df1.Away_team]).drop_duplicates())

df1['team'] = le.transform(df1['team'])
df1['Away_team'] = le.transform(df1['Away_team'])

le.fit(df1['result'].drop_duplicates())
df1['result'] = le.transform(df1['result'])

df1[['team','Away_team','result']] = df1[['team','Away_team','result']].stack().rank(method='dense').unstack().astype(int)

ValueError: Index contains duplicate entries, cannot reshape

In [13]:
df1.loc[(df1['season']==1928) & (df1['division']==1) & (df1['matchday']==18)]

Unnamed: 0,season,division,matchday,team,Away_team,W,L,T,result,GD,GD_cum,rank,last_results
85,1928,1,18,Arenas Club,Racing,8.0,6.0,3.0,L,-2.0,-3.0,3.0,"['L', 'W', 'L', 'L', 'W', 'L', 'T', 'W', 'W', ..."
85,1928,1,18,Athletic,Real Madrid,7.0,6.0,4.0,W,0.0,8.0,5.0,"['T', 'L', 'W', 'T', 'L', 'T', 'L', 'W', 'L', ..."
86,1928,1,18,Athletic Madrid,Espanyol,7.0,8.0,2.0,W,0.0,-4.0,7.0,"['T', 'W', 'W', 'L', 'L', 'W', 'T', 'L', 'L', ..."
87,1928,1,18,Barcelona,Real Unión,10.0,4.0,3.0,W,0.0,11.0,2.0,"['T', 'W', 'T', 'W', 'W', 'W', 'W', 'W', 'W', ..."
86,1928,1,18,Catalunya,Donostia,6.0,7.0,4.0,L,3.0,-3.0,8.0,"['W', 'L', 'T', 'T', 'W', 'L', 'T', 'W', 'W', ..."
88,1928,1,18,Donostia,Catalunya,7.0,6.0,4.0,W,1.0,4.0,4.0,"['W', 'L', 'W', 'L', 'T', 'W', 'T', 'L', 'L', ..."
87,1928,1,18,Espanyol,Athletic Madrid,7.0,6.0,4.0,L,0.0,0.0,6.0,"['T', 'L', 'L', 'T', 'T', 'W', 'W', 'L', 'W', ..."
89,1928,1,18,Racing,Arenas Club,2.0,12.0,3.0,W,-3.0,-29.0,10.0,"['L', 'W', 'L', 'T', 'T', 'T', 'L', 'L', 'L', ..."
88,1928,1,18,Real Madrid,Athletic,11.0,5.0,1.0,L,2.0,15.0,1.0,"['W', 'W', 'W', 'W', 'T', 'L', 'L', 'L', 'W', ..."
89,1928,1,18,Real Unión,Barcelona,5.0,10.0,2.0,L,-1.0,1.0,9.0,"['L', 'L', 'L', 'W', 'L', 'L', 'W', 'W', 'L', ..."


In [5]:
# Encoding last results column
def last_results(row):
    row = [row.strip() for row in eval(row)]
    result = 0
    while row:
        if row[0] == 'L': result -= 1
        elif row[0] == 'W': result += 1
        row.pop(0)
    return result

df1['last_results'] = df1['last_results'].apply(lambda row: last_results(row))

TypeError: eval() arg 1 must be a string, bytes or code object

In [None]:
# Splitting season to compare it later
df1['season2'] = df1['season']
df1 = df1.astype({"season": str})
df1[['season','season2']] = df1['season'].str.split("-",expand=True)
df1 = df1.astype({"season": int})
df1 = df1.drop('season2', axis=1)
df1

The features we are interested in have to be as independent as possible from each other. Therefore, we are going to
make sure that we don't have any correlated columns with a value higher than .95

In [None]:
cor_matrix = df1.corr().abs()
upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(np.bool))

to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]
print(); print(to_drop)  


In [None]:
# Compute the correlation matrix
corr = df1.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=False)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

### Automatic Variable Selection
In order to have an automatic variable selection we can use a **model-based** feature selection. Consequently, we will use a supervised machine learning model to judge the importance of each feature.

We will add some noise feature to our data and expect that the feature selection will identify the features that are non informative, removing the data features that are irrelevant.

For that purpose, we will add 20 noise features and select the 50% that are more correlated. Then, we will discard the noise features. The variables selected can be shown belove: black are original features and white noise features

In [None]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

target = df1.result
X = df1.loc[:, df1.columns != 'result']

# get deterministic random numbers
rng = np.random.RandomState(42)
noise = rng.normal(size=(len(df1), 12))
X_w_noise = np.hstack([X, noise])

X_train,X_test,y_train,y_test = train_test_split(X_w_noise, target, test_size=0.5)

select = SelectFromModel( RandomForestClassifier(n_estimators=100, random_state=42), threshold="median")

select.fit(X_train, y_train)
X_train_l1 = select.transform(X_train)
print("X_train.shape: {}".format(X_train.shape))
print("y_train.shape: {}".format(y_train.shape))

print("X_train_l1.shape: {}".format(X_train_l1.shape))

mask = select.get_support()
# visualize the mask -- black is True, white is False
plt.matshow(mask.reshape(1, -1), cmap='gray_r')
plt.xlabel("Sample index")

X_test_l1 = select.transform(X_test)

In [None]:
#plot our results
def confusion_matrix(trained_model, y_test, X_test):
    Y_test_pred = trained_model.predict(X_test)

    fig = plt.figure(figsize=(15,6))
    ax1 = fig.add_subplot(121)
    skplt.metrics.plot_confusion_matrix(y_test, Y_test_pred,
                                        title="Confusion Matrix",
                                        cmap="Oranges",
                                        ax=ax1)

    ax2 = fig.add_subplot(122)
    skplt.metrics.plot_confusion_matrix(y_test, Y_test_pred,
                                        normalize=True,
                                        title="Confusion Matrix",
                                        cmap="Purples",
                                        ax=ax2)

def ROC_PRC_matrix(trained_model, y_test, X_test):    
    y_test_probs = trained_model.predict_proba(X_test)
    
    fig = plt.figure(figsize=(15,6))
    ax1 = fig.add_subplot(121)
    skplt.metrics.plot_roc_curve(y_test, y_test_probs,
                       title="Digits ROC Curve", figsize=(12,6), ax=ax1)
    
    ax2 = fig.add_subplot(122)
    skplt.metrics.plot_precision_recall_curve(y_test, y_test_probs,
                        title="Digits Precision-Recall Curve", figsize=(12,6), ax= ax2)

In [None]:
names = ["Logistic Regression", "Decision Tree", "Random Forest"]
         
classifiers = [
    #KNeighborsClassifier(3),
    LogisticRegression(),
    DecisionTreeClassifier(max_depth=1),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
]


for name, clf in zip(names, classifiers):
        clf.fit(X_train_l1, y_train)
        prediction_in = clf.predict(X_train_l1)
        accuracy_r2_in = r2_score(y_train, prediction_in)
        prediction_out = clf.predict(X_test_l1)
        accuracy_r2_out = r2_score(y_test, prediction_out)
        print() 
        print(name) 
        print("-" * 20)
        print("Accuracy on training set: ",clf.score(X_train_l1, y_train), "\tR2 training set: ", accuracy_r2_in )
        print("Accuracy on test set: ",clf.score(X_test_l1, y_test), "\tR2 test set: ", accuracy_r2_out )
        ROC_PRC_matrix(clf, y_test, X_test_l1)
        confusion_matrix(clf, y_test, X_test_l1)


In [None]:
# # get importance

# models = [lr_classifier]
  
# for model in models: 
# 	importance = model.coef_[0]
# 	#summarize feature importance
# 	for i,v in enumerate(importance):
# 		print('Feature: %0d, Score: %.5f' % (i,v))
# 	# plot feature importance
# 	pyplot.bar([x for x in range(len(importance))], importance)
# 	pyplot.show()

In [None]:
import sqlite3
with sqlite3.connect("../laliga.sqlite") as conn:
    df = pd.read_sql("SELECT * FROM Matches", con = conn)

test_dataset = df.loc[df.score.isnull()]

In [None]:
test_dataset

In [None]:
#normalize with Normalize
# fit scaler on training data
scale_2 = MinMaxScaler().fit(features)

# transform training data
scale_df_normalize = scale_2.transform(features)
featureNorm= pd.DataFrame(scale_df_normalize, columns= feature_cols)
featureNorm.head()

for col in features:
    sns.kdeplot(featureNorm[col], shade=True)