In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn import svm
from sklearn import ensemble
from sklearn import preprocessing
from sklearn.model_selection import learning_curve
from sklearn.multiclass import OneVsRestClassifier
import matplotlib.pyplot as plt

In [2]:
def load_in_data():
    x_tr = pd.read_csv('x_train.csv', index_col = 'match_api_id')
    y_tr = pd.read_csv('y_train.csv', index_col = 'match_api_id')
    x_cv = pd.read_csv('x_val.csv', index_col = 'match_api_id')
    y_cv = pd.read_csv('y_val.csv', index_col = 'match_api_id')
    x_te = pd.read_csv('x_test.csv', index_col = 'match_api_id')
    y_te = pd.read_csv('y_test.csv', index_col = 'match_api_id')
    return x_tr, y_tr, x_cv, y_cv, x_te, y_te

In [24]:
x_tr, y_tr, x_cv, y_cv, x_te, y_te = load_in_data()

In [25]:
['home_player_1_overall_rating', 'home_player_1_crossing',
       'home_player_1_finishing', 'home_player_1_heading_accuracy',
       'home_player_1_short_passing', 'home_player_1_volleys',
       'home_player_1_dribbling', 'home_player_1_curve',
       'home_player_1_long_passing', 'home_player_1_ball_control',
       'home_player_1_acceleration', 'home_player_1_sprint_speed',
       'home_player_1_reactions', 'home_player_1_balance',
       'home_player_1_shot_power', 'home_player_1_strength',
       'home_player_1_long_shots', 'home_player_1_aggression',
       'home_player_1_interceptions', 'home_player_1_positioning',
       'home_player_1_vision', 'home_player_1_penalties',
       'home_player_1_marking', 'home_player_1_sliding_tackle',
       'home_player_1_gk_diving', 'home_player_1_gk_handling',
       'home_player_1_gk_positioning', 'home_player_1_gk_reflexes']

['home_player_1_overall_rating',
 'home_player_1_crossing',
 'home_player_1_finishing',
 'home_player_1_heading_accuracy',
 'home_player_1_short_passing',
 'home_player_1_volleys',
 'home_player_1_dribbling',
 'home_player_1_curve',
 'home_player_1_long_passing',
 'home_player_1_ball_control',
 'home_player_1_acceleration',
 'home_player_1_sprint_speed',
 'home_player_1_reactions',
 'home_player_1_balance',
 'home_player_1_shot_power',
 'home_player_1_strength',
 'home_player_1_long_shots',
 'home_player_1_aggression',
 'home_player_1_interceptions',
 'home_player_1_positioning',
 'home_player_1_vision',
 'home_player_1_penalties',
 'home_player_1_marking',
 'home_player_1_sliding_tackle',
 'home_player_1_gk_diving',
 'home_player_1_gk_handling',
 'home_player_1_gk_positioning',
 'home_player_1_gk_reflexes']

In [26]:
columns_keep = ['overall_rating',
                'crossing',
                'finishing',
                'heading_accuracy',
                'short_passing',
                'volleys',
                'dribbling',
                'curve',
                'long_passing',
                'ball_control',
                'acceleration',
                'sprint_speed',
                'reactions',
                'balance',
                'shot_power',
                'strength',
                'long_shots',
                'aggression',
                'interceptions',
                'positioning',
                'vision',
                'penalties',
                'marking',
                'standing_tackle',
                'sliding_tackle',
                'gk_diving',
                'gk_handling',
                'gk_positioning',
                'gk_reflexes']
positions = ['home_player_1',
             'home_player_2',
             'home_player_3',
             'home_player_4',
             'home_player_5',
             'home_player_6',
             'home_player_7',
             'home_player_8',
             'home_player_9',
             'home_player_10',
             'home_player_11',
             'away_player_1',
             'away_player_2',
             'away_player_3',
             'away_player_4',
             'away_player_5',
             'away_player_6',
             'away_player_7',
             'away_player_8',
             'away_player_9',
             'away_player_10',
             'away_player_11']
columns = []
for pos in positions:
    arr = [pos + '_' + column_name for column_name in columns_keep]
    columns = columns + arr
x_tr = x_tr[columns]
x_cv = x_cv[columns]
x_te = x_te[columns]

In [27]:
min_max_scaler = preprocessing.MinMaxScaler()
x_tr = pd.DataFrame(min_max_scaler.fit_transform(x_tr), index = x_tr.index, columns = x_tr.columns)
x_cv = pd.DataFrame(min_max_scaler.fit_transform(x_cv), index = x_cv.index, columns = x_cv.columns)
x_te = pd.DataFrame(min_max_scaler.fit_transform(x_te), index = x_te.index, columns = x_te.columns)

In [28]:
x_tr.fillna(x_tr.mean(), inplace=True)
x_cv.fillna(x_cv.mean(), inplace=True)
x_te.fillna(x_te.mean(), inplace=True)

In [29]:
def plot_learning_curve(estimator, x_tr, y_tr):
    train_sizes, train_scores, test_scores = \
        learning_curve(estimator, x_tr, y_tr, train_sizes = np.linspace(0.2, 1, 20))
    
    plt.plot(train_sizes, -train_scores.mean(1), 'o-', label = str(estimator.__class__.__name__) + ' train')
    plt.plot(train_sizes, -test_scores.mean(1), 'o-', label = str(estimator.__class__.__name__) + ' test')
    plt.legend(loc="best")

In [30]:
def do_stuff(model, x_tr, y_tr, x_cv, y_cv):
    clf = OneVsRestClassifier(model).fit(x_tr, y_tr)
    predict_tr = clf.predict(x_tr).reshape((-1, 1))
    predict_cv = clf.predict(x_cv).reshape((-1, 1))
    print(np.mean(predict_tr == y_tr))
    print(np.mean(predict_cv == y_cv))
    #plot_learning_curve(clf, x_tr, y_tr)
    return clf

In [48]:
def logistic_model(x_tr, y_tr, x_cv, y_cv):
    log = linear_model.LogisticRegression(max_iter = 500, C = 1, l1_ratio = 0.9, penalty = 'elasticnet', solver = 'saga', verbose = 2)
    clf = log.fit(x_tr, y_tr)
    return clf

def svm_model(x_tr, y_tr, x_cv, y_cv):
    svc = svm.SVC(kernel = 'rbf', max_iter = 1000, verbose = 5)
    return do_stuff(svc, x_tr, y_tr, x_cv, y_cv)
    
def rf_model(x_tr, y_tr, x_cv, y_cv):
    rf = ensemble.RandomForestClassifier(n_estimators = 100, max_featuers = 'auto', max_depth = 3, verbose = 2)
    return do_stuff(rf, x_tr, y_tr, x_cv, y_cv)

In [49]:
clf = logistic_model(x_tr, y_tr, x_cv, y_cv)

  return f(**kwargs)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


max_iter reached after 260 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  4.3min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  4.3min finished


In [10]:
clf1 = svm_model(x_tr, y_tr, x_cv, y_cv)

[LibSVM]



[LibSVM]



[LibSVM]



result    0.337132
dtype: float64
result    0.370201
dtype: float64


In [43]:
clf2 = rf_model(x_tr, y_tr, x_cv, y_cv)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s


building tree 1 of 100
building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
b

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    3.8s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s


building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s


building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    3.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


building tree 1 of 100
building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s


building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    3.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.

result    0.49937
dtype: float64
result    0.514625
dtype: float64


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


In [47]:
predict_tr = clf.predict(x_tr).reshape((-1, 1))
predict_cv = clf.predict(x_cv).reshape((-1, 1))
predict_te = clf.predict(x_te).reshape((-1, 1))
print(np.mean(predict_tr == y_tr))
print(np.mean(predict_cv == y_cv))
np.mean(predict_te == y_te)

result    0.549151
dtype: float64
result    0.513711
dtype: float64


result    0.535224
dtype: float64

In [42]:
coefs = clf.coef_
coefs[0].reshape((-1, 29))

array([[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        -2.63553348e-02,  0.00000000e+00,  0.00000000e+00,
         1.92773849e-01,  0.00000000e+00,  8.05225005e-02,
        -1.56858522e-01, -2.03819933e-02, -8.30416979e-02,
         8.43846553e-02,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        -1.36520348e-01,  0.00000000e+00, -8.70792572e-02,
         0.00000000e+00,  7.81609584e-02, -1.32211107e-01,
         0.00000000e+00,  0.00000000e+00, -2.03228089e-01,
        -1.21600278e-01,  2.10137937e-01],
       [-1.36583161e-02,  0.00000000e+00, -6.90390688e-02,
         0.00000000e+00,  1.73304104e-01, -6.71001283e-02,
        -2.41346539e-02,  4.25870396e-02,  0.00000000e+00,
        -2.52380425e-01,  1.52677019e-01,  0.00000000e+00,
         0.00000000e+00, -2.42185174e-01, -5.22122770e-02,
        -3.04700781e-01,  4.09458613e-02,  2.87914054e-02,
         0.00000000e+00,  0.00000000e+00, -1.10021347e-01,
        -6.97

In [35]:
x_tr.columns
len([0,2,3,4,5,6,7,8,10,11,12,13,15,16,17,20,21,22,23,24,25,26,27,29,30,31,33,34])

28

In [88]:
clf.coef_.shape

(3, 770)

In [109]:
c0 = coefs[0, :].reshape((22, 35))
c1 = coefs[1, :].reshape((22, 35))
c2 = coefs[2, :].reshape((22, 35))
cols = x_tr.columns.to_numpy().reshape((22, 35))

In [104]:
maxes0 = np.argmax(c0, axis = 1)
maxes1 = np.argmax(c1, axis = 1)
maxes2 = np.argmax(c2, axis = 1)
maxes0.sort()
maxes1.sort()
maxes2.sort()
print(maxes0)
print(maxes1)
print(maxes2)
print(x_tr.columns[maxes0])
print(x_tr.columns[maxes1])
print(x_tr.columns[maxes2])

[ 0  0  5  6  7  7 11 11 11 12 13 14 20 25 27 27 27 28 29 30 32 34]
[ 0  1  2  2  3 10 11 15 15 16 17 20 21 21 21 23 24 25 28 30 31 32]
[ 0  0  2  2  3  5  5  7 10 12 13 14 15 24 25 29 29 29 31 32 33 33]
Index(['home_player_1_overall_rating', 'home_player_1_overall_rating',
       'home_player_1_short_passing', 'home_player_1_volleys',
       'home_player_1_dribbling', 'home_player_1_dribbling',
       'home_player_1_ball_control', 'home_player_1_ball_control',
       'home_player_1_ball_control', 'home_player_1_acceleration',
       'home_player_1_sprint_speed', 'home_player_1_agility',
       'home_player_1_strength', 'home_player_1_vision',
       'home_player_1_marking', 'home_player_1_marking',
       'home_player_1_marking', 'home_player_1_standing_tackle',
       'home_player_1_sliding_tackle', 'home_player_1_gk_diving',
       'home_player_1_gk_kicking', 'home_player_1_gk_reflexes'],
      dtype='object')
Index(['home_player_1_overall_rating', 'home_player_1_potential',
       

In [54]:
clf.classes_

array([-1,  0,  1], dtype=int64)

In [105]:
maxes0 = np.argmax(c0, axis = 0)
maxes1 = np.argmax(c1, axis = 0)
maxes2 = np.argmax(c2, axis = 0)
maxes0.sort()
maxes1.sort()
maxes2.sort()
print(maxes0)
print(maxes1)
print(maxes2)

[ 0  0  1  3  3  5  6  8  8  9  9 10 11 11 11 13 14 14 15 15 16 16 16 17
 17 18 18 19 19 19 19 20 20 21 21]
[ 0  1  1  1  2  2  3  3  4  4  5  5  5  6  6  7  7  8  8  8  9 10 10 10
 11 11 15 15 16 16 16 18 19 20 20]
[ 0  1  1  1  2  3  4  5  5  6  6  7  8  9  9 10 11 11 11 12 12 14 15 15
 15 16 16 16 18 20 20 20 21 21 21]


In [101]:
x_tr.columns[22]

'home_player_1_aggression'

In [96]:
770/22

35.0