In [326]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [327]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

In [328]:
train = train.iloc[:, 1:]
test = test.iloc[:, 1:]

In [329]:
def sep_area(x) :
    
    if 'L' in x :
        return 1
    elif 'R' in x :
        return 2
    else :
        return 3

In [330]:
train['Position_Area'] = train['Position'].apply(sep_area)
test['Position_Area'] = test['Position'].apply(sep_area)

In [331]:
def position_category(x) :
    
    if x.endswith('M') :
        return 1
    elif x.endswith('B') :
        return 2
    elif x == 'GK' :
        return 3
    else :
        return 4

In [332]:
train['Position_Category'] = train['Position'].apply(position_category)
test['Position_Category'] = test['Position'].apply(position_category)

In [333]:
train['BMI'] = train['Weight'] / ((train['Height'] / 100) ** 2)
test['BMI'] = test['Weight'] / ((test['Height'] / 100) ** 2)

In [334]:
train['AttackingWorkRate'] = train['AttackingWorkRate'].map({'High' : 3, 'Medium' : 2, 'Low' : 1})
test['AttackingWorkRate'] = test['AttackingWorkRate'].map({'High' : 3, 'Medium' : 2, 'Low' : 1})

In [335]:
train['DefensiveWorkRate'] = train['DefensiveWorkRate'].map({'High' : 3, 'Medium' : 2, 'Low' : 1})
test['DefensiveWorkRate'] = test['DefensiveWorkRate'].map({'High' : 3, 'Medium' : 2, 'Low' : 1})

In [336]:
train['PreferredFoot'] = [1 if pf == 'Right' else 0 for pf in train['PreferredFoot']]
test['PreferredFoot'] = [1 if pf == 'Right' else 0 for pf in test['PreferredFoot']]

In [337]:
train['Oppsite_Foot_Side'] = ((train['PreferredFoot'] == 1) & (train['Position_Area'] == 1)) | ((train['PreferredFoot'] == 0) & (train['Position_Area'] == 2))
test['Oppsite_Foot_Side'] = ((test['PreferredFoot'] == 1) & (test['Position_Area'] == 1)) | ((test['PreferredFoot'] == 0) & (test['Position_Area'] == 2))

In [338]:
gk_ability = ['GKDiving', 'GKHandling', 'GKKicking', 'GKPositioning', 'GKReflexes']

In [339]:
forward_rating = train.columns[train.columns.str.contains('WRating|TRating|FRating')].tolist()
midfielder_rating = train.columns[train.columns.str.contains('MRating')].tolist()
defense_rating = train.columns[train.columns.str.contains('BRating')].tolist()
gk_rating = train.columns[train.columns.str.contains('GKRating')].tolist()

In [340]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [341]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [342]:
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold

In [343]:
le = LabelEncoder()

In [344]:
skf = StratifiedKFold(n_splits = 8, random_state = 42, shuffle = True)

### Forward

In [442]:
f = train[train['Position_Category'] == 4]

In [443]:
te_f = test[test['Position_Category'] == 4]

In [444]:
f.drop(defense_rating + midfielder_rating + gk_rating, inplace = True, axis = 1)

In [445]:
f['Centrailty'] = f[forward_rating].max(axis = 1) / (f[forward_rating].max(axis = 1) - f[forward_rating].min(axis = 1))

In [446]:
te_f['Centrailty'] = te_f[forward_rating].max(axis = 1) / (te_f[forward_rating].max(axis = 1) - te_f[forward_rating].min(axis = 1))

In [447]:
f.drop(forward_rating + gk_ability, axis = 1, inplace = True)

In [448]:
f['Position'] = le.fit_transform(f['Position'])

In [449]:
te_f['Position'] = le.transform(te_f['Position'])

In [450]:
X = f.drop(['Prospect', 'Position_Category'], axis = 1)

In [451]:
y = f['Prospect']

In [452]:
target = te_f[X.columns]

In [454]:
cb_f1 = []
i = 0
cb_f = np.zeros((target.shape[0]))
for tr_idx, val_idx in skf.split(X, y) :
    
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
    
    cb = CatBoostClassifier(random_state = 42, max_depth = 6, learning_rate = 0.03, n_estimators = 5000, eval_metric = 'F1')
    
    cb.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], early_stopping_rounds = 1000, verbose = 0)
    pred = cb.predict(val_x)
    f1 = f1_score(val_y, pred, average = 'macro')
    cb_f1.append(f1)
    print(f'{i + 1} Fold F1 = {f1}')
    i += 1
    
    pred = cb.predict_proba(target)[:, 1] / skf.n_splits
    cb_f += pred
print(f'\n{cb.__class__.__name__} AVG of F1 = {np.mean(cb_f1)}')

1 Fold F1 = 0.7324561403508771
2 Fold F1 = 0.8124126362873917
3 Fold F1 = 0.776829268292683
4 Fold F1 = 0.8063492063492064
5 Fold F1 = 0.911976911976912
6 Fold F1 = 0.8680877355576151
7 Fold F1 = 0.8775867094141651
8 Fold F1 = 0.8400000000000001

CatBoostClassifier AVG of F1 = 0.8282123260286063


### Midfielder

In [429]:
m = train[train['Position_Category'] == 1]

In [430]:
te_m = test[test['Position_Category'] == 1]

In [431]:
m.drop(defense_rating + forward_rating + gk_rating, inplace = True, axis = 1)

In [432]:
m['Centrailty'] = m[midfielder_rating].max(axis = 1) / (m[midfielder_rating].max(axis = 1) - m[midfielder_rating].min(axis = 1))

In [433]:
te_m['Centrailty'] = te_m[forward_rating].max(axis = 1) / (te_m[forward_rating].max(axis = 1) - te_m[forward_rating].min(axis = 1))

In [434]:
m.drop(midfielder_rating + gk_ability, axis = 1, inplace = True)

In [435]:
m['Position'] = le.fit_transform(m['Position'])

In [436]:
te_m['Position'] = le.transform(te_m['Position'])

In [437]:
X = m.drop(['Prospect', 'Position_Category'], axis = 1)

In [438]:
y = m['Prospect']

In [439]:
target = te_m[X.columns]

In [441]:
cb_f1 = []
i = 0
cb_m = np.zeros((target.shape[0]))
for tr_idx, val_idx in skf.split(X, y) :
    
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
    
    cb = CatBoostClassifier(random_state = 42, max_depth = 7, learning_rate = 0.03, n_estimators = 5000, eval_metric = 'F1')
    
    cb.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], early_stopping_rounds = 1000, verbose = 0)
    pred = cb.predict(val_x)
    f1 = f1_score(val_y, pred, average = 'macro')
    cb_f1.append(f1)
    print(f'{i + 1} Fold F1 = {f1}')
    i += 1
    
    pred = cb.predict_proba(target)[:, 1] / skf.n_splits
    cb_m += pred
print(f'\n{cb.__class__.__name__} AVG of F1 = {np.mean(cb_f1)}')

1 Fold F1 = 0.8073479969214566
2 Fold F1 = 0.7809209492377809
3 Fold F1 = 0.7940616518815571
4 Fold F1 = 0.8101169800774545
5 Fold F1 = 0.7737931034482759
6 Fold F1 = 0.7970215993931409
7 Fold F1 = 0.7839262187088274
8 Fold F1 = 0.7370243576122457

CatBoostClassifier AVG of F1 = 0.7855266071600924


### GK

In [418]:
g = train[train['Position_Category'] == 3]

In [419]:
te_g = test[test['Position_Category'] == 3]

In [420]:
g.drop(defense_rating + forward_rating + midfielder_rating, inplace = True, axis = 1)

In [421]:
del g['Position']

In [422]:
del g['Position_Area']

In [423]:
del g['Position_Category']

In [424]:
del g['Oppsite_Foot_Side']

In [425]:
X = g.drop(['Prospect'], axis = 1)

In [426]:
y = g['Prospect']

In [427]:
target = te_g[X.columns]

In [379]:
cb_f1 = []
i = 0
cb_g = np.zeros((target.shape[0]))
for tr_idx, val_idx in skf.split(X, y) :
    
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
    
    cb = CatBoostClassifier(random_state = 42, max_depth = 5, learning_rate = 0.03, n_estimators = 5000, eval_metric = 'F1')
    
    cb.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], early_stopping_rounds = 1000, verbose = 0)
    pred = cb.predict(val_x)
    f1 = f1_score(val_y, pred, average = 'macro')
    cb_f1.append(f1)
    print(f'{i + 1} Fold F1 = {f1}')
    i += 1
    
    pred = cb.predict_proba(target)[:, 1] / skf.n_splits
    cb_g += pred
print(f'\n{cb.__class__.__name__} AVG of F1 = {np.mean(cb_f1)}')

1 Fold F1 = 0.8043326345213136
2 Fold F1 = 0.7737272155876807
3 Fold F1 = 0.8627726952850105
4 Fold F1 = 0.8627726952850105
5 Fold F1 = 0.6432090077410274
6 Fold F1 = 0.832857142857143
7 Fold F1 = 0.6290760869565217
8 Fold F1 = 0.6871657754010695

CatBoostClassifier AVG of F1 = 0.7619891567043471


### DF

In [406]:
d = train[train['Position_Category'] == 2]

In [407]:
te_d = test[test['Position_Category'] == 2]

In [408]:
d.drop(midfielder_rating + forward_rating + gk_rating, inplace = True, axis = 1)

In [409]:
d['Centrailty'] = d[defense_rating].max(axis = 1) / (d[defense_rating].max(axis = 1) - d[defense_rating].min(axis = 1))

In [410]:
te_d['Centrailty'] = te_d[forward_rating].max(axis = 1) / (te_d[forward_rating].max(axis = 1) - te_d[forward_rating].min(axis = 1))

In [411]:
d.drop(defense_rating + gk_ability, axis = 1, inplace = True)

In [412]:
d['Position'] = le.fit_transform(d['Position'])

In [413]:
te_d['Position'] = le.transform(te_d['Position'])

In [414]:
X = d.drop(['Prospect', 'Position_Category'], axis = 1)

In [415]:
y = d['Prospect']

In [416]:
target = te_d[X.columns]

In [417]:
cb_f1 = []
i = 0
cb_d = np.zeros((target.shape[0]))
for tr_idx, val_idx in skf.split(X, y) :
    
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
    
    cb = CatBoostClassifier(random_state = 42, max_depth = 5, learning_rate = 0.03, n_estimators = 5000, eval_metric = 'F1')
    
    cb.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], early_stopping_rounds = 1000, verbose = 0)
    pred = cb.predict(val_x)
    f1 = f1_score(val_y, pred, average = 'macro')
    cb_f1.append(f1)
    print(f'{i + 1} Fold F1 = {f1}')
    i += 1
    
    pred = cb.predict_proba(target)[:, 1] / skf.n_splits
    cb_d += pred
print(f'\n{cb.__class__.__name__} AVG of F1 = {np.mean(cb_f1)}')

1 Fold F1 = 0.8649165983046212
2 Fold F1 = 0.7821354383049439
3 Fold F1 = 0.7904411764705883
4 Fold F1 = 0.6998946998946999
5 Fold F1 = 0.7600849256900212
6 Fold F1 = 0.8527550260610574
7 Fold F1 = 0.7931665649786455
8 Fold F1 = 0.8665152203543844

CatBoostClassifier AVG of F1 = 0.8012387062573703


In [391]:
cb_f1 = []
i = 0
cb_d = np.zeros((target.shape[0]))
for tr_idx, val_idx in skf.split(X, y) :
    
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
    
    cb = CatBoostClassifier(random_state = 42, max_depth = 6, learning_rate = 0.03, n_estimators = 5000, eval_metric = 'F1')
    
    cb.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], early_stopping_rounds = 1000, verbose = 0)
    pred = cb.predict(val_x)
    f1 = f1_score(val_y, pred, average = 'macro')
    cb_f1.append(f1)
    print(f'{i + 1} Fold F1 = {f1}')
    i += 1
    
    pred = cb.predict_proba(target)[:, 1] / skf.n_splits
    cb_d += pred
print(f'\n{cb.__class__.__name__} AVG of F1 = {np.mean(cb_f1)}')

1 Fold F1 = 0.8556962025316455
2 Fold F1 = 0.7744904102519996
3 Fold F1 = 0.7780661907852044
4 Fold F1 = 0.7111486486486486
5 Fold F1 = 0.7638346206269877
6 Fold F1 = 0.836940836940837
7 Fold F1 = 0.8215789473684212
8 Fold F1 = 0.8665152203543844

CatBoostClassifier AVG of F1 = 0.801033884688516


In [392]:
submission = pd.read_csv('sample_submission.csv')

In [400]:
submission.loc[test[test['Position_Category'] == 4].index, 'Prospect'] = cb_f
submission.loc[test[test['Position_Category'] == 1].index, 'Prospect'] = cb_m
submission.loc[test[test['Position_Category'] == 3].index, 'Prospect'] = cb_g
submission.loc[test[test['Position_Category'] == 2].index, 'Prospect'] = cb_d

In [402]:
submission['Prospect'] = np.where(submission['Prospect'] >= 0.5, 1, 0)

In [404]:
submission['Prospect'].value_counts()

0    1069
1     557
Name: Prospect, dtype: int64

In [405]:
submission.to_csv('baseline.csv', index = False)