In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.model_selection import KFold
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from ngboost import NGBRegressor

### 데이터 불러오기

In [2]:
train = pd.read_csv('FIFA_train.csv')
test = pd.read_csv('FIFA_test.csv')
submission = pd.read_csv('submission.csv')

In [3]:
train.head()

Unnamed: 0,id,name,age,continent,contract_until,position,prefer_foot,reputation,stat_overall,stat_potential,stat_skill_moves,value
0,0,L. Messi,31,south america,2021,ST,left,5.0,94,94,4.0,110500000.0
1,3,De Gea,27,europe,2020,GK,right,4.0,91,93,1.0,72000000.0
2,7,L. Suárez,31,south america,2021,ST,right,5.0,91,91,3.0,80000000.0
3,8,Sergio Ramos,32,europe,2020,DF,right,4.0,91,91,3.0,51000000.0
4,9,J. Oblak,25,europe,2021,GK,right,3.0,90,93,1.0,68000000.0


In [4]:
test.head()

Unnamed: 0,id,name,age,continent,contract_until,position,prefer_foot,reputation,stat_overall,stat_potential,stat_skill_moves
0,1,Cristiano Ronaldo,33,europe,2022,ST,right,5.0,94,94,5.0
1,2,Neymar Jr,26,south america,2022,ST,right,5.0,92,93,5.0
2,4,K. De Bruyne,27,europe,2023,MF,right,4.0,91,92,4.0
3,5,E. Hazard,27,europe,2020,ST,right,4.0,91,91,4.0
4,6,L. Modrić,32,europe,2020,MF,right,4.0,91,91,4.0


In [5]:
submission.head()

Unnamed: 0,id,value
0,1,0
1,2,0
2,4,0
3,5,0
4,6,0


### 이전 공유 코드와 마찬가지로 계약기간 변수를 만들어줍니다.

In [6]:
def con_period(x):
    if x == 'Dec 31, 2018' :
        return '2019'
    elif x == 'Jun 30, 2020' :
        return '2020.5'
    elif x == 'Jun 30, 2019' :
        return '2019.5'
    elif x == 'May 31, 2020' :
        return '2020.3333'
    elif x == 'May 31, 2019' :
        return '2019.3333'
    elif x == 'Jan 31, 2019' :
        return '2019.0833'
    elif x == 'Jan 1, 2019' :
        return '2019'
    elif x == 'Jan 12, 2019' :
        return '2019.034'
    elif x == 'Dec 31, 2019' :
        return '2020'
    elif x == 'Jun 1, 2019' :
        return '2019.416'
    else :
        return x

In [7]:
train.contract_until = train.contract_until.apply(con_period).astype('float64') - 2018
test.contract_until = test.contract_until.apply(con_period).astype('float64') - 2018

***
### Log-Transformation

In [8]:
train[['age', 'stat_potential']] = np.log1p(train[['age', 'stat_potential']])
test[['age', 'stat_potential']] = np.log1p(test[['age', 'stat_potential']])

### Train Set

In [10]:
X = train[['age', 'continent', 'contract_until', 'position', 'reputation', 'stat_overall', 'stat_potential', 'stat_skill_moves']]
y = np.log1p(train['value'])

In [11]:
X = pd.get_dummies(columns = ['continent', 'position'], data = X)

### Test Set

In [12]:
target = test[['age', 'continent', 'contract_until', 'position', 'reputation', 'stat_overall', 'stat_potential', 'stat_skill_moves']]

In [13]:
target = pd.get_dummies(columns = ['continent', 'position'], data = target)

### 10-Fold Crossvalidation

In [14]:
kf = KFold(n_splits = 10, random_state = 521, shuffle = True)

### 1) NGBRegressor

In [20]:
ngb = NGBRegressor(random_state = 521, verbose = 500, n_estimators = 500)

In [21]:
ngb_pred = np.zeros((target.shape[0]))
rmse_list = []
for tr_idx, val_idx in kf.split(X, y) :
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
    
    ngb.fit(tr_x, tr_y)
    pred = np.expm1([0 if x < 0 else x for x in ngb.predict(val_x)])

    rmse = np.sqrt(mean_squared_error(np.expm1(val_y), pred))
    rmse_list.append(rmse)
    
    sub_pred = np.expm1([0 if x < 0 else x for x in ngb.predict(target)]) / 10
    ngb_pred += sub_pred
print(f'{ngb.__class__.__name__}의 10fold 평균 RMSE는 {np.mean(rmse_list)}')

[iter 0] loss=1.7562 val_loss=0.0000 scale=1.0000 norm=1.2861
[iter 100] loss=0.7611 val_loss=0.0000 scale=2.0000 norm=1.1141
[iter 200] loss=-0.1028 val_loss=0.0000 scale=2.0000 norm=0.9194
[iter 300] loss=-0.7609 val_loss=0.0000 scale=2.0000 norm=0.8144
[iter 400] loss=-1.0723 val_loss=0.0000 scale=2.0000 norm=0.8506
[iter 0] loss=-1.1726 val_loss=0.0000 scale=1.0000 norm=0.4586
[iter 100] loss=-1.2344 val_loss=0.0000 scale=1.0000 norm=0.4660
[iter 200] loss=-1.2778 val_loss=0.0000 scale=1.0000 norm=0.4740
[iter 300] loss=-1.3129 val_loss=0.0000 scale=2.0000 norm=0.9557
[iter 400] loss=-1.3429 val_loss=0.0000 scale=2.0000 norm=0.9620
[iter 0] loss=-1.3300 val_loss=0.0000 scale=2.0000 norm=1.0117
[iter 100] loss=-1.3749 val_loss=0.0000 scale=1.0000 norm=0.4872
[iter 200] loss=-1.3933 val_loss=0.0000 scale=1.0000 norm=0.4881
[iter 300] loss=-1.4100 val_loss=0.0000 scale=1.0000 norm=0.4890
[iter 400] loss=-1.4252 val_loss=0.0000 scale=1.0000 norm=0.4889
[iter 0] loss=-1.4211 val_loss=0.

### 2) RandomForestRegressor

In [36]:
rf = RandomForestRegressor(random_state = 521, n_estimators = 150)

In [37]:
rf_pred = np.zeros((target.shape[0]))
rmse_list = []
for tr_idx, val_idx in kf.split(X, y) :
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
    
    rf.fit(tr_x, tr_y)
    pred = np.expm1([0 if x < 0 else x for x in rf.predict(val_x)])

    rmse = np.sqrt(mean_squared_error(np.expm1(val_y), pred))
    rmse_list.append(rmse)
    
    sub_pred = np.expm1([0 if x < 0 else x for x in rf.predict(target)]) / 10
    rf_pred += sub_pred
print(f'{rf.__class__.__name__}의 10fold 평균 RMSE는 {np.mean(rmse_list)}')

RandomForestRegressor의 10fold 평균 RMSE는 812259.6958769986


### 3) ExtraTreesRegressor

In [44]:
etc = ExtraTreesRegressor(random_state = 521, n_estimators = 500)

In [45]:
etc_pred = np.zeros((target.shape[0]))
rmse_list = []
for tr_idx, val_idx in kf.split(X, y) :
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
    
    etc.fit(tr_x, tr_y)
    pred = np.expm1([0 if x < 0 else x for x in etc.predict(val_x)])

    rmse = np.sqrt(mean_squared_error(np.expm1(val_y), pred))
    rmse_list.append(rmse)
    
    sub_pred = np.expm1([0 if x < 0 else x for x in etc.predict(target)]) / 10
    etc_pred += sub_pred
print(f'{etc.__class__.__name__}의 10fold 평균 RMSE는 {np.mean(rmse_list)}')

ExtraTreesRegressor의 10fold 평균 RMSE는 742490.4940104572


### 4) GradientBoostingRegressor

In [54]:
gb = GradientBoostingRegressor(random_state = 521, max_depth = 5)

In [55]:
gb_pred = np.zeros((target.shape[0]))
rmse_list = []
for tr_idx, val_idx in kf.split(X, y) :
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
    
    gb.fit(tr_x, tr_y)
    pred = np.expm1([0 if x < 0 else x for x in gb.predict(val_x)])

    rmse = np.sqrt(mean_squared_error(np.expm1(val_y), pred))
    rmse_list.append(rmse)
    
    sub_pred = np.expm1([0 if x < 0 else x for x in gb.predict(target)]) / 10
    gb_pred += sub_pred
print(f'{gb.__class__.__name__}의 10fold 평균 RMSE는 {np.mean(rmse_list)}')

GradientBoostingRegressor의 10fold 평균 RMSE는 590051.6390746763


### 5) CatBoostRegressor

In [75]:
cb = CatBoostRegressor(random_state = 521, silent = True, depth = 3)

In [76]:
cb_pred = np.zeros((target.shape[0]))
rmse_list = []
for tr_idx, val_idx in kf.split(X, y) :
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
    
    cb.fit(tr_x, tr_y)
    pred = np.expm1([0 if x < 0 else x for x in cb.predict(val_x)])

    rmse = np.sqrt(mean_squared_error(np.expm1(val_y), pred))
    rmse_list.append(rmse)
    
    sub_pred = np.expm1([0 if x < 0 else x for x in cb.predict(target)]) / 10
    cb_pred += sub_pred
print(f'{cb.__class__.__name__}의 10fold 평균 RMSE는 {np.mean(rmse_list)}')

CatBoostRegressor의 10fold 평균 RMSE는 445035.320543002


### 6) LGBMRegressor

In [97]:
lgbm = LGBMRegressor(random_state = 521, max_depth = 4, n_estimators = 1000)

In [98]:
lgbm_pred = np.zeros((target.shape[0]))
rmse_list = []
for tr_idx, val_idx in kf.split(X, y) :
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
    
    lgbm.fit(tr_x, tr_y)
    pred = np.expm1([0 if x < 0 else x for x in lgbm.predict(val_x)])

    rmse = np.sqrt(mean_squared_error(np.expm1(val_y), pred))
    rmse_list.append(rmse)
    
    sub_pred = np.expm1([0 if x < 0 else x for x in lgbm.predict(target)]) / 10
    lgbm_pred += sub_pred
print(f'{lgbm.__class__.__name__}의 10fold 평균 RMSE는 {np.mean(rmse_list)}')

LGBMRegressor의 10fold 평균 RMSE는 600903.6346210752


### 7) XGBRegressor

In [113]:
xgb = XGBRegressor(random_state = 521, max_depth = 5)

In [114]:
xgb_pred = np.zeros((target.shape[0]))
rmse_list = []
for tr_idx, val_idx in kf.split(X, y) :
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
    
    xgb.fit(tr_x, tr_y)
    pred = np.expm1([0 if x < 0 else x for x in xgb.predict(val_x)])

    rmse = np.sqrt(mean_squared_error(np.expm1(val_y), pred))
    rmse_list.append(rmse)
    
    sub_pred = np.expm1([0 if x < 0 else x for x in xgb.predict(target)]) / 10
    xgb_pred += sub_pred
print(f'{xgb.__class__.__name__}의 10fold 평균 RMSE는 {np.mean(rmse_list)}')

XGBRegressor의 10fold 평균 RMSE는 581307.7105113387


### Blending Models

In [132]:
submission['value'] = (gb_pred + rf_pred + etc_pred + lgbm_pred + cb_pred + ngb_pred + xgb_pred) / 7

In [133]:
q1 = submission['value'].quantile(0.004)
q2 = submission['value'].quantile(0.99)

submission['value'] = submission['value'].apply(lambda x: x if x > q1 else x * 0.9)
submission['value'] = submission['value'].apply(lambda x: x if x < q2 else x * 1.1)

In [134]:
submission

Unnamed: 0,id,value
0,1,6.535701e+07
1,2,1.025468e+08
2,4,8.708519e+07
3,5,9.148247e+07
4,6,6.999558e+07
...,...,...
3823,16924,5.841319e+04
3824,16929,4.985170e+04
3825,16932,5.951643e+04
3826,16937,4.977167e+04


In [135]:
submission.to_csv('fifa0520.csv', index = False)