In [2]:
# https://dacon.io/competitions/open/235538/data
# id : 선수 고유의 아이디
# name : 이름
# age : 나이
# continent : 선수들의 국적이 포함되어 있는 대륙입니다
# contract_until : 선수의 계약기간이 언제까지인지 나타내어 줍니다
# position : 선수가 선호하는 포지션입니다. ex) 공격수, 수비수 등
# prefer_foot : 선수가 선호하는 발입니다. ex) 오른발
# reputation : 선수가 유명한 정도입니다. ex) 높은 수치일 수록 유명한 선수
# stat_overall : 선수의 현재 능력치 입니다.
# stat_potential : 선수가 경험 및 노력을 통해 발전할 수 있는 정도입니다.
# stat_skill_moves : 선수의 개인기 능력치 입니다.
# value : FIFA가 선정한 선수의 이적 시장 가격 (단위 : 유로) 입니다


In [74]:
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from lightgbm import LGBMRegressor
from bayes_opt import BayesianOptimization

In [43]:
train_df = pd.read_csv('./data/FIFA_train.csv')
test_df = pd.read_csv('./data/FIFA_test.csv')
sub_df = pd.read_csv('./data/submission.csv')


In [44]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8932 entries, 0 to 8931
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                8932 non-null   int64  
 1   name              8932 non-null   object 
 2   age               8932 non-null   int64  
 3   continent         8932 non-null   object 
 4   contract_until    8932 non-null   object 
 5   position          8932 non-null   object 
 6   prefer_foot       8932 non-null   object 
 7   reputation        8932 non-null   float64
 8   stat_overall      8932 non-null   int64  
 9   stat_potential    8932 non-null   int64  
 10  stat_skill_moves  8932 non-null   float64
 11  value             8932 non-null   float64
dtypes: float64(3), int64(4), object(5)
memory usage: 837.5+ KB


In [45]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3828 entries, 0 to 3827
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                3828 non-null   int64  
 1   name              3828 non-null   object 
 2   age               3828 non-null   int64  
 3   continent         3828 non-null   object 
 4   contract_until    3828 non-null   object 
 5   position          3828 non-null   object 
 6   prefer_foot       3828 non-null   object 
 7   reputation        3828 non-null   float64
 8   stat_overall      3828 non-null   int64  
 9   stat_potential    3828 non-null   int64  
 10  stat_skill_moves  3828 non-null   float64
dtypes: float64(2), int64(4), object(5)
memory usage: 329.1+ KB


In [46]:
train_df.drop(columns=['id','name'], inplace=True)
test_df.drop(columns=['id','name'], inplace=True)

In [47]:
train_df['age'].value_counts()

26    708
24    691
21    676
23    663
25    654
22    632
20    582
27    581
28    532
19    491
30    476
29    472
31    347
18    344
32    285
34    216
33    202
17    131
35     89
36     64
37     42
16     18
38     17
39     16
40      3
Name: age, dtype: int64

In [48]:
train_df['continent'].value_counts()

europe           5322
south america    1927
asia              787
africa            721
oceania           175
Name: continent, dtype: int64

In [49]:
train_df['contract_until'].value_counts()

2019            2366
2021            2308
2020            2041
2022             761
2023             506
Jun 30, 2019     501
2018             327
Dec 31, 2018      64
May 31, 2019      19
2024              12
Jan 31, 2019      10
Jun 30, 2020       9
2025               3
Jan 1, 2019        2
Jan 12, 2019       1
2026               1
May 31, 2020       1
Name: contract_until, dtype: int64

In [50]:
train_df['position'].value_counts()

MF    3428
DF    2791
ST    1705
GK    1008
Name: position, dtype: int64

In [51]:
train_df['prefer_foot'].value_counts()

right    6837
left     2095
Name: prefer_foot, dtype: int64

In [52]:
train_df['reputation'].value_counts()

1.0    8014
2.0     706
3.0     177
4.0      31
5.0       4
Name: reputation, dtype: int64

In [53]:
train_df['stat_overall'].value_counts()

67    570
66    559
68    543
69    532
64    492
65    479
63    473
70    463
72    445
71    431
62    363
73    332
61    305
74    303
60    303
75    281
59    205
76    204
58    170
77    169
57    148
56    143
78    106
55    104
79    103
54    102
53     91
80     83
52     70
82     58
81     53
51     51
83     46
50     42
84     27
85     22
86     11
88     10
87      9
89      7
48      7
49      6
47      4
90      3
91      3
94      1
Name: stat_overall, dtype: int64

In [54]:
train_df['contract_until'] = train_df['contract_until'].apply(lambda x : int(x[-4:]))
test_df['contract_until'] = test_df['contract_until'].apply(lambda x : int(x[-4:]))

In [55]:
train_df['contract_until'].unique().tolist()

[2021, 2020, 2019, 2023, 2022, 2024, 2026, 2018, 2025]

In [56]:
train_df['contract_until'].value_counts()

2019    2899
2021    2308
2020    2051
2022     761
2023     506
2018     391
2024      12
2025       3
2026       1
Name: contract_until, dtype: int64

In [57]:
train_df.groupby('contract_until').mean()['value']

contract_until
2018    1.286407e+06
2019    1.746123e+06
2020    2.397036e+06
2021    2.827543e+06
2022    5.899074e+06
2023    5.802115e+06
2024    2.450833e+07
2025    1.405333e+07
2026    5.050000e+07
Name: value, dtype: float64

In [58]:
print(train_df['age'].min())
print(train_df['age'].max())

16
40


In [59]:
def age_group(age):
    if age < 20: age = 15
    elif age >= 20 and age < 25 : age = 20
    elif age >= 25 and age < 30: age = 25
    elif age >= 30 and age < 35: age = 30
    elif age >= 35: age = 35
    return age

def age_group2(age):
    if age < 20: age = 10
    elif age >= 20 and age < 30 : age = 20
    elif age >= 30 and age < 40: age = 30
    elif age >= 40: age = 40
    return age

train_df['age'] = train_df['age'].apply(lambda x : age_group2(x))
test_df['age'] = test_df['age'].apply(lambda x : age_group2(x))

In [60]:
train_df['age'].value_counts()

20    6191
30    1754
10     984
40       3
Name: age, dtype: int64

In [61]:
train_df.groupby('age').mean()['value']

age
10    7.277287e+05
20    3.090281e+06
30    2.831690e+06
40    1.436667e+06
Name: value, dtype: float64

In [62]:
train_df['continent'].value_counts()


europe           5322
south america    1927
asia              787
africa            721
oceania           175
Name: continent, dtype: int64

In [63]:
train_df.groupby('continent').mean()['value']

continent
africa           2.972247e+06
asia             1.035146e+06
europe           2.928125e+06
oceania          8.225429e+05
south america    3.183204e+06
Name: value, dtype: float64

In [64]:
temp_list = train_df['continent'].unique().tolist()
train_df['continent'] = train_df.continent.map(lambda x : temp_list.index(x))
temp_list = test_df['continent'].unique().tolist()
test_df['continent'] = test_df.continent.map(lambda x : temp_list.index(x))

In [65]:
train_df['continent'].value_counts()

1    5322
0    1927
3     787
2     721
4     175
Name: continent, dtype: int64

In [66]:
train_df.groupby('position').mean()['value']


position
DF    2.304348e+06
GK    1.992073e+06
MF    3.121762e+06
ST    3.330361e+06
Name: value, dtype: float64

In [67]:
temp_list = train_df['position'].unique().tolist()
train_df['position'] = train_df.position.map(lambda x : temp_list.index(x))
temp_list = test_df['position'].unique().tolist()
test_df['position'] = test_df.position.map(lambda x : temp_list.index(x))

In [68]:
train_df.groupby('prefer_foot').mean()['value']

prefer_foot
left     2.865232e+06
right    2.752150e+06
Name: value, dtype: float64

In [69]:
temp_list = train_df['prefer_foot'].unique().tolist()
train_df['prefer_foot'] = train_df.prefer_foot.map(lambda x : temp_list.index(x))
temp_list = test_df['prefer_foot'].unique().tolist()
test_df['prefer_foot'] = test_df.prefer_foot.map(lambda x : temp_list.index(x))

In [70]:
y = train_df['value']
X = train_df.drop('value', axis= 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [71]:
lgbm = LGBMRegressor()

lgbm.fit(X_train, y_train).score(X_test, y_test)

0.9744428483760013

In [75]:
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.metrics import mean_absolute_error

  # MAPE Metric
def mean_absolute_percentage_error(y_test, y_pred):
  y_test, y_pred = np.array(y_test), np.array(y_pred)
  return np.mean(np.abs((y_test - y_pred) / y_test)) * 100

  # 탐색 대상 함수 (XGBRegressor)
def lgbm_cv(max_depth,learning_rate, n_estimators, 
            min_child_weight, subsample
            ,colsample_bytree, silent=True, nthread=-1):
  

      # 모델 정의
  model = LGBMRegressor(max_depth=int(max_depth),
                        learning_rate=learning_rate,
                        n_estimators=int(n_estimators),
                        # gamma=gamma,
                        min_child_weight=min_child_weight,
                        subsample=subsample,
                        colsample_bytree=colsample_bytree, 
                        nthread=nthread)
      # 모델 훈련
  model.fit(X_train, y_train)

      # 예측값 출력
  y_pred = model.predict(X_test)

  rmse = np.sqrt(mean_squared_error(y_test, y_pred))
  r2 = r2_score(y_test, y_pred)
  mape = mean_absolute_percentage_error(y_test, y_pred)
  mean_absolute_error(y_test, y_pred)

      # 오차 최적화로 사용할 metric 반환
  return r2

In [76]:
# 실험해보고자하는 hyperparameter 집합
pbounds = {'max_depth': (1, 15),
           'learning_rate': (0.01, 0.5),
           'n_estimators': (50, 1000),
          #  'gamma': (0, 100),/
           'min_child_weight': (0, 3),
           'subsample': (0.1, 0.99),
           'colsample_bytree' :(0.1, 0.99)
           }

  # verbose = 2 항상 출력, verbose = 1 최댓값일 때 출력, verbose = 0 출력 안함
  # random_state : Bayesian Optimization 상의 랜덤성이 존재하는 부분을 통제 
bo=BayesianOptimization(f=lgbm_cv, pbounds=pbounds, verbose=2, random_state=1 )

  # init_points :  초기 Random Search 갯수
  # n_iter : 반복 횟수 (몇개의 입력값-함숫값 점들을 확인할지! 많을 수록 정확한 값을 얻을 수 있다.)
  # acq : Acquisition Function들 중 Expected Improvement(EI) 를 사용
  # xi : exploration 강도 (기본값은 0.0)
bo.maximize(init_points=2, n_iter=10, acq='ei', xi=0.01)

  # 찾은 파라미터 값 확인
print(bo.max)

|   iter    |  target   | colsam... | learni... | max_depth | min_ch... | n_esti... | subsample |
-------------------------------------------------------------------------------------------------
| [0m1        [0m | [0m0.9398   [0m | [0m0.4711   [0m | [0m0.363    [0m | [0m1.002    [0m | [0m0.907    [0m | [0m189.4    [0m | [0m0.1822   [0m |
| [95m2        [0m | [95m0.9625   [0m | [95m0.2658   [0m | [95m0.1793   [0m | [95m6.555    [0m | [95m1.616    [0m | [95m448.2    [0m | [95m0.7098   [0m |
| [0m3        [0m | [0m0.9392   [0m | [0m0.1598   [0m | [0m0.4424   [0m | [0m6.773    [0m | [0m1.157    [0m | [0m449.3    [0m | [0m0.1903   [0m |
| [95m4        [0m | [95m0.9706   [0m | [95m0.632    [0m | [95m0.166    [0m | [95m10.28    [0m | [95m1.345    [0m | [95m213.6    [0m | [95m0.3696   [0m |
| [0m5        [0m | [0m0.9397   [0m | [0m0.8985   [0m | [0m0.2695   [0m | [0m1.826    [0m | [0m2.512    [0m | [0m265.5    [0m 

In [78]:
# 2차 학습
lgbm_ff = LGBMRegressor(random_state=0, learning_rate=0.0699, metric = 'mae',
                      n_estimators=679,max_depth=11,min_child_weight=2.6042)



lgbm_ff.fit(X, y)

LGBMRegressor(learning_rate=0.0699, max_depth=11, metric='mae',
              min_child_weight=2.6042, n_estimators=679, random_state=0)

In [79]:
y_pred = lgbm_ff.predict(X_test)
MSE = mean_squared_error(y_test, y_pred)
np.sqrt(MSE)

581100.5760913864

In [None]:
pred = lgbm_ff.predict(test_df)
sub_df['value'] = pred

In [None]:
sub_df.to_csv('./save/submission.csv', index=False)

In [32]:
model = CatBoostRegressor(random_state = 123)
model.fit(X_train, y_train, eval_set = [(X_test,y_test)])

Learning rate set to 0.066818
0:	learn: 5628830.7010575	test: 5162979.6827977	best: 5162979.6827977 (0)	total: 155ms	remaining: 2m 35s
1:	learn: 5343138.5946339	test: 4887506.0326632	best: 4887506.0326632 (1)	total: 157ms	remaining: 1m 18s
2:	learn: 5076240.5266443	test: 4624673.8275921	best: 4624673.8275921 (2)	total: 158ms	remaining: 52.7s
3:	learn: 4833628.7274693	test: 4402552.7401185	best: 4402552.7401185 (3)	total: 160ms	remaining: 39.8s
4:	learn: 4588291.6804829	test: 4179058.8163623	best: 4179058.8163623 (4)	total: 161ms	remaining: 32.1s
5:	learn: 4354088.8244695	test: 3973889.3800404	best: 3973889.3800404 (5)	total: 163ms	remaining: 27s
6:	learn: 4129790.4954815	test: 3761400.3176633	best: 3761400.3176633 (6)	total: 164ms	remaining: 23.3s
7:	learn: 3924067.3033692	test: 3556816.7149940	best: 3556816.7149940 (7)	total: 166ms	remaining: 20.5s
8:	learn: 3731828.6608202	test: 3367206.8927873	best: 3367206.8927873 (8)	total: 167ms	remaining: 18.4s
9:	learn: 3549787.4451687	test: 31

<catboost.core.CatBoostRegressor at 0x296aaab57f0>

In [33]:
y_pred = model.predict(X_test)
MSE = mean_squared_error(y_test, y_pred)
np.sqrt(MSE)

878921.2490333221

In [None]:
pred = model.predict(test_df)
sub_df['value'] = pred

In [None]:
sub_df.to_csv('./save/submission.csv', index=False)