In [433]:
# https://dacon.io/competitions/open/235538/data
# id : 선수 고유의 아이디
# name : 이름
# age : 나이
# continent : 선수들의 국적이 포함되어 있는 대륙입니다
# contract_until : 선수의 계약기간이 언제까지인지 나타내어 줍니다
# position : 선수가 선호하는 포지션입니다. ex) 공격수, 수비수 등
# prefer_foot : 선수가 선호하는 발입니다. ex) 오른발
# reputation : 선수가 유명한 정도입니다. ex) 높은 수치일 수록 유명한 선수
# stat_overall : 선수의 현재 능력치 입니다.
# stat_potential : 선수가 경험 및 노력을 통해 발전할 수 있는 정도입니다.
# stat_skill_moves : 선수의 개인기 능력치 입니다.
# value : FIFA가 선정한 선수의 이적 시장 가격 (단위 : 유로) 입니다


In [470]:
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error


In [435]:
train_df = pd.read_csv('./data/FIFA_train.csv')
test_df = pd.read_csv('./data/FIFA_test.csv')
sub_df = pd.read_csv('./data/submission.csv')


In [436]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8932 entries, 0 to 8931
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                8932 non-null   int64  
 1   name              8932 non-null   object 
 2   age               8932 non-null   int64  
 3   continent         8932 non-null   object 
 4   contract_until    8932 non-null   object 
 5   position          8932 non-null   object 
 6   prefer_foot       8932 non-null   object 
 7   reputation        8932 non-null   float64
 8   stat_overall      8932 non-null   int64  
 9   stat_potential    8932 non-null   int64  
 10  stat_skill_moves  8932 non-null   float64
 11  value             8932 non-null   float64
dtypes: float64(3), int64(4), object(5)
memory usage: 837.5+ KB


In [437]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3828 entries, 0 to 3827
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                3828 non-null   int64  
 1   name              3828 non-null   object 
 2   age               3828 non-null   int64  
 3   continent         3828 non-null   object 
 4   contract_until    3828 non-null   object 
 5   position          3828 non-null   object 
 6   prefer_foot       3828 non-null   object 
 7   reputation        3828 non-null   float64
 8   stat_overall      3828 non-null   int64  
 9   stat_potential    3828 non-null   int64  
 10  stat_skill_moves  3828 non-null   float64
dtypes: float64(2), int64(4), object(5)
memory usage: 329.1+ KB


In [438]:
train_df.drop(columns=['id','name'], inplace=True)
test_df.drop(columns=['id','name'], inplace=True)

In [439]:
train_df['age'].value_counts()

26    708
24    691
21    676
23    663
25    654
22    632
20    582
27    581
28    532
19    491
30    476
29    472
31    347
18    344
32    285
34    216
33    202
17    131
35     89
36     64
37     42
16     18
38     17
39     16
40      3
Name: age, dtype: int64

In [440]:
train_df['continent'].value_counts()

europe           5322
south america    1927
asia              787
africa            721
oceania           175
Name: continent, dtype: int64

In [441]:
train_df['contract_until'].value_counts()

2019            2366
2021            2308
2020            2041
2022             761
2023             506
Jun 30, 2019     501
2018             327
Dec 31, 2018      64
May 31, 2019      19
2024              12
Jan 31, 2019      10
Jun 30, 2020       9
2025               3
Jan 1, 2019        2
2026               1
May 31, 2020       1
Jan 12, 2019       1
Name: contract_until, dtype: int64

In [442]:
train_df['position'].value_counts()

MF    3428
DF    2791
ST    1705
GK    1008
Name: position, dtype: int64

In [443]:
train_df['prefer_foot'].value_counts()

right    6837
left     2095
Name: prefer_foot, dtype: int64

In [444]:
train_df['reputation'].value_counts()

1.0    8014
2.0     706
3.0     177
4.0      31
5.0       4
Name: reputation, dtype: int64

In [445]:
train_df['stat_overall'].value_counts()

67    570
66    559
68    543
69    532
64    492
65    479
63    473
70    463
72    445
71    431
62    363
73    332
61    305
74    303
60    303
75    281
59    205
76    204
58    170
77    169
57    148
56    143
78    106
55    104
79    103
54    102
53     91
80     83
52     70
82     58
81     53
51     51
83     46
50     42
84     27
85     22
86     11
88     10
87      9
48      7
89      7
49      6
47      4
91      3
90      3
94      1
Name: stat_overall, dtype: int64

In [446]:
train_df['contract_until'] = train_df['contract_until'].apply(lambda x : x[-4:])
test_df['contract_until'] = test_df['contract_until'].apply(lambda x : x[-4:])

In [447]:
train_df['contract_until'].unique().tolist()

['2021', '2020', '2019', '2023', '2022', '2024', '2026', '2018', '2025']

In [448]:
train_df['contract_until'].value_counts()

2019    2899
2021    2308
2020    2051
2022     761
2023     506
2018     391
2024      12
2025       3
2026       1
Name: contract_until, dtype: int64

In [449]:
train_df.groupby('contract_until').mean()['value']

contract_until
2018    1.286407e+06
2019    1.746123e+06
2020    2.397036e+06
2021    2.827543e+06
2022    5.899074e+06
2023    5.802115e+06
2024    2.450833e+07
2025    1.405333e+07
2026    5.050000e+07
Name: value, dtype: float64

In [450]:
print(train_df['age'].min())
print(train_df['age'].max())

16
40


In [451]:
def age_group(age):
    if age < 20: age = 15
    elif age >= 20 and age < 25 : age = 20
    elif age >= 25 and age < 30: age = 25
    elif age >= 30 and age < 35: age = 30
    elif age >= 35: age = 35
    return age

def age_group2(age):
    if age < 20: age = 10
    elif age >= 20 and age < 30 : age = 20
    elif age >= 30 and age < 40: age = 30
    elif age >= 40: age = 40
    return age

train_df['age'] = train_df['age'].apply(lambda x : age_group2(x))
test_df['age'] = test_df['age'].apply(lambda x : age_group2(x))

In [452]:
train_df['age'].value_counts()

20    6191
30    1754
10     984
40       3
Name: age, dtype: int64

In [453]:
train_df.groupby('age').mean()['value']

age
10    7.277287e+05
20    3.090281e+06
30    2.831690e+06
40    1.436667e+06
Name: value, dtype: float64

In [454]:
train_df['continent'].value_counts()


europe           5322
south america    1927
asia              787
africa            721
oceania           175
Name: continent, dtype: int64

In [455]:
train_df.groupby('continent').mean()['value']

continent
africa           2.972247e+06
asia             1.035146e+06
europe           2.928125e+06
oceania          8.225429e+05
south america    3.183204e+06
Name: value, dtype: float64

In [456]:
temp_list = train_df['continent'].unique().tolist()
train_df['continent'] = train_df.continent.map(lambda x : temp_list.index(x))
temp_list = test_df['continent'].unique().tolist()
test_df['continent'] = test_df.continent.map(lambda x : temp_list.index(x))

In [457]:
train_df['continent'].value_counts()

1    5322
0    1927
3     787
2     721
4     175
Name: continent, dtype: int64

In [458]:
train_df.groupby('position').mean()['value']


position
DF    2.304348e+06
GK    1.992073e+06
MF    3.121762e+06
ST    3.330361e+06
Name: value, dtype: float64

In [459]:
temp_list = train_df['position'].unique().tolist()
train_df['position'] = train_df.position.map(lambda x : temp_list.index(x))
temp_list = test_df['position'].unique().tolist()
test_df['position'] = test_df.position.map(lambda x : temp_list.index(x))

In [460]:
train_df.groupby('prefer_foot').mean()['value']

prefer_foot
left     2.865232e+06
right    2.752150e+06
Name: value, dtype: float64

In [461]:
temp_list = train_df['prefer_foot'].unique().tolist()
train_df['prefer_foot'] = train_df.prefer_foot.map(lambda x : temp_list.index(x))
temp_list = test_df['prefer_foot'].unique().tolist()
test_df['prefer_foot'] = test_df.prefer_foot.map(lambda x : temp_list.index(x))

In [462]:
y = train_df['value']
X = train_df.drop('value', axis= 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [463]:
model = CatBoostRegressor(random_state = 123)
model.fit(X_train, y_train, eval_set = [(X_test,y_test)])

Learning rate set to 0.069338
0:	learn: 5617274.7671884	test: 5152616.3642692	best: 5152616.3642692 (0)	total: 1.09ms	remaining: 1.09s
1:	learn: 5321545.7013885	test: 4867428.9696962	best: 4867428.9696962 (1)	total: 2.22ms	remaining: 1.11s
2:	learn: 5045946.2846730	test: 4595965.4221137	best: 4595965.4221137 (2)	total: 3.23ms	remaining: 1.07s
3:	learn: 4781179.7743354	test: 4363687.6954777	best: 4363687.6954777 (3)	total: 4.29ms	remaining: 1.07s
4:	learn: 4529436.8977062	test: 4134237.5126681	best: 4134237.5126681 (4)	total: 5.35ms	remaining: 1.06s
5:	learn: 4291354.1560008	test: 3925107.1305864	best: 3925107.1305864 (5)	total: 6.41ms	remaining: 1.06s
6:	learn: 4063065.4587795	test: 3708876.6891367	best: 3708876.6891367 (6)	total: 7.43ms	remaining: 1.05s
7:	learn: 3853650.1712369	test: 3500229.0087922	best: 3500229.0087922 (7)	total: 8.48ms	remaining: 1.05s
8:	learn: 3659915.1704373	test: 3317328.8317483	best: 3317328.8317483 (8)	total: 9.52ms	remaining: 1.05s
9:	learn: 3477105.9009717

<catboost.core.CatBoostRegressor at 0x17d8be732e0>

In [471]:
y_pred = model.predict(X_test)
MSE = mean_squared_error(y_test, y_pred)
np.sqrt(MSE)

878513.1325762578

In [464]:
pred = model.predict(test_df)
sub_df['value'] = pred

In [465]:
sub_df['value'] = pred

In [466]:
sub_df.to_csv('./save/submission.csv', index=False)