In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
import warnings
warnings.filterwarnings('ignore')

In [2]:
os.listdir('./data')

['submission_baseline_rf2.csv',
 'data_pickle.pkl',
 'FIFA_train.csv',
 'data_feather.ftr',
 'submission_baseline_rf.csv',
 'FIFA_test.csv',
 'submission.csv']

In [3]:
train_path = os.path.join('data', 'FIFA_train.csv')
test_path = os.path.join('data', 'FIFA_test.csv')
submission_path = os.path.join('data', 'submission.csv')

In [4]:
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
submission = pd.read_csv(submission_path)

In [5]:
train.head()

Unnamed: 0,id,name,age,continent,contract_until,position,prefer_foot,reputation,stat_overall,stat_potential,stat_skill_moves,value
0,0,L. Messi,31,south america,2021,ST,left,5.0,94,94,4.0,110500000.0
1,3,De Gea,27,europe,2020,GK,right,4.0,91,93,1.0,72000000.0
2,7,L. Suárez,31,south america,2021,ST,right,5.0,91,91,3.0,80000000.0
3,8,Sergio Ramos,32,europe,2020,DF,right,4.0,91,91,3.0,51000000.0
4,9,J. Oblak,25,europe,2021,GK,right,3.0,90,93,1.0,68000000.0


In [6]:
test.head()

Unnamed: 0,id,name,age,continent,contract_until,position,prefer_foot,reputation,stat_overall,stat_potential,stat_skill_moves
0,1,Cristiano Ronaldo,33,europe,2022,ST,right,5.0,94,94,5.0
1,2,Neymar Jr,26,south america,2022,ST,right,5.0,92,93,5.0
2,4,K. De Bruyne,27,europe,2023,MF,right,4.0,91,92,4.0
3,5,E. Hazard,27,europe,2020,ST,right,4.0,91,91,4.0
4,6,L. Modrić,32,europe,2020,MF,right,4.0,91,91,4.0


In [7]:
submission.head()

Unnamed: 0,id,value
0,1,0
1,2,0
2,4,0
3,5,0
4,6,0


In [8]:
set(train.columns) - set(test.columns)  # test에 없는 컬럼 확인

{'value'}

In [9]:
y_train = train['value']  # target 지정

In [10]:
# train, test 합치기
data = pd.concat([train.drop('value', axis=1), test]).reset_index(drop=True)

In [11]:
print(train.shape)
print(test.shape)
print(data.shape)

(8932, 12)
(3828, 11)
(12760, 11)


In [12]:
data = data.drop(['id', 'name'], axis=1)  # 불필요한 컬럼 제거
data.head()

Unnamed: 0,age,continent,contract_until,position,prefer_foot,reputation,stat_overall,stat_potential,stat_skill_moves
0,31,south america,2021,ST,left,5.0,94,94,4.0
1,27,europe,2020,GK,right,4.0,91,93,1.0
2,31,south america,2021,ST,right,5.0,91,91,3.0
3,32,europe,2020,DF,right,4.0,91,91,3.0
4,25,europe,2021,GK,right,3.0,90,93,1.0


In [13]:
data.isna().sum()  # 결측치 확인

age                 0
continent           0
contract_until      0
position            0
prefer_foot         0
reputation          0
stat_overall        0
stat_potential      0
stat_skill_moves    0
dtype: int64

In [14]:
# 컬럼별 유니크한 값의 개수
for col in data.columns:
    print(f'column: {col}')
    print(f'# of unique values: {data[col].nunique()}')
    print()

column: age
# of unique values: 26

column: continent
# of unique values: 5

column: contract_until
# of unique values: 19

column: position
# of unique values: 4

column: prefer_foot
# of unique values: 2

column: reputation
# of unique values: 5

column: stat_overall
# of unique values: 47

column: stat_potential
# of unique values: 46

column: stat_skill_moves
# of unique values: 5



In [15]:
# 컬럼별 유니크한 값 목록
for col in data.columns:
    print(f'column: {col}')
    print(f'unique values: {data[col].unique()}')
    print()

column: age
unique values: [31 27 32 25 26 29 33 30 40 24 28 34 23 22 35 36 21 18 19 37 20 39 17 38
 16 42]

column: continent
unique values: ['south america' 'europe' 'africa' 'asia' 'oceania']

column: contract_until
unique values: ['2021' '2020' '2019' '2023' '2022' '2024' 'Jun 30, 2019' '2026'
 'Dec 31, 2018' '2018' '2025' 'Jun 30, 2020' 'May 31, 2020' 'May 31, 2019'
 'Jan 31, 2019' 'Jan 1, 2019' 'Jan 12, 2019' 'Dec 31, 2019' 'Jun 1, 2019']

column: position
unique values: ['ST' 'GK' 'DF' 'MF']

column: prefer_foot
unique values: ['left' 'right']

column: reputation
unique values: [5. 4. 3. 1. 2.]

column: stat_overall
unique values: [94 91 90 89 88 87 86 85 84 83 82 81 80 79 78 77 76 75 74 73 72 71 70 69
 68 67 66 65 64 63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 92]

column: stat_potential
unique values: [94 93 91 90 92 89 88 87 86 85 84 83 82 81 80 79 78 77 76 75 74 73 72 71
 70 69 68 67 66 65 64 63 62 61 60 59 58 57 56 55 54 53 52 50 48 95]

column: stat_skill_moves
uniq

In [16]:
# 계약기간 연도 4자리 추출
data['contract_until'] = data['contract_until'].map(lambda x: x[-4:])
data['contract_until'].unique()

array(['2021', '2020', '2019', '2023', '2022', '2024', '2026', '2018',
       '2025'], dtype=object)

In [17]:
# 원-핫 인코딩 (string 타입만 가능)
data = pd.get_dummies(data, columns=['continent', 'contract_until', 'position', 'prefer_foot'])

In [18]:
data.shape

(12760, 25)

In [19]:
data.head()

Unnamed: 0,age,reputation,stat_overall,stat_potential,stat_skill_moves,continent_africa,continent_asia,continent_europe,continent_oceania,continent_south america,...,contract_until_2023,contract_until_2024,contract_until_2025,contract_until_2026,position_DF,position_GK,position_MF,position_ST,prefer_foot_left,prefer_foot_right
0,31,5.0,94,94,4.0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,1,0
1,27,4.0,91,93,1.0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,1
2,31,5.0,91,91,3.0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,1
3,32,4.0,91,91,3.0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,1
4,25,3.0,90,93,1.0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,1


In [20]:
# 전처리된 데이터 저장
data.to_pickle('./data/data_pickle.pkl')
data.to_feather('./data/data_feather.ftr')

In [21]:
# train, test 나누기
X_train = data[:len(train)]
X_test = data[len(train):].reset_index(drop=True)

In [22]:
y_train = np.log1p(y_train)  # log scaling

In [23]:
# RandomForest
params = {
    'n_estimators': 300,
    'random_state': 2021
}

In [24]:
rf = RandomForestRegressor(**params)
rf.fit(X_train, y_train)

RandomForestRegressor(n_estimators=300, random_state=2021)

In [25]:
pred = rf.predict(X_test)
pred = np.expm1(pred)  # log scaling 되돌리기

In [26]:
submission['value'] = pred
submission.to_csv('./data/submission_baseline_rf.csv', index=False)

In [27]:
pd.read_csv('./data/submission_baseline_rf.csv')

Unnamed: 0,id,value
0,1,5.280492e+07
1,2,7.738953e+07
2,4,6.680251e+07
3,5,7.004369e+07
4,6,5.986099e+07
...,...,...
3823,16924,5.900439e+04
3824,16929,5.088056e+04
3825,16932,5.939972e+04
3826,16937,4.440723e+04
