# 데이터 불러오기

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge, BayesianRidge, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

import numpy as np
import pandas as pd

train = pd.read_csv("data/FIFA_train.csv")
test = pd.read_csv("data/FIFA_test.csv")

# 전처리1

* train data의 value 전처리 우선
* value 를 log 스케일로
* log(value) 가 10~18 사이인 값만 사용

In [3]:
train["log_value"] = np.log(train["value"])
train = train[10<=np.log(train["value"])]
train = train[18>=np.log(train["value"])]

In [4]:
all_data = pd.concat([train, test], ignore_index=True)
all_data.tail(3)

Unnamed: 0,id,name,age,continent,contract_until,position,prefer_foot,reputation,stat_overall,stat_potential,stat_skill_moves,value,log_value
12735,16932,Y. Góez,18,south america,2021,MF,right,1.0,48,65,2.0,,
12736,16937,A. Kaltner,18,europe,2020,ST,right,1.0,47,61,2.0,,
12737,16943,K. Fujikawa,19,asia,2021,MF,right,1.0,47,61,2.0,,


* (stat_overall + stat_potential)/2 하여 새로운 피쳐를 만들고 원래 피쳐 제거

In [5]:
all_data["all_stat"] = (all_data["stat_overall"]+all_data["stat_potential"])/2

In [6]:
all_data = all_data.drop(["stat_overall", "stat_potential"], axis=1)

In [7]:
all_data.tail(3)

Unnamed: 0,id,name,age,continent,contract_until,position,prefer_foot,reputation,stat_skill_moves,value,log_value,all_stat
12735,16932,Y. Góez,18,south america,2021,MF,right,1.0,2.0,,,56.5
12736,16937,A. Kaltner,18,europe,2020,ST,right,1.0,2.0,,,54.0
12737,16943,K. Fujikawa,19,asia,2021,MF,right,1.0,2.0,,,54.0


* 계약 만료기간을 남은 계약기간으로

In [8]:
all_data.loc[all_data["contract_until"]=="Jun 30, 2019","contract_until"] = 2018
all_data.loc[all_data["contract_until"]=="Dec 31, 2018","contract_until"] = 2018
all_data.loc[all_data["contract_until"]=="May 31, 2019","contract_until"] = 2019
all_data.loc[all_data["contract_until"]=="Jan 31, 2019","contract_until"] = 2018
all_data.loc[all_data["contract_until"]=="Jun 30, 2020","contract_until"] = 2019
all_data.loc[all_data["contract_until"]=="Jan 1, 2019","contract_until"] = 2018
all_data.loc[all_data["contract_until"]=="May 31, 2020","contract_until"] = 2020
all_data.loc[all_data["contract_until"]=="Jan 12, 2019","contract_until"] = 2018
all_data.loc[all_data["contract_until"]=="Dec 31, 2019","contract_until"] = 2020
all_data.loc[all_data["contract_until"]=="Jun 1, 2019","contract_until"] = 2018

# "2019", 2019를 하나로 묶어주기 위한 타입 변화
all_data["contract_until"] = all_data["contract_until"].astype(int)

In [9]:
all_data["contract_until"] = all_data["contract_until"].apply(lambda x: x-2018)

In [10]:
all_data.tail(3)

Unnamed: 0,id,name,age,continent,contract_until,position,prefer_foot,reputation,stat_skill_moves,value,log_value,all_stat
12735,16932,Y. Góez,18,south america,3,MF,right,1.0,2.0,,,56.5
12736,16937,A. Kaltner,18,europe,2,ST,right,1.0,2.0,,,54.0
12737,16943,K. Fujikawa,19,asia,3,MF,right,1.0,2.0,,,54.0


* 왼발 오른발 변수 제거, value 제거, id, name 제거

In [11]:
all_data = all_data.drop(["prefer_foot", "value","id","name"], axis=1)

In [12]:
all_data.tail(3)

Unnamed: 0,age,continent,contract_until,position,reputation,stat_skill_moves,log_value,all_stat
12735,18,south america,3,MF,1.0,2.0,,56.5
12736,18,europe,2,ST,1.0,2.0,,54.0
12737,19,asia,3,MF,1.0,2.0,,54.0


# 전처리 2

In [13]:
from sklearn.preprocessing import OneHotEncoder
from scipy import sparse

In [185]:
all_data.tail(3)

Unnamed: 0,age,continent,contract_until,position,reputation,stat_skill_moves,log_value,all_stat
12735,18,south america,3,MF,1.0,2.0,,56.5
12736,18,europe,2,ST,1.0,2.0,,54.0
12737,19,asia,3,MF,1.0,2.0,,54.0


* 원-핫인코딩

In [180]:
ohe = OneHotEncoder()
encoded_matrix = ohe.fit_transform(all_data[["continent","position"]])

* 데이터 모으기

In [190]:
feature = ["age", "contract_until", "reputation","stat_skill_moves","all_stat"]
all_data_sprs = sparse.hstack([sparse.csr_matrix(all_data[feature]), encoded_matrix], format="csr")

* 데이터 분할하기

In [202]:
num_train = len(train)
X = all_data_sprs[:num_train].toarray()
X_test = all_data_sprs[num_train:].toarray()
y = train["log_value"]

In [203]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2) 

# 베이스라인 평가

In [204]:
from Metrics import regression_scores

## 베이스 베이스

In [205]:
'linear', 'ridge', 'lasso', 'elastic', 'bayes_ridge', 'tree', 'forest', 'et', 'gb', 'xgb'
lr = LinearRegression()
ridge = Ridge(alpha=0.1)
lasso = Lasso(alpha=0.1)
elastic = ElasticNet(alpha =0.1)
bayes_ridge = BayesianRidge()
tree = DecisionTreeRegressor()
forest = RandomForestRegressor()
et = ExtraTreesRegressor()
gb = GradientBoostingRegressor()
xgb = XGBRegressor()

model_lst = ["lr", "ridge", "lasso", "elastic", "bayes_ridge", "tree", "forest", "et", "gb", "xgb"]

for idx, model in enumerate(model_lst):
    eval(model).fit(X_train, y_train)
    y_pred = eval(model).predict(X_val)
    score=regression_scores(y_val, y_pred)
    score["name"] = model
    print(score)

{'MSE': 0.152, 'RMSE': 0.39, 'MAE': 0.277, 'NMAE': 0.02, 'MAPE': 2.044, 'R^2': 0.924, 'name': 'lr'}
{'MSE': 0.152, 'RMSE': 0.39, 'MAE': 0.277, 'NMAE': 0.02, 'MAPE': 2.044, 'R^2': 0.924, 'name': 'ridge'}
{'MSE': 0.175, 'RMSE': 0.418, 'MAE': 0.303, 'NMAE': 0.022, 'MAPE': 2.23, 'R^2': 0.913, 'name': 'lasso'}
{'MSE': 0.162, 'RMSE': 0.403, 'MAE': 0.289, 'NMAE': 0.021, 'MAPE': 2.123, 'R^2': 0.919, 'name': 'elastic'}
{'MSE': 0.152, 'RMSE': 0.39, 'MAE': 0.277, 'NMAE': 0.02, 'MAPE': 2.044, 'R^2': 0.924, 'name': 'bayes_ridge'}
{'MSE': 0.042, 'RMSE': 0.206, 'MAE': 0.132, 'NMAE': 0.01, 'MAPE': 0.968, 'R^2': 0.979, 'name': 'tree'}
{'MSE': 0.025, 'RMSE': 0.157, 'MAE': 0.107, 'NMAE': 0.008, 'MAPE': 0.782, 'R^2': 0.988, 'name': 'forest'}
{'MSE': 0.027, 'RMSE': 0.165, 'MAE': 0.111, 'NMAE': 0.008, 'MAPE': 0.813, 'R^2': 0.986, 'name': 'et'}
{'MSE': 0.021, 'RMSE': 0.144, 'MAE': 0.108, 'NMAE': 0.008, 'MAPE': 0.787, 'R^2': 0.99, 'name': 'gb'}
{'MSE': 0.019, 'RMSE': 0.139, 'MAE': 0.096, 'NMAE': 0.007, 'MAPE'

# 예측 및 제출

In [214]:
submission = pd.read_csv("data/submission.csv")
print(submission.shape, X_test.shape)
submission["value"] = np.exp(xgb.predict(X_test))

(3828, 2) (3828, 14)


In [225]:
submission.to_csv("submission_1.csv", index=False)

In [224]:
pd.read_csv("submission_1.csv")

Unnamed: 0,id,value
0,1,3.230921e+07
1,2,6.027849e+07
2,4,6.356716e+07
3,5,6.652283e+07
4,6,4.972656e+07
...,...,...
3823,16924,5.452637e+04
3824,16929,4.324331e+04
3825,16932,5.702469e+04
3826,16937,6.363220e+04
