In [30]:
### 데이터 불러오기
import pandas as pd

X_train = pd.read_csv('./data/연습문제/FIFA_X_train.csv', encoding = 'cp949')
X_test = pd.read_csv('./data/연습문제/FIFA_X_test.csv', encoding = 'cp949')
y_train = pd.read_csv('./data/연습문제/FIFA_y_train.csv', encoding = 'cp949')

### EDA

### 데이터 전처리
# 1) 불필요 변수 제거
X_train = X_train.drop('ID', axis = 1)
y_train = y_train.drop('ID', axis = 1)


test_id = X_test.pop('ID')

# 2) 결측치 처리
    # 'Position_Class' <- 'Position'에서 분류가 덜 됨.
X_train['Position_Class'] = X_train['Position_Class'].fillna('unknown')
pd.crosstab(index = X_train['Position'], columns = X_train['Position_Class'])

train_pos = X_train['Position'].copy()
train_cls = X_train['Position_Class'].copy()
test_pos = X_test['Position'].copy()
test_cls = X_test['Position_Class'].copy()

pos = ['CM','GK','LF','RDM','RWB']
pos_cls = ['Midfielder','GoalKeeper','Forward','Defender','Defender']

for p,c in zip(pos, pos_cls) :
    train_cls[train_pos == p] = c
    test_cls[test_pos == p] = c
    
X_train['Position_Class'] = train_cls
X_test['Position_Class'] = test_cls

X_train = X_train.drop('Position', axis = 1)
X_test = X_test.drop('Position', axis = 1)

    # 'Height_cm' <- 'Height'에서 계산이 덜 됨.
Height_train = X_train['Height'].copy()
Height_cm_train = X_train['Height_cm'].copy()
Height_test = X_test['Height'].copy()
Height_cm_test = X_test['Height_cm'].copy()

split_str_train = Height_train.str.split("'", expand = True).astype('float64')
split_str_test = Height_test.str.split("'", expand = True).astype('float64')

Height_cm_train = Height_train.fillna(split_str_train[0]*30 + split_str_train[1]*2.5)
Height_cm_test = Height_test.fillna(split_str_test[0]*30 + split_str_test[1]*2.5)

X_train['Height_cm'] = Height_cm_train
X_test['Height_cm'] = Height_cm_test

X_train = X_train.drop('Height', axis = 1)
X_test = X_test.drop('Height', axis = 1)

    # 'Weight_lb'는 train에서만 2% 결측이므로 결측행 제거
cond_na = X_train['Weight_lb'].isna()

y_train = y_train[~ cond_na]
X_train = X_train[~ cond_na]

# 3) 범주형 데이터 처리
    # 'Age' 일부 마스킹 되어 있음.
X_train['Age_gp'] = X_train['Age'].str[0]
X_test['Age_gp'] = X_test['Age'].str[0]

X_train = X_train.drop('Age', axis = 1)
X_test = X_test.drop('Age', axis = 1)

    # 'Club' 불필요해 보임. 제거
X_train = X_train.drop(columns = 'Club')
X_test = X_test.drop(columns = 'Club')

    # 'Work_Rate'
work_rate_train = X_train['Work_Rate'].str.split("/ ", expand = True)
work_rate_test = X_test['Work_Rate'].str.split("/ ", expand = True)

work_rate_train = work_rate_train.replace(" ", "")
work_rate_test = work_rate_test.replace(" ", "")

X_train['WR_Attack'] = work_rate_train[0]
X_train['WR_Defend'] = work_rate_train[1]
X_test['WR_Attack'] = work_rate_test[0]
X_test['WR_Defend'] = work_rate_test[0]

X_train = X_train.drop(columns = 'Work_Rate')
X_test = X_test.drop(columns = 'Work_Rate')

# 4) 수치형 데이터 처리
    # 'Jersey_Number' 등번호는 불필요해 보임. 제거
X_train = X_train.drop('Jersey_Number', axis = 1)
X_test = X_test.drop('Jersey_Number', axis = 1)

    # 'Contract_Valid_Until' 범주형 변수로 취급
# print(X_train['Contract_Valid_Until'].sort_values().unique())
# print(X_test['Contract_Valid_Until'].sort_values().unique())  # 2026이 없음

X_train['CVU_gp'] = X_train['Contract_Valid_Until'].astype('object')
X_test['CVU_gp'] = X_test['Contract_Valid_Until'].astype('object')

X_train = X_train.drop('Contract_Valid_Until', axis = 1)
X_test = X_test.drop('Contract_Valid_Until', axis = 1)

column_conti = ['Overall','Height_cm','Weight_lb','Release_Clause','Wage']
X_train[column_conti].corr()

    # 'Release_Clause', 'Wage' 연관성이 높으므로 둘 중 하나를 제거
X_train = X_train.drop('Release_Clause', axis = 1)
X_test = X_test.drop('Release_Clause', axis = 1)

# 5) 데이터 분할
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,
                                                  test_size = 0.3,
                                                  random_state = 2022)

# 6) 인코딩
from sklearn.preprocessing import OneHotEncoder

X_train_obj = X_train.select_dtypes('object').copy()
X_val_obj = X_val.select_dtypes('object').copy()
X_test_obj = X_test.select_dtypes('object').copy()

ohe = OneHotEncoder(sparse = False, handle_unknown = 'ignore').fit(X_train_obj)

X_train_oh = ohe.transform(X_train_obj)
X_val_oh = ohe.transform(X_val_obj)
X_test_oh = ohe.transform(X_test_obj)

# 7) 스케일링
from sklearn.preprocessing import StandardScaler

X_train_num = X_train.select_dtypes(exclude = 'object').copy()
X_val_num = X_val.select_dtypes(exclude = 'object').copy()
X_test_num = X_test.select_dtypes(exclude = 'object').copy()

scale = StandardScaler().fit(X_train_num)

X_train_scale = scale.transform(X_train_num)
X_val_scale = scale.transform(X_val_num)
X_test_scale = scale.transform(X_test_num)

# 8) 데이터셋 준비
import numpy as np

X_train = np.concatenate([X_train_oh, X_train_scale], axis = 1)
X_val = np.concatenate([X_val_oh, X_val_scale], axis = 1)
X_test = np.concatenate([X_test_oh, X_test_scale], axis = 1)

y_train = y_train.values.ravel()
y_val = y_val.values.ravel()

### 모델 학습 및 예측
# 학습
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, AdaBoostRegressor

    # RandomForest
rf = RandomForestRegressor(n_estimators = 500,
                           max_depth = 3,
                           min_samples_leaf = 10,
                           max_features = 50,
                           random_state = 2022)
model_rf = rf.fit(X_train, y_train)

    # Bagging
dt = DecisionTreeRegressor(max_depth = 3,
                           min_samples_leaf = 10)
bag = BaggingRegressor(base_estimator = dt,
                       n_estimators = 500,
                       random_state = 2022)
model_bag = bag.fit(X_train, y_train)

    # AdaBoost
dt = DecisionTreeRegressor(max_depth = 3,
                           min_samples_leaf = 10)
ada = AdaBoostRegressor(base_estimator = dt,
                        n_estimators = 500,
                        learning_rate = 0.5,
                        random_state = 2022)
model_ada = ada.fit(X_train, y_train)


# 검증
from sklearn.metrics import mean_squared_error

pred_rf = model_rf.predict(X_val)
pred_bag = model_bag.predict(X_val)
pred_ada = model_ada.predict(X_val)

rmse_rf = mean_squared_error(y_val, pred_rf, squared = False)
rmse_bag = mean_squared_error(y_val, pred_bag, squared = False)
rmse_ada = mean_squared_error(y_val, pred_ada, squared = False)

print(rmse_rf)
print(rmse_bag)
print(rmse_ada)

# 예측
y_pred = model_bag.predict(X_test)

obj = {'ID':test_id,
       'Value':y_pred}
result = pd.DataFrame(obj)

### 결과제출
result.to_csv('FIFA.csv', index = False)

2312.0464561397657
1845.808246966615
2749.9705313226004
