In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import *
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('C:/Users/ss/Desktop/11주_4조/data/FIFA_train.csv', encoding='utf-8')

In [3]:
from sklearn.preprocessing import LabelEncoder
pos = df['position'].values
le = LabelEncoder()
df['position'] = le.fit_transform(pos)
df = df.drop(['contract_until', 'continent', 'prefer_foot'], axis = 1)
df.head()

Unnamed: 0,id,name,age,position,reputation,stat_overall,stat_potential,stat_skill_moves,value
0,0,L. Messi,31,3,5.0,94,94,4.0,110500000.0
1,3,De Gea,27,1,4.0,91,93,1.0,72000000.0
2,7,L. Suárez,31,3,5.0,91,91,3.0,80000000.0
3,8,Sergio Ramos,32,0,4.0,91,91,3.0,51000000.0
4,9,J. Oblak,25,1,3.0,90,93,1.0,68000000.0


In [4]:
# 데이터 분할

X = np.zeros(len(df)*6).reshape(len(df), 6)
# age 부터 skill_moves 까지를 X로 설정
for i,col in enumerate(df.columns[2:-1]) :
  X[:,i] =  df[col].values
y = df['value'].values
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2,
                                                    random_state = 1)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((7145, 6), (1787, 6), (7145,), (1787,))

In [5]:
# 정규화 -> LGBM 적용 pipe 생성
from lightgbm import LGBMRegressor
from sklearn.pipeline import make_pipeline
from sklearn import set_config
set_config(display = 'diagram')
pipe_lgbm = make_pipeline(StandardScaler(),
                          LGBMRegressor(n_estimate = 400, randomstate=1, metric = 'mse'))
pipe_lgbm.fit(X_train, y_train)
pipe_lgbm



In [6]:
pipe_lgbm.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'standardscaler', 'lgbmregressor', 'standardscaler__copy', 'standardscaler__with_mean', 'standardscaler__with_std', 'lgbmregressor__boosting_type', 'lgbmregressor__class_weight', 'lgbmregressor__colsample_bytree', 'lgbmregressor__importance_type', 'lgbmregressor__learning_rate', 'lgbmregressor__max_depth', 'lgbmregressor__min_child_samples', 'lgbmregressor__min_child_weight', 'lgbmregressor__min_split_gain', 'lgbmregressor__n_estimators', 'lgbmregressor__n_jobs', 'lgbmregressor__num_leaves', 'lgbmregressor__objective', 'lgbmregressor__random_state', 'lgbmregressor__reg_alpha', 'lgbmregressor__reg_lambda', 'lgbmregressor__silent', 'lgbmregressor__subsample', 'lgbmregressor__subsample_for_bin', 'lgbmregressor__subsample_freq', 'lgbmregressor__n_estimate', 'lgbmregressor__randomstate', 'lgbmregressor__metric'])

In [7]:
# 그리드 서치
from sklearn.model_selection import GridSearchCV
param_grid = { 'lgbmregressor__max_depth':[3,5,8], 
               'lgbmregressor__learning_rate' : [0.1,0.01],
              'lgbmregressor__min_child_samples' : [1,10,20,30],
              'lgbmregressor__min_child_weight' : [1,3,5]}

gs = GridSearchCV(estimator = pipe_lgbm,
                  scoring = 'accuracy',
                  cv = 5,  # 5겹 교차검증
                  param_grid = param_grid,
                  refit = True,  # 훈련후 바로 적용
                  return_train_score = True # 훈련 성능 리턴
                  )
gs.fit(X_train, y_train)
print(f'최적의 하이퍼파라미터 세트:{gs.best_params_}')









최적의 하이퍼파라미터 세트:{'lgbmregressor__learning_rate': 0.1, 'lgbmregressor__max_depth': 3, 'lgbmregressor__min_child_samples': 1, 'lgbmregressor__min_child_weight': 1}


In [8]:
# 데이터 스케일링 및 모델 생성

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
y_train = scaler.fit_transform(y_train.reshape(-1,1)).flatten()
X_test = scaler.fit_transform(X_test)
y_test = scaler.fit_transform(y_test.reshape(-1,1)).flatten()
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

lgbm = LGBMRegressor(max_depth = 3, n_estimate = 400, learning_rate = 0.1,
                     metric = 'mse', randomstate=1)
lgbm.fit(X_train, y_train)

(7145, 6) (1787, 6) (7145,) (1787,)


In [9]:
# 모델 성능 평가 함수 생성

def regression_scores(y_true, y_pred) :
  import numpy as np
  from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
  
  MSE = mean_squared_error(y_true, y_pred)
  RMSE = np.sqrt(mean_squared_error(y_true,y_pred))
  MAE = np.mean( np.abs((y_true - y_pred) / y_true) )
  NMAE =mean_absolute_error(y_true, y_pred)/ np.mean( np.abs(y_true) )
  MAPE = np.mean( np.abs((y_true - y_pred) / y_true) ) *100
  R2 = r2_score(y_true, y_pred)
  return {'MSE' : np.round(MSE, 3), 
          'RMSE' : np.round(RMSE, 3),
          'MAE' : np.round(MAE, 3),
          'NMAE' : np.round(NMAE, 3),
          'MAPE' : np.round(MAPE, 3),
          'R2' : np.round(R2, 3)}

In [10]:
# 모델 성능 평가
y_pred = lgbm.predict(X_test)
regression_scores(y_test, y_pred)

{'MSE': 0.01,
 'RMSE': 0.101,
 'MAE': 0.201,
 'NMAE': 0.076,
 'MAPE': 20.127,
 'R2': 0.99}

In [12]:
gs.best_estimator_

In [13]:
test = pd.read_csv('C:/Users/ss/Desktop/11주_4조/data/FIFA_test.csv', encoding='utf-8')

In [14]:
from sklearn.preprocessing import LabelEncoder
pos = test['position'].values
le = LabelEncoder()
test['position'] = le.fit_transform(pos)
test = test.drop(['contract_until', 'continent', 'prefer_foot'], axis = 1)
test.head()

Unnamed: 0,id,name,age,position,reputation,stat_overall,stat_potential,stat_skill_moves
0,1,Cristiano Ronaldo,33,3,5,94,94,5
1,2,Neymar Jr,26,3,5,92,93,5
2,4,K. De Bruyne,27,2,4,91,92,4
3,5,E. Hazard,27,3,4,91,91,4
4,6,L. Modrić,32,2,4,91,91,4


In [15]:
X = np.zeros(len(test)*6).reshape(len(test), 6)
# age 부터 skill_moves 까지를 X로 설정
for i,col in enumerate(test.columns[2:]) :
  X[:,i] =  test[col].values

X.shape

(3828, 6)

In [16]:
scaler = StandardScaler()
X = scaler.fit_transform(X)
y_pred = lgbm.predict(X)
y_pred

array([10.26472894, 11.99773076, 11.36174772, ..., -0.44893689,
       -0.4443672 , -0.44236527])

In [20]:
sub['value'] = y_pred
sub.to_csv('LGBM.csv', index = False)

In [19]:
sub = pd.read_csv('C:/Users/ss/Desktop/11주_4조/data/submission.csv', encoding='utf-8')