In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Image

import warnings
warnings.filterwarnings(action='ignore')
%matplotlib inline

# Scikit-Learn
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error as MSE

# Ensemble Learning
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

In [6]:
# C:/Users/MILAB/Desktop/자료/2022년 2학기/데이터마이닝/과제4

fifa_data = pd.read_csv('C:/Users/MILAB/Desktop/자료/2022년 2학기/데이터마이닝/과제4/FIFA_train.csv')
fifa_data.head()

Unnamed: 0,id,name,age,continent,contract_until,position,prefer_foot,reputation,stat_overall,stat_potential,stat_skill_moves,value
0,0,L. Messi,31,south america,2021,ST,left,5.0,94,94,4.0,110500000.0
1,3,De Gea,27,europe,2020,GK,right,4.0,91,93,1.0,72000000.0
2,7,L. Suárez,31,south america,2021,ST,right,5.0,91,91,3.0,80000000.0
3,8,Sergio Ramos,32,europe,2020,DF,right,4.0,91,91,3.0,51000000.0
4,9,J. Oblak,25,europe,2021,GK,right,3.0,90,93,1.0,68000000.0


In [7]:
fifa_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8932 entries, 0 to 8931
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                8932 non-null   int64  
 1   name              8932 non-null   object 
 2   age               8932 non-null   int64  
 3   continent         8932 non-null   object 
 4   contract_until    8932 non-null   object 
 5   position          8932 non-null   object 
 6   prefer_foot       8932 non-null   object 
 7   reputation        8932 non-null   float64
 8   stat_overall      8932 non-null   int64  
 9   stat_potential    8932 non-null   int64  
 10  stat_skill_moves  8932 non-null   float64
 11  value             8932 non-null   float64
dtypes: float64(3), int64(4), object(5)
memory usage: 837.5+ KB


## 1. 데이터 전처리

### 1.1. Continent를 범주화
- 1 : 유럽
- 2 : 남미
- 3 : 아시아
- 4 : 아프리카
- 5 : 오세아니아

In [8]:
def continent_num(x):
    if x == 'europe':
        return 1
    elif x == 'south america':
        return 2
    elif x == 'asia':
        return 3
    elif x == 'africa':
        return 4
    elif x == 'oceania':
        return 5

In [9]:
fifa_data['continent_num'] = fifa_data.continent.apply(continent_num).astype('int64')
fifa_data = fifa_data.drop(columns=['continent'], axis=1)

In [10]:
fifa_data.head()

Unnamed: 0,id,name,age,contract_until,position,prefer_foot,reputation,stat_overall,stat_potential,stat_skill_moves,value,continent_num
0,0,L. Messi,31,2021,ST,left,5.0,94,94,4.0,110500000.0,2
1,3,De Gea,27,2020,GK,right,4.0,91,93,1.0,72000000.0,1
2,7,L. Suárez,31,2021,ST,right,5.0,91,91,3.0,80000000.0,2
3,8,Sergio Ramos,32,2020,DF,right,4.0,91,91,3.0,51000000.0,1
4,9,J. Oblak,25,2021,GK,right,3.0,90,93,1.0,68000000.0,1


### 1.2. Contract_until을 수정 
- 본 데이터는 2018년 기준이기 때문에 2018년부터 남은 기간을 채택

In [11]:
def contract_calculate(x):
    if len(x) > 4:
        a = int(x[-4:])
        return a
    else:
        return x

In [12]:
fifa_data['contract_year'] = 2018
fifa_data['contract_period'] = fifa_data.contract_until.apply(contract_calculate).astype('int64')
fifa_data['contract_period'] -= fifa_data['contract_year']
fifa_data = fifa_data.drop(columns=['contract_until', 'contract_year'], axis=1)

In [13]:
fifa_data.head()

Unnamed: 0,id,name,age,position,prefer_foot,reputation,stat_overall,stat_potential,stat_skill_moves,value,continent_num,contract_period
0,0,L. Messi,31,ST,left,5.0,94,94,4.0,110500000.0,2,3
1,3,De Gea,27,GK,right,4.0,91,93,1.0,72000000.0,1,2
2,7,L. Suárez,31,ST,right,5.0,91,91,3.0,80000000.0,2,3
3,8,Sergio Ramos,32,DF,right,4.0,91,91,3.0,51000000.0,1,2
4,9,J. Oblak,25,GK,right,3.0,90,93,1.0,68000000.0,1,3


### 1.3. Position을 범주화
- 1 : 골키퍼
- 2 : 수비수
- 3 : 미드필더
- 4 : 공격수

In [14]:
def position_num(x):
    if x == 'GK':
        return 1
    elif x == 'DF':
        return 2
    elif x == 'MF':
        return 3
    elif x == 'ST':
        return 4

In [15]:
fifa_data['position_num'] = fifa_data.position.apply(position_num).astype('int64')
fifa_data = fifa_data.drop(columns=['position'], axis=1)

In [16]:
fifa_data.head()

Unnamed: 0,id,name,age,prefer_foot,reputation,stat_overall,stat_potential,stat_skill_moves,value,continent_num,contract_period,position_num
0,0,L. Messi,31,left,5.0,94,94,4.0,110500000.0,2,3,4
1,3,De Gea,27,right,4.0,91,93,1.0,72000000.0,1,2,1
2,7,L. Suárez,31,right,5.0,91,91,3.0,80000000.0,2,3,4
3,8,Sergio Ramos,32,right,4.0,91,91,3.0,51000000.0,1,2,2
4,9,J. Oblak,25,right,3.0,90,93,1.0,68000000.0,1,3,1


### 1.4. 선호발(prefer_foot)은 One-Hot Encoding으로 변환

In [17]:
def prefer_foot_num(x):
    if x == 'left':
        return 0
    elif x == 'right':
        return 1

In [18]:
fifa_data['prefer_foot_num'] = fifa_data.prefer_foot.apply(prefer_foot_num).astype('int64')
fifa_data = fifa_data.drop(columns=['prefer_foot'], axis=1)

In [19]:
fifa_data.head()

Unnamed: 0,id,name,age,reputation,stat_overall,stat_potential,stat_skill_moves,value,continent_num,contract_period,position_num,prefer_foot_num
0,0,L. Messi,31,5.0,94,94,4.0,110500000.0,2,3,4,0
1,3,De Gea,27,4.0,91,93,1.0,72000000.0,1,2,1,1
2,7,L. Suárez,31,5.0,91,91,3.0,80000000.0,2,3,4,1
3,8,Sergio Ramos,32,4.0,91,91,3.0,51000000.0,1,2,2,1
4,9,J. Oblak,25,3.0,90,93,1.0,68000000.0,1,3,1,1


### 1.5. 데이터 분석에서 불필요한 Column들 제거
- id, name 제거

In [20]:
fifa_data = fifa_data.drop(columns=['id', 'name'], axis=1)

In [21]:
fifa_data.head()

Unnamed: 0,age,reputation,stat_overall,stat_potential,stat_skill_moves,value,continent_num,contract_period,position_num,prefer_foot_num
0,31,5.0,94,94,4.0,110500000.0,2,3,4,0
1,27,4.0,91,93,1.0,72000000.0,1,2,1,1
2,31,5.0,91,91,3.0,80000000.0,2,3,4,1
3,32,4.0,91,91,3.0,51000000.0,1,2,2,1
4,25,3.0,90,93,1.0,68000000.0,1,3,1,1


### 1.6. Column 순서 변경
- 데이터 분석의 편의성을 위함

In [22]:
fifa_data = fifa_data[['age', 'continent_num', 'contract_period', 
                       'position_num', 'prefer_foot_num', 'reputation', 
                       'stat_overall', 'stat_potential', 'stat_skill_moves', 'value']]

In [23]:
fifa_data.head()

Unnamed: 0,age,continent_num,contract_period,position_num,prefer_foot_num,reputation,stat_overall,stat_potential,stat_skill_moves,value
0,31,2,3,4,0,5.0,94,94,4.0,110500000.0
1,27,1,2,1,1,4.0,91,93,1.0,72000000.0
2,31,2,3,4,1,5.0,91,91,3.0,80000000.0
3,32,1,2,2,1,4.0,91,91,3.0,51000000.0
4,25,1,3,1,1,3.0,90,93,1.0,68000000.0


## 전처리한 데이터를 csv로 추출

In [24]:
fifa_data.to_csv('C:/Users/MILAB/Desktop/자료/2022년 2학기/데이터마이닝/과제4/fifa_data_preprocessing.csv')