# 필요한 라이브러리 import

In [8]:
import pandas as pd
import os
import re
import csv
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
import joblib

# 선수의 포지션을 예측하는 모델 생성

In [3]:
# csv 파일을 불러와 데이터 프레임 생성
# 이 데이터는 선수의 포지션을 공격,미드필더,수비 세부분으로 라벨링을 한 데이터입니다.
position_df = pd.read_csv('./datas/new_position.csv',encoding='utf-8')

In [4]:
# 필요한 행만 추출
position_df = position_df[['이름','골','도움','태클 성공률','태클','헤더 시도','헤더 성공%','경기 당 드리블','드리블','패스 시도','패스 %','기회 창출/90','Int/90','달린 거리/90분','Cr A','Cr C/A','슈팅 수','유효 슈팅 %','구분']]

# nan 을 데이터로 가진 행을 0으로 대체
position_df = position_df.fillna(0)

position_df.head(5)

Unnamed: 0,이름,골,도움,태클 성공률,태클,헤더 시도,헤더 성공%,경기 당 드리블,드리블,패스 시도,패스 %,기회 창출/90,Int/90,달린 거리/90분,Cr A,Cr C/A,슈팅 수,유효 슈팅 %,구분
0,Javi Manquillo,0,1,63,1.71,86,59,0.93,32,1006,79,0.06,2.67,7.8,89,34,16,6.0,미드필더
1,Joe Willock,2,5,72,1.4,56,21,0.0,0,997,83,0.48,1.03,7.6,65,26,28,43.0,미드필더
2,Allan Saint-Maximin,7,3,44,0.57,37,19,1.19,38,995,64,0.31,0.91,6.9,115,33,65,29.0,공격수
3,Miguel Almirón,6,4,74,0.57,56,27,0.81,24,979,79,0.27,1.05,8.8,55,29,33,39.0,미드필더
4,Jamal Lewis,0,2,75,1.5,88,59,0.0,0,953,78,0.07,1.41,8.3,71,33,8,0.0,미드필더


In [5]:
sc = StandardScaler()
position_df.iloc[:,1:-1] =sc.fit_transform(position_df.iloc[:,1:-1])
position_df.head()

Unnamed: 0,이름,골,도움,태클 성공률,태클,헤더 시도,헤더 성공%,경기 당 드리블,드리블,패스 시도,패스 %,기회 창출/90,Int/90,달린 거리/90분,Cr A,Cr C/A,슈팅 수,유효 슈팅 %,구분
0,Javi Manquillo,-0.798046,-0.612579,-0.342579,0.049987,0.255353,0.279957,1.692981,2.224985,0.470151,-0.504583,-0.921118,0.53369,-0.151591,1.097997,0.761079,-0.469197,-1.548728,미드필더
1,Joe Willock,-0.310596,0.999624,0.465414,-0.390711,-0.408804,-1.504916,-1.000003,-0.847019,0.443762,0.052268,1.918385,-0.99854,-0.381048,0.321786,0.188396,-0.028878,0.676239,미드필더
2,Allan Saint-Maximin,0.90803,0.193523,-2.048342,-1.570646,-0.829436,-1.598857,2.445858,2.800986,0.437897,-2.592774,0.769062,-1.110655,-1.184148,1.938892,0.689494,1.328773,-0.165641,공격수
3,Miguel Almirón,0.664305,0.596573,0.644969,-1.570646,-0.408804,-1.223094,1.345499,1.456984,0.390983,-0.504583,0.498634,-0.979854,0.995695,-0.001635,0.403152,0.154589,0.435702,미드필더
4,Jamal Lewis,-0.798046,-0.209528,0.734746,-0.24855,0.29963,0.279957,-1.000003,-0.847019,0.314747,-0.643796,-0.85351,-0.643511,0.422052,0.515839,0.689494,-0.762743,-1.909533,미드필더


In [10]:
split_count = 5

X,Y = position_df.iloc[:,1:-1],position_df.loc[:,'구분']

kf = KFold(n_splits= split_count)
kf.get_n_splits(X)

model_list = []
acc_score_list = []

for train_index, test_index in kf.split(X):

    x_train, x_test = X.values[train_index], X.values[test_index]
    y_train, y_test = Y.values[train_index], Y.values[test_index]
    
    # multi-layer perceptron model. 으로 해봤을때 성능 확인
    # created_model = MLPRegressor().fit(x_train, y_train)

    # LinearRegression 으로 확인
    created_model = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=123).fit(x_train, y_train)
    
    pred = created_model.predict(x_test)
    
    acc_score_list.append(accuracy_score(y_test, pred))
        
    model_list.append(created_model)

In [11]:
model_list

[RandomForestClassifier(oob_score=True, random_state=123),
 RandomForestClassifier(oob_score=True, random_state=123),
 RandomForestClassifier(oob_score=True, random_state=123),
 RandomForestClassifier(oob_score=True, random_state=123),
 RandomForestClassifier(oob_score=True, random_state=123)]

In [12]:
acc_score_list

[0.9107142857142857,
 0.9642857142857143,
 0.9272727272727272,
 0.8909090909090909,
 0.8909090909090909]

In [13]:
# 실제 나의 생각과 예측된 포지션을 비교
real_predicted = model_list[1].predict(position_df.iloc[:,1:-1])
position_df['예측 포지션'] = real_predicted
position_df[position_df['구분']!=position_df['예측 포지션']]

Unnamed: 0,이름,골,도움,태클 성공률,태클,헤더 시도,헤더 성공%,경기 당 드리블,드리블,패스 시도,패스 %,기회 창출/90,Int/90,달린 거리/90분,Cr A,Cr C/A,슈팅 수,유효 슈팅 %,구분,예측 포지션
66,Rodri,-0.554321,-1.01563,1.093854,3.163309,-0.563774,1.360274,-0.536694,-0.655019,-0.922625,1.305182,-0.245046,1.757606,-0.839962,-1.165951,-1.672823,-0.652663,1.39785,미드필더,수비수
82,Daniel James,-0.310596,-0.612579,-0.163025,0.391173,-1.294346,-1.880679,-1.000003,-0.847019,-1.635139,-1.479072,-0.650689,-0.961169,0.766238,-0.422082,0.832664,-0.322424,0.796507,공격수,미드필더


In [14]:
# 예측모델 저장
joblib.dump(model_list,'./models/position_model.pkl')

['./models/position_model.pkl']