# 필요한 라이브러리 import

In [3]:
import pandas as pd
import os
import re
import csv
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib

# 선수의 포지션을 예측하는 모델 생성

In [5]:
# csv 파일을 불러와 데이터 프레임 생성
# 이 데이터는 선수의 포지션을 공격,미드필더,수비 세부분으로 라벨링을 한 데이터입니다.
position_df = pd.read_csv('./datas/new_position.csv',encoding='utf-8')

In [8]:
# 필요한 행만 추출
position_df = position_df[['이름','골','도움','태클 성공률','태클','헤더 시도','헤더 성공%','경기 당 드리블','드리블','패스 시도','패스 %','기회 창출/90','Int/90','달린 거리/90분','Cr A','Cr C/A','슈팅 수','유효 슈팅 %','구분']]

# nan 을 데이터로 가진 행을 0으로 대체
position_df = position_df.fillna(0)

position_df.head(5)

Unnamed: 0,이름,골,도움,태클 성공률,태클,헤더 시도,헤더 성공%,경기 당 드리블,드리블,패스 시도,패스 %,기회 창출/90,Int/90,달린 거리/90분,Cr A,Cr C/A,슈팅 수,유효 슈팅 %,구분
0,Javi Manquillo,0,1,63,1.71,86,59,0.93,32,1006,79,0.06,2.67,7.8,89,34,16,6.0,미드필더
1,Joe Willock,2,5,72,1.4,56,21,0.0,0,997,83,0.48,1.03,7.6,65,26,28,43.0,미드필더
2,Allan Saint-Maximin,7,3,44,0.57,37,19,1.19,38,995,64,0.31,0.91,6.9,115,33,65,29.0,공격수
3,Miguel Almirón,6,4,74,0.57,56,27,0.81,24,979,79,0.27,1.05,8.8,55,29,33,39.0,미드필더
4,Jamal Lewis,0,2,75,1.5,88,59,0.0,0,953,78,0.07,1.41,8.3,71,33,8,0.0,미드필더


In [9]:
# test, train split
# X는 지표들 Y는 라벨링된 포지션
X_train, X_test, y_train, y_test = train_test_split(
    position_df.iloc[:,1:-1]
    ,position_df.loc[:,'구분']
    ,test_size=0.3
    ,random_state=123)

In [10]:
# 모델생성
rf = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=123)

# 모델 훈련
rf.fit(X_train,y_train)

# 예측
predicted = rf.predict(X_test)

# 정확도 계산
accuracy = accuracy_score(y_test,predicted)

# 약 90퍼센트의 정확성
accuracy

0.9047619047619048

In [11]:
# 실제 나의 생각과 예측된 포지션을 비교
real_predicted = rf.predict(position_df.iloc[:,1:-1])
position_df['예측 포지션'] = real_predicted
position_df[position_df['구분']!=position_df['예측 포지션']]

Unnamed: 0,이름,골,도움,태클 성공률,태클,헤더 시도,헤더 성공%,경기 당 드리블,드리블,패스 시도,패스 %,기회 창출/90,Int/90,달린 거리/90분,Cr A,Cr C/A,슈팅 수,유효 슈팅 %,구분,예측 포지션
36,Fabinho,0,2,87,2.77,111,79,0.12,3,1105,93,0.19,4.79,7.8,33,15,14,14.0,미드필더,수비수
82,Daniel James,2,1,65,1.95,16,13,0.0,0,288,72,0.1,1.07,8.6,42,35,20,45.0,공격수,미드필더
161,Trezeguet,5,3,53,0.86,46,28,0.9,21,843,77,0.21,1.07,7.5,79,35,31,35.0,공격수,미드필더
179,Jean-Philippe Gbamin,0,0,78,2.75,47,77,0.0,0,334,89,0.2,1.48,8.5,20,25,5,40.0,미드필더,수비수
235,Harry Kane,10,5,63,0.67,73,64,0.76,17,1155,85,0.8,2.23,8.7,80,37,72,63.0,공격수,미드필더
245,Lucas Moura,6,2,60,1.52,44,27,0.49,8,563,74,0.24,1.28,9.1,43,34,52,37.0,공격수,미드필더
249,Gareth Bale,1,1,59,1.63,46,39,0.09,1,441,72,0.0,1.11,6.2,53,37,23,35.0,공격수,미드필더
260,Joel Campbell,3,2,53,0.76,28,29,0.46,6,442,75,0.23,0.61,6.9,45,35,20,40.0,공격수,미드필더


In [12]:
# 예측모델 저장
joblib.dump(rf,'./models/position_model.pkl')

['./models/position_model.pkl']