In [137]:
import pandas as pd
import numpy as np
from sklearn import preprocessing,svm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,r2_score,accuracy_score, confusion_matrix,classification_report


In [138]:
#Read Data CSV
dataset = pd.read_csv('fifa20.csv')
dataset.head(10)

Unnamed: 0,sofifa_id,player_url,short_name,long_name,age,dob,height_cm,weight_kg,nationality,club,...,lwb,ldm,cdm,rdm,rwb,lb,lcb,cb,rcb,rb
0,158023,https://sofifa.com/player/158023/lionel-messi/...,L. Messi,Lionel Andrés Messi Cuccittini,32,1987-06-24,170,72,Argentina,FC Barcelona,...,68+2,66+2,66+2,66+2,68+2,63+2,52+2,52+2,52+2,63+2
1,20801,https://sofifa.com/player/20801/c-ronaldo-dos-...,Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,34,1985-02-05,187,83,Portugal,Juventus,...,65+3,61+3,61+3,61+3,65+3,61+3,53+3,53+3,53+3,61+3
2,190871,https://sofifa.com/player/190871/neymar-da-sil...,Neymar Jr,Neymar da Silva Santos Junior,27,1992-02-05,175,68,Brazil,Paris Saint-Germain,...,66+3,61+3,61+3,61+3,66+3,61+3,46+3,46+3,46+3,61+3
3,200389,https://sofifa.com/player/200389/jan-oblak/20/...,J. Oblak,Jan Oblak,26,1993-01-07,188,87,Slovenia,Atlético Madrid,...,,,,,,,,,,
4,183277,https://sofifa.com/player/183277/eden-hazard/2...,E. Hazard,Eden Hazard,28,1991-01-07,175,74,Belgium,Real Madrid,...,66+3,63+3,63+3,63+3,66+3,61+3,49+3,49+3,49+3,61+3
5,192985,https://sofifa.com/player/192985/kevin-de-bruy...,K. De Bruyne,Kevin De Bruyne,28,1991-06-28,181,70,Belgium,Manchester City,...,77+3,77+3,77+3,77+3,77+3,73+3,66+3,66+3,66+3,73+3
6,192448,https://sofifa.com/player/192448/marc-andre-te...,M. ter Stegen,Marc-André ter Stegen,27,1992-04-30,187,85,Germany,FC Barcelona,...,,,,,,,,,,
7,203376,https://sofifa.com/player/203376/virgil-van-di...,V. van Dijk,Virgil van Dijk,27,1991-07-08,193,92,Netherlands,Liverpool,...,79+3,83+3,83+3,83+3,79+3,81+3,87+3,87+3,87+3,81+3
8,177003,https://sofifa.com/player/177003/luka-modric/2...,L. Modrić,Luka Modrić,33,1985-09-09,172,66,Croatia,Real Madrid,...,81+3,81+3,81+3,81+3,81+3,79+3,72+3,72+3,72+3,79+3
9,209331,https://sofifa.com/player/209331/mohamed-salah...,M. Salah,Mohamed Salah Ghaly,27,1992-06-15,175,71,Egypt,Liverpool,...,70+3,67+3,67+3,67+3,70+3,66+3,57+3,57+3,57+3,66+3


# Data Preparation

In [139]:
#Drop Unused Columns
d = dataset.drop(['player_url', 'long_name'],axis=1)
d = d.drop(d.loc[:, 'dob':'weak_foot'].columns, axis=1)
d = d.drop(d.loc[:, 'work_rate':'passing'].columns, axis=1)
d = d.drop(d.loc[:, 'defending':'player_traits'].columns, axis=1)
d = d.drop(d.loc[:, 'power_shot_power':'power_long_shots'].columns, axis=1)
data = d.drop(d.loc[:, 'defending_marking':].columns, axis=1)


In [140]:
data

Unnamed: 0,sofifa_id,short_name,age,skill_moves,dribbling,attacking_crossing,attacking_finishing,attacking_heading_accuracy,attacking_short_passing,attacking_volleys,...,movement_sprint_speed,movement_agility,movement_reactions,movement_balance,mentality_aggression,mentality_interceptions,mentality_positioning,mentality_vision,mentality_penalties,mentality_composure
0,158023,L. Messi,32,4,96.0,88,95,70,92,88,...,84,93,95,95,48,40,94,94,75,96
1,20801,Cristiano Ronaldo,34,5,89.0,84,94,89,83,87,...,91,87,96,71,63,29,95,82,85,95
2,190871,Neymar Jr,27,5,95.0,87,87,62,87,87,...,89,96,92,84,51,36,87,90,90,94
3,200389,J. Oblak,26,1,,13,11,15,43,13,...,60,67,88,49,34,19,11,65,11,68
4,183277,E. Hazard,28,4,94.0,81,84,61,89,83,...,88,95,90,94,54,41,87,89,88,91
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18273,245006,Shao Shuai,22,2,33.0,21,17,40,35,27,...,58,45,40,73,46,48,28,25,39,41
18274,250995,Xiao Mingjie,22,2,35.0,24,20,42,43,28,...,61,43,41,76,42,49,23,25,37,35
18275,252332,Zhang Wei,19,2,45.0,32,33,49,53,32,...,54,52,52,57,50,45,38,38,36,39
18276,251110,Wang Haijian,18,2,47.0,39,34,47,54,28,...,63,55,54,59,57,49,31,48,36,40


In [141]:
#Menggabungkan kolom-kolom tertentu untuk meminimalisasikan dimensi data

#Concatenate movement columns
x = (data['movement_acceleration'] + data['movement_sprint_speed'] + data['movement_agility'] + data['movement_reactions'] + data['movement_balance'])/5
data['movement'] = x

#Concatenate attacking columns
x = (data['attacking_crossing'] + data['attacking_finishing'] + data['attacking_heading_accuracy'] + data['attacking_short_passing'] + data['attacking_volleys'])/5
data['attacking'] = x

#Concatenate skill columns
x = (data['skill_dribbling'] + data['skill_curve'] + data['skill_fk_accuracy'] + data['skill_long_passing'] + data['skill_ball_control'])/5
data['skill'] = x

#Concatenate mentality columns
x = (data['mentality_aggression'] + data['mentality_interceptions'] + data['mentality_positioning'] + data['mentality_vision'] + data['mentality_penalties'] + data['mentality_composure'])/6
data['mentality'] = x



In [142]:
#Drop unused columns
data = data.drop(data.loc[:, 'attacking_crossing':'mentality_composure'].columns, axis=1)
data.head(10)

Unnamed: 0,sofifa_id,short_name,age,skill_moves,dribbling,movement,attacking,skill,mentality
0,158023,L. Messi,32,4,96.0,91.6,86.6,94.4,74.5
1,20801,Cristiano Ronaldo,34,5,89.0,86.8,87.4,83.0,74.833333
2,190871,Neymar Jr,27,5,95.0,91.0,82.0,89.4,74.666667
3,200389,J. Oblak,26,1,,61.4,19.0,21.8,34.666667
4,183277,E. Hazard,28,4,94.0,92.2,79.6,86.8,75.0
5,192985,K. De Bruyne,28,4,86.0,79.6,80.8,87.2,81.5
6,192448,M. ter Stegen,27,1,,50.8,23.6,28.8,40.166667
7,203376,V. van Dijk,27,2,71.0,71.0,62.8,71.4,72.333333
8,177003,L. Modrić,33,4,89.0,84.4,76.2,86.0,81.333333
9,209331,M. Salah,27,4,89.0,91.4,78.2,81.0,77.0


In [143]:
#Check correlation between data 
data.corr()

Unnamed: 0,sofifa_id,age,skill_moves,dribbling,movement,attacking,skill,mentality
sofifa_id,1.0,-0.73668,-0.079645,-0.222207,-0.006552,-0.155565,-0.156383,-0.263559
age,-0.73668,1.0,0.045277,0.161913,-0.032349,0.147495,0.132553,0.264678
skill_moves,-0.079645,0.045277,1.0,0.714841,0.72092,0.800612,0.819368,0.714946
dribbling,-0.222207,0.161913,0.714841,1.0,0.755849,0.818624,0.891303,0.656674
movement,-0.006552,-0.032349,0.72092,0.755849,1.0,0.737778,0.766024,0.662269
attacking,-0.155565,0.147495,0.800612,0.818624,0.737778,1.0,0.935812,0.900419
skill,-0.156383,0.132553,0.819368,0.891303,0.766024,0.935812,1.0,0.896398
mentality,-0.263559,0.264678,0.714946,0.656674,0.662269,0.900419,0.896398,1.0


In [144]:
#Check null data or missing values in columns
data.isnull().sum()

sofifa_id         0
short_name        0
age               0
skill_moves       0
dribbling      2036
movement          0
attacking         0
skill             0
mentality         0
dtype: int64

In [145]:
#Filling missing value in column dribbling by mean of existing value
meanDrib = data['dribbling'].mean(skipna=True) 
data['dribbling'] = data['dribbling'].fillna(meanDrib)

In [146]:
#Ubah jadi 1 angka dibelakang koma
data['mentality'] = data['mentality'].round(1)
data['dribbling'] = data['dribbling'].round(1)

#Recheck null data or missing values in columns
data.isnull().sum()

sofifa_id      0
short_name     0
age            0
skill_moves    0
dribbling      0
movement       0
attacking      0
skill          0
mentality      0
dtype: int64

In [147]:
#Labelling Data

#function for define label
def label(x):
    if x == 5:
        y = 'Very High'
    elif x == 4:
        y = 'High'
    elif x == 3:
        y = 'Avarage'
    elif x == 2:
        y = 'Low'
    elif x == 1:
        y = 'very Low'
    return y
data
# #Insert column label to data
arr = pd.DataFrame(data['skill_moves']).values.tolist()
for i in range(len(data)):
     data.loc[i, 'label'] = label(arr[i][0])


In [148]:
#Filter Data untuk pemain-pemain yang berumur dan diatas 30 tahun
data = data[data.age >= 30]
#reset index data
data = data.reset_index(drop=True)

In [149]:
data

Unnamed: 0,sofifa_id,short_name,age,skill_moves,dribbling,movement,attacking,skill,mentality,label
0,158023,L. Messi,32,4,96.0,91.6,86.6,94.4,74.5,High
1,20801,Cristiano Ronaldo,34,5,89.0,86.8,87.4,83.0,74.8,Very High
2,177003,L. Modrić,33,4,89.0,84.4,76.2,86.0,81.3,High
3,138956,G. Chiellini,34,2,60.0,66.0,56.0,55.2,65.2,Low
4,153079,S. Agüero,31,4,88.0,85.4,81.8,79.4,73.0,High
...,...,...,...,...,...,...,...,...,...,...
3661,224695,Ge Zhen,32,2,51.0,59.0,35.0,35.0,39.7,Low
3662,11430,J. McCombe,36,2,23.0,29.4,32.0,26.0,35.8,Low
3663,245558,G. Maley,36,1,62.5,33.6,12.4,16.4,18.5,very Low
3664,252355,R. Hernández,30,2,48.0,52.6,40.8,47.8,49.3,Low


# Data Exploration
## Using Standard Scaler for scalling method

In [150]:
#create new dataframe and convert it to array numpy
norm = pd.DataFrame(data[['dribbling','movement','attacking','skill', 'mentality']])
arr = norm.values.tolist()
arr

[[96.0, 91.6, 86.6, 94.4, 74.5],
 [89.0, 86.8, 87.4, 83.0, 74.8],
 [89.0, 84.4, 76.2, 86.0, 81.3],
 [60.0, 66.0, 56.0, 55.2, 65.2],
 [88.0, 85.4, 81.8, 79.4, 73.0],
 [73.0, 75.2, 74.0, 75.4, 81.0],
 [84.0, 79.2, 84.8, 81.8, 78.3],
 [85.0, 80.8, 81.0, 80.0, 76.7],
 [80.0, 57.2, 66.0, 76.8, 81.7],
 [68.0, 57.8, 69.4, 65.4, 76.2],
 [62.5, 54.4, 16.0, 17.6, 32.0],
 [62.5, 56.8, 23.8, 32.0, 43.0],
 [62.5, 64.4, 18.8, 23.0, 33.8],
 [89.0, 78.8, 76.8, 84.4, 75.5],
 [79.0, 74.8, 83.0, 73.0, 78.8],
 [61.0, 62.4, 61.6, 59.0, 67.8],
 [87.0, 85.4, 78.8, 85.4, 74.8],
 [80.0, 84.0, 81.2, 75.8, 69.8],
 [78.0, 73.4, 72.6, 76.8, 77.7],
 [72.0, 68.0, 64.4, 70.0, 73.5],
 [86.0, 76.8, 82.0, 80.4, 74.5],
 [72.0, 66.0, 67.8, 71.6, 74.5],
 [91.0, 90.0, 71.4, 83.0, 70.8],
 [72.0, 62.2, 69.4, 70.4, 75.3],
 [66.0, 64.6, 61.6, 70.8, 71.5],
 [83.0, 88.0, 74.8, 77.6, 77.0],
 [62.5, 62.4, 15.6, 19.6, 38.0],
 [81.0, 68.0, 78.2, 85.4, 78.3],
 [87.0, 84.4, 74.6, 85.8, 73.7],
 [70.0, 63.0, 62.2, 69.2, 73.3],
 [79.0, 59

In [151]:
#scalling data using standard scalling
scaler = preprocessing.StandardScaler()
arr_norm = scaler.fit_transform(arr)

In [152]:
#Input scalled data to dataframe
n = 0
for c in ['dribbling','movement','attacking','skill', 'mentality']:
    singleArr = []
    for i in range(len(arr_norm)):
        singleArr.append(arr_norm[i][n])
    data.loc[:,c] = singleArr
    n=n+1

In [153]:
#ubah data jadi 1 angka dibelakang koma
data = data.round(1)
data

Unnamed: 0,sofifa_id,short_name,age,skill_moves,dribbling,movement,attacking,skill,mentality,label
0,158023,L. Messi,32,4,3.5,2.7,2.0,2.3,1.4,High
1,20801,Cristiano Ronaldo,34,5,2.7,2.3,2.1,1.7,1.4,Very High
2,177003,L. Modrić,33,4,2.7,2.0,1.4,1.8,1.9,High
3,138956,G. Chiellini,34,2,-0.3,0.4,0.3,0.1,0.7,Low
4,153079,S. Agüero,31,4,2.6,2.1,1.7,1.5,1.3,High
...,...,...,...,...,...,...,...,...,...,...
3661,224695,Ge Zhen,32,2,-1.3,-0.2,-0.9,-1.0,-1.2,Low
3662,11430,J. McCombe,36,2,-4.3,-2.8,-1.1,-1.5,-1.5,Low
3663,245558,G. Maley,36,1,-0.1,-2.4,-2.2,-2.0,-2.7,very Low
3664,252355,R. Hernández,30,2,-1.6,-0.7,-0.6,-0.3,-0.5,Low


In [154]:
#Convert dataframe to csv
data.to_csv('Data Eksplorasi Klasifikasi SVM.csv')

In [155]:
#encode string to integer
arr = data['label'].values.tolist()
enc = preprocessing.LabelEncoder()

#Input array label to array y
arr_y = enc.fit_transform(arr)
arr_y

array([1, 3, 1, ..., 4, 2, 2], dtype=int64)

In [156]:
#Input array fitur to array x
arr_x = pd.DataFrame(data[['dribbling','movement','attacking','skill', 'mentality']]).values.tolist()


In [157]:
#split data: 75% data latih dan 25% data uji
x_train,x_test,y_train,y_test = train_test_split(arr_x,arr_y,test_size=0.25,random_state=42)

# Klasifikasi
## Using Support Vector Machine with RBF, Polynomial, and Linear kernel

In [158]:
#Klasifikasi menggunakan SVM 
for kernel_arg in ['rbf','poly','linear']:
    #SVM dengan kernel RBF
    if (kernel_arg == 'rbf'):
        clf = svm.SVC(kernel ='rbf',C=1e3,gamma=0.1)
    #SVM dengan kernel Polynomial
    elif (kernel_arg == 'poly'):
        clf = svm.SVC(kernel ='poly',C=1e3,degree=3)
    #SVM dengan kernel linear
    else:
        clf = svm.SVC(kernel ='linear',C=1e3)
    clf.fit(x_train,y_train)
    y_predict = clf.predict(x_test)

    #Menghitung akurasi label prediksi 
    accuracy = accuracy_score(y_test, y_predict)
    report = classification_report(y_test,y_predict, target_names = ['Very High', 'High', 'Avarage', 'Low', 'Very Low'])
    
    print('The Accuracy of %s : %f'%(kernel_arg,accuracy))
    print(report)
    


The Accuracy of rbf : 0.837514
              precision    recall  f1-score   support

   Very High       0.76      0.84      0.80       340
        High       0.76      0.43      0.55        67
     Avarage       0.86      0.84      0.85       346
         Low       0.00      0.00      0.00         3
    Very Low       1.00      1.00      1.00       161

    accuracy                           0.84       917
   macro avg       0.68      0.62      0.64       917
weighted avg       0.84      0.84      0.84       917

The Accuracy of poly : 0.827699
              precision    recall  f1-score   support

   Very High       0.76      0.78      0.77       340
        High       0.75      0.40      0.52        67
     Avarage       0.82      0.88      0.85       346
         Low       0.50      0.33      0.40         3
    Very Low       1.00      1.00      1.00       161

    accuracy                           0.83       917
   macro avg       0.77      0.68      0.71       917
weighted avg  

  _warn_prf(average, modifier, msg_start, len(result))
