# Player overall and market value prediction using FIFA 19 dataset

#### Section 1 - Import statements 


In [None]:
import json
import os
import csv
import collections
from sklearn.pipeline import Pipeline

import numpy as np
import pandas as pd
import keras
import tensorflow as tf
import shutil
import time
from scipy.stats import zscore
from scipy import stats
from sklearn import preprocessing
from sklearn import utils
import sklearn.feature_extraction.text as sk_text
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, ElasticNet
from sklearn.ensemble import GradientBoostingRegressor

import io
import requests
from sklearn import metrics
from keras import optimizers
from keras.optimizers import Adam, sgd
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D
from keras.layers.core import Dense, Activation, Dropout, Flatten
from keras.callbacks import EarlyStopping, ModelCheckpoint
import xgboost as xgb
import lightgbm as lgb

from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import confusion_matrix, classification_report, r2_score, roc_curve, auc

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [None]:
# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)


# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd


# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)


# Convert all missing values in the specified column to the default
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)


# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column. 
    target_type = df[target].dtypes
    target_type = target_type[0] if isinstance(target_type, collections.Sequence) else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32, np.object):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    else:
        # Regression
        return df[result].values.astype(np.float32), df[target].values.astype(np.float32)

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)


# Regression chart.
def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    a = plt.plot(t['y'].tolist(),label='expected')
    b = plt.plot(t['pred'].tolist(),label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()

# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)


# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low=0, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
               * (normalized_high - normalized_low) + normalized_low


In [None]:
# Plot a confusion matrix.
# cm is the confusion matrix, names are the names of the classes.
def plot_confusion_matrix(cm, names, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(names))
    plt.xticks(tick_marks, names, rotation=45)
    plt.yticks(tick_marks, names)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    

# Plot an ROC. pred - the predictions, y - the expected output.
def plot_roc(pred,y):
    fpr, tpr, thresholds = roc_curve(y, pred)
    roc_auc = auc(fpr, tpr)

    plt.figure()
    plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC)')
    plt.legend(loc="lower right")
    plt.show()

#### Section 3 - Data read and pre-processing

In [None]:
df1 =pd.read_csv('./data1.csv')
df = df1[~df1["Value"].str.contains('K')]
df = df[0:11000].copy()


In [None]:
df.drop(['Unnamed: 0','Photo','Flag','Club Logo'],axis=1,inplace=True)
df.head()

Unnamed: 0,ID,Name,Age,Nationality,Overall,Potential,Club,Value,Wage,Special,...,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause
0,158023,L. Messi,31,Argentina,94,94,FC Barcelona,€110.5M,€565K,2202,...,96.0,33.0,28.0,26.0,6.0,11.0,15.0,14.0,8.0,€226.5M
1,20801,Cristiano Ronaldo,33,Portugal,94,94,Juventus,€77M,€405K,2228,...,95.0,28.0,31.0,23.0,7.0,11.0,15.0,14.0,11.0,€127.1M
2,190871,Neymar Jr,26,Brazil,92,93,Paris Saint-Germain,€118.5M,€290K,2143,...,94.0,27.0,24.0,33.0,9.0,9.0,15.0,15.0,11.0,€228.1M
3,193080,De Gea,27,Spain,91,93,Manchester United,€72M,€260K,1471,...,68.0,15.0,21.0,13.0,90.0,85.0,87.0,88.0,94.0,€138.6M
4,192985,K. De Bruyne,27,Belgium,91,92,Manchester City,€102M,€355K,2281,...,88.0,68.0,58.0,51.0,15.0,13.0,5.0,10.0,13.0,€196.4M


In [None]:
# filling the missing value for the continous variables.

df['ShortPassing'].fillna(df['ShortPassing'].mean(), inplace = True)
df['Volleys'].fillna(df['Volleys'].mean(), inplace = True)
df['Dribbling'].fillna(df['Dribbling'].mean(), inplace = True)
df['Curve'].fillna(df['Curve'].mean(), inplace = True)
df['FKAccuracy'].fillna(df['FKAccuracy'], inplace = True)
df['LongPassing'].fillna(df['LongPassing'].mean(), inplace = True)
df['BallControl'].fillna(df['BallControl'].mean(), inplace = True)
df['HeadingAccuracy'].fillna(df['HeadingAccuracy'].mean(), inplace = True)
df['Finishing'].fillna(df['Finishing'].mean(), inplace = True)
df['Crossing'].fillna(df['Crossing'].mean(), inplace = True)
df['Weight'].fillna('200lbs', inplace = True)
df['Contract Valid Until'].fillna(2019, inplace = True)
df['Height'].fillna("5'11", inplace = True)
df['Loaned From'].fillna('None', inplace = True)
df['Joined'].fillna('Jul 1, 2018', inplace = True)
df['Jersey Number'].fillna(8, inplace = True)
df['Body Type'].fillna('Normal', inplace = True)
df['Position'].fillna('ST', inplace = True)
df['Club'].fillna('No Club', inplace = True)
df['Work Rate'].fillna('Medium/ Medium', inplace = True)
df['Skill Moves'].fillna(df['Skill Moves'].median(), inplace = True)
df['Weak Foot'].fillna(3, inplace = True)
df['Preferred Foot'].fillna('Right', inplace = True)
df['International Reputation'].fillna(1, inplace = True)
df['Wage'].fillna('€200K', inplace = True)

In [None]:
df['Wage']= df['Wage'].str.strip('K')
df['Wage']= df['Wage'].str.strip('€')
df['Wage'] = df['Wage'].astype('float64')

In [None]:
col = ['Age', 'Overall', 'Wage','Potential', 'Crossing', 'Finishing', 'HeadingAccuracy', 'ShortPassing', 'Volleys', 'Dribbling', 'Curve', 'FKAccuracy', 'LongPassing', 'BallControl', 'Acceleration', 'SprintSpeed', 'Agility', 'Reactions', 'Balance', 'ShotPower', 'Jumping' ,'Stamina', 'Strength', 'LongShots', 'Aggression', 'Interceptions', 'Positioning' ,'Vision', 'Penalties', 'Composure', 'Marking', 'StandingTackle', 'SlidingTackle', 'GKDiving', 'GKHandling', 'GKKicking', 'GKPositioning', 'GKReflexes']

for x in col:
    encode_numeric_range(df,x)

In [None]:
#Label encoding

encode_text_index(df,'Preferred Foot')
encode_text_index(df,'Body Type')
encode_text_index(df,'Nationality')

array(['Albania', 'Algeria', 'Angola', 'Antigua & Barbuda', 'Argentina',
       'Armenia', 'Australia', 'Austria', 'Belarus', 'Belgium', 'Benin',
       'Bermuda', 'Bolivia', 'Bosnia Herzegovina', 'Brazil', 'Bulgaria',
       'Burkina Faso', 'Burundi', 'Cameroon', 'Canada', 'Cape Verde',
       'Central African Rep.', 'Chad', 'Chile', 'China PR', 'Colombia',
       'Comoros', 'Congo', 'Costa Rica', 'Croatia', 'Cuba', 'Curacao',
       'Cyprus', 'Czech Republic', 'DR Congo', 'Denmark',
       'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador', 'England',
       'Equatorial Guinea', 'Eritrea', 'Estonia', 'FYR Macedonia', 'Fiji',
       'Finland', 'France', 'Gabon', 'Gambia', 'Georgia', 'Germany',
       'Ghana', 'Greece', 'Guinea', 'Guinea Bissau', 'Haiti', 'Honduras',
       'Hungary', 'Iceland', 'India', 'Iran', 'Iraq', 'Israel', 'Italy',
       'Ivory Coast', 'Jamaica', 'Japan', 'Kazakhstan', 'Kenya',
       'Korea DPR', 'Korea Republic', 'Kosovo', 'Kuwait', 'Latvia',
       'Lib

In [None]:
df.columns

Index(['ID', 'Name', 'Age', 'Nationality', 'Overall', 'Potential', 'Club',
       'Value', 'Wage', 'Special', 'Preferred Foot',
       'International Reputation', 'Weak Foot', 'Skill Moves', 'Work Rate',
       'Body Type', 'Real Face', 'Position', 'Jersey Number', 'Joined',
       'Loaned From', 'Contract Valid Until', 'Height', 'Weight', 'LS', 'ST',
       'RS', 'LW', 'LF', 'CF', 'RF', 'RW', 'LAM', 'CAM', 'RAM', 'LM', 'LCM',
       'CM', 'RCM', 'RM', 'LWB', 'LDM', 'CDM', 'RDM', 'RWB', 'LB', 'LCB', 'CB',
       'RCB', 'RB', 'Crossing', 'Finishing', 'HeadingAccuracy', 'ShortPassing',
       'Volleys', 'Dribbling', 'Curve', 'FKAccuracy', 'LongPassing',
       'BallControl', 'Acceleration', 'SprintSpeed', 'Agility', 'Reactions',
       'Balance', 'ShotPower', 'Jumping', 'Stamina', 'Strength', 'LongShots',
       'Aggression', 'Interceptions', 'Positioning', 'Vision', 'Penalties',
       'Composure', 'Marking', 'StandingTackle', 'SlidingTackle', 'GKDiving',
       'GKHandling', 'GKKicking

In [None]:
df.dropna(inplace = True)

In [None]:
col = ['Age','Overall', 'Potential','Crossing', 'Finishing', 'HeadingAccuracy', 'ShortPassing', 'Volleys', 'Dribbling', 'Curve', 'FKAccuracy', 'LongPassing', 'BallControl', 'Acceleration', 'SprintSpeed', 'Agility', 'Reactions', 'Balance', 'ShotPower', 'Jumping' ,'Stamina', 'Strength', 'LongShots', 'Aggression', 'Interceptions', 'Positioning' ,'Vision', 'Penalties', 'Composure', 'Marking', 'StandingTackle', 'SlidingTackle', 'GKDiving', 'GKHandling', 'GKKicking', 'GKPositioning', 'GKReflexes']

for x in col:
    remove_outliers(df,x,3)

In [None]:
df['Value']= df['Value'].str.strip('M')
df['Value']= df['Value'].str.strip('€')


In [None]:
df.dtypes

ID                            int64
Name                         object
Age                         float64
Nationality                   int64
Overall                     float64
Potential                   float64
Club                         object
Value                        object
Wage                        float64
Special                       int64
Preferred Foot                int64
International Reputation    float64
Weak Foot                   float64
Skill Moves                 float64
Work Rate                    object
Body Type                     int64
Real Face                    object
Position                     object
Jersey Number               float64
Joined                       object
Loaned From                  object
Contract Valid Until         object
Height                       object
Weight                       object
LS                           object
ST                           object
RS                           object
LW                          

In [None]:
df.tail()

Unnamed: 0,ID,Name,Age,Nationality,Overall,Potential,Club,Value,Wage,Special,...,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause
10856,236772,D. Szoboszlai,0.035714,58,0.369565,0.744681,FC Red Bull Salzburg,1.1,0.00708,1719,...,0.55914,0.264368,0.244186,0.202381,0.101124,0.098901,0.133333,0.101124,0.053763,€2.3M
10900,238319,T. Edwards,0.107143,40,0.369565,0.723404,Stoke City,1.0,0.00708,1686,...,0.612903,0.62069,0.616279,0.678571,0.101124,0.142857,0.133333,0.05618,0.064516,€2.6M
10922,234229,R. McCrorie,0.142857,105,0.369565,0.680851,Rangers FC,1.1,0.014159,1604,...,0.612903,0.632184,0.686047,0.678571,0.101124,0.043956,0.111111,0.101124,0.064516,€2.6M
11323,239948,S. Reyes,0.142857,82,0.347826,0.638298,Monarcas Morelia,1.0,0.00354,1508,...,0.548387,0.229885,0.244186,0.22619,0.123596,0.098901,0.155556,0.146067,0.096774,€2.3M
11457,241266,W. Geubbels,0.0,47,0.347826,0.808511,AS Monaco,1.0,0.00885,1576,...,0.645161,0.229885,0.162791,0.142857,0.044944,0.131868,0.055556,0.157303,0.139785,€2.9M


In [None]:
df_x = df[['Age','Overall','Potential', 'Crossing', 'Finishing', 'HeadingAccuracy', 'ShortPassing', 'Volleys', 'Dribbling', 'Curve', 'FKAccuracy', 'LongPassing', 'BallControl', 'Acceleration', 'SprintSpeed', 'Agility', 'Reactions', 'Balance', 'ShotPower', 'Jumping' ,'Stamina', 'Strength', 'LongShots', 'Aggression', 'Nationality','Wage','Interceptions', 'Positioning' ,'Vision', 'Penalties', 'Composure', 'Marking', 'StandingTackle', 'SlidingTackle', 'GKDiving', 'GKHandling', 'GKKicking', 'GKPositioning', 'GKReflexes', 'Preferred Foot', 'Body Type']].copy()

In [None]:
df_y = df['Value'].copy()
df_y.astype('float64')

80       50.0
82       40.5
84       40.5
85       38.0
86       45.0
87       45.0
89       28.5
90       37.0
93       37.5
95       30.0
96       26.0
97       37.5
98       30.5
99       33.0
100      38.5
101      35.5
102       9.0
103      26.0
104      15.5
105      37.0
106      21.5
107      22.0
108       9.0
110      34.0
112      32.0
113      34.0
114      40.5
118      34.0
119      35.5
120      31.5
         ... 
10083     1.0
10087     1.1
10155     1.0
10172     1.2
10178     1.2
10200     1.0
10238     1.1
10240     1.1
10315     1.0
10343     1.1
10352     1.1
10361     1.1
10457     1.0
10461     1.2
10479     1.2
10501     1.1
10523     1.1
10526     1.0
10557     1.2
10567     1.2
10620     1.1
10752     1.2
10801     1.1
10836     1.2
10841     1.2
10856     1.1
10900     1.0
10922     1.1
11323     1.0
11457     1.0
Name: Value, Length: 4876, dtype: float64

In [None]:
df_x.isnull().values.any()



False

In [None]:
X = np.array(df_x)

In [None]:
Y = np.array(df_y)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1 )

In [None]:
#Linear Regression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import  linear_model

lr = linear_model.LinearRegression()
lr.fit(X_train,Y_train)
lr_pred = lr.predict(X_test)

print ('--------------------------------------------------')
print ('MODEL: Linear Regression')
print ('--------------------------------------------------')


print('Actual values:')
print(Y_test[0:9])
print('Predicted values:')
print(lr_pred[0:9])

lr_mse  = mean_squared_error(lr_pred, Y_test)
lr_rmse = np.sqrt(lr_mse)
print('RMSE Score for Linear Regression:', lr_rmse)

--------------------------------------------------
MODEL: Linear Regression
--------------------------------------------------
Actual values:
['1.4' '8' '5.5' '6.5' '1' '2.4' '1.5' '3.4' '2.4']
Predicted values:
[ 2.26560693 13.53921301  6.77351927  7.48412607 -1.15617165  1.1750294
  2.26290932  5.84962154  1.35286005]
RMSE Score for Linear Regression: 2.132842772609482


In [None]:
ri = Ridge(alpha=0.01, normalize=False).fit(X_train,Y_train)
ri_pred = ri.predict(X_test)
ri_mse  = mean_squared_error(ri_pred, Y_test)
ri_rmse = np.sqrt(ri_mse)

print ('--------------------------------------------------')
print ('MODEL: Ridge')
print ('--------------------------------------------------')
print('Actual values:')
print(Y_test[0:9])
print('Predicted values:')
print(ri_pred[0:9])
print('RMSE Score for Ridge:',ri_rmse)




--------------------------------------------------
MODEL: Ridge
--------------------------------------------------
Actual values:
['1.4' '8' '5.5' '6.5' '1' '2.4' '1.5' '3.4' '2.4']
Predicted values:
[ 2.2605936  13.54420442  6.77559208  7.48312509 -1.15072305  1.17880496
  2.26274962  5.85012653  1.35430619]
RMSE Score for Ridge: 2.132786908843801


In [None]:
ricv = RidgeCV(cv=5).fit(X_train,Y_train)
ricv_pred = ricv.predict(X_test)
ricv_mse  = mean_squared_error(ricv_pred, Y_test)
ricv_rmse = np.sqrt(ricv_mse)

print ('--------------------------------------------------')
print ('MODEL: RidgeCV')
print ('--------------------------------------------------')
print('Actual values:')
print(Y_test[0:9])
print('Predicted values:')
print(ricv_pred[0:9])
print('RMSE Score for RidgeCV:',ricv_rmse)


--------------------------------------------------
MODEL: RidgeCV
--------------------------------------------------
Actual values:
['1.4' '8' '5.5' '6.5' '1' '2.4' '1.5' '3.4' '2.4']
Predicted values:
[ 2.21694857 13.58728777  6.7933785   7.47516326 -1.10383923  1.21231204
  2.26158856  5.85400108  1.36732777]
RMSE Score for RidgeCV: 2.1328646944445198


In [None]:
en = ElasticNet(alpha=0.005, l1_ratio=0.1).fit(X_train,Y_train)
en_pred = en.predict(X_test)
en_mse  = mean_squared_error(en_pred, Y_test)
en_rmse = np.sqrt(en_mse)

print ('--------------------------------------------------')
print ('MODEL: ElasticNet')
print ('--------------------------------------------------')
print('Actual values:')
print(Y_test[0:9])
print('Predicted values:')
print(en_pred[0:9])
print('RMSE Score for ElasticNet:',en_rmse)



--------------------------------------------------
MODEL: ElasticNet
--------------------------------------------------
Actual values:
['1.4' '8' '5.5' '6.5' '1' '2.4' '1.5' '3.4' '2.4']
Predicted values:
[ 1.49542881 13.48004272  6.90013078  7.59579243  0.3503782   3.58503227
  2.39411495  5.22196195  2.76687153]
RMSE Score for ElasticNet: 2.7750444208644995


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression

regr = RandomForestRegressor(max_depth=2, random_state=1, n_estimators=100)
regr.fit(X,Y)
regr_pred = regr.predict(X_test)
regr_mse  = mean_squared_error(regr_pred, Y_test)
regr_rmse = np.sqrt(regr_mse)

print ('--------------------------------------------------')
print ('MODEL: Random Forest Regression')
print ('--------------------------------------------------')
print('Actual values:')
print(Y_test[0:9])
print('Predicted values:')
print(regr_pred[0:9])
print('RMSE Score for Random Forest Regression:',regr_rmse)


--------------------------------------------------
MODEL: Random Forest Regression
--------------------------------------------------
Actual values:
['1.4' '8' '5.5' '6.5' '1' '2.4' '1.5' '3.4' '2.4']
Predicted values:
[ 2.32026252 15.07321179  2.32026252  7.16684017  2.32026252  2.32026252
  2.32026252  2.32026252  2.32026252]
RMSE Score for Random Forest Regression: 2.403798479357281


In [None]:
lars = linear_model.LassoLars(alpha=.1)
lars.fit(X,Y)
lars_pred = lars.predict(X_test)
lars_mse  = mean_squared_error(lars_pred, Y_test)
lars_rmse = np.sqrt(lars_mse)

print ('--------------------------------------------------')
print ('MODEL: Lars Lasso Regression')
print ('--------------------------------------------------')
print('Actual values:')
print(Y_test[0:9])
print('Predicted values:')
print(lars_pred[0:9])
print('RMSE Score for Lars Lasso Regression:',lars_rmse)


--------------------------------------------------
MODEL: Lars Lasso Regression
--------------------------------------------------
Actual values:
['1.4' '8' '5.5' '6.5' '1' '2.4' '1.5' '3.4' '2.4']
Predicted values:
[5.42325677 5.42325677 5.42325677 5.42325677 5.42325677 5.42325677
 5.42325677 5.42325677 5.42325677]
RMSE Score for Lars Lasso Regression: 6.199130539851531
