# I. Import library and load data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [2]:
data = pd.read_csv("processed data.csv")
data = data.drop(columns = ['Unnamed: 0'])
data.head()

Unnamed: 0,MP,Min,Goals,Assists,CrdY,CrdR,Comp,Shots,Acceleration,Aggression,...,Vision,Volleys,age,name,overall,potential,preferred_foot,position common,market value,wage value
0,34,2983,0.06,0.0,0.15,0.03,Ligue 1,0.54,37,81,...,45,24,33,Yunis Abdelhamid,76,76,Left,defender,3600000.0,23000.0
1,31,2462,0.04,0.0,0.44,0.11,Ligue 1,0.66,50,79,...,65,38,21,Salis Abdul Samed,70,75,Right,midfielder,2200000.0,7000.0
2,34,2956,0.0,0.06,0.27,0.0,Ligue 1,0.91,77,76,...,66,37,28,Laurent Abergel,75,75,Right,midfielder,4900000.0,18000.0
3,24,726,0.0,0.12,0.37,0.0,Bundesliga,2.22,81,39,...,44,53,22,Dickson Abiama,68,76,Right,striker,2700000.0,9000.0
4,30,2536,0.14,0.0,0.07,0.04,Serie A,0.57,64,77,...,65,39,33,Francesco Acerbi,83,83,Left,defender,17500000.0,75000.0


# II. Convert category data into numeric data and remove columns that don't related in build model

In [3]:
# Comp = name of league
data['Comp'].replace(data['Comp'].unique(),[1, 3, 2, 4, 5], inplace=True)
# drop name column and preferred_foot
data = data.drop(columns = ['name','preferred_foot'])

In [4]:
#convert the categorical variables to dummies
data = pd.get_dummies(data) 
# log tranformation for market vakue
data['market value'] = np.log(data['market value'])
data.head()

Unnamed: 0,MP,Min,Goals,Assists,CrdY,CrdR,Comp,Shots,Acceleration,Aggression,...,Volleys,age,overall,potential,market value,wage value,position common_defender,position common_goalkeeper,position common_midfielder,position common_striker
0,34,2983,0.06,0.0,0.15,0.03,1,0.54,37,81,...,24,33,76,76,15.096444,23000.0,1,0,0,0
1,31,2462,0.04,0.0,0.44,0.11,1,0.66,50,79,...,38,21,70,75,14.603968,7000.0,0,0,1,0
2,34,2956,0.0,0.06,0.27,0.0,1,0.91,77,76,...,37,28,75,75,15.404746,18000.0,0,0,1,0
3,24,726,0.0,0.12,0.37,0.0,3,2.22,81,39,...,53,22,68,76,14.808762,9000.0,0,0,0,1
4,30,2536,0.14,0.0,0.07,0.04,2,0.57,64,77,...,39,33,83,83,16.677711,75000.0,1,0,0,0


# III. Split training and test

In [5]:
data_build = data.copy()
y = data_build[['market value']]
x = data_build.drop(columns =['market value'])
# Standardization
def standard_scale(x):
    convert = StandardScaler().fit(x)
    return convert.transform(x)
column = x.columns
#x = standard_scale(x)
x_train, x_test, y_train, y_test = train_test_split(x,y , test_size = 0.33, random_state = 3 )

x_train = pd.DataFrame(x_train, columns = column)
x_test =  pd.DataFrame(x_test, columns = column)


# IV.Emsemble models : Stacking

In [6]:
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold

## 1. Layer1 : Lasso, Ridge, Tree, Forest 

In [7]:
def Stacking(model, train, y, test ):
    folds = KFold(n_splits = 5)
    test_pred = []
    train_pred= []
    for train_index, val_index in folds.split(train, y):
        
        x_train = train.iloc[train_index]
        x_val = train.iloc[val_index]
        y_train = y.iloc[train_index]
        y_val= y.iloc[val_index]
        
        model.fit(x_train, y_train)
        for i in model.predict(x_val):
            train_pred.append(i)
        test_pred.append(list(model.predict(test)))
    test_pred = np.average(np.array(test_pred), axis=0)
    return test_pred, train_pred


model1 = Lasso(alpha = 0.005)
test_pred_1 ,train_pred_1 = Stacking(model1, x_train, y_train, x_test) 
train_pred_1 = pd.DataFrame(train_pred_1)
test_pred_1 = pd.DataFrame(test_pred_1)

model2 = Ridge(alpha = 0.4)
test_pred_2 ,train_pred_2 = Stacking(model2, x_train, y_train, x_test) 
train_pred_2 = pd.DataFrame(train_pred_2)
test_pred_2 = pd.DataFrame(test_pred_2)

model3 = KNeighborsRegressor(n_neighbors = 7,weights = 'distance', metric = 'manhattan')
test_pred_3 ,train_pred_3 = Stacking(model3, x_train, y_train, x_test) 
train_pred_3 = pd.DataFrame(train_pred_3)
test_pred_3 = pd.DataFrame(test_pred_3)

model4 = DecisionTreeRegressor(random_state = 3,criterion = 'friedman_mse', max_depth = 16 ,max_features = None,
                              min_samples_leaf = 2 )
test_pred_4 ,train_pred_4 = Stacking(model4, x_train, y_train, x_test) 
train_pred_4 = pd.DataFrame(train_pred_4)
test_pred_4 = pd.DataFrame(test_pred_4)

model5 = RandomForestRegressor(random_state = 3, n_estimators = 500,
                              max_features = None, min_samples_leaf = 1)
test_pred_5 ,train_pred_5 = Stacking(model5, x_train, y_train, x_test) 
train_pred_5 = pd.DataFrame(train_pred_5)
test_pred_5 = pd.DataFrame(test_pred_5)

  model.fit(x_train, y_train)
  model.fit(x_train, y_train)
  model.fit(x_train, y_train)
  model.fit(x_train, y_train)
  model.fit(x_train, y_train)


In [8]:
# add predictions of 4 models as  new features
x_train['Lasso'] = train_pred_1.values
x_train['Ridge'] = train_pred_2.values
x_train['Tree'] = train_pred_4.values
x_train['Forest'] = train_pred_5.values


x_test['Lasso'] = test_pred_1.values
x_test['Ridge'] = test_pred_2.values
x_test['Tree'] = test_pred_4.values
x_test['Forest'] = test_pred_5.values

print(x_train.shape)
print(x_test.shape)


(1341, 56)
(661, 56)


## 2. Layer 2: XGBoost  

#### we use GridSearchCV to optimize parameter

In [17]:
regressor = XGBRegressor()

params = {
    'n_estimators': [300,500, 1000],
    
    'learning_rate': [x/100 for x in range(1, 21, 2)],
    'max_depth':[int(x) for x in range(3, 11, 2)]
}
scoring_fnc = 'neg_mean_squared_error'
grid = GridSearchCV(estimator=regressor, param_grid=params, scoring= scoring_fnc, cv= 5 , verbose= 5)
grid.fit(x_train, y_train)

Fitting 5 folds for each of 120 candidates, totalling 600 fits
[CV 1/5] END learning_rate=0.01, max_depth=3, n_estimators=300;, score=-0.566 total time=   0.7s
[CV 2/5] END learning_rate=0.01, max_depth=3, n_estimators=300;, score=-0.599 total time=   0.8s
[CV 3/5] END learning_rate=0.01, max_depth=3, n_estimators=300;, score=-0.586 total time=   0.7s
[CV 4/5] END learning_rate=0.01, max_depth=3, n_estimators=300;, score=-0.562 total time=   0.7s
[CV 5/5] END learning_rate=0.01, max_depth=3, n_estimators=300;, score=-0.617 total time=   0.7s
[CV 1/5] END learning_rate=0.01, max_depth=3, n_estimators=500;, score=-0.019 total time=   1.4s
[CV 2/5] END learning_rate=0.01, max_depth=3, n_estimators=500;, score=-0.028 total time=   1.2s
[CV 3/5] END learning_rate=0.01, max_depth=3, n_estimators=500;, score=-0.026 total time=   1.2s
[CV 4/5] END learning_rate=0.01, max_depth=3, n_estimators=500;, score=-0.023 total time=   1.2s
[CV 5/5] END learning_rate=0.01, max_depth=3, n_estimators=500;,

[CV 5/5] END learning_rate=0.03, max_depth=5, n_estimators=500;, score=-0.011 total time=   1.8s
[CV 1/5] END learning_rate=0.03, max_depth=5, n_estimators=1000;, score=-0.009 total time=   3.8s
[CV 2/5] END learning_rate=0.03, max_depth=5, n_estimators=1000;, score=-0.014 total time=   3.8s
[CV 3/5] END learning_rate=0.03, max_depth=5, n_estimators=1000;, score=-0.017 total time=   3.8s
[CV 4/5] END learning_rate=0.03, max_depth=5, n_estimators=1000;, score=-0.012 total time=   3.9s
[CV 5/5] END learning_rate=0.03, max_depth=5, n_estimators=1000;, score=-0.011 total time=   4.1s
[CV 1/5] END learning_rate=0.03, max_depth=7, n_estimators=300;, score=-0.010 total time=   1.3s
[CV 2/5] END learning_rate=0.03, max_depth=7, n_estimators=300;, score=-0.014 total time=   1.3s
[CV 3/5] END learning_rate=0.03, max_depth=7, n_estimators=300;, score=-0.017 total time=   1.3s
[CV 4/5] END learning_rate=0.03, max_depth=7, n_estimators=300;, score=-0.012 total time=   1.3s
[CV 5/5] END learning_rat

[CV 5/5] END learning_rate=0.05, max_depth=9, n_estimators=300;, score=-0.012 total time=   1.9s
[CV 1/5] END learning_rate=0.05, max_depth=9, n_estimators=500;, score=-0.011 total time=   2.8s
[CV 2/5] END learning_rate=0.05, max_depth=9, n_estimators=500;, score=-0.014 total time=   4.3s
[CV 3/5] END learning_rate=0.05, max_depth=9, n_estimators=500;, score=-0.017 total time=   5.6s
[CV 4/5] END learning_rate=0.05, max_depth=9, n_estimators=500;, score=-0.013 total time=   4.8s
[CV 5/5] END learning_rate=0.05, max_depth=9, n_estimators=500;, score=-0.012 total time=   3.7s
[CV 1/5] END learning_rate=0.05, max_depth=9, n_estimators=1000;, score=-0.011 total time=   5.0s
[CV 2/5] END learning_rate=0.05, max_depth=9, n_estimators=1000;, score=-0.014 total time=   4.4s
[CV 3/5] END learning_rate=0.05, max_depth=9, n_estimators=1000;, score=-0.017 total time=   4.3s
[CV 4/5] END learning_rate=0.05, max_depth=9, n_estimators=1000;, score=-0.013 total time=   4.1s
[CV 5/5] END learning_rate

[CV 5/5] END learning_rate=0.09, max_depth=3, n_estimators=1000;, score=-0.012 total time=   2.7s
[CV 1/5] END learning_rate=0.09, max_depth=5, n_estimators=300;, score=-0.009 total time=   1.2s
[CV 2/5] END learning_rate=0.09, max_depth=5, n_estimators=300;, score=-0.014 total time=   1.2s
[CV 3/5] END learning_rate=0.09, max_depth=5, n_estimators=300;, score=-0.019 total time=   1.2s
[CV 4/5] END learning_rate=0.09, max_depth=5, n_estimators=300;, score=-0.013 total time=   1.1s
[CV 5/5] END learning_rate=0.09, max_depth=5, n_estimators=300;, score=-0.013 total time=   1.1s
[CV 1/5] END learning_rate=0.09, max_depth=5, n_estimators=500;, score=-0.009 total time=   1.9s
[CV 2/5] END learning_rate=0.09, max_depth=5, n_estimators=500;, score=-0.014 total time=   1.9s
[CV 3/5] END learning_rate=0.09, max_depth=5, n_estimators=500;, score=-0.018 total time=   1.9s
[CV 4/5] END learning_rate=0.09, max_depth=5, n_estimators=500;, score=-0.013 total time=   1.9s
[CV 5/5] END learning_rate=0.

[CV 5/5] END learning_rate=0.11, max_depth=7, n_estimators=500;, score=-0.014 total time=   1.7s
[CV 1/5] END learning_rate=0.11, max_depth=7, n_estimators=1000;, score=-0.011 total time=   2.6s
[CV 2/5] END learning_rate=0.11, max_depth=7, n_estimators=1000;, score=-0.016 total time=   2.4s
[CV 3/5] END learning_rate=0.11, max_depth=7, n_estimators=1000;, score=-0.018 total time=   2.4s
[CV 4/5] END learning_rate=0.11, max_depth=7, n_estimators=1000;, score=-0.012 total time=   2.4s
[CV 5/5] END learning_rate=0.11, max_depth=7, n_estimators=1000;, score=-0.014 total time=   2.4s
[CV 1/5] END learning_rate=0.11, max_depth=9, n_estimators=300;, score=-0.012 total time=   1.4s
[CV 2/5] END learning_rate=0.11, max_depth=9, n_estimators=300;, score=-0.016 total time=   1.4s
[CV 3/5] END learning_rate=0.11, max_depth=9, n_estimators=300;, score=-0.018 total time=   1.3s
[CV 4/5] END learning_rate=0.11, max_depth=9, n_estimators=300;, score=-0.012 total time=   1.3s
[CV 5/5] END learning_rat

[CV 5/5] END learning_rate=0.15, max_depth=3, n_estimators=300;, score=-0.012 total time=   0.7s
[CV 1/5] END learning_rate=0.15, max_depth=3, n_estimators=500;, score=-0.010 total time=   1.2s
[CV 2/5] END learning_rate=0.15, max_depth=3, n_estimators=500;, score=-0.013 total time=   1.2s
[CV 3/5] END learning_rate=0.15, max_depth=3, n_estimators=500;, score=-0.016 total time=   1.2s
[CV 4/5] END learning_rate=0.15, max_depth=3, n_estimators=500;, score=-0.015 total time=   1.3s
[CV 5/5] END learning_rate=0.15, max_depth=3, n_estimators=500;, score=-0.012 total time=   1.3s
[CV 1/5] END learning_rate=0.15, max_depth=3, n_estimators=1000;, score=-0.010 total time=   2.6s
[CV 2/5] END learning_rate=0.15, max_depth=3, n_estimators=1000;, score=-0.013 total time=   2.6s
[CV 3/5] END learning_rate=0.15, max_depth=3, n_estimators=1000;, score=-0.016 total time=   2.6s
[CV 4/5] END learning_rate=0.15, max_depth=3, n_estimators=1000;, score=-0.015 total time=   2.5s
[CV 5/5] END learning_rate

[CV 5/5] END learning_rate=0.17, max_depth=5, n_estimators=1000;, score=-0.014 total time=   2.3s
[CV 1/5] END learning_rate=0.17, max_depth=7, n_estimators=300;, score=-0.014 total time=   1.1s
[CV 2/5] END learning_rate=0.17, max_depth=7, n_estimators=300;, score=-0.015 total time=   1.1s
[CV 3/5] END learning_rate=0.17, max_depth=7, n_estimators=300;, score=-0.020 total time=   1.1s
[CV 4/5] END learning_rate=0.17, max_depth=7, n_estimators=300;, score=-0.016 total time=   1.1s
[CV 5/5] END learning_rate=0.17, max_depth=7, n_estimators=300;, score=-0.014 total time=   1.1s
[CV 1/5] END learning_rate=0.17, max_depth=7, n_estimators=500;, score=-0.014 total time=   1.4s
[CV 2/5] END learning_rate=0.17, max_depth=7, n_estimators=500;, score=-0.015 total time=   1.4s
[CV 3/5] END learning_rate=0.17, max_depth=7, n_estimators=500;, score=-0.020 total time=   1.4s
[CV 4/5] END learning_rate=0.17, max_depth=7, n_estimators=500;, score=-0.016 total time=   1.3s
[CV 5/5] END learning_rate=0.

[CV 5/5] END learning_rate=0.19, max_depth=9, n_estimators=500;, score=-0.015 total time=   1.2s
[CV 1/5] END learning_rate=0.19, max_depth=9, n_estimators=1000;, score=-0.012 total time=   2.0s
[CV 2/5] END learning_rate=0.19, max_depth=9, n_estimators=1000;, score=-0.017 total time=   2.0s
[CV 3/5] END learning_rate=0.19, max_depth=9, n_estimators=1000;, score=-0.019 total time=   1.9s
[CV 4/5] END learning_rate=0.19, max_depth=9, n_estimators=1000;, score=-0.015 total time=   2.0s
[CV 5/5] END learning_rate=0.19, max_depth=9, n_estimators=1000;, score=-0.015 total time=   1.9s


In [18]:
# get best parameter
grid.best_estimator_.get_params()

{'objective': 'reg:squarederror',
 'base_score': 0.5,
 'booster': 'gbtree',
 'callbacks': None,
 'colsample_bylevel': 1,
 'colsample_bynode': 1,
 'colsample_bytree': 1,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'gamma': 0,
 'gpu_id': -1,
 'grow_policy': 'depthwise',
 'importance_type': None,
 'interaction_constraints': '',
 'learning_rate': 0.03,
 'max_bin': 256,
 'max_cat_to_onehot': 4,
 'max_delta_step': 0,
 'max_depth': 3,
 'max_leaves': 0,
 'min_child_weight': 1,
 'missing': nan,
 'monotone_constraints': '()',
 'n_estimators': 1000,
 'n_jobs': 0,
 'num_parallel_tree': 1,
 'predictor': 'auto',
 'random_state': 0,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'sampling_method': 'uniform',
 'scale_pos_weight': 1,
 'subsample': 1,
 'tree_method': 'exact',
 'validate_parameters': 1,
 'verbosity': None}

In [19]:
from xgboost import XGBRegressor
model = XGBRegressor(n_estimators = 1000, learning_rate = 0.03, gamma = 0.0, max_depth = 3  )
model.fit(x_train, y_train)
y_pre = model.predict(x_test)
print('Root mean squared' ,(mean_squared_error(y_pre,y_test )**(1/2)))
print('R2-scored ', r2_score(y_pre, y_test))

Root mean squared 0.08157912622666642
R2-scored  0.9958506745627055
