In [64]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas import Series,DataFrame

# 1. Import Data

In [65]:
epl_1 = pd.read_csv('tmp_csv/epl_1.csv',index_col=0)
# csv to dataframe
# index_col=0 => for name 
epl_1.head()

Unnamed: 0_level_0,age,position,position_cat,market_value,page_views,fpl_value,fpl_sel,fpl_points,region,new_foreign,age_cat,club_id,big_club,new_signing
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Alexis Sanchez,28,LW,1,65.0,4329,12.0,0.171,264,3.0,0,4,1,1,0
Mesut Ozil,28,AM,1,50.0,4395,9.5,0.056,167,2.0,0,4,1,1,0
Petr Cech,35,GK,4,7.0,1529,5.5,0.059,134,2.0,0,6,1,1,0
Theo Walcott,28,RW,1,20.0,2393,7.5,0.015,122,1.0,0,4,1,1,0
Laurent Koscielny,31,CB,3,22.0,912,6.0,0.007,121,2.0,0,4,1,1,0


### feature introduction

1. name : Name of player<br>
2. club : Club of player<br>
3. age : Age of the Player<br>
4. position : usual position<br>
5. position_cat : 1-for attack 2-for midifielders 3-for defendenders 4-for goalkeepers<br>
6. market_value : As on transfermrkt.com on July 20th, 2017<br>
7. page_view : Average daily Wikipedia page views from 9/1,2016 to 5/1, 2017  
8. fpl_value : Value in Fantasy Premier League as on July 20th, 2017<br>
9. fpl_sel : % of FPL players who have selected that player in their team<br>
10. fpl_points : : FPL points accumulated over the previous season<br>
11. region : 1-for England 2-for EU 3-for Americas 4-for Rest of World
12. big_club : Whether one of the Top 6 clubs
13. new_signing : Whether a new signing for 2017/18 (till 20th July)

# 2. Data Pre-Processing

In [68]:
# remove the club because I think if club belongs to a big club, it affects the market_value
# remove nationality because there is region feature
# from correlation, market_value, page_views, fpl_value, fpl_sel, fpl_points, big_club 만 가지고 해보기로함.
cols = epl_1.columns.tolist()
cols

['age',
 'position',
 'position_cat',
 'market_value',
 'page_views',
 'fpl_value',
 'fpl_sel',
 'fpl_points',
 'region',
 'new_foreign',
 'age_cat',
 'club_id',
 'big_club',
 'new_signing']

In [69]:
cols = cols[3:8]+cols[0:1]+cols[12:14]
cols

['market_value',
 'page_views',
 'fpl_value',
 'fpl_sel',
 'fpl_points',
 'age',
 'big_club',
 'new_signing']

In [70]:
epl_1 = epl_1[cols]
epl_1.head()

Unnamed: 0_level_0,market_value,page_views,fpl_value,fpl_sel,fpl_points,age,big_club,new_signing
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Alexis Sanchez,65.0,4329,12.0,0.171,264,28,1,0
Mesut Ozil,50.0,4395,9.5,0.056,167,28,1,0
Petr Cech,7.0,1529,5.5,0.059,134,35,1,0
Theo Walcott,20.0,2393,7.5,0.015,122,28,1,0
Laurent Koscielny,22.0,912,6.0,0.007,121,31,1,0


In [71]:
epl_1.isnull().sum()

market_value    0
page_views      0
fpl_value       0
fpl_sel         0
fpl_points      0
age             0
big_club        0
new_signing     0
dtype: int64

# 3. Y : market _ value , X : rest of columns

In [72]:
# Target Variable ; y is market_value
# rest of the columns are X 
epl_final_y = epl_1.iloc[:,0]
epl_final_x = epl_1.iloc[:,1:len(epl_1.columns)]
epl_final_x.iloc[0:5,:]

Unnamed: 0_level_0,page_views,fpl_value,fpl_sel,fpl_points,age,big_club,new_signing
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Alexis Sanchez,4329,12.0,0.171,264,28,1,0
Mesut Ozil,4395,9.5,0.056,167,28,1,0
Petr Cech,1529,5.5,0.059,134,35,1,0
Theo Walcott,2393,7.5,0.015,122,28,1,0
Laurent Koscielny,912,6.0,0.007,121,31,1,0


# 4. Normalization + PCA

In [73]:
# normalization
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [74]:
scaler = StandardScaler()
scaler.fit(epl_final_x)
epl_x_scaled = scaler.transform(epl_final_x)

epl_pca_num = PCA(n_components=7) 
epl_pca_num_res = epl_pca_num.fit(epl_x_scaled)

PCA_result = ['Variance', 'Variance\%','Cum\%'] 
epl_pca_num_res_pc = np.transpose(np.array(epl_pca_num_res.components_).reshape(np.shape(epl_pca_num_res.components_))) 
epl_pca_num_res_var = np.array(np.transpose(epl_pca_num_res.explained_variance_[:,np.newaxis])) 
epl_pca_num_res_varp = np.array(np.transpose(epl_pca_num_res.explained_variance_ratio_[:,np.newaxis])) 
epl_pca_num_res_sum = np.cumsum(np.round(epl_pca_num_res.explained_variance_ratio_, decimals=8)*100) 
epl_pca_num_res_varc = np.array(np.transpose(epl_pca_num_res_sum[:,np.newaxis])) 
epl_pca_num_res_tab = DataFrame(np.concatenate((epl_pca_num_res_pc, 
                                                   epl_pca_num_res_var, 
                                                   epl_pca_num_res_varp, 
                                                   epl_pca_num_res_varc), 
                                                  axis = 0), 
                                   index = np.concatenate((np.array(epl_final_x.columns), PCA_result)), 
                                   columns = ['PC1', 'PC2', 'PC3', 'PC4','PC5','PC6','PC7'])
epl_pca_num_res_tab


  return self.partial_fit(X, y)
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7
page_views,0.501267,-0.07402,0.001668,-0.08519,0.146323,0.583391,-0.611748
fpl_value,0.500039,-0.058566,-0.006652,0.096202,0.463477,0.191559,0.696941
fpl_sel,0.415297,0.166522,-0.135059,0.464373,-0.739456,0.052215,0.128037
fpl_points,0.444598,0.307848,0.18201,0.136973,0.270717,-0.702928,-0.297115
age,-0.019959,0.798112,0.323524,-0.410662,-0.106304,0.237798,0.146494
big_club,0.349768,-0.266266,-0.238843,-0.761774,-0.299398,-0.263724,0.101138
new_signing,0.07627,-0.400888,0.887057,-0.011855,-0.206776,-0.005218,0.060637
Variance,3.023708,1.167016,0.984894,0.723628,0.4926,0.385756,0.237615
Variance\%,0.431021,0.166355,0.140394,0.103151,0.070219,0.054989,0.033871
Cum\%,43.102123,59.737621,73.777017,84.09213,91.114012,96.612863,99.999999


In [86]:
epl_pca_num_res_ps = epl_pca_num.fit_transform(epl_x_scaled)
epl_pca_num_res_ps_tab = DataFrame(data= epl_pca_num_res_ps,
                                  index = epl_final_x.index,
                                  columns=['PS1', 'PS2', 'PS3', 'PS4','PS5','PS6','PS7'])
#epl_pca_final = epl_pca_num_res_ps_tab[:,0:20]
#epl_pca_final.iloc[0:10,:]
epl_pca_final = epl_pca_num_res_ps_tab.iloc[0:,:-2]
epl_pca_final.head()

Unnamed: 0_level_0,PS1,PS2,PS3,PS4,PS5
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alexis Sanchez,7.594551,1.040617,-0.276798,0.538106,1.665104
Mesut Ozil,5.045591,0.243611,-0.323124,-0.839073,1.819122
Petr Cech,1.72544,1.874666,0.1434,-1.649799,-0.405083
Theo Walcott,2.546598,0.108362,-0.373524,-1.250862,1.12048
Laurent Koscielny,1.109348,0.867138,-0.10789,-1.601989,0.389522


# 5. Split train:test = 60:40

In [87]:
from sklearn.model_selection import train_test_split
epl_final_x_train, epl_final_x_test, epl_final_y_train, epl_final_y_test = train_test_split(epl_pca_final,epl_final_y,test_size=0.4,random_state=777)
epl_final_x_test.describe()

Unnamed: 0,PS1,PS2,PS3,PS4,PS5
count,185.0,185.0,185.0,185.0,185.0
mean,-0.153949,0.047551,0.051996,-0.12962,-0.069664
std,1.42776,1.101378,1.029701,0.803558,0.686372
min,-1.830986,-2.710924,-1.562722,-2.318441,-2.696341
25%,-1.189284,-0.757145,-0.555592,-0.681486,-0.432481
50%,-0.556987,0.120256,-0.20464,0.095958,-0.013074
75%,0.54896,0.776407,0.211979,0.402362,0.257043
max,5.963389,3.119848,3.116929,1.598039,1.968401


# 6. Linear Regression

In [88]:
from sklearn.linear_model  import LinearRegression
sl_lr = LinearRegression().fit(epl_final_x_train,epl_final_y_train)

#result of mathematical model
print("lr.coef_  : {}\n".format(sl_lr.coef_))
print("lr.intercept_ : {}\n".format(sl_lr.intercept_))

lr.coef_  : [ 6.04603232 -1.40355346 -0.19742311 -0.87994567  1.4413351 ]

lr.intercept_ : 10.692877769339479



In [89]:
# result of model performace
print("Training set score : {:.2f}".format(sl_lr.score(epl_final_x_train,epl_final_y_train)))
print("Test set score {:.2f}".format(sl_lr.score(epl_final_x_test,epl_final_y_test)))

Training set score : 0.77
Test set score 0.71


# 7. OLS report

In [90]:
import statsmodels.api as sm

In [91]:
sm_lr = sm.OLS(epl_final_y_train,epl_final_x_train).fit()
sm_lr.summary()

0,1,2,3
Dep. Variable:,market_value,R-squared:,0.501
Model:,OLS,Adj. R-squared:,0.492
Method:,Least Squares,F-statistic:,54.36
Date:,"Sat, 22 Jun 2019",Prob (F-statistic):,5.76e-39
Time:,02:31:34,Log-Likelihood:,-1084.5
No. Observations:,276,AIC:,2179.0
Df Residuals:,271,BIC:,2197.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
PS1,6.2970,0.393,16.020,0.000,5.523,7.071
PS2,-1.6605,0.702,-2.364,0.019,-3.043,-0.278
PS3,-0.5971,0.775,-0.771,0.441,-2.122,0.928
PS4,0.3290,0.862,0.382,0.703,-1.369,2.027
PS5,2.5222,1.056,2.388,0.018,0.443,4.602

0,1,2,3
Omnibus:,28.167,Durbin-Watson:,0.554
Prob(Omnibus):,0.0,Jarque-Bera (JB):,127.206
Skew:,0.16,Prob(JB):,2.39e-28
Kurtosis:,6.31,Cond. No.,2.72


In [92]:
from sklearn.svm import SVR
from sklearn.feature_selection import RFE
from sklearn import datasets

selector = RFE(sl_lr,3).fit(epl_final_x_train,epl_final_y_train)
print(selector.support_)
print(selector.ranking_)

[ True  True False False  True]
[1 1 3 2 1]


In [93]:
# re-run
epl_final_train_x_selected = epl_final_x_train.iloc[:,selector.support_]
epl_final_test_x_selected = epl_final_x_test.iloc[:,selector.support_]
epl_final_train_x_selected.head()

Unnamed: 0_level_0,PS1,PS2,PS5
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ruben Loftus-Cheek,-0.09428,-0.69327,-1.877762
Chris Smalling,0.367406,-0.294291,-0.117888
Karl Darlow,-1.532859,-0.133354,-0.117887
Tom Cleverley,-0.70505,0.30202,0.499109
Granit Xhaka,1.238648,-0.765266,0.193991


In [94]:
from sklearn.linear_model  import LinearRegression
sl_lr_selected = LinearRegression().fit(epl_final_train_x_selected,epl_final_y_train)
# result of model performace
print("Training set score : {:.2f}".format(sl_lr_selected.score(epl_final_train_x_selected,epl_final_y_train)))
print("Test set score {:.2f}".format(sl_lr_selected.score(epl_final_test_x_selected,epl_final_y_test)))

Training set score : 0.77
Test set score 0.70
