# Documentation
## 1. Introduction
Using the soccer dataset, we are going to fit a linear regression model to predict the overall score of a player and then use techniques such as cross validation and lasso regression to improve it.



In [1]:
import scipy as sp
import os
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import re
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn import metrics
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV

# Load the dataset

In [2]:
pd.set_option('display.max_columns', None)
fifa = pd.read_csv("FIFA19data.csv", sep=r'\s*,\s*', engine='python')
fifa.head()

Unnamed: 0,ID,Name,Age,Nationality,Overall,Potential,Club,Value,Wage,International Reputation,Weak Foot,Skill Moves,Work Rate,Body Type,Position,Contract Valid Until,Crossing,Finishing,HeadingAccuracy,ShortPassing,Volleys,Dribbling,Curve,FKAccuracy,LongPassing,BallControl,Acceleration,SprintSpeed,Agility,Reactions,Balance,ShotPower,Jumping,Stamina,Strength,LongShots,Aggression,Interceptions,Positioning,Vision,Penalties,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes
0,158023,L. Messi,31,Argentina,94,94,FC Barcelona,�110.5M,�565K,5.0,4.0,4.0,Medium/ Medium,Messi,RF,2021,84.0,95.0,70.0,90.0,86.0,97.0,93.0,94.0,87.0,96.0,91.0,86.0,91.0,95.0,95.0,85.0,68.0,72.0,59.0,94.0,48.0,22.0,94.0,94.0,75.0,96.0,33.0,28.0,26.0,6.0,11.0,15.0,14.0,8.0
1,20801,Cristiano Ronaldo,33,Portugal,94,94,Juventus,�77M,�405K,5.0,4.0,5.0,High/ Low,C. Ronaldo,ST,2022,84.0,94.0,89.0,81.0,87.0,88.0,81.0,76.0,77.0,94.0,89.0,91.0,87.0,96.0,70.0,95.0,95.0,88.0,79.0,93.0,63.0,29.0,95.0,82.0,85.0,95.0,28.0,31.0,23.0,7.0,11.0,15.0,14.0,11.0
2,190871,Neymar Jr,26,Brazil,92,93,Paris Saint-Germain,�118.5M,�290K,5.0,5.0,5.0,High/ Medium,Neymar,LW,2022,79.0,87.0,62.0,84.0,84.0,96.0,88.0,87.0,78.0,95.0,94.0,90.0,96.0,94.0,84.0,80.0,61.0,81.0,49.0,82.0,56.0,36.0,89.0,87.0,81.0,94.0,27.0,24.0,33.0,9.0,9.0,15.0,15.0,11.0
3,193080,De Gea,27,Spain,91,93,Manchester United,�72M,�260K,4.0,3.0,1.0,Medium/ Medium,Lean,GK,2020,17.0,13.0,21.0,50.0,13.0,18.0,21.0,19.0,51.0,42.0,57.0,58.0,60.0,90.0,43.0,31.0,67.0,43.0,64.0,12.0,38.0,30.0,12.0,68.0,40.0,68.0,15.0,21.0,13.0,90.0,85.0,87.0,88.0,94.0
4,192985,K. De Bruyne,27,Belgium,91,92,Manchester City,�102M,�355K,4.0,5.0,4.0,High/ High,Normal,RCM,2023,93.0,82.0,55.0,92.0,82.0,86.0,85.0,83.0,91.0,91.0,78.0,76.0,79.0,91.0,77.0,91.0,63.0,90.0,75.0,91.0,76.0,61.0,87.0,94.0,79.0,88.0,68.0,58.0,51.0,15.0,13.0,5.0,10.0,13.0


# Data cleaning and transformation
### Eliminate unnecessary columns. 
#### Directly delete
1. ID
2. Name
3. Potential (no practical meaning)

#### Check for distribution
1. Body Type
2. Nationality
3. Club

In [3]:
fifa['Body Type'].value_counts()

Normal                 10595
Lean                    6417
Stocky                  1140
Neymar                     1
Messi                      1
Akinfenwa                  1
Shaqiri                    1
C. Ronaldo                 1
PLAYER_BODY_TYPE_25        1
Courtois                   1
Name: Body Type, dtype: int64

In [4]:
fifa.groupby('Body Type')['Overall'].mean()

Body Type
Akinfenwa              66.000000
C. Ronaldo             94.000000
Courtois               89.000000
Lean                   65.539816
Messi                  94.000000
Neymar                 92.000000
Normal                 66.512034
PLAYER_BODY_TYPE_25    88.000000
Shaqiri                81.000000
Stocky                 67.687719
Name: Overall, dtype: float64

Delete Body type because the average overall scores of the three main body types are very close, 
indicating it has no contribution to our dependent variable

In [6]:
fifa['Nationality'].value_counts()

England                 1662
Germany                 1198
Spain                   1072
Argentina                937
France                   914
Brazil                   827
Italy                    702
Colombia                 618
Japan                    478
Netherlands              453
Sweden                   397
China PR                 392
Chile                    391
Republic of Ireland      368
Mexico                   366
United States            353
Poland                   350
Norway                   341
Saudi Arabia             340
Denmark                  336
Korea Republic           335
Portugal                 322
Turkey                   303
Austria                  298
Scotland                 286
Belgium                  260
Australia                236
Switzerland              220
Uruguay                  149
Senegal                  130
                        ... 
Eritrea                    2
Uzbekistan                 2
Chad                       2
Bermuda       

In [7]:
fifa.groupby('Club')["Overall"].mean()

Club
1. FC Heidenheim 1846       65.750000
1. FC Kaiserslautern        63.384615
1. FC K�ln                  70.785714
1. FC Magdeburg             65.615385
1. FC N�rnberg              68.827586
1. FC Union Berlin          68.321429
1. FSV Mainz 05             70.843750
?l?sk Wroc?aw               62.200000
AC Ajaccio                  65.043478
AC Horsens                  60.640000
AD Alcorc�n                 67.413793
ADO Den Haag                66.678571
AEK Athens                  70.214286
AFC Wimbledon               60.461538
AIK                         65.074074
AJ Auxerre                  66.296296
AS B�ziers                  62.038462
AS Monaco                   72.939394
AS Nancy Lorraine           64.666667
AS Saint-�tienne            70.875000
AZ Alkmaar                  70.000000
Aalborg BK                  62.037037
Aarhus GF                   61.407407
Aberdeen                    64.333333
Accrington Stanley          61.178571
Adelaide United             61.400000
Ajax   

Delete nationality and club because of too many levels

In [9]:
fifa = fifa.drop('ID', 1)
fifa = fifa.drop('Name', 1)
fifa = fifa.drop('Nationality', 1)
fifa = fifa.drop('Club', 1)
fifa = fifa.drop('Body Type', 1)
fifa = fifa.drop('Potential', 1) 
fifa = fifa.drop('Value', 1)
fifa = fifa.drop('Contract Valid Until', 1)
fifa = fifa.drop('Work Rate', 1)

Transform Wage

In [11]:
value_wage = fifa['Wage'].apply(lambda x: x.strip('�MK'))

In [12]:
fifa.shape

(18207, 41)

In [13]:
fifa = fifa.drop('Wage', 1)

In [14]:
fifa = pd.concat([fifa,value_wage],axis = 1)

In [15]:
fifa.head()

Unnamed: 0,Age,Overall,International Reputation,Weak Foot,Skill Moves,Position,Crossing,Finishing,HeadingAccuracy,ShortPassing,Volleys,Dribbling,Curve,FKAccuracy,LongPassing,BallControl,Acceleration,SprintSpeed,Agility,Reactions,Balance,ShotPower,Jumping,Stamina,Strength,LongShots,Aggression,Interceptions,Positioning,Vision,Penalties,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Wage
0,31,94,5.0,4.0,4.0,RF,84.0,95.0,70.0,90.0,86.0,97.0,93.0,94.0,87.0,96.0,91.0,86.0,91.0,95.0,95.0,85.0,68.0,72.0,59.0,94.0,48.0,22.0,94.0,94.0,75.0,96.0,33.0,28.0,26.0,6.0,11.0,15.0,14.0,8.0,565
1,33,94,5.0,4.0,5.0,ST,84.0,94.0,89.0,81.0,87.0,88.0,81.0,76.0,77.0,94.0,89.0,91.0,87.0,96.0,70.0,95.0,95.0,88.0,79.0,93.0,63.0,29.0,95.0,82.0,85.0,95.0,28.0,31.0,23.0,7.0,11.0,15.0,14.0,11.0,405
2,26,92,5.0,5.0,5.0,LW,79.0,87.0,62.0,84.0,84.0,96.0,88.0,87.0,78.0,95.0,94.0,90.0,96.0,94.0,84.0,80.0,61.0,81.0,49.0,82.0,56.0,36.0,89.0,87.0,81.0,94.0,27.0,24.0,33.0,9.0,9.0,15.0,15.0,11.0,290
3,27,91,4.0,3.0,1.0,GK,17.0,13.0,21.0,50.0,13.0,18.0,21.0,19.0,51.0,42.0,57.0,58.0,60.0,90.0,43.0,31.0,67.0,43.0,64.0,12.0,38.0,30.0,12.0,68.0,40.0,68.0,15.0,21.0,13.0,90.0,85.0,87.0,88.0,94.0,260
4,27,91,4.0,5.0,4.0,RCM,93.0,82.0,55.0,92.0,82.0,86.0,85.0,83.0,91.0,91.0,78.0,76.0,79.0,91.0,77.0,91.0,63.0,90.0,75.0,91.0,76.0,61.0,87.0,94.0,79.0,88.0,68.0,58.0,51.0,15.0,13.0,5.0,10.0,13.0,355


Merge Position

In [17]:
fifa['Position'].value_counts()

ST     2152
GK     2025
CB     1778
CM     1394
LB     1322
RB     1291
RM     1124
LM     1095
CAM     958
CDM     948
RCB     662
LCB     648
LCM     395
RCM     391
LW      381
RW      370
RDM     248
LDM     243
LS      207
RS      203
RWB      87
LWB      78
CF       74
RAM      21
LAM      21
RF       16
LF       15
Name: Position, dtype: int64

In [18]:
fifa['Position'] = fifa['Position'].replace(['RB','LB','CB','LCB','RCB','RWB','LWB'],'DF')
fifa['Position'] = fifa['Position'].replace(['LM','RM','CM','CAM','CDM','LCM','RCM','LDM', 'RDM', 'RAM', 'LAM'],'MF')
fifa['Position'] = fifa['Position'].replace(['ST','LW','RW','LS','RS','CF','LF','RF'],'FW')

In [19]:
fifa.head()

Unnamed: 0,Age,Overall,International Reputation,Weak Foot,Skill Moves,Position,Crossing,Finishing,HeadingAccuracy,ShortPassing,Volleys,Dribbling,Curve,FKAccuracy,LongPassing,BallControl,Acceleration,SprintSpeed,Agility,Reactions,Balance,ShotPower,Jumping,Stamina,Strength,LongShots,Aggression,Interceptions,Positioning,Vision,Penalties,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Wage
0,31,94,5.0,4.0,4.0,FW,84.0,95.0,70.0,90.0,86.0,97.0,93.0,94.0,87.0,96.0,91.0,86.0,91.0,95.0,95.0,85.0,68.0,72.0,59.0,94.0,48.0,22.0,94.0,94.0,75.0,96.0,33.0,28.0,26.0,6.0,11.0,15.0,14.0,8.0,565
1,33,94,5.0,4.0,5.0,FW,84.0,94.0,89.0,81.0,87.0,88.0,81.0,76.0,77.0,94.0,89.0,91.0,87.0,96.0,70.0,95.0,95.0,88.0,79.0,93.0,63.0,29.0,95.0,82.0,85.0,95.0,28.0,31.0,23.0,7.0,11.0,15.0,14.0,11.0,405
2,26,92,5.0,5.0,5.0,FW,79.0,87.0,62.0,84.0,84.0,96.0,88.0,87.0,78.0,95.0,94.0,90.0,96.0,94.0,84.0,80.0,61.0,81.0,49.0,82.0,56.0,36.0,89.0,87.0,81.0,94.0,27.0,24.0,33.0,9.0,9.0,15.0,15.0,11.0,290
3,27,91,4.0,3.0,1.0,GK,17.0,13.0,21.0,50.0,13.0,18.0,21.0,19.0,51.0,42.0,57.0,58.0,60.0,90.0,43.0,31.0,67.0,43.0,64.0,12.0,38.0,30.0,12.0,68.0,40.0,68.0,15.0,21.0,13.0,90.0,85.0,87.0,88.0,94.0,260
4,27,91,4.0,5.0,4.0,MF,93.0,82.0,55.0,92.0,82.0,86.0,85.0,83.0,91.0,91.0,78.0,76.0,79.0,91.0,77.0,91.0,63.0,90.0,75.0,91.0,76.0,61.0,87.0,94.0,79.0,88.0,68.0,58.0,51.0,15.0,13.0,5.0,10.0,13.0,355


Checking for nulls

In [20]:
fifa.isnull().values.any()

True

In [21]:
fifa.describe() # There are 48 rows with only two columns being not NAs.

Unnamed: 0,Age,Overall,International Reputation,Weak Foot,Skill Moves,Crossing,Finishing,HeadingAccuracy,ShortPassing,Volleys,Dribbling,Curve,FKAccuracy,LongPassing,BallControl,Acceleration,SprintSpeed,Agility,Reactions,Balance,ShotPower,Jumping,Stamina,Strength,LongShots,Aggression,Interceptions,Positioning,Vision,Penalties,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes
count,18207.0,18207.0,18159.0,18159.0,18159.0,18159.0,18159.0,18159.0,18159.0,18159.0,18159.0,18159.0,18159.0,18159.0,18159.0,18159.0,18159.0,18159.0,18159.0,18159.0,18159.0,18159.0,18159.0,18159.0,18159.0,18159.0,18159.0,18159.0,18159.0,18159.0,18159.0,18159.0,18159.0,18159.0,18159.0,18159.0,18159.0,18159.0,18159.0
mean,25.122206,66.238699,1.113222,2.947299,2.361308,49.734181,45.550911,52.298144,58.686712,42.909026,55.371001,47.170824,42.863153,52.711933,58.369459,64.614076,64.726967,63.503607,61.83661,63.966573,55.460047,65.089432,63.219946,65.311967,47.109973,55.868991,46.698276,49.958478,53.400903,48.548598,58.648274,47.281623,47.697836,45.661435,16.616223,16.391596,16.232061,16.388898,16.710887
std,4.669943,6.90893,0.394031,0.660456,0.756164,18.364524,19.52582,17.379909,14.699495,17.694408,18.910371,18.395264,17.478763,15.32787,16.686595,14.92778,14.649953,14.766049,9.010464,14.136166,17.237958,11.820044,15.894741,12.557,19.260524,17.367967,20.696909,19.529036,14.146881,15.704053,11.436133,19.904397,21.664004,21.289135,17.695349,16.9069,16.502864,17.034669,17.955119
min,16.0,46.0,1.0,1.0,1.0,5.0,2.0,4.0,7.0,4.0,4.0,6.0,3.0,9.0,5.0,12.0,12.0,14.0,21.0,16.0,2.0,15.0,12.0,17.0,3.0,11.0,3.0,2.0,10.0,5.0,3.0,3.0,2.0,3.0,1.0,1.0,1.0,1.0,1.0
25%,21.0,62.0,1.0,3.0,2.0,38.0,30.0,44.0,54.0,30.0,49.0,34.0,31.0,43.0,54.0,57.0,57.0,55.0,56.0,56.0,45.0,58.0,56.0,58.0,33.0,44.0,26.0,38.0,44.0,39.0,51.0,30.0,27.0,24.0,8.0,8.0,8.0,8.0,8.0
50%,25.0,66.0,1.0,3.0,2.0,54.0,49.0,56.0,62.0,44.0,61.0,48.0,41.0,56.0,63.0,67.0,67.0,66.0,62.0,66.0,59.0,66.0,66.0,67.0,51.0,59.0,52.0,55.0,55.0,49.0,60.0,53.0,55.0,52.0,11.0,11.0,11.0,11.0,11.0
75%,28.0,71.0,1.0,3.0,3.0,64.0,62.0,64.0,68.0,57.0,68.0,62.0,57.0,64.0,69.0,75.0,75.0,74.0,68.0,74.0,68.0,73.0,74.0,74.0,62.0,69.0,64.0,64.0,64.0,60.0,67.0,64.0,66.0,64.0,14.0,14.0,14.0,14.0,14.0
max,45.0,94.0,5.0,5.0,5.0,93.0,95.0,94.0,93.0,90.0,97.0,94.0,94.0,93.0,96.0,97.0,96.0,96.0,96.0,96.0,95.0,95.0,96.0,97.0,94.0,95.0,92.0,95.0,94.0,92.0,96.0,94.0,93.0,91.0,90.0,92.0,91.0,90.0,94.0


In [22]:
fifa[(fifa['Position'].isnull().values==True) & (fifa['Weak Foot'].isnull().values==False)]

Unnamed: 0,Age,Overall,International Reputation,Weak Foot,Skill Moves,Position,Crossing,Finishing,HeadingAccuracy,ShortPassing,Volleys,Dribbling,Curve,FKAccuracy,LongPassing,BallControl,Acceleration,SprintSpeed,Agility,Reactions,Balance,ShotPower,Jumping,Stamina,Strength,LongShots,Aggression,Interceptions,Positioning,Vision,Penalties,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Wage
5018,37,70,1.0,3.0,2.0,,25.0,36.0,72.0,56.0,19.0,41.0,32.0,51.0,33.0,57.0,47.0,46.0,59.0,66.0,58.0,74.0,58.0,53.0,75.0,47.0,74.0,65.0,26.0,48.0,77.0,64.0,79.0,70.0,70.0,16.0,8.0,11.0,12.0,13.0,0
6736,33,68,1.0,3.0,3.0,,64.0,73.0,65.0,64.0,52.0,67.0,40.0,36.0,50.0,70.0,71.0,74.0,73.0,61.0,75.0,61.0,63.0,73.0,41.0,61.0,48.0,36.0,67.0,63.0,69.0,67.0,12.0,34.0,33.0,16.0,13.0,15.0,16.0,8.0,0
7922,33,67,1.0,3.0,2.0,,59.0,39.0,59.0,33.0,37.0,44.0,48.0,40.0,35.0,47.0,64.0,61.0,68.0,68.0,60.0,51.0,72.0,78.0,78.0,47.0,76.0,62.0,49.0,45.0,42.0,54.0,72.0,71.0,64.0,11.0,7.0,8.0,12.0,12.0,0
9905,23,66,1.0,4.0,3.0,,52.0,70.0,54.0,57.0,63.0,74.0,57.0,41.0,53.0,72.0,71.0,73.0,79.0,63.0,91.0,62.0,75.0,80.0,33.0,59.0,26.0,17.0,70.0,56.0,67.0,64.0,19.0,24.0,23.0,13.0,11.0,11.0,7.0,9.0,0
10628,26,65,1.0,2.0,2.0,,72.0,48.0,44.0,66.0,31.0,57.0,31.0,29.0,64.0,59.0,68.0,61.0,54.0,65.0,76.0,24.0,58.0,56.0,47.0,29.0,57.0,63.0,38.0,32.0,39.0,57.0,68.0,69.0,68.0,14.0,12.0,11.0,14.0,12.0,0
16450,31,57,1.0,3.0,1.0,,15.0,20.0,15.0,23.0,17.0,14.0,15.0,12.0,22.0,13.0,56.0,46.0,65.0,65.0,46.0,13.0,69.0,32.0,33.0,16.0,28.0,23.0,12.0,27.0,16.0,52.0,12.0,15.0,12.0,53.0,48.0,62.0,57.0,60.0,0
16539,23,57,1.0,4.0,2.0,,51.0,33.0,47.0,28.0,31.0,51.0,32.0,34.0,24.0,44.0,78.0,82.0,70.0,53.0,74.0,24.0,56.0,54.0,47.0,29.0,52.0,58.0,49.0,37.0,38.0,57.0,60.0,61.0,57.0,15.0,12.0,11.0,6.0,12.0,0
16793,31,56,1.0,3.0,2.0,,53.0,47.0,39.0,57.0,56.0,57.0,73.0,70.0,51.0,64.0,67.0,66.0,81.0,56.0,84.0,63.0,57.0,70.0,54.0,48.0,33.0,43.0,41.0,53.0,59.0,63.0,43.0,48.0,32.0,7.0,8.0,9.0,15.0,9.0,0
17129,26,55,1.0,4.0,2.0,,47.0,51.0,40.0,50.0,45.0,46.0,48.0,59.0,35.0,53.0,86.0,82.0,77.0,51.0,80.0,68.0,53.0,66.0,59.0,52.0,32.0,12.0,49.0,40.0,65.0,42.0,26.0,18.0,13.0,15.0,14.0,8.0,10.0,16.0,0
17339,23,54,1.0,3.0,2.0,,35.0,56.0,49.0,38.0,38.0,53.0,37.0,33.0,33.0,43.0,66.0,68.0,57.0,49.0,58.0,50.0,55.0,59.0,62.0,47.0,39.0,34.0,57.0,43.0,58.0,45.0,14.0,23.0,21.0,8.0,6.0,13.0,9.0,10.0,0


In [23]:
fifa[(fifa['Position'].isnull().values==True) & (fifa['Weak Foot'].isnull().values==True)].head()

Unnamed: 0,Age,Overall,International Reputation,Weak Foot,Skill Moves,Position,Crossing,Finishing,HeadingAccuracy,ShortPassing,Volleys,Dribbling,Curve,FKAccuracy,LongPassing,BallControl,Acceleration,SprintSpeed,Agility,Reactions,Balance,ShotPower,Jumping,Stamina,Strength,LongShots,Aggression,Interceptions,Positioning,Vision,Penalties,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Wage
13236,33,62,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1
13237,29,62,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1
13238,35,62,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3
13239,20,62,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1
13240,24,62,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1


 There are 48 rows that have most columns being NA and 12 rows whose 'position' with NA. 
 We decide to drop these rows because they are only a very small percent of the data.

In [24]:
fifa = fifa.dropna()

In [25]:
fifa.shape

(18147, 41)

# Linear regression

There are some columns which are highly position-specific. For example, some values like GK Positioning, GK Reflexes, etc. are always higher for the Goalkeepers and very low for outfield players. Hence, in order to accurately predict the overall score for Goalkeepers, we need to only consider the GK specific columns and not the rest. And so, we built a separate model for Goalkeepers, outfield players and a third model without the separation to observe if there is an improvement. Ideally, we would want to have four different models for each key position as some columns are more defend-specific and some are more attack-specific. But in the interest of this assignment, we demonstrated the idea for GK and non-GK players and continued with the other analysis. 

## fit model with GK data

In [26]:
fifa_GK = fifa[fifa['Position'] == 'GK']

In [27]:
fifa_GK = fifa_GK[['Age','Overall','International Reputation','GKDiving', 'GKHandling', 'GKKicking', 'GKPositioning', 'GKReflexes',
       'Wage']]

In [28]:
fifa_GK.head()

Unnamed: 0,Age,Overall,International Reputation,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Wage
3,27,91,4.0,90.0,85.0,87.0,88.0,94.0,260
9,25,90,3.0,86.0,92.0,78.0,88.0,89.0,94
18,26,89,3.0,87.0,85.0,88.0,85.0,90.0,240
19,26,89,4.0,85.0,91.0,72.0,86.0,88.0,240
22,32,89,5.0,90.0,86.0,91.0,87.0,87.0,130


In [29]:
# Creating dummy variables for categorical variables (in this case ordinal data)
for col in fifa_GK.columns:
    fifa_GK[col].fillna(value=fifa_GK[col].mode()[0], inplace=True)

factors = ['International Reputation']

for var in factors:
    cat_list='var'+'_'+var
    cat_list = pd.get_dummies(fifa_GK[var], prefix=var)
    fifa_GK = pd.concat([fifa_GK,cat_list], axis = 1)
    fifa_GK = fifa_GK.drop(var, 1)

In [30]:
X_GK = fifa_GK.copy()
X_GK = X_GK.drop('Overall', 1)
Y_GK = fifa_GK.copy()
Y_GK = Y_GK['Overall']

In [31]:
X_GK.shape

(2025, 12)

In [32]:
X_train_GK,X_test_GK,y_train_GK,y_test_GK=train_test_split(X_GK,Y_GK, test_size=0.9, random_state=31)

In [33]:
lm_GK = LinearRegression()
lm_GK.fit(X_train_GK, y_train_GK)
lm1_predictions_GK = lm_GK.predict(X_test_GK)
lm1_r2_GK = r2_score(y_test_GK,lm1_predictions_GK)
print(lm1_r2_GK)

0.990312597625216


## fit model with the remain data 

In [34]:
fifa_other = fifa[fifa['Position'] != 'GK']

In [35]:
fifa_other = fifa_other.drop(['GKDiving','GKHandling','GKKicking','GKPositioning'],axis = 1)

In [36]:
fifa_other.head()

Unnamed: 0,Age,Overall,International Reputation,Weak Foot,Skill Moves,Position,Crossing,Finishing,HeadingAccuracy,ShortPassing,Volleys,Dribbling,Curve,FKAccuracy,LongPassing,BallControl,Acceleration,SprintSpeed,Agility,Reactions,Balance,ShotPower,Jumping,Stamina,Strength,LongShots,Aggression,Interceptions,Positioning,Vision,Penalties,Composure,Marking,StandingTackle,SlidingTackle,GKReflexes,Wage
0,31,94,5.0,4.0,4.0,FW,84.0,95.0,70.0,90.0,86.0,97.0,93.0,94.0,87.0,96.0,91.0,86.0,91.0,95.0,95.0,85.0,68.0,72.0,59.0,94.0,48.0,22.0,94.0,94.0,75.0,96.0,33.0,28.0,26.0,8.0,565
1,33,94,5.0,4.0,5.0,FW,84.0,94.0,89.0,81.0,87.0,88.0,81.0,76.0,77.0,94.0,89.0,91.0,87.0,96.0,70.0,95.0,95.0,88.0,79.0,93.0,63.0,29.0,95.0,82.0,85.0,95.0,28.0,31.0,23.0,11.0,405
2,26,92,5.0,5.0,5.0,FW,79.0,87.0,62.0,84.0,84.0,96.0,88.0,87.0,78.0,95.0,94.0,90.0,96.0,94.0,84.0,80.0,61.0,81.0,49.0,82.0,56.0,36.0,89.0,87.0,81.0,94.0,27.0,24.0,33.0,11.0,290
4,27,91,4.0,5.0,4.0,MF,93.0,82.0,55.0,92.0,82.0,86.0,85.0,83.0,91.0,91.0,78.0,76.0,79.0,91.0,77.0,91.0,63.0,90.0,75.0,91.0,76.0,61.0,87.0,94.0,79.0,88.0,68.0,58.0,51.0,13.0,355
5,27,91,4.0,4.0,4.0,FW,81.0,84.0,61.0,89.0,80.0,95.0,83.0,79.0,83.0,94.0,94.0,88.0,95.0,90.0,94.0,82.0,56.0,83.0,66.0,80.0,54.0,41.0,87.0,89.0,86.0,91.0,34.0,27.0,22.0,8.0,340


In [37]:
for col in fifa_other.columns:
    fifa_other[col].fillna(value=fifa_other[col].mode()[0], inplace=True)

factors = ['International Reputation', 'Weak Foot', 'Skill Moves', 'Position']

for var in factors:
    cat_list='var'+'_'+var
    cat_list = pd.get_dummies(fifa_other[var], prefix=var)
    fifa_other = pd.concat([fifa_other,cat_list], axis = 1)
    fifa_other = fifa_other.drop(var, 1)

In [38]:
fifa_other = fifa_other.copy()
X_other = fifa_other.drop('Overall', 1)
Y_other = fifa_other.copy()
Y_other = Y_other['Overall']

In [39]:
X_other.shape

(16122, 49)

In [40]:
X_train_other,X_test_other,y_train_other,y_test_other=train_test_split(X_other,Y_other, test_size=0.9, random_state=31)

In [41]:
lm_other = LinearRegression()
lm_other.fit(X_train_other, y_train_other)
lm1_predictions_other = lm_other.predict(X_test_other)
lm1_r2_other = r2_score(y_test_other,lm1_predictions_other)
print(lm1_r2_other)

0.8850051575389288


## fit model with all the data

In [42]:
for col in fifa.columns:
    fifa[col].fillna(value=fifa[col].mode()[0], inplace=True)

factors = ['International Reputation', 'Weak Foot', 'Skill Moves', 'Position']

for var in factors:
    cat_list='var'+'_'+var
    cat_list = pd.get_dummies(fifa[var], prefix=var)
    fifa = pd.concat([fifa,cat_list], axis = 1)
    fifa = fifa.drop(var, 1)

In [43]:
X = fifa.copy()
X = X.drop('Overall', 1)
Y = fifa.copy()
Y = Y['Overall']

In [44]:
X.shape

(18147, 55)

In [45]:
X_train,X_test,y_train,y_test=train_test_split(X,Y, test_size=0.9, random_state=31)

In [46]:
lm1 = LinearRegression()
lm1.fit(X_train, y_train)
lm1_predictions = lm1.predict(X_test)
lm1_r2 = r2_score(y_test,lm1_predictions)
print(lm1_r2)

0.8792659972267223


# CrossVal
#### GK

In [47]:
#cv_predictions = cross_val_predict(lm1, fifa, Y, cv=5)
#cv_r2 = r2_score(Y,cv_predictions)
cv_predictions_GK = cross_val_predict(lm_GK, X_test_GK, y_test_GK, cv=5)
cv_r2_GK = r2_score(y_test_GK,cv_predictions_GK)
print(cv_r2_GK)

0.9913316211091591


#### Other

In [48]:
cv_predictions_other = cross_val_predict(lm_other, X_test_other, y_test_other, cv=5)
cv_r2_other = r2_score(y_test_other,cv_predictions_other)
print(cv_r2_other)

0.8898574159970659


#### All the data

In [49]:
cv_predictions = cross_val_predict(lm1, X_test, y_test, cv=5)
cv_r2 = r2_score(y_test,cv_predictions)
print(cv_r2)

0.8809703182220356


When we fit a linear regression model with a 5-fold cross validation, we see that the R^2 value for each model increases. 
This is mainly because when we are using cross validation, it reduces the sampling variance as it ensures that each observation is left out in order to validate the model. In this case, there are 5 different folds or sets of the data that are used to validate the linear regression model. So there can be slightly varying results for the model on each combination of the folds, from which we could generate the most optimal set of parameter values to build the best regression model.

# Lasso
#### GK

In [50]:
lasso_GK = Lasso()
lasso_GK.fit(X_train_GK,y_train_GK)
lasso1_predictions_GK = lasso_GK.predict(X_test_GK)
train_score_GK=lasso_GK.score(X_train_GK,y_train_GK)
test_score_GK=lasso_GK.score(X_test_GK,y_test_GK)
coeff_used_GK = np.sum(lasso_GK.coef_!=0)
r2_lasso_GK = r2_score(y_test_GK, lasso1_predictions_GK)

In [51]:
print("training score:", train_score_GK)
print("test score: ", test_score_GK)
print("number of features used: ", coeff_used_GK)
print("test r2 score: ", r2_lasso_GK)

training score: 0.9911144174040093
test score:  0.9904218968405674
number of features used:  7
test r2 score:  0.9904218968405674


#### Other

In [52]:
lasso_other = Lasso()
lasso_other.fit(X_train_other,y_train_other)
lasso1_predictions_other = lasso_other.predict(X_test_other)
train_score_other=lasso_other.score(X_train_other,y_train_other)
test_score_other=lasso_other.score(X_test_other,y_test_other)
coeff_used_other = np.sum(lasso_other.coef_!=0)
r2_lasso_other = r2_score(y_test_other, lasso1_predictions_other)

In [53]:
print("training score:", train_score_other)
print("test score: ", test_score_other)
print("number of features used: ", coeff_used_other)
print("test r2 score: ", r2_lasso_other)

training score: 0.8652216400753341
test score:  0.8705230613901518
number of features used:  18
test r2 score:  0.8705230613901518


#### All the data

In [54]:
lasso = Lasso()
lasso.fit(X_train,y_train)
lasso1_predictions = lasso.predict(X_test)
train_score=lasso.score(X_train,y_train)
test_score=lasso.score(X_test,y_test)
coeff_used = np.sum(lasso.coef_!=0)
r2_lasso1 = r2_score(y_test, lasso1_predictions)

In [55]:
print("training score:", train_score)
print("test score: ", test_score)
print("number of features used: ", coeff_used)
print("test r2 score: ", r2_lasso1)

training score: 0.8754923713926073
test score:  0.8623543569626184
number of features used:  24
test r2 score:  0.8623543569626185


Lasso regression is an effective model to fit when the dataset has a large number of features. Alpha is a parameter to adjust penalty. With a higher value of alpha, fewer number of features will be included in the model and only those most important features are kept. The default value is usually 1, but when alpha becomes extremely large, all coefficient will be zero. 
From the result, we can observe that the number of features experiences a huge drop. Meanwhile, the R^2 of lasso regression decreases, because R^2 is determined by the proportion of the variance in the dependent variable that is predictable from independent variables. Thus, such a shrink in the number of features will definitely cause the R^2 to decrease. Specifically, we observe that the R^2 of test data is lower than that of train data, which also indicates that the lasso regression model with default value of Alpha is underfitting due to the reduction of too many variables.


# Finding ideal value of alpha
In order to select the best alpha value, a list of candidate values are created which contains: 1e-15, 1e-10, 1e-08, 0.0001, 0.001, 0.01, 1, 5, 10, 20. Leveraging cross-validation technique, we test and find that the best value of alpha is 0.01. By introducing Lasso Regression and α with the ideal value, the number of features naturally drops and the R^2 value also slightly increases. This is because using ideal value of alpha, for such a heterogenous dataset, lasso is using a set of most significant variables to seek maximum score. 

#### GK

In [56]:
lasso_GK = Lasso()

parameters = {'alpha': [1e-15, 1e-10, 1e-8, 1e-4, 1e-3,1e-2, 1, 5, 10, 20]}

lasso_regressor_GK = GridSearchCV(lasso_GK, parameters, cv = 5)

lasso_regressor_GK.fit(X_train_GK, y_train_GK)



GridSearchCV(cv=5, error_score='raise',
       estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'alpha': [1e-15, 1e-10, 1e-08, 0.0001, 0.001, 0.01, 1, 5, 10, 20]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [57]:
lasso_regressor_GK.best_params_

{'alpha': 0.01}

In [58]:
lasso_regressor_GK.score(X_train_GK,y_train_GK)

0.9918823327777881

In [59]:
coeff_used_GK = np.sum(lasso_regressor_GK.best_estimator_.coef_!=0)
print(coeff_used_GK)

7


In [60]:
lasso2_predictions_GK = lasso_regressor_GK.predict(X_test_GK)

In [61]:
lasso2_GK = r2_score(y_test_GK, lasso1_predictions_GK)

In [62]:
print("test r2 score: ", lasso2_GK)

test r2 score:  0.9904218968405674


#### Other

In [63]:
lasso_other = Lasso()

parameters = {'alpha': [1e-15, 1e-10, 1e-8, 1e-4, 1e-3,1e-2, 1, 5, 10, 20]}

lasso_regressor_other = GridSearchCV(lasso_other, parameters, cv = 5)

lasso_regressor_other.fit(X_train_other, y_train_other)



GridSearchCV(cv=5, error_score='raise',
       estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'alpha': [1e-15, 1e-10, 1e-08, 0.0001, 0.001, 0.01, 1, 5, 10, 20]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [64]:
lasso_regressor_other.best_params_

{'alpha': 0.01}

In [65]:
lasso_regressor_other.score(X_train_other,y_train_other)

0.8892989681133713

In [66]:
coeff_used_other = np.sum(lasso_regressor_other.best_estimator_.coef_!=0)
print(coeff_used_other)

39


In [67]:
lasso2_predictions_other = lasso_regressor_other.predict(X_test_other)

In [68]:
lasso2_other = r2_score(y_test_other, lasso2_predictions_other)

In [69]:
print("test r2 score: ", lasso2_other)

test r2 score:  0.8859130796697334


#### All the data

In [70]:
lasso = Lasso()

parameters = {'alpha': [1e-15, 1e-10, 1e-8, 1e-4, 1e-3,1e-2, 1, 5, 10, 20]}

lasso_regressor = GridSearchCV(lasso, parameters, cv = 5)

lasso_regressor.fit(X_train, y_train)



GridSearchCV(cv=5, error_score='raise',
       estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'alpha': [1e-15, 1e-10, 1e-08, 0.0001, 0.001, 0.01, 1, 5, 10, 20]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [71]:
lasso_regressor.best_params_

{'alpha': 0.001}

In [72]:
lasso_regressor.score(X_train,y_train)

0.8935460129229055

In [73]:
coeff_used = np.sum(lasso_regressor.best_estimator_.coef_!=0)
print(coeff_used)

50


In [74]:
lasso2_predictions = lasso_regressor.predict(X_test)

In [75]:
lasso2 = r2_score(y_test, lasso2_predictions)

In [76]:
print("test r2 score: ", lasso2)

test r2 score:  0.879192966965149


# AIC BIC

In [77]:
def AIC(y_true, y_hat, coeff_used):
    resid = y_true - y_hat
    sse = sum(resid**2)
    n = len(y_hat)
    return n*np.log(sse/n)+2*coeff_used

def BIC(y_true, y_hat, coeff_used):
    resid = y_true - y_hat
    sse = sum(resid**2)
    n = len(y_hat)
    return n*np.log(sse/n)+np.log(n)*coeff_used

def AICc(y_true, y_hat, coeff_used):
    resid = y_true - y_hat
    sse = sum(resid**2)
    n = len(y_hat)
    return n*np.log(sse/n)+2*coeff_used*n/(n-coeff_used-1)

#### aic, bic and aicc of simple linear model

In [78]:
aic_lm1 = AIC(y_test, lm1_predictions, (len(X_test.columns)+1))
print('AIC: ',aic_lm1)
bic_lm1 = BIC(y_test, lm1_predictions, (len(X_test.columns)+1))
print('BIC: ',bic_lm1)
aicc_lm1 = AICc(y_test, lm1_predictions, (len(X_test.columns)+1))
print('AICc: ',aicc_lm1)

AIC:  28695.95735681517
BIC:  29127.210158098147
AICc:  28696.34959077929


#### aic, bic and aicc of lasso model

In [79]:
aic_lasso2 = AIC(y_test, lasso2_predictions, (coeff_used+1))
print('AIC: ',aic_lasso2)
bic_lasso2 = BIC(y_test, lasso2_predictions, (coeff_used+1))
print('BIC: ',bic_lasso2)
aicc_lasso2 = AICc(y_test, lasso2_predictions, (coeff_used+1))
print('AICc: ',aicc_lasso2)

AIC:  28695.833966763686
BIC:  29088.582053646398
AICc:  28696.15974527852


Both AIC and BIC for Lasso model are slightly lower than the values of linear model, so Lasso model is better. As long as the number of observations N is not too small (7 or less),  BIC is always less tolerant than AIC for free parameters. When N is sufficient, BIC can help us to reduce the chance of choosing a very large model, but it also leads to a larger chance of choosing a very small model. Thus, we can’t simply determine that BIC is always greater than AIC.
By comparing the way we calculate AIC and AICc, it is not hard to see AICc is always smaller than AIC. But as the ratio of N/df gets higher, AICc and AIC tend to be approximately equal, which applies to our case.
