In [116]:
import pandas as pd

df = pd.read_csv('../data/whoscored/whoscored_10000_matches.csv')

# 1. DATA CLEANING

In [117]:
print('all matchs:',df.shape)
df = df.loc[df['home_possession'] != 'null']
print('all detailed matchs:',df.shape)

all matchs: (9661, 27)
all detailed matchs: (761, 27)


In [118]:
df['match_id']=df['match_report-href'].str.extract('(\d+)',expand=False).astype(int)
df.drop('match_report-href',axis=1,inplace=True)
df=df.sort_values(by='match_id', ascending=1)

In [119]:
df['home_team_id']=df['home_team-href'].str.extract('(\d+)',expand=False).astype(int)
df['away_team_id']=df['away_team-href'].str.extract('(\d+)',expand=False).astype(int)
df.drop(['home_team-href','away_team-href'],axis=1,inplace=True)
df['tournament_id']=df['tournament-href'].str.extract('(\d+)',expand=False).astype(int)
df.drop(['tournament-href'],axis=1,inplace=True)

## 1.1. Inspect Tournament INFO

In [120]:
df[['match_report','tournament','tournament_id','country']].groupby(['country','tournament','tournament_id']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,match_report
country,tournament,tournament_id,Unnamed: 3_level_1
Brazil,Brasileirão - 2016,31,1
England,Championship - 2016/2017,252,271
England,Premier League - 2016/2017,252,8
Europe,UEFA Super Cup - 2015/2016,250,1
France,Ligue 1 - 2016/2017,74,187
International,European Championship - 2016 - EURO 1/8 Finals,247,2
Netherlands,Eredivisie - 2016/2017,155,153
Russia,Premier League - 2016/2017,182,136
Spain,Supercopa de Espana - 2015/2016,206,2


### only process tournment England Championahip (252)

In [121]:
df = df.loc[df['tournament_id'] == 252]

## 1.2. convert percentage to fraction

In [122]:
import numpy as np
df['away_possession']=df['away_possession'].replace('%','',regex=True).astype('float')/100
df['home_possession']=df['home_possession'].replace('%','',regex=True).astype('float')/100
df['away_aerial_duel_success']=df['away_aerial_duel_success'].replace('%','',regex=True).astype('float')/100
df['home_aerial_duel_success']=df['home_aerial_duel_success'].replace('%','',regex=True).astype('float')/100
df['home_pass_success']=df['home_pass_success'].replace('%','',regex=True).astype('float')/100
df['away_pass_success']=df['away_pass_success'].replace('%','',regex=True).astype('float')/100

## 1.3. extract home and away goal

In [123]:
df_ht=df['half_time'].str.split(':',expand=True)
df_ht.columns=['home_half_time_goal','away_half_time_goal']
df_ft=df['full_time'].str.split(':',expand=True)
df_ft.columns=['home_full_time_goal','away_full_time_goal']

df = pd.concat([df, df_ht], axis=1, join_axes=[df.index])
df = pd.concat([df, df_ft], axis=1, join_axes=[df.index])

## 1.4. change data type to integer

In [124]:
df['home_full_time_goal']=df['home_full_time_goal'].astype(int)
df['away_full_time_goal']=df['away_full_time_goal'].astype(int)
df['home_half_time_goal']=df['home_half_time_goal'].astype(int)
df['away_half_time_goal']=df['away_half_time_goal'].astype(int)
df['home_shots']=df['home_shots'].astype(int)
df['away_shots']=df['away_shots'].astype(int)
df['home_shots_on_target']=df['home_shots_on_target'].astype(int)
df['away_shots_on_target']=df['away_shots_on_target'].astype(int)
df['home_dribbles_won']=df['home_dribbles_won'].astype(int)
df['away_dribbles_won']=df['away_dribbles_won'].astype(int)
df['home_tackles']=df['home_tackles'].astype(int)
df['away_tackles']=df['away_tackles'].astype(int)

# 2. feature engineering
## 2.1. find goal difference

In [125]:
df['half_time_goal_diff']=df['home_half_time_goal']-df['away_half_time_goal']
df['full_time_goal_diff']=df['home_full_time_goal']-df['away_full_time_goal']

# 3. Training

In [126]:
home_input_columns=['home_team','home_shots','home_shots_on_target',
       'home_pass_success', 
       'home_aerial_duel_success', 
       'home_dribbles_won', 'home_tackles',
       'home_possession', 
       'home_team_id', 'home_half_time_goal',
       'home_full_time_goal']

away_input_columns=['away_shots', 
       'away_shots_on_target', 'away_pass_success',
       'away_aerial_duel_success',
       'away_dribbles_won',
       'away_tackles', 'away_possession', 
       'away_team_id', 
       'away_half_time_goal', 'away_full_time_goal']


home_team_info=df[home_input_columns].groupby('home_team_id').mean()
away_team_info=df[away_input_columns].groupby('away_team_id').mean()
home_team_info.head()

Unnamed: 0_level_0,home_shots,home_shots_on_target,home_pass_success,home_aerial_duel_success,home_dribbles_won,home_tackles,home_possession,home_half_time_goal,home_full_time_goal
home_team_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
19,10.916667,3.666667,0.741667,0.455,8.5,19.333333,0.485833,0.416667,1.25
20,14.818182,4.363636,0.773636,0.510909,8.454545,15.090909,0.530909,0.272727,0.727273
21,10.0,4.0,0.74,0.58,6.0,17.0,0.43,2.0,3.0
23,15.6,5.2,0.79,0.493,6.6,15.8,0.595,0.9,2.2
24,12.75,4.75,0.735,0.5675,9.166667,18.083333,0.48,0.666667,1.583333


## 3.1. prepare training set and test set

In [127]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

train_columns=['home_shots', 'away_shots', 'home_shots_on_target',
       'away_shots_on_target', 'home_pass_success', 'away_pass_success',
       'home_aerial_duel_success', 'away_aerial_duel_success',
       'home_dribbles_won', 'away_dribbles_won', 'home_tackles',
       'away_tackles', 'home_possession', 'away_possession']
target_column='full_time_goal_diff'
team_id_columns=['home_team_id','away_team_id']

#train_df, test_df = train_test_split(df[train_columns + [target_column] + team_id_columns], test_size = 0.2)
train_size=int(df.shape[0]*0.8)

train_df = df[train_columns + [target_column] + team_id_columns][:train_size]
test_df = df[train_columns + [target_column] + team_id_columns][train_size:]
train_x_df = train_df[train_columns]
train_y_df = train_df[target_column]
test_x_df = test_df[train_columns]
test_y_df = test_df[target_column]

## 3.2. linear regression

In [163]:
regr = LinearRegression()
regr.fit(train_x_df, train_y_df)
print('Coefficients: \n', regr.coef_)
print("Mean squared error: %.2f" % np.mean((regr.predict(test_x_df) - test_y_df) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % regr.score(test_x_df, test_y_df))
print('Mean absolute error: %.2f' % mean_absolute_error(test_y_df,regr.predict(test_x_df)))


Coefficients: 
 [ -6.98157282e-02   4.30086639e-02   3.82026561e-01  -3.42443299e-01
   3.47836722e+00  -3.02189564e+00   9.99524832e+01   9.82296837e+01
   4.52181105e-02  -6.07347770e-03  -1.17633854e-02  -2.62111948e-02
  -2.52400960e+00   2.52400960e+00]
Mean squared error: 2.28
Variance score: 0.24
Mean absolute error: 1.20


## 3.3. Lasso

In [164]:
from sklearn.linear_model import Lasso
clf = Lasso(alpha=0.1)
clf.fit(train_x_df, train_y_df)

print('Coefficients: \n', clf.coef_)
print("Mean squared error: %.2f" % np.mean((clf.predict(test_x_df) - test_y_df) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % clf.score(test_x_df, test_y_df))
print('Mean absolute error: %.2f' % mean_absolute_error(test_y_df,clf.predict(test_x_df)))



Coefficients: 
 [ -7.12313049e-02   2.83112649e-02   3.58320974e-01  -3.22103352e-01
  -0.00000000e+00   0.00000000e+00   0.00000000e+00  -0.00000000e+00
   3.61939182e-02  -1.94466141e-04  -0.00000000e+00  -2.76031184e-02
  -0.00000000e+00   0.00000000e+00]
Mean squared error: 2.25
Variance score: 0.25
Mean absolute error: 1.23


## 3.4. Elastic Net

In [165]:
from sklearn.linear_model import ElasticNet

enf = ElasticNet(alpha=0.1)
enf.fit(train_x_df, train_y_df)

print('Coefficients: \n', enf.coef_)
print("Mean squared error: %.2f" % np.mean((enf.predict(test_x_df) - test_y_df) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % enf.score(test_x_df, test_y_df))
print('Mean absolute error: %.2f' % mean_absolute_error(test_y_df,enf.predict(test_x_df)))

Coefficients: 
 [-0.0777746   0.03495085  0.37373163 -0.33856046 -0.          0.          0.
 -0.          0.04197191 -0.00393122 -0.00158674 -0.02924034 -0.          0.        ]
Mean squared error: 2.25
Variance score: 0.25
Mean absolute error: 1.24


# 4. Prediction
## 4.1. Whoscored Team ID INFO

In [129]:
home_team_id_map=df.groupby(['home_team_id','home_team']).count().reset_index()[['home_team_id','home_team']]
away_team_id_map=df.groupby(['away_team_id','away_team']).count().reset_index()[['away_team_id','away_team']]


def team_id_to_name(team_id):
    away_team_id_name=away_team_id_map.loc[away_team_id_map['away_team_id']==team_id]    
    if away_team_id_name.size > 0:
        return away_team_id_name['away_team'].values[0]
    
    home_team_id_name=home_team_id_map.loc[home_team_id_map['home_team_id']==team_id]
    if home_team_id_name.size > 0:
        return home_team_id_name['home_team'].values[0]
    return None

for team_id in range(0, 1000):
    team_name = team_id_to_name(team_id)
    if team_name is not None:
        print(str(team_id),':',team_id_to_name(team_id))

13 : Arsenal
15 : Chelsea
18 : Southampton
19 : Leeds
20 : Derby
21 : Middlesbrough
23 : Newcastle
24 : Aston Villa
25 : Sheff Wed
26 : Liverpool
30 : Tottenham
31 : Everton
94 : Reading
96 : Stoke
142 : Barnsley
157 : Birmingham
158 : Blackburn
161 : Wolves
162 : Crystal Palace
165 : Ipswich
166 : Huddersfield
167 : Man City
168 : Norwich
170 : Fulham
171 : QPR
174 : Nottingham Forest
181 : Preston
182 : Bristol City
183 : Bournemouth
184 : Burnley
188 : Cardiff
189 : Brentford
194 : Wigan
210 : Rotherham
211 : Brighton
214 : Hull
259 : Swansea


In [130]:
home_team_info.head()

Unnamed: 0_level_0,home_shots,home_shots_on_target,home_pass_success,home_aerial_duel_success,home_dribbles_won,home_tackles,home_possession,home_half_time_goal,home_full_time_goal
home_team_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
19,10.916667,3.666667,0.741667,0.455,8.5,19.333333,0.485833,0.416667,1.25
20,14.818182,4.363636,0.773636,0.510909,8.454545,15.090909,0.530909,0.272727,0.727273
21,10.0,4.0,0.74,0.58,6.0,17.0,0.43,2.0,3.0
23,15.6,5.2,0.79,0.493,6.6,15.8,0.595,0.9,2.2
24,12.75,4.75,0.735,0.5675,9.166667,18.083333,0.48,0.666667,1.583333


## 4.2. Test Prediction

In [131]:
#try predict home team 19 vs away team 20
home_team_info_1=home_team_info.reset_index()
home_team_features=home_team_info_1.loc[home_team_info_1['home_team_id']==19]
del home_team_info_1
home_team_features

Unnamed: 0,home_team_id,home_shots,home_shots_on_target,home_pass_success,home_aerial_duel_success,home_dribbles_won,home_tackles,home_possession,home_half_time_goal,home_full_time_goal
0,19,10.916667,3.666667,0.741667,0.455,8.5,19.333333,0.485833,0.416667,1.25


In [132]:
away_team_info_1=away_team_info.reset_index()
away_team_features=away_team_info_1.loc[away_team_info_1['away_team_id']==20]
del away_team_info_1
away_team_features

Unnamed: 0,away_team_id,away_shots,away_shots_on_target,away_pass_success,away_aerial_duel_success,away_dribbles_won,away_tackles,away_possession,away_half_time_goal,away_full_time_goal
4,20,11.0,3.818182,0.744545,0.502727,7.454545,15.363636,0.480909,0.454545,1.090909


In [133]:
features = pd.concat([home_team_features.reset_index(), away_team_features.reset_index()], axis=1)[train_columns]
features

Unnamed: 0,home_shots,away_shots,home_shots_on_target,away_shots_on_target,home_pass_success,away_pass_success,home_aerial_duel_success,away_aerial_duel_success,home_dribbles_won,away_dribbles_won,home_tackles,away_tackles,home_possession,away_possession
0,10.916667,11.0,3.666667,3.818182,0.741667,0.744545,0.455,0.502727,8.5,7.454545,19.333333,15.363636,0.485833,0.480909


In [134]:
#normalize features
features['total_possession']=features['home_possession']+features['away_possession']
features['home_possession']=features['home_possession']/features['total_possession']
features['away_possession']=features['away_possession']/features['total_possession']
features.drop('total_possession',axis=1,inplace=True)
features

Unnamed: 0,home_shots,away_shots,home_shots_on_target,away_shots_on_target,home_pass_success,away_pass_success,home_aerial_duel_success,away_aerial_duel_success,home_dribbles_won,away_dribbles_won,home_tackles,away_tackles,home_possession,away_possession
0,10.916667,11.0,3.666667,3.818182,0.741667,0.744545,0.455,0.502727,8.5,7.454545,19.333333,15.363636,0.502547,0.497453


In [135]:
#test_x_df.head()
regr.predict(features)

array([-4.03896667])

## 4.3. Make prediction with all test data set

In [136]:
def predict_diff(home_team_id, away_team_id):
    home_team_info_1=home_team_info.reset_index()
    home_team_features=home_team_info_1.loc[home_team_info_1['home_team_id']==home_team_id]
    del home_team_info_1
    away_team_info_1=away_team_info.reset_index()
    away_team_features=away_team_info_1.loc[away_team_info_1['away_team_id']==away_team_id]
    del away_team_info_1
    away_team_features
    features = pd.concat([home_team_features.reset_index(), away_team_features.reset_index()], axis=1)[train_columns]
    features['total_possession']=features['home_possession']+features['away_possession']
    features['home_possession']=features['home_possession']/features['total_possession']
    features['away_possession']=features['away_possession']/features['total_possession']
    features.drop('total_possession',axis=1,inplace=True)
    return regr.predict(features)

In [137]:
test_df_predict = test_df[['home_team_id','away_team_id','full_time_goal_diff']]
test_df_predict['predict_goal_diff']=0
test_df_predict.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


Unnamed: 0,home_team_id,away_team_id,full_time_goal_diff,predict_goal_diff
9164,25,165,-1,0
3645,157,211,-1,0
178,210,94,-1,0
7550,210,1786,-1,0
2179,157,142,-3,0


In [138]:
for index, row in test_df_predict.iterrows():
    predicted_diff=predict_diff(row['home_team_id'],row['away_team_id'])[0]    
    #print(row['full_time_goal_diff'],":",predicted_diff)
    row['predict_goal_diff']=predicted_diff

In [139]:
test_df_predict

Unnamed: 0,home_team_id,away_team_id,full_time_goal_diff,predict_goal_diff
9164,25,165,-1,1
3645,157,211,-1,5
178,210,94,-1,-7
7550,210,1786,-1,-5
2179,157,142,-3,0
5340,194,94,-3,0
3315,158,94,-1,0
1248,25,171,1,0
8333,161,171,-1,-2
2978,158,166,0,-8
