In [1]:
import time
from math import sqrt
import pandas as pd
import numpy as np
from matplotlib import pyplot
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, f1_score
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from catboost import Pool, CatBoostClassifier
import seaborn as sns
import matplotlib.pyplot as plt
pd.set_option('display.max_rows', 100)

In [12]:
df_stats=pd.read_csv('data/statistics.csv', index_col=None)
df_stats=df_stats.loc[df_stats['period']=='1ST']
df_stats['Ball possession']=df_stats['Ball possession'].apply(lambda x: float(x.replace('%',''))/100)

In [13]:
df_stats_h=df_stats.loc[df_stats['ishome']==1].reset_index(drop=True)
df_stats_a=df_stats.loc[df_stats['ishome']==0].reset_index(drop=True)

In [7]:
df_stats

Unnamed: 0,mid,period,ishome,Ball possession,Shots on target,Shots off target,Corner kicks,Offsides,Fouls,Yellow cards,...,Hit woodwork,Big chances,Big chances missed,Possession lost,Long balls,Crosses,Dribbles,Tackles,Interceptions,Clearances
724,5583114,1ST,1,0.71,3.0,7.0,2.0,0.0,,1.0,...,0.0,,,,,,,,,
725,5583114,1ST,0,0.29,0.0,1.0,0.0,0.0,,0.0,...,0.0,,,,,,,,,
730,5583116,1ST,1,0.62,5.0,6.0,6.0,3.0,,0.0,...,,,,,,,,,,
731,5583116,1ST,0,0.38,2.0,2.0,2.0,0.0,,2.0,...,,,,,,,,,,
736,5583118,1ST,1,0.30,1.0,0.0,2.0,1.0,,0.0,...,0.0,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
154253,7775071,1ST,0,0.33,1.0,5.0,2.0,0.0,,1.0,...,0.0,0.0,0.0,,,,,,,
154264,7779099,1ST,1,0.64,3.0,5.0,6.0,0.0,,2.0,...,,1.0,1.0,,,,,,,
154265,7779099,1ST,0,0.36,0.0,1.0,0.0,1.0,,0.0,...,,0.0,0.0,,,,,,,
154270,7779100,1ST,1,0.58,2.0,1.0,4.0,0.0,,1.0,...,,0.0,0.0,,,,,,,


In [60]:
descr=pd.DataFrame(df_stats_h.describe().iloc[0])
descr['pct']=descr['count']/df_stats_h.shape[0]
keep=descr.loc[descr['pct']>.8].index
df_stats_h=df_stats_h[keep]
df_stats_a=df_stats_a[keep]

In [15]:
descr=pd.DataFrame(df_stats_h.describe().iloc[0])
descr['pct']=descr['count']/df_stats_h.shape[0]

In [5]:
df_incidents=pd.read_csv('data/incidents.csv', index_col=None)
df_incidents=df_incidents.loc[df_incidents['incidentType']!='varDecision']
df_incidents=df_incidents.loc[df_incidents['incidentType']!='injuryTime']
df_incidents=df_incidents.loc[df_incidents['time']>0]
df_incidents=df_incidents.loc[df_incidents['time']<91]
torem=['period','injuryTime']
df_incidents=df_incidents.loc[~(df_incidents['incidentType'].isin(torem))]
df_incidents.loc[(df_incidents['addedTime']>0) & (df_incidents['time']==90),'time']=90.5
df_incidents.loc[(df_incidents['addedTime']>0) & (df_incidents['time']==45),'time']=45.5

df_stats=pd.read_csv('data/statistics.csv', index_col=None)
df_stats=df_stats.loc[df_stats['period']=='1ST']
df_stats=df_stats.loc[df_stats['mid'].isin(df_incidents['mid'].unique())]
df_stats['Ball possession']=df_stats['Ball possession'].apply(lambda x: float(str(x).replace('%',''))/100)

df_stats_h=df_stats.loc[df_stats['ishome']==1].reset_index(drop=True)
df_stats_a=df_stats.loc[df_stats['ishome']==0].reset_index(drop=True)

descr=pd.DataFrame(df_stats_h.describe().iloc[0])
descr['pct']=descr['count']/df_stats_h.shape[0]
keep=descr.loc[descr['pct']>.8].index
df_stats_h=df_stats_h[keep]
df_stats_a=df_stats_a[keep]
cols_stats=df_stats_h.columns[1:]
cols_stats_diffs=[x+'_diff' for x in cols_stats]
stats_diffs=df_stats_h[cols_stats]-df_stats_a[cols_stats]
stats_diffs.columns=cols_stats_diffs
df_stats_h=pd.concat([df_stats_h,stats_diffs], axis=1)
df_stats_a=pd.concat([df_stats_a,stats_diffs*-1], axis=1)

df_incidents=df_incidents.loc[df_incidents['mid'].isin(df_stats['mid'].unique())]

df_graph=pd.read_csv('data/graph.csv', index_col=None)
df_graph=df_graph.loc[df_graph['minute']>0]
df_graph=df_graph.loc[df_graph['minute']<91]
df_graph.columns=['mid','time','graph']
df_graph=df_graph.loc[df_graph['mid'].isin(df_incidents['mid'].unique())]
df=df_graph.merge(df_incidents[['mid','time','position1','incidentType','isHome']], on=['mid','time'], how='left')

df=pd.get_dummies(df, columns=['incidentType'], prefix=['it'] )
df=pd.get_dummies(df, columns=['position1'], prefix=['p'] )
df=pd.get_dummies(df, columns=['isHome'], prefix=['h'] )

df['h_goal']=df['it_goal']*df['h_True']
df['a_goal']=df['it_goal']*df['h_False']

df['h_red']=(df['it_red']+df['it_yellowRed'])*df['h_True']
df['a_red']=(df['it_red']+df['it_yellowRed'])*df['h_False']

df['h_yellow']=(df['it_yellow']+df['it_yellowRed'])*df['h_True']
df['a_yellow']=(df['it_yellow']+df['it_yellowRed'])*df['h_False']

df['h_penalty']=df['it_inGamePenalty']*df['h_True']
df['a_penalty']=df['it_inGamePenalty']*df['h_False']

df['h_subs']=df['it_substitution']*df['h_True']
df['a_subs']=df['it_substitution']*df['h_False']

df.drop(['it_yellowRed','it_inGamePenalty','it_substitution','it_yellow','it_red','it_goal','h_True','h_False'], axis = 1, inplace=True) 
df['half']=np.where(df['time']<46,0,1)
df['half_dyn']=df['time'].apply(lambda x: 1 if x<30 else 2 if x<46 else 0)


In [6]:
df_half=df.loc[df['half']==0].groupby('mid')[['graph','p_D','p_F','p_G','p_M','h_goal','a_goal','h_red','a_red','h_yellow','a_yellow','h_penalty','a_penalty','h_subs','a_subs']].mean().reset_index()

df_h1=df.loc[df['half']==0].groupby('mid')[['h_goal','a_goal']].sum().reset_index()
df_h1[['h_goal','a_goal']]=df_h1[['h_goal','a_goal']].astype(float)
df_h1.columns=['mid','h_goal_h1','a_goal_h1']

df_h2=df.loc[df['half']==1].groupby('mid')[['h_goal','a_goal']].sum().reset_index()
df_h2[['h_goal','a_goal']]=df_h2[['h_goal','a_goal']].astype(float)
df_h2.columns=['mid','h_goal_h2','a_goal_h2']

df_ft=df.groupby('mid')[['h_goal','a_goal']].sum().reset_index()
df_ft[['h_goal','a_goal']]=df_ft[['h_goal','a_goal']].astype(float)
df_ft.columns=['mid','h_goal_ft','a_goal_ft']

df_half=df_half.merge(df_h1, on='mid', how='left')
df_half=df_half.merge(df_h2, on='mid', how='left')
df_half=df_half.merge(df_ft, on='mid', how='left')

df_half['h1_res']=np.where(df_half['h_goal_h1']==df_half['a_goal_h1'],0,np.where(df_half['h_goal_h1']>df_half['a_goal_h1'],1,2))
df_half['h2_res']=np.where(df_half['h_goal_h2']==df_half['a_goal_h2'],0,np.where(df_half['h_goal_h2']>df_half['a_goal_h2'],1,2))
df_half['ft_res']=np.where(df_half['h_goal_ft']==df_half['a_goal_ft'],0,np.where(df_half['h_goal_ft']>df_half['a_goal_ft'],1,2))

# DIFFS 

df_half['goal_d']=df_half['h_goal_h1']-df_half['a_goal_h1']
df_half['red_d']=df_half['h_red']-df_half['a_red']
df_half['yellow_d']=df_half['h_yellow']-df_half['a_yellow']
df_half['penalty_d']=df_half['h_penalty']-df_half['a_penalty']
df_half['subs_d']=df_half['h_subs']-df_half['a_subs']
df_half.drop(columns=['p_D', 'p_F', 'p_G', 'p_M', 'h_goal', 'a_goal', 'h_red', 'a_red', 'h_yellow', 'a_yellow', 'h_penalty', 'a_penalty', 'h_subs', 'a_subs'], inplace=True)

# DYNAMICS
cols_dyn=['graph','h_goal','a_goal','h_red','a_red','h_yellow','a_yellow','h_penalty','a_penalty','h_subs','a_subs']
df_h1=df.loc[df['half_dyn']==1].groupby('mid')[cols_dyn].mean().reset_index()
df_h2=df.loc[df['half_dyn']==2].groupby('mid')[cols_dyn].mean().reset_index()
df_h1=df_h1.loc[df_h1['mid'].isin(df_h2['mid'])]
df_h1=df_h1.reset_index(drop=True)
df_h2=df_h2.reset_index(drop=True)
df_h2[cols_dyn]=df_h2[cols_dyn]-df_h1[cols_dyn]

df_h2['goal_dyn']=df_h2['h_goal']-df_h2['a_goal']
df_h2['red_dyn']=df_h2['h_red']-df_h2['a_red']
df_h2['yellow_dyn']=df_h2['h_yellow']-df_h2['a_yellow']
df_h2['penalty_dyn']=df_h2['h_penalty']-df_h2['a_penalty']
df_h2['subs_dyn']=df_h2['h_subs']-df_h2['a_subs']
df_h2=df_h2.rename(columns={'graph':'graph_dyn'})

df_half=df_half.merge(df_h2[['mid', 'graph_dyn', 'goal_dyn', 'red_dyn', 'yellow_dyn', 'penalty_dyn', 'subs_dyn']], on='mid', how='left')


In [8]:
df_home=df_half.copy()
df_home['home']=1
df_away=df_half.copy()
df_away['home']=0
df_away[['graph','goal_d','red_d','yellow_d','penalty_d','subs_d', 'graph_dyn', 'goal_dyn', 'red_dyn', 'yellow_dyn', 'penalty_dyn', 'subs_dyn']]=df_away[['graph','goal_d','red_d','yellow_d','penalty_d','subs_d', 'graph_dyn', 'goal_dyn', 'red_dyn', 'yellow_dyn', 'penalty_dyn', 'subs_dyn']]*-1
df_away['h1_res']=df_away['h1_res'].apply(lambda x: 0 if x==0 else 1 if x==2 else 2)
df_away['h2_res']=df_away['h2_res'].apply(lambda x: 0 if x==0 else 1 if x==2 else 2)
df_away['ft_res']=df_away['ft_res'].apply(lambda x: 0 if x==0 else 1 if x==2 else 2)
df_away=df_away.rename(columns={'h_goal_h1':'a_goal_h1', 'a_goal_h1':'h_goal_h1', 'h_goal_h2':'a_goal_h2', 'a_goal_h2':'h_goal_h2',
       'h_goal_ft':'a_goal_ft', 'a_goal_ft':'h_goal_ft'})
df_home=df_home.merge(df_stats_h, on='mid', how='left')
df_away=df_away.merge(df_stats_a, on='mid', how='left')
df_half=pd.concat([df_home, df_away]).reset_index(drop=True)
df_half=df_half.sample(frac=1).reset_index(drop=True)
df_half['htft_changed']=np.where(df_half['h1_res']==df_half['ft_res'],0,1)

In [10]:
print('H1-FT',f1_score(df_half['h1_res'], df_half['ft_res'], average='micro'))
print('H1-H2',f1_score(df_half['h1_res'], df_half['h2_res'], average='micro'))
print('HTFT Changed',df_half['htft_changed'].mean())

H1-FT 0.5956607495069034
H1-H2 0.37299429607121914
HTFT Changed 0.40433925049309666


In [11]:
#df_half=pd.get_dummies(df_half, columns=['h2_res'], prefix=['h2_res'] )
#df_half=pd.get_dummies(df_half, columns=['ft_res'], prefix=['ft_res'] )

cols_info=['mid', 'home', 'h_goal_h1', 'a_goal_h1', 'h_goal_h2', 'a_goal_h2', 'h_goal_ft', 'a_goal_ft','h1_res', 'h2_res','ft_res','htft_changed']
cols_num=['graph', 'goal_d', 'h_goal_h1', 'a_goal_h1','red_d','yellow_d','penalty_d','subs_d', 'graph_dyn', 'goal_dyn', 'red_dyn', 'yellow_dyn', 'penalty_dyn', 'subs_dyn']+list(cols_stats)+list(cols_stats_diffs)
cols_cat=['home','h1_res'] #h1_res
#cols_out=[ 'h2_res_0','h2_res_1','h2_res_2']
#cols_out=[ 'ft_res_0','ft_res_1','ft_res_2']
cols_out=[ 'h2_res','ft_res', 'htft_changed']
scaler = MinMaxScaler(feature_range=(0, 1))
scaled = scaler.fit_transform(df_half[cols_num].values)
df_X=pd.concat([pd.DataFrame(data=scaled, columns=cols_num), df_half[cols_cat]], axis=1)
df_X[cols_cat].astype(int)
#df_X=pd.DataFrame(data=scaled, columns=cols_num)
df_y=df_half[cols_out]
df_info=df_half[cols_info]
cat_features=[len(df_X.columns)-2,len(df_X.columns)-1]


In [28]:
#df_X=pd.concat([pd.DataFrame(data=scaled, columns=cols_num), df_half[cols_info+cols_bin]], axis=1).values
#df_y=df_half[cols_out].values

#df_X=pd.concat([pd.DataFrame(data=scaled, columns=cols_num), df_half[cols_info+cols_bin]], axis=1)
#df_y=df_half[cols_out]

In [13]:
train_X, test_X, train_y, test_y, train_info, test_info = train_test_split(df_X, df_y, df_info, test_size=0.4, random_state=42)
validate_X, test_X, validate_y, test_y, validate_info, test_info = train_test_split(test_X, test_y, test_info, test_size=0.5, random_state=42)
print(train_X.shape, train_y.shape, train_info.shape, test_X.shape, test_y.shape ,test_info.shape, validate_X.shape, validate_y.shape ,validate_info.shape)

(22510, 46) (22510, 3) (22510, 12) (7504, 46) (7504, 3) (7504, 12) (7504, 46) (7504, 3) (7504, 12)


In [14]:
target='h2_res'
train_dataset = Pool(data=train_X, label=train_y[target], cat_features=cat_features)
eval_dataset = Pool(data=test_X, label=test_y[target], cat_features=cat_features)
validate_dataset = Pool(data=validate_X, label=validate_y[target], cat_features=cat_features)
model = CatBoostClassifier(iterations=10000, learning_rate=.001, depth=4, loss_function='MultiClass')
model.fit(train_dataset,eval_set=validate_dataset,early_stopping_rounds=50,use_best_model=True, verbose=False)
preds_class = model.predict(eval_dataset)
#preds_proba = model.predict_proba(eval_dataset)

In [15]:
dfr=pd.concat([test_info.reset_index(drop=True),pd.DataFrame(preds_class, columns=['pred_h2'])], axis=1)

In [16]:
target='ft_res'
train_dataset = Pool(data=train_X, label=train_y[target], cat_features=cat_features)
eval_dataset = Pool(data=test_X, label=test_y[target], cat_features=cat_features)
validate_dataset = Pool(data=validate_X, label=validate_y[target], cat_features=cat_features)
model = CatBoostClassifier(iterations=10000, learning_rate=.001, depth=4, loss_function='MultiClass')
model.fit(train_dataset,eval_set=validate_dataset,early_stopping_rounds=50,use_best_model=True, verbose=False)
preds_class = model.predict(eval_dataset)

In [17]:
dfr=pd.concat([dfr.reset_index(drop=True),pd.DataFrame(preds_class, columns=['pred_ft'])], axis=1)

In [18]:
target='htft_changed'
train_dataset = Pool(data=train_X, label=train_y[target], cat_features=cat_features)
eval_dataset = Pool(data=test_X, label=test_y[target], cat_features=cat_features)
validate_dataset = Pool(data=validate_X, label=validate_y[target], cat_features=cat_features)
model = CatBoostClassifier(iterations=10000, learning_rate=.001, depth=4)
model.fit(train_dataset,eval_set=validate_dataset,early_stopping_rounds=50,use_best_model=True)
preds_class = model.predict(eval_dataset)
preds_proba = model.predict_proba(eval_dataset)

:	learn: 0.5603634	test: 0.5638436	best: 0.5638436 (9784)	total: 6m 4s	remaining: 8.01s
9785:	learn: 0.5603616	test: 0.5638444	best: 0.5638436 (9784)	total: 6m 4s	remaining: 7.98s
9786:	learn: 0.5603600	test: 0.5638451	best: 0.5638436 (9784)	total: 6m 4s	remaining: 7.94s
9787:	learn: 0.5603586	test: 0.5638453	best: 0.5638436 (9784)	total: 6m 4s	remaining: 7.9s
9788:	learn: 0.5603573	test: 0.5638448	best: 0.5638436 (9784)	total: 6m 4s	remaining: 7.86s
9789:	learn: 0.5603560	test: 0.5638448	best: 0.5638436 (9784)	total: 6m 4s	remaining: 7.83s
9790:	learn: 0.5603545	test: 0.5638451	best: 0.5638436 (9784)	total: 6m 4s	remaining: 7.79s
9791:	learn: 0.5603527	test: 0.5638456	best: 0.5638436 (9784)	total: 6m 4s	remaining: 7.75s
9792:	learn: 0.5603525	test: 0.5638454	best: 0.5638436 (9784)	total: 6m 5s	remaining: 7.71s
9793:	learn: 0.5603518	test: 0.5638451	best: 0.5638436 (9784)	total: 6m 5s	remaining: 7.68s
9794:	learn: 0.5603507	test: 0.5638444	best: 0.5638436 (9784)	total: 6m 5s	remaining:

In [19]:
dfr=pd.concat([dfr.reset_index(drop=True),pd.DataFrame(preds_proba, columns=['pred_ch1','pred_ch0'])], axis=1)
dfr=pd.concat([dfr.reset_index(drop=True),pd.DataFrame(preds_class, columns=['pred_ch'])], axis=1)

In [20]:
dfr['h1_diff']=(dfr['h_goal_h1']-dfr['a_goal_h1']).abs()
dfr['h2_acc']=np.where(dfr['h2_res']==dfr['pred_h2'],1,0)
dfr['ft_acc']=np.where(dfr['ft_res']==dfr['pred_ft'],1,0)
dfr['ch_acc']=np.where(dfr['htft_changed']==dfr['pred_ch'],1,0)

In [21]:
dfr.groupby(['home','h1_diff'])[['htft_changed','pred_ch','h2_acc','ft_acc','ch_acc','pred_ch1','pred_ch0']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,htft_changed,pred_ch,h2_acc,ft_acc,ch_acc,pred_ch1,pred_ch0
home,h1_diff,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.0,0.632154,0.999357,0.437942,0.436656,0.632797,0.368327,0.631673
0,1.0,0.304749,0.003298,0.434697,0.695251,0.69723,0.703984,0.296016
0,2.0,0.127376,0.0,0.412548,0.872624,0.872624,0.871978,0.128022
0,3.0,0.013158,0.0,0.381579,0.986842,0.986842,0.928428,0.071572
0,4.0,0.021739,0.0,0.478261,0.978261,0.978261,0.933059,0.066941
0,5.0,0.0,0.0,0.0,1.0,1.0,0.941094,0.058906
0,6.0,0.0,0.0,0.625,1.0,1.0,0.926927,0.073073
0,8.0,0.0,0.0,0.0,1.0,1.0,0.895666,0.104334
1,0.0,0.631223,1.0,0.429625,0.436386,0.631223,0.362562,0.637438
1,1.0,0.296066,0.00207,0.437543,0.703244,0.703244,0.70379,0.29621


In [106]:
dfr[['htft_changed','pred_ch','h2_acc','ft_acc','ch_acc','pred_ch1','pred_ch0']].mean()

htft_changed    0.400928
pred_ch         0.417169
h2_acc          0.429234
ft_acc          0.619954
ch_acc          0.710905
pred_ch1        0.595688
pred_ch0        0.404312
dtype: float64

In [105]:
#print('macro',f1_score(test_y.values, preds_class, average='macro'))
print('micro',f1_score(test_y.values, preds_class, average='micro'))
print('H1-H2',f1_score(test_info['h1_res'], test_info['h2_res'], average='micro'))
#print('weighted',f1_score(test_y.values, preds_class, average='weighted'))

ValueError: Classification metrics can't handle a mix of multiclass-multioutput and binary targets

In [139]:
test_y.values

array([[1],
       [0],
       [2],
       ...,
       [1],
       [2],
       [0]], dtype=int64)

In [140]:
preds_class

array([[1],
       [0],
       [2],
       ...,
       [1],
       [2],
       [0]], dtype=int64)

In [104]:
dfr.to_csv('pred.csv')

In [None]:
# design network
model = keras.Sequential()
model.add(layers.Dense(50, input_shape=(train_X.shape[1], train_X.shape[2])))
model.add(layers.Dense(1))
model.compile(loss='mae', optimizer='adam')
# fit network
history = model.fit(train_X, train_y, epochs=50, batch_size=72, validation_data=(test_X, test_y), verbose=2, shuffle=False)
# plot history
pyplot.plot(history.history['loss'], label='train')
pyplot.plot(history.history['val_loss'], label='test')
pyplot.legend()
pyplot.show()

In [None]:
yhat.min()

In [None]:
# make a prediction
yhat = model.predict(test_X)
test_X = test_X.reshape((test_X.shape[0], test_X.shape[2]))
# invert scaling for forecast
inv_yhat = np.concatenate((yhat, test_X[:, 1:]), axis=1)
inv_yhat = scaler.inverse_transform(inv_yhat)
inv_yhat = inv_yhat[:,0]
# invert scaling for actual
test_y = test_y.reshape((len(test_y), 1))
inv_y = np.concatenate((test_y, test_X[:, 1:]), axis=1)
inv_y = scaler.inverse_transform(inv_y)
inv_y = inv_y[:,0]
# calculate RMSE
rmse = sqrt(mean_squared_error(inv_y, inv_yhat))
print('Test RMSE: %.3f' % rmse)

In [None]:
df.loc[df['mid']==8699223]