In [3]:
import pandas as pd
import numpy as np
import ast
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

## Elective Graph ML

#### Code

CALL gds.graph.create('Player_sage', <br>
&emsp;&emsp;&emsp;&emsp;                   'Player', {PLAYS_WITH: { <br>
&emsp;&emsp;&emsp;&emsp;                          type: 'PLAYS_WITH', <br>
&emsp;&emsp;&emsp;&emsp;                          orientation: 'UNDIRECTED'}})

CALL gds.degree.mutate('Player_sage', {mutateProperty: 'degree'}) <br>
YIELD nodePropertiesWritten


CALL gds.beta.graphSage.train( <br>
&emsp;&emsp;  'Player_sage', <br>
&emsp;  { <br>
&emsp;&emsp;    modelName: 'graphSage', <br>
&emsp;&emsp;    featureProperties: ['degree'] <br>
&emsp;&emsp;    learningRate:0.001, <br>
&emsp;&emsp;    epochs:10 <br>
&emsp;  } <br>
)

CALL gds.beta.graphSage.stream( <br>
&emsp;&emsp;  'Player_sage', <br>
&emsp;  { <br>
&emsp;&emsp;    modelName: 'graphSage' <br>
&emsp;  }) <br>
YIELD nodeId, embedding <br>
RETURN gds.util.asNode(nodeId).first_name + ' ' + gds.util.asNode(nodeId).last_name AS Player, embedding <br>

In [4]:
sage_results = pd.read_csv('player_embeddings.csv')
sage_results.head()

Unnamed: 0,Player,embedding
0,Shayne Gostisbehere,"[0.11226033574093451,0.13159826734459273,0.122..."
1,Claude Giroux,"[0.11221969980490319,0.1318266041815291,0.1232..."
2,Travis Konecny,"[0.11223483076995378,0.13173648205579874,0.123..."
3,Ivan Provorov,"[0.11223948957578837,0.1317080497596955,0.1230..."
4,Brandon Manning,"[0.11225547090125365,0.13161756503640212,0.122..."


In [5]:
#the embedding vectors are being read in as a string representation of a list. Apply ast.literal_eval to convert to list
sage_results['embedding'] = sage_results['embedding'].apply(ast.literal_eval)

In [6]:
#expand each embedding element into its own column
embeddings = sage_results['embedding'].apply(pd.Series)
embeddings = embeddings.rename(columns = lambda x: 'embedding_' + str(x))

In [7]:
#concatentate the embedding columns with the original player names
concat_sage = pd.concat([sage_results['Player'], embeddings],axis=1)
concat_sage.head()

Unnamed: 0,Player,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,...,embedding_54,embedding_55,embedding_56,embedding_57,embedding_58,embedding_59,embedding_60,embedding_61,embedding_62,embedding_63
0,Shayne Gostisbehere,0.11226,0.131598,0.122953,0.148596,0.097817,0.088059,0.119645,0.1482,0.10925,...,0.15099,0.095985,0.161893,0.095068,0.153072,0.186881,0.133661,0.083386,0.175816,0.093023
1,Claude Giroux,0.11222,0.131827,0.123224,0.14891,0.097673,0.087653,0.119305,0.14801,0.108965,...,0.150739,0.095582,0.161601,0.094364,0.152951,0.187348,0.134062,0.083049,0.176651,0.092888
2,Travis Konecny,0.112235,0.131736,0.123127,0.148793,0.097722,0.087796,0.11943,0.148084,0.109067,...,0.15083,0.095729,0.161717,0.094628,0.153002,0.18718,0.133922,0.083164,0.176339,0.092935
3,Ivan Provorov,0.112239,0.131708,0.123098,0.148756,0.097737,0.08784,0.119468,0.148107,0.109097,...,0.150857,0.095775,0.161754,0.094709,0.153018,0.187128,0.133879,0.083198,0.176243,0.092949
4,Brandon Manning,0.112255,0.131618,0.122989,0.148631,0.097795,0.088003,0.119603,0.148182,0.109213,...,0.150959,0.095935,0.161869,0.09499,0.153065,0.186941,0.133717,0.083334,0.175909,0.093003


#### Classification model - let's see if we can predict the player's goals based on their embeddings

In [8]:
stats = pd.read_csv('first_300_game_stats.csv', index_col=0)
stats.head()

Unnamed: 0,game_id,firstName,lastName,goals
25937,2019020001,Dylan,DeMelo,0
25101,2019020001,Mark,Borowiecki,0
1233,2019020001,Artem,Anisimov,0
37697,2019020001,Scott,Sabourin,1
7788,2019020001,Morgan,Rielly,0


In [9]:
stats['Player'] = stats['firstName'] + ' ' + stats['lastName']
stats = stats[['Player', 'goals']].copy()

In [10]:
#group by player to count the number of goals they scored in the dataset
season_stats = stats.groupby('Player').sum()['goals'].reset_index()
season_stats.sort_values('goals', ascending=False)

Unnamed: 0,Player,goals
187,David Pastrnak,17
408,Leon Draisaitl,16
62,Auston Matthews,14
165,Connor McDavid,14
22,Alex Ovechkin,14
...,...,...
280,Jacob Middleton,0
561,Paul Carey,0
271,J.C. Beaudin,0
268,Isac Lundestrom,0


#### For simplicity sake, I am converting number of goals into three categories: 10 goals and up = 3, 6-10 goals = 2, 5 and below = 1

In [11]:
season_stats['goals'] = np.where(season_stats['goals'] >= 10, 3, (np.where(season_stats['goals'] <= 5, 1, 2)))
season_stats.sort_values('goals', ascending=False)

Unnamed: 0,Player,goals
22,Alex Ovechkin,3
88,Brayden Schenn,3
165,Connor McDavid,3
463,Matthew Tkachuk,3
187,David Pastrnak,3
...,...,...
267,Ilya Mikheyev,1
268,Isac Lundestrom,1
269,Ivan Barbashev,1
270,Ivan Provorov,1


In [12]:
sage_players_merge = season_stats.merge(concat_sage, on='Player')
sage_players_merge.head()

Unnamed: 0,Player,goals,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,...,embedding_54,embedding_55,embedding_56,embedding_57,embedding_58,embedding_59,embedding_60,embedding_61,embedding_62,embedding_63
0,Aaron Ekblad,1,0.112265,0.131563,0.122925,0.148558,0.097828,0.088098,0.119682,0.148227,...,0.151018,0.09603,0.161937,0.095156,0.153093,0.186831,0.133623,0.083413,0.175712,0.093035
1,Aaron Ness,1,0.11229,0.131449,0.122742,0.148369,0.097936,0.088375,0.119899,0.148328,...,0.151179,0.096287,0.162081,0.095576,0.153144,0.186522,0.133343,0.083663,0.17521,0.093131
2,Adam Boqvist,1,0.112257,0.131609,0.122982,0.148622,0.097798,0.088013,0.119612,0.148189,...,0.150965,0.095946,0.16188,0.095012,0.15307,0.186928,0.133708,0.083341,0.175883,0.093006
3,Adam Erne,1,0.112265,0.131574,0.122919,0.14856,0.097836,0.088109,0.119685,0.14822,...,0.15102,0.096033,0.161923,0.095149,0.153084,0.186824,0.133611,0.08343,0.17572,0.09304
4,Adam Fox,1,0.112259,0.131603,0.122962,0.148606,0.097811,0.088045,0.119632,0.148195,...,0.150983,0.095971,0.161886,0.095047,0.15307,0.186896,0.133675,0.083373,0.17584,0.093017


In [13]:
sage_players_merge['goals'].value_counts()

1    624
2     89
3     20
Name: goals, dtype: int64

In [14]:
X = sage_players_merge.drop(columns=['Player', 'goals'])
y = sage_players_merge['goals']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [15]:
parameters = {'kernel': ('linear', 'poly', 'rbf'), 'C':(1.0, 0.8, 0.6, 0.4, 0.2, 0.1, 0.01), 'degree':(5,4,3,2,1)}

svc = SVC(class_weight='balanced')

grid = GridSearchCV(svc, parameters)

grid.fit(X_train, y_train)

GridSearchCV(estimator=SVC(class_weight='balanced'),
             param_grid={'C': (1.0, 0.8, 0.6, 0.4, 0.2, 0.1, 0.01),
                         'degree': (5, 4, 3, 2, 1),
                         'kernel': ('linear', 'poly', 'rbf')})

In [16]:
grid.best_params_

{'C': 0.2, 'degree': 5, 'kernel': 'poly'}

In [17]:
grid_pred = grid.predict(X_test)

confusion_matrix(y_test, grid_pred)

array([[83, 86, 37],
       [ 9,  8, 11],
       [ 2,  4,  2]], dtype=int64)

In [36]:
print(classification_report(y_test, grid_pred, target_names=['class 1', 'class 2', 'class 3']))

              precision    recall  f1-score   support

     class 1       0.88      0.40      0.55       206
     class 2       0.08      0.29      0.13        28
     class 3       0.04      0.25      0.07         8

    accuracy                           0.38       242
   macro avg       0.33      0.31      0.25       242
weighted avg       0.76      0.38      0.49       242



In [18]:
def make_player():
    random_embedding = np.random.uniform(0,0.2,64).reshape(1,-1)
    return random_embedding

In [19]:
def predict_player(classifier):
    player = make_player()
    result = classifier.predict(player)
    return print("This player is predicted to score in group:", result[0])

In [38]:
predict_player(grid)

This player is predicted to score in group: 1
