In [26]:
import pandas as pd
import ast
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

## Graph Model - No Updates

## Elective Graph ML

#### Model - GraphSAGE

After some further research by the Seattle Kraken's analytics department, the team has decided to try to implement some graph machine learning into their methods to characterize NHL players. One model of interest is the GraphSAGE model that works to embed player features into float vectors. Using player nodes and the PLAYS_WITH edge, the GraphSAGE will transform player attributes into a useable format for a ML classification model. The model will be trained based of player degree which has been calculated based on how my PLAYS_WITH edges a certain player has. Following training, the model will return the player and their associated embedding vector. 

#### Code

CALL gds.graph.create('Player_sage', <br>
&emsp;&emsp;&emsp;&emsp;                   'Player', {PLAYS_WITH: { <br>
&emsp;&emsp;&emsp;&emsp;                          type: 'PLAYS_WITH', <br>
&emsp;&emsp;&emsp;&emsp;                          orientation: 'UNDIRECTED'}})

CALL gds.degree.mutate('Player_sage', {mutateProperty: 'degree'}) <br>
YIELD nodePropertiesWritten


CALL gds.beta.graphSage.train( <br>
&emsp;&emsp;  'Player_sage', <br>
&emsp;  { <br>
&emsp;&emsp;    modelName: 'graphSage', <br>
&emsp;&emsp;    featureProperties: ['degree'] <br>
&emsp;&emsp;    learningRate:0.001, <br>
&emsp;&emsp;    epochs:10 <br>
&emsp;  } <br>
)

CALL gds.beta.graphSage.stream( <br>
&emsp;&emsp;  'Player_sage', <br>
&emsp;  { <br>
&emsp;&emsp;    modelName: 'graphSage' <br>
&emsp;  }) <br>
YIELD nodeId, embedding <br>
RETURN gds.util.asNode(nodeId).first_name + ' ' + gds.util.asNode(nodeId).last_name AS Player, embedding <br>

In [2]:
sage_results = pd.read_csv('player_embeddings.csv')
sage_results.head()

Unnamed: 0,Player,embedding
0,Shayne Gostisbehere,"[0.11226033574093451,0.13159826734459273,0.122..."
1,Claude Giroux,"[0.11221969980490319,0.1318266041815291,0.1232..."
2,Travis Konecny,"[0.11223483076995378,0.13173648205579874,0.123..."
3,Ivan Provorov,"[0.11223948957578837,0.1317080497596955,0.1230..."
4,Brandon Manning,"[0.11225547090125365,0.13161756503640212,0.122..."


In [3]:
sage_results.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 733 entries, 0 to 732
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Player     733 non-null    object
 1   embedding  733 non-null    object
dtypes: object(2)
memory usage: 11.6+ KB


In [4]:
#the embedding vectors are being read in as a string representation of a list. Apply ast.literal_eval to convert to list
sage_results['embedding'] = sage_results['embedding'].apply(ast.literal_eval)

In [5]:
#expand each embedding element into its own column
embeddings = sage_results['embedding'].apply(pd.Series)
embeddings = embeddings.rename(columns = lambda x: 'embedding_' + str(x))

In [6]:
#concatentate the embedding columns with the original player names
concat_sage = pd.concat([sage_results['Player'], embeddings],axis=1)
concat_sage.head()

Unnamed: 0,Player,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,...,embedding_54,embedding_55,embedding_56,embedding_57,embedding_58,embedding_59,embedding_60,embedding_61,embedding_62,embedding_63
0,Shayne Gostisbehere,0.11226,0.131598,0.122953,0.148596,0.097817,0.088059,0.119645,0.1482,0.10925,...,0.15099,0.095985,0.161893,0.095068,0.153072,0.186881,0.133661,0.083386,0.175816,0.093023
1,Claude Giroux,0.11222,0.131827,0.123224,0.14891,0.097673,0.087653,0.119305,0.14801,0.108965,...,0.150739,0.095582,0.161601,0.094364,0.152951,0.187348,0.134062,0.083049,0.176651,0.092888
2,Travis Konecny,0.112235,0.131736,0.123127,0.148793,0.097722,0.087796,0.11943,0.148084,0.109067,...,0.15083,0.095729,0.161717,0.094628,0.153002,0.18718,0.133922,0.083164,0.176339,0.092935
3,Ivan Provorov,0.112239,0.131708,0.123098,0.148756,0.097737,0.08784,0.119468,0.148107,0.109097,...,0.150857,0.095775,0.161754,0.094709,0.153018,0.187128,0.133879,0.083198,0.176243,0.092949
4,Brandon Manning,0.112255,0.131618,0.122989,0.148631,0.097795,0.088003,0.119603,0.148182,0.109213,...,0.150959,0.095935,0.161869,0.09499,0.153065,0.186941,0.133717,0.083334,0.175909,0.093003


#### Classification model - let's see if we can predict the player's position based on their embeddings

In [7]:
players = pd.read_csv('game_player.csv')

In [8]:
players.head()

Unnamed: 0,game_id,player_id,firstName,lastName,primaryPosition
0,2019020016,8476906,Shayne,Gostisbehere,D
1,2019020045,8476906,Shayne,Gostisbehere,D
2,2019020073,8476906,Shayne,Gostisbehere,D
3,2019020088,8476906,Shayne,Gostisbehere,D
4,2019020095,8476906,Shayne,Gostisbehere,D


In [9]:
players['Player'] = players['firstName'] + ' ' + players['lastName']
player_position = players[['Player', 'primaryPosition']].copy()
player_position.drop_duplicates(inplace=True)
player_position.head()

Unnamed: 0,Player,primaryPosition
0,Shayne Gostisbehere,D
47,Dale Weise,RW
75,Claude Giroux,C
160,Travis Konecny,C
243,Ivan Provorov,D


In [10]:
#merge the player positions with their embedding by inner joining based on the player name
sage_players_merge = player_position.merge(concat_sage, on='Player')
sage_players_merge.head()

Unnamed: 0,Player,primaryPosition,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,...,embedding_54,embedding_55,embedding_56,embedding_57,embedding_58,embedding_59,embedding_60,embedding_61,embedding_62,embedding_63
0,Shayne Gostisbehere,D,0.11226,0.131598,0.122953,0.148596,0.097817,0.088059,0.119645,0.1482,...,0.15099,0.095985,0.161893,0.095068,0.153072,0.186881,0.133661,0.083386,0.175816,0.093023
1,Claude Giroux,C,0.11222,0.131827,0.123224,0.14891,0.097673,0.087653,0.119305,0.14801,...,0.150739,0.095582,0.161601,0.094364,0.152951,0.187348,0.134062,0.083049,0.176651,0.092888
2,Travis Konecny,C,0.112235,0.131736,0.123127,0.148793,0.097722,0.087796,0.11943,0.148084,...,0.15083,0.095729,0.161717,0.094628,0.153002,0.18718,0.133922,0.083164,0.176339,0.092935
3,Ivan Provorov,D,0.112239,0.131708,0.123098,0.148756,0.097737,0.08784,0.119468,0.148107,...,0.150857,0.095775,0.161754,0.094709,0.153018,0.187128,0.133879,0.083198,0.176243,0.092949
4,Brandon Manning,D,0.112255,0.131618,0.122989,0.148631,0.097795,0.088003,0.119603,0.148182,...,0.150959,0.095935,0.161869,0.09499,0.153065,0.186941,0.133717,0.083334,0.175909,0.093003


In [11]:
#label encode the target variable
le = preprocessing.LabelEncoder()
sage_players_merge['primaryPosition'] = le.fit_transform(sage_players_merge['primaryPosition'])

In [12]:
sage_players_merge.head()

Unnamed: 0,Player,primaryPosition,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,...,embedding_54,embedding_55,embedding_56,embedding_57,embedding_58,embedding_59,embedding_60,embedding_61,embedding_62,embedding_63
0,Shayne Gostisbehere,1,0.11226,0.131598,0.122953,0.148596,0.097817,0.088059,0.119645,0.1482,...,0.15099,0.095985,0.161893,0.095068,0.153072,0.186881,0.133661,0.083386,0.175816,0.093023
1,Claude Giroux,0,0.11222,0.131827,0.123224,0.14891,0.097673,0.087653,0.119305,0.14801,...,0.150739,0.095582,0.161601,0.094364,0.152951,0.187348,0.134062,0.083049,0.176651,0.092888
2,Travis Konecny,0,0.112235,0.131736,0.123127,0.148793,0.097722,0.087796,0.11943,0.148084,...,0.15083,0.095729,0.161717,0.094628,0.153002,0.18718,0.133922,0.083164,0.176339,0.092935
3,Ivan Provorov,1,0.112239,0.131708,0.123098,0.148756,0.097737,0.08784,0.119468,0.148107,...,0.150857,0.095775,0.161754,0.094709,0.153018,0.187128,0.133879,0.083198,0.176243,0.092949
4,Brandon Manning,1,0.112255,0.131618,0.122989,0.148631,0.097795,0.088003,0.119603,0.148182,...,0.150959,0.095935,0.161869,0.09499,0.153065,0.186941,0.133717,0.083334,0.175909,0.093003


In [13]:
X = sage_players_merge.drop(columns=['Player', 'primaryPosition'])
y = sage_players_merge['primaryPosition']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [40]:
parameters = {'kernel': ('linear', 'poly', 'rbf'), 'C':(1.0, 0.8, 0.6, 0.4, 0.2, 0.1, 0.01), 'degree':(5,4,3,2,1)}

svc = SVC()

grid = GridSearchCV(svc, parameters)

grid.fit(X_train, y_train)

GridSearchCV(estimator=SVC(),
             param_grid={'C': (1.0, 0.8, 0.6, 0.4, 0.2, 0.1, 0.01),
                         'degree': (5, 4, 3, 2, 1),
                         'kernel': ('linear', 'poly', 'rbf')})

In [41]:
grid.best_params_

{'C': 0.1, 'degree': 4, 'kernel': 'poly'}

In [42]:
grid_pred = grid.predict(X_test)

confusion_matrix(y_test, svc_pred)

array([[43, 38,  0,  0],
       [19, 53,  0,  0],
       [10, 38,  0,  0],
       [ 6, 35,  0,  0]], dtype=int64)

As you can see the model was only able to predict classes 0 and 1 correctly, corresponding to center and defenseman. The optimized model following using GridSearchCV was unable to correctly classify the other player positions. Clearly further optimization is needed to produce a useful classification model. However this is a good first step into using player embeddings via the GraphSAGE model as an input into a ML classifier.

## Cypher Action 1

#### Search Phrase

"Which player have been involved in most goals and assists plays with Player $last_name"

#### Description

Find other plays who are connected to the player of interest when the play event was either a goal or an assist

#### Code 

MATCH (p1:Player {last_name:$last_name})-[i1:INVOLVES]-(pl:Play)-[i2:INVOLVES]-(p2:Player) <br>
WHERE i1.event IN ['Goal','Assist'] <br>
WITH p1,i1, pl,i2,p2 ,count(i1.event) AS degree <br>
RETURN p1, i1, pl,i2,p2 <br>
ORDER BY degree DESC <br>

#### Importance

Goals and assists are the most important aspects to a hockey game. Goals win games. Therefore, it is important to know who is invovled in the most plays in which a goal or assist occured. This cypher action allows a user to input a players last name and determine which players were also involved in that play. This gives the user an idea of who that player plays well with and thus who they have chemistry with. Chemistry among plays is a very important part of hockey as players that do not play well together will not score goals. 

## Cypher Action 2

#### Search Phrase

"Which players are in the community $community"

#### Description

Find the players that are similar to each other by visualizing a community via community ID lookup

#### Code

MATCH (p1:Player)-[r:PLAYS_WITH]-(p2:Player) <br>
WHERE p1.community = \\$community AND p2.community = \\$community <br>
RETURN p1, p2

#### Importance

Using the modularity optimization results previously calculated, this cypher action allows the user to identify players who are in the same community. If the team has found a player that they deemed skilled, this action allows them to search for players who are similar via community ID. Good players are likely associated with other good players

## Visualization 1

#### Top 10 Players by PageRank and their connection to other players, ranked by degree of connections

Cypher Query: <br>

MATCH (p:Player) WHERE EXISTS (p.pagerank) <br>
WITH p ORDER BY p.pagerank DESC LIMIT 10 <br>
CALL apoc.cypher.run(" <br>
&emsp;&emsp;    MATCH (p)<-[r:PLAYS_WITH]-(p2) <br>
&emsp;&emsp;    WITH p,p2,r, SIZE((p)<-[r:PLAYS_WITH]-(p2)) as plays_with_degree <br>
&emsp;&emsp;    RETURN p,r, p2 <br>
&emsp;&emsp;    ORDER BY plays_with_degree DESC <br>
&emsp;&emsp;    LIMIT 10", {p:p}) YIELD value <br>
RETURN value

<table><tr>
<td> <img src="viz1_overall_structure.png" alt="Drawing" style="width: 500px;"/> </td>
<td> <img src="viz1_zoomed.png" alt="Drawing" style="width: 1000px;"/> </td>
</tr></table>

Above are two screenshots of visualization 1, left being a view of the overall structure while right is a zoomed in figure showing example nodes and connections. The overall structure shows there are two main clusters from this query. This is a monopartite graph showing player to player relationship based off the PLAYS_WITH edge. There are two features encoded into this graph. 1). Size of node is proportional to PageRank score, higher score = larger node 2.) Color signifies player position: Light Yellow = Center, Magenta = Right Wing, Cyan = Left Wing, and Orange = Defensemen. 
<br> <br>
There is a lot of useful information in this graph. Here there is a visual representation of who the important players are, based off their PageRank scores. Extending outward, this graph helps to identify who these important players play with. This knowledge allows insight into who plays well whom if they're on the same team or plays well against if they're on another team. This knowledge can then be further broken down by player position.

## Visualization 2

Example use of Cypher Action 1: Which player have been involved in most goals and assists plays with Player Giroux

<img src='viz2.png'>

Visualization 2 is an example of using a dynamic bloom search phrase to investigate which players are directly connected to Claude Giroux in when the associating play was a goal or an assist. Below we can see that players like Jakub Voracek and Ivan Provorov are highly connected to Claude Giroux across many goal/assist plays. This unveils who Giroux plays well with as well as implicitly identifying who he does not. 