In [None]:
import os
import re
import json
import numpy as np
import pandas as pd

import matplotlib.cm as cm
import matplotlib.pyplot as plt

from sklearn.svm import SVC
from sklearn import preprocessing
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupShuffleSplit
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

import clean_reports
import preprocess_reports
import setup_predictor
from Model import *
from train_test_predictor import train_and_test

nltk.download("punkt")
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

: 

In [None]:
# dataset location
DATASET = "data/prospect-data.csv"

# load dataset into dataframe
data = clean_reports.clean(DATASET, raw=True)

data.head()

In [None]:
data.info(verbose=True)

In [None]:
# clean up dataset
# might have to look at dropping seattle in the future but for clustering it 
# should not matter
data = data[data['Team'] != 'SEA']

# try with only forwards
# data = data[
#     (data['Position'] == 'C') | 
#     (data['Position'] == 'LW') | 
#     (data['Position'] == 'RW')
# ]

data.info()

In [None]:
data.sample(5)

In [6]:
HOCKEY_WORDS = ["usntdp", "ntdp", "development", "program",
                "khl", "shl", "ushl", "ncaa", "ohl", "chl", "whl", "qmjhl",
                "sweden", "russia", "usa", "canada", "ojhl", "finland", 
                "finnish", "swedish", "russian", "american", "wisconsin",
                "michigan", "bc", "boston", "london", "bchl", "kelowna",
                "liiga", 
                "portland", "minnesota", "ska", "frolunda", "sjhl", "college",
                "center", "left", "right", "saginaw", "kelowna", "frolunda",
                "slovakia"]

# scouting report columns
mask = data.columns.str.match('Description')
scouting_reports = data.columns[mask]

# preprocess data with NLTK
preprocessed_df = data.copy()
for report in scouting_reports:
    # skip columns with ALL missing values
    if data[report].isnull().all():
        continue
    report_preprocessor = preprocess_reports.NltkPreprocessor(data[report])
    preprocessed_df.loc[:,report] = report_preprocessor\
        .remove_names(data['Name'])\
        .remove_whitespace()\
        .remove_words(HOCKEY_WORDS)\
        .get_text()


In [7]:
# transform from wide to long data frame
long_df = preprocessed_df.melt(
    id_vars=['Year', 'Position', 'Height', 'Weight', 'Drafted', 'Team', 'Average Ranking', 'Name'],
    value_vars=scouting_reports.tolist(),
    var_name='reporter',  
    value_name='text'
).dropna(
    subset=['text']
)



In [9]:
openai_embeddings_path = 'data/reports_with_embeddings.csv'
if os.path.exists(openai_embeddings_path):
    openai_df = pd.read_csv(openai_embeddings_path)
    openai_df['embeddings'] = openai_df.embeddings.apply(eval).apply(np.array)
    embeddings = np.vstack(openai_df['embeddings'].values).astype(np.float64)
    openai_df['embeddings'] = [np.array(x, dtype=np.float64) for x in embeddings]

    openai_cols = [f'openai{i}' for i in range(openai_df['embeddings'].iloc[0].shape[0])]

    # create individual columns for each openai embedding
    embeddings_df = pd.DataFrame(
        np.concatenate([x.reshape(1,-1) for x in openai_df['embeddings']]),
        columns=openai_cols
    )

    embeddings_df.loc[:,'player_name'] = openai_df['player_name']

    full_df = pd.merge(preprocessed_df, embeddings_df, left_on='Name', right_on='player_name')

In [16]:
from sklearn.decomposition import PCA
pca = PCA(n_components=20)

X_pca = pca.fit_transform(
    pd.DataFrame(
        np.concatenate([x.reshape(1,-1) for x in openai_df['embeddings']]),
    )
)

openai_pca_cols = [f'openai_pca{i}' for i in range(X_pca.shape[1])]

embeddings_pca_df = pd.DataFrame(X_pca, columns=openai_pca_cols)

embeddings_pca_df.loc[:,'player_name'] = openai_df['player_name']

full_df = pd.merge(preprocessed_df, embeddings_pca_df, left_on='Name', right_on='player_name')

In [None]:
# check that every player has OpenAI embeddings
full_df[full_df.columns[:20]].info(verbose=True)

In [18]:
# setup model architecture
numeric_cols = ['Height', 'Weight'] + openai_pca_cols
categorical_cols = ['Position']
# text_cols = scouting_reports.tolist()
text_cols = []
lr_model = setup_predictor.setup(
    numeric_cols=numeric_cols, 
    categorical_cols=categorical_cols,
    text_cols=text_cols,
    func=LogisticOrdinalRegression()
)
svm_model = setup_predictor.setup(
    numeric_cols=numeric_cols, 
    categorical_cols=categorical_cols,
    text_cols=text_cols,
    func=SVC(probability=True)
)

mlp_model = setup_predictor.setup(
    numeric_cols=numeric_cols, 
    categorical_cols=categorical_cols,
    text_cols=text_cols,
    func=MLPClassifier()
)
rf_model = setup_predictor.setup(
    numeric_cols=numeric_cols, 
    categorical_cols=categorical_cols,
    text_cols=text_cols,
    func=RandomForestOrdinalClassifier()
)

In [19]:
X = full_df[numeric_cols + categorical_cols + text_cols]
y = full_df['Drafted']
groups = full_df['Name']

mean_df = pd.DataFrame(columns=['accuracy', 'f1', 'precision', 'recall'])
std_df = pd.DataFrame(columns=['accuracy', 'f1', 'precision', 'recall'])

In [20]:
train_idx = full_df[full_df['Year'] <= 2022].index.tolist()
test_idx = full_df[full_df['Year'] == 2023].index.tolist()

X_train = X.iloc[train_idx]
y_train = y.iloc[train_idx]
X_test = X.iloc[test_idx]
y_test = y.iloc[test_idx]

In [None]:
# Random Forest Classification model
param_grid = {
    'clf__n_estimators' : np.arange(60, 110, 20).tolist(),
    'clf__max_depth' : np.arange(20, 100, 20).tolist(),
}

label = 'OpenAI_rand_forest_2023_prediction'

rf_metrics = train_and_test(rf_model, X_train, y_train, groups[train_idx], param_grid, notes=label)

rf_mean = {k : np.mean(v) for k,v in rf_metrics.items()}
rf_std = {k : np.std(v) for k,v in rf_metrics.items()}

mean_df.loc[label] = pd.Series(rf_mean)
std_df.loc[label] = pd.Series(rf_std)


# 2023 Predictions

In [None]:
# try just one model fit
rf_model = setup_predictor.setup(
    numeric_cols=numeric_cols, 
    categorical_cols=categorical_cols,
    text_cols=text_cols,
    func=RandomForestOrdinalClassifier(n_estimators=80, max_depth=40, random_state=42)
)

rf_model.fit(X_train, y_train)

In [None]:
# make predictions on test set
# since it is entire class of 2023, we can actually rank them
y_test_pred = rf_model.predict(X_test).argsort()

In [None]:
foo = pd.DataFrame()
foo.loc[:,'name'] = groups[test_idx]
foo.loc[:,'ranking'] = y_test_pred + 1

In [None]:
foo.sort_values(by='ranking')