In [None]:
import re
import numpy as np
import pandas as pd
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import word_tokenize
nltk.download("punkt")
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

import clean_reports
#import preprocess_reports



: 

## EDA Data Cleaning

In [None]:
##from sentence_transformers import SentenceTransformer. # will be used for the bert transformer in the future
# dataset location
DATASET = "/Users/jeremygonsalves/Documents/GitHub/PredictingDraftNHL/Data/prospect-data.csv"
print("Dataset saved as DATASET")

# load dataset into dataframe
data = clean_reports.clean(DATASET, raw=True)

data.head()
DATASET=data

#
DATASET2023=data[data['Year'] == 2023]
DATASET2022=data[data['Year'] ==2022]
DATASET2021=data[data['Year'] == 2021]

In [None]:
def plot_drafted_vs_ranking(data):
    # Create the x-values range from the minimum to the maximum 'Drafted' position
    x = np.arange(data['Drafted'].min(), data['Drafted'].max())     #creates an array of evenly spaced values from min ranked position to max pos
    y = x

    # Scatter plot of 'Drafted' vs 'Average Ranking'
    plt.scatter(data['Drafted'], data['Average Ranking'])

    # Plot a reference line y=x
    plt.plot(x, y, 'y--', label="y=x")
    
    plt.xlabel("Drafted Position")
    plt.ylabel("Average Ranking")
    plt.legend()
    plt.title("Predictive Average Ranking")
    plt.show()

# Now you can call the function with the data you have
print("-----------------")
print("For All Data")
print(plot_drafted_vs_ranking(DATASET))
print("-----------------")
print("For 2022")
print(plot_drafted_vs_ranking(DATASET2022))
print("-----------------")
print("For 2021")
print(plot_drafted_vs_ranking(DATASET2021))

In [None]:
HOCKEY_POSITIONS = {
    'C' : 'Center',
    'D' : 'Defender',
    'RW' : 'Right Wing',
    'LW' : 'Left Wing',
    'G' : 'Goalie'
}

# distribution of player positions
data['Position'].value_counts()

In [None]:
match = data.columns.str.match('Description')
scouting_reports = data.columns[match]

# Create a deep copy of data
token_count = data.copy(deep=True)

# Define a function to count tokens (words) using split
def count_tokens(text):
    if isinstance(text, str):
        # Split the string by spaces and count the number of words
        return len(text.split())
    else:
        return np.nan

# Apply the function across all scouting report columns
token_count[scouting_reports] = token_count[scouting_reports].applymap(count_tokens)

average_token_count = token_count[scouting_reports].mean().sort_values()

# Display the result
print(average_token_count)


In [None]:
from sentence_transformers import SentenceTransformer
bert_model = SentenceTransformer('all-mpnet-base-v2')

In [None]:
nrows = 2
ncols = 4
fig, ax = plt.subplots(nrows=nrows, ncols=ncols, figsize=(15,7))

for index, report in enumerate(scouting_reports):
    xx = token_count[report].values  
    i, j = [array[0] for array in np.unravel_index([index], shape=(nrows,ncols))] 
    
    ax[i, j].hist(xx, bins=50) 
    ax[i, j].set_title(report)  
    
    if xx[~np.isnan(xx)].max() > bert_model.max_seq_length:
        ax[i, j].axvline(bert_model.max_seq_length, color='k', linestyle='dashed', linewidth=1)  # Add vertical line

plt.suptitle("Distribution of Number of Tokens for Each Report", fontsize=22)
plt.show()


In [None]:
data = data[data['Team'] != 'SEA'] #seattle is a new team as of 2021 having a 2021 second overall pick

# keep data only to 2014-2022 (predict this year's class at a later time)
data = data[data['Year'] <= 2022]

data.info()

## Bert Transformations


In [107]:
import os
import pandas as pd
from sentence_transformers import SentenceTransformer

In [None]:
long_df = preprocessed_df.melt(
    id_vars=['Year', 'Position', 'Height', 'Weight', 'Drafted', 'Team', 'Average Ranking', 'Name'],
    value_vars=scouting_reports.tolist(),
    var_name='reporter',  
    value_name='text'
).dropna(
    subset=['text']
)


In [None]:


bert_embeddings_path = 'data/reports_with_bert_embeddings.csv'

# Check if embeddings already exist
if not os.path.exists(bert_embeddings_patpiph):
    # Load the BERT model
    bert_model = SentenceTransformer('all-mpnet-base-v2')

    # Generate BERT embeddings for the text in 'long_df'
    bert_embeddings = bert_model.encode(long_df['text'].values)
    bert_df = pd.DataFrame(bert_embeddings, columns=[f'bert{i}' for i in range(bert_embeddings.shape[1])])
    long_df = pd.concat([long_df, bert_df], axis=1)
    long_df.to_csv(bert_embeddings_path, index=False)
    print("New File Was created")
else:
    # Load the DataFrame with BERT embeddings
    long_df = pd.read_csv(bert_embeddings_path)
    bert_columns = [col for col in long_df.columns if col.startswith('bert')]
    print("The BERT Embedding file was overriden")