# Analyzing the dataset using TensorFlow

# Important Note!

It is important to acknowledge that we did not learn TensorFlow in class. Therefore, the majority of the code below was generated by AI and modified by me to fit the specific goals of the project. The AI used was CHAT GPT. I will leave a prompt chain and a link to the query below.

TensorFlow has many different ways to analyze a dataset. For my analysis, I will be changing activation functions, learning rates, and epochs to see which combination yields the best results. It is important to note that modifying the parameters can significantly change the amount of computing time. 

In [1]:
# Imports
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.api as sm
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn import svm
from sklearn.model_selection import ParameterGrid

Please note that TensorFlow was not covered in this class. As a result of that generative AI played a larger role than normal in helping me complete this section of the project. 

In [78]:
# This class was created with the help of generative AI.
# Source OpenAI
# URL: https://chat.openai.com/
# Parts of the code that were directly generated using chat GPT will be cited directly in the class.


class DataAnalyzer():
    """ Data analyzer class.
    
    Takes a dataset and uses tensor flow to analyze the data and return predictions
    
    Methods: 
        __init__(self, data, n_components, learning_rate, activation_function, features, target, epoch=100, verbose=1, test_size=0.2)
            Initialize all of the arguments needed to model the dataset with TensorFlow.
        
        process_data(self, features, target, n_components, test_size)
            Processes the given dataset using train_test_split and PCAs
            
        model(self, pca_train_vectors, pca_test_vectors, y_train_scaled, y_test_scaled)
            Models the dataset using TensorFlow.
        
    """
    
    def __init__(self, data, n_components, learning_rate, activation_function, features, target, epoch=100, verbose=1, test_size=0.2):
        """Initialize all of the arguments needed to model the dataset with TensorFlow.
        Arguments:
        
        data [type: Pandas dataframe]
            The dataset that will be modeled with TensorFlow
            
        n_components [type: int]
            The amount of feature vectors you want to use with the PCA.
                
        learning_rate [type: float]
            The learning rate of the model.
                
        activation_function [type: string]
            The type of activation function that will anaylze the data.
                
        features [type: Pandas dataframe]
            The dataframe of features you would like to use to train and test the model.
                
        target [type: Pandas dataframe]
            The variable in the dataframe tha you would like to predict.
                
        epoch [type: int]
            The amount of times you would like the model to run. Higher epochs correspond with longer runtimes.
                
        verbose [type: int]
            Adjust the visual output of the model when ran.
                
        test_size [type: float]
            Scale the size of the training and testing sets."""
        
        # Initialize the input variables
        self.data = data
        self.learning_rate = learning_rate
        self.activation_function = activation_function
        self.features = features
        self.target = target
        self.epoch = epoch
        self.verbose = verbose
        self.test_size = test_size
        self.n_components = n_components
        self.target_scaler = MinMaxScaler()

    def process_data(self, features, target, n_components, test_size):
        
        """Processes the given dataset using train_test_split and PCAs.
        Arguments:
        
            features [type: Pandas dataframe]
                The dataframe of features you would like to use to train and test the model.
                
            target [type: Pandas dataframe]
                The variable in the dataframe tha you would like to predict.
                
            n_components [type: int]
                The amount of feature vectors you want to use with the PCA.
                
            test_size [type: float]
                Scale the size of the training and testing sets. """
        
        # This class was created with the help of generative AI.
        # Source OpenAI
        # URL: https://chat.openai.com/
        
        # Split the data into training and testing sets
        train_vectors, test_vectors, train_labels, test_labels = train_test_split(features, target, test_size = test_size, random_state=42)

        # Standardize the features
        scaler = MinMaxScaler()
        X_train_scaled = scaler.fit_transform(train_vectors)
        X_test_scaled = scaler.transform(test_vectors)

        # Normalize the target variable
        self.target_scaler = MinMaxScaler()
        y_train_scaled = self.target_scaler.fit_transform(train_labels.values.reshape(-1, 1))
        y_test_scaled = self.target_scaler.transform(test_labels.values.reshape(-1, 1))

        # Set up the pca object with the number of components we want to find
        pca = PCA(n_components=self.n_components, whiten=True)

        # Fit the training data to the pca model.
        X_fit = pca.fit(X_train_scaled)

        total_variance = np.sum(pca.explained_variance_ratio_) * 100

        # Transforming the vectors
        pca_train_vectors = pca.transform(X_train_scaled)
        pca_test_vectors = pca.transform(X_test_scaled)
    
        return pca_train_vectors, pca_test_vectors, y_train_scaled, y_test_scaled

    def model(self, pca_train_vectors, pca_test_vectors, y_train_scaled, y_test_scaled):
        
        """Models the dataset using TensorFlow.
        
        Arguements:
        
            pca_train_vectors [type: numpy array]
                An array that the method process_data returns. 
                An array of scaled an fitted feature values ready to be acted upon by the model.
                    
            pca_test_vectors [type: numpy array]
                An array that the method process_data returns. 
                An array of scaled an fitted feature values ready to be used to test the models accuracy.
                
            y_train_scaled [type: numpy array]
                An array that the method process_data returns. 
                An array of scaled an fitted target values ready to be acted upon by the model.
            
            
            y_test_scaled [type: numpy array]
                An array that the method process_data returns. 
                An array of scaled an fitted feature values ready to be used to test the models accuracy."""\
        
        # The base code used to create the model was generated by CHAT GPT.
        # Source OpenAI 
        # URL: https://chat.openai.com/
        # In order to get the correct code working a long series of prompts was required over multiple days of work.
        # For the sake of neatness I will not list the prompts here.
        # The general prompts were,
        # Can you generate a TensorFlow model to help me model NHL data?
        # Fix this error.
        # What other options do I have to improve accuracy?
                
        # Create the model
        model = tf.keras.Sequential([
            tf.keras.layers.Input(shape=(self.n_components,)),  # Input layer
            tf.keras.layers.Dense(128, activation=self.activation_function, kernel_regularizer=tf.keras.regularizers.l2(0.01)),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.Dropout(0.5),
            tf.keras.layers.Dense(64, activation=self.activation_function, kernel_regularizer=tf.keras.regularizers.l2(0.01)),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.Dropout(0.6),
            tf.keras.layers.Dense(32, activation=self.activation_function, kernel_regularizer=tf.keras.regularizers.l2(0.01)),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.Dense(1)  # Output layer with one neuron for regression
        ])

        # Compile the model
        optimizer = tf.keras.optimizers.Adam(learning_rate=self.learning_rate)
        model.compile(optimizer=optimizer, loss='mean_squared_error')

        # Train the model
        model.fit(pca_train_vectors, y_train_scaled, epochs=self.epoch, verbose=self.verbose)

        # Predict on the test set
        y_test_pred_scaled = model.predict(pca_test_vectors).flatten()

        # Transform the predictions back to the original scale
        y_test_pred = self.target_scaler.inverse_transform(y_test_pred_scaled.reshape(-1, 1)).flatten()
        
        # Inverse transform the true values back to the original scale
        y_test_true = self.target_scaler.inverse_transform(y_test_scaled.reshape(-1, 1)).flatten()
        
        # Convert to binary classification using the threshold
        y_test_pred_binary = (y_test_pred >= 0.35).astype(int)

        # Calculate R-squared on the original scale
        r2 = r2_score(y_test_true, y_test_pred_binary)
        
        # Returns the actual target, the predicted target, and the R-squared value.
        return y_test_true, y_test_pred_binary, r2

In [79]:
# Import the data 
df = pd.read_csv('Stanley Cup Data - project_dataset.csv') # This is the dataset from the NHL Website 

# Sorting the NHL data by year 
df = df.sort_values(by = "Season Start")

In [80]:
# Select features (independent variables) and the target (dependent variable) for normal dataset
features = df.drop(columns=['Season Start','Season End','Stanley Cup','Team','T','OT'])
target = df['Stanley Cup']

In [81]:
# Model data for normal dataset
modeled_data = DataAnalyzer(df, 15, 0.01, 'sigmoid', features, target, 100, 0) # Initialize the class

pca_train_vectors_s, pca_test_vectors_s, y_train_scaled_s, y_test_scaled_s = modeled_data.process_data(features, target, 15, 0.25) # Process the dataset given

y_stanley, y_prediction_stanley, r2_stanley = modeled_data.model(pca_train_vectors_s, pca_test_vectors_s, y_train_scaled_s, y_test_scaled_s) # Create a model and return predictions

In [82]:
print(f'The R-squared value for the stanley cup data is {r2_stanley}')

The R-squared value for the normal data is -0.0342857142857147


It is clear that with our given dataset and approach with TensorFlow that we are not going to be able to find a suitable R-Squared value. However we can still try to predict some other categories such as the presidents cup. The presidents cup is awarded to the team with the most points at the end of the season. Since points are directly correlated with wins TensorFlow should do a much better job at finding a suffcient R-squared value.

In [83]:
# Select new features (independent variables) and the target (dependent variable) for normal dataset
new_features = df.drop(columns=['Season Start', 'Season End','Presidents Cup','Team','T','OT', 'GP'])
new_target = df['Presidents Cup']

In [84]:
# Model data for new features and target
presidents_model = DataAnalyzer(df, 18, 0.01, 'sigmoid', new_features, new_target, 100, 0) # Initialize the class

pca_train_vectors_p, pca_test_vectors_p, y_train_scaled_p, y_test_scaled_p = presidents_model.process_data(new_features, new_target, 15, 0.25) # Process the dataset given

y_presidents, y_prediction_presidents, r2_presidents = presidents_model.model(pca_train_vectors_p, pca_test_vectors_p, y_train_scaled_p, y_test_scaled_p) # Create a model and return predictions

In [92]:
# Base code obtained from Sklearn documentation
# Accessed at 10:40 AM 11/30/2023
# URL: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html#sklearn.metrics.confusion_matrix

tn, fp, fn, tp = confusion_matrix(y_presidents, y_prediction_presidents,).ravel()
print(f'True Negatives: {tn}\nFalse Postives: {fp}\nFalse Negatives: {fn}\nTrue Postives: {tp}')

accuracy = (tn+tp)/(tn+tp+fn+fp) *100
print(f'The accuracy is {accuracy}')

True Negatives: 169
False Postives: 10
False Negatives: 1
True Postives: 1
The accuracy is 93.92265193370166


## Sources

https://datatofish.com/read_excel/

https://www.nhl.com/stats/teams?aggregate=0&reportType=season&seasonFrom=19171918&seasonTo=20232024&gameType=2&filter=gamesPlayed,gte,1&sort=faceoffWinPct&page=0&pageSize=50

https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sort_values.html

https://www.nhl.com/news/nhl-stanley-cup-champions-winners-complete-list-287705398

https://python-charts.com/seaborn/themes/#:~:text=The%20seaborn%20library%20provides%20five,the%20set_style%20or%20set_style%20function.

https://datatofish.com/csv-to-excel-python/