In [2]:
import os
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.datasets import make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import seaborn as sns

In [3]:
big_data = pd.read_csv('/Users/ohmpatel/Desktop/big_data.csv')

In [4]:
big_data = big_data.drop(['Unnamed: 0', 'game_id', 'game_date'], axis = 1)

In [5]:
# cut down number of defense packages
dp_freq = dict(big_data['defense_personnel'].value_counts())
dp_freq[np.NaN] = 0
defense_package = []
for idx, row in big_data.iterrows():
    if dp_freq[row['defense_personnel']] < 450 or row['defense_personnel'] == np.NaN:
        defense_package.append(np.NaN)
    else:
        defense_package.append(row['defense_personnel'])
big_data = big_data.drop(['defense_personnel'],axis=1)
big_data['defense_personnel'] = defense_package

# cut down number of offense packages
op_freq = dict(big_data['offense_personnel'].value_counts())
op_freq[np.NaN] = 0
offense_package = []
for idx, row in big_data.iterrows():
    if op_freq[row['offense_personnel']] < 450 or row['offense_personnel'] == np.NaN:
        offense_package.append(np.NaN)
    else:
        offense_package.append(row['offense_personnel'])
big_data = big_data.drop(['offense_personnel'],axis=1)
big_data['offense_personnel'] = offense_package

In [6]:
outcome_run_pass = []
outcome_buckets = []
for idx, row in big_data.iterrows():
  if row['play_type'] == 'pass':
    outcome_run_pass.append('pass')
    outcome_buckets.append('pass + ' + str(row['pass_location']))
  elif row['play_type'] == 'run':
    outcome_run_pass.append('run')
    outcome_buckets.append('run + ' + str(row['run_location']))

In [11]:
len(outcome_run_pass)

239532

In [12]:
X = big_data.drop(['play_sequence_num','pass_length', 'home_team', 'away_team', 'pass_location', 'run_location', 'play_type', 'surface'], axis=1)

In [13]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 239532 entries, 0 to 239531
Data columns (total 18 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   yardline_100                239532 non-null  float64
 1   game_seconds_remaining      239532 non-null  float64
 2   down                        238663 non-null  float64
 3   ydstogo                     239532 non-null  float64
 4   no_huddle                   239532 non-null  float64
 5   shotgun                     239532 non-null  float64
 6   posteam_timeouts_remaining  239532 non-null  float64
 7   defteam_timeouts_remaining  239532 non-null  float64
 8   score_differential          239532 non-null  float64
 9   roof                        239532 non-null  object 
 10  offense_formation           237081 non-null  object 
 11  defenders_in_box            238195 non-null  float64
 12  feelslike                   239532 non-null  float64
 13  humidity      

In [15]:
np.isnan(X)

TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [14]:
X[np.isnan(X)] = 0

TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [None]:
X = pd.get_dummies(X)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

pca = PCA(n_components=0.75)
X_pca = pca.fit_transform(X_scaled)
explained_variance_ratio = pca.explained_variance_ratio_
cumulative_explained_variance = np.cumsum(explained_variance_ratio)

In [None]:
X_pca.shape

In [None]:
X.shape

In [None]:
# Plotting the explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_
cumulative_explained_variance = np.cumsum(explained_variance_ratio)

# Increase fontsize and bold text
plt.figure(figsize=(12, 6))

# Plot the explained variance ratio
plt.subplot(1, 2, 1)
plt.bar(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio)
plt.title('Explained Variance Ratio', fontsize=16, fontweight='bold')
plt.xlabel('Principal Component', fontsize=14)
plt.ylabel('Variance Ratio', fontsize=14)

# Plot the cumulative explained variance
plt.subplot(1, 2, 2)
plt.plot(range(1, len(cumulative_explained_variance) + 1), cumulative_explained_variance, marker='o')
plt.title('Cumulative Explained Variance', fontsize=16, fontweight='bold')
plt.xlabel('Number of Principal Components', fontsize=14)
plt.ylabel('Cumulative Variance Ratio', fontsize=14)

plt.tight_layout()
plt.show()

In [None]:
loadings = pca.components_

# Creating a DataFrame for better visualization
# Assuming df.columns gives the original feature names
loadings_df = pd.DataFrame(data=loadings, columns=X.columns, index=[f'PC{i}' for i in range(loadings.shape[0])])

In [None]:
feature_importances = np.sum(loadings**2, axis=0)
feature_ranking = np.argsort(feature_importances)[::-1]

In [None]:
feature_ranking

In [None]:
len(feature_ranking)

In [None]:
list(X.columns)[9]

In [None]:
feature_importances