<a href="https://colab.research.google.com/github/jsohn0824/Hacklytics-submission/blob/main/7_Aash_Hackathon_7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from google.colab import drive
from sklearn.preprocessing import StandardScaler, OneHotEncoder
drive.mount('/content/drive')
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from scipy.optimize import linear_sum_assignment
!pip install nba_api


filePath = 'shotlogs.csv'
df = pd.read_csv('/content/drive/MyDrive/shot_logs.csv')
df.dropna(inplace=True)
#change team to each assigned number
df['TEAM_ABBR'] = df.iloc[:, 1].str.extract(r' - ([A-Z]{3})')[0].str.replace(' ', '', regex=True).str.upper()
team_order = []
team_mapping = {}
for team in df['TEAM_ABBR']:
    if team not in team_mapping:
        team_mapping[team] = len(team_mapping) + 1
print(team_mapping)
#change made to 1 and missed to 0
df['FGM'] = df['SHOT_RESULT'].str.contains(r'(?i)made').astype(int)
#fill in missing data with average values
df['SHOT_CLOCK'] = df['SHOT_CLOCK'].fillna(df.groupby('PERIOD')['SHOT_CLOCK'].transform('mean'))

#Calculate shot angle
df['SHOT_ANGLE'] = np.arctan(df['SHOT_DIST']/10)
#Calculate how much the player is pressured on time
df['PRESSURE_RATIO'] = (1/df['CLOSE_DEF_DIST']+0.0000001) * (1/ (df['SHOT_CLOCK'] + 0.000000001))
#Convert game clock to seconds
df['GAME_CLOCK_SEC'] = df['GAME_CLOCK'].apply(lambda x: int(x.split(':')[0]) * 60 + int(x.split(':')[1]))
#Calculate time for clutch(criteria = 2 min)
df['CLUTCH_TIME'] = ((df['GAME_CLOCK_SEC'] < 120) & (df['PERIOD'] >= 4)).astype(int)

#Extract Defender ID
df['DEFENDER_ID'] = df['CLOSEST_DEFENDER'].str.extract(r'\((.*?)\)')[0]
df['MATCHUP_TEAMS'] = df['MATCHUP'].str.split(' - ').str[-1]
df['OPPONENT_TEAM'] = df['MATCHUP_TEAMS'].apply(lambda x:x.split(' vs. ')[1] if ' vs. ' in x else None)

#find best clutch shooter
clutch_df = df[df['CLUTCH_TIME'] ==1].copy()



player_clutch_stats = clutch_df.groupby('player_name').agg(PLAYER_CLUTCH_FG = ('FGM','mean'),PLAYER_CLUTCH_SHOT_DIST = ('SHOT_DIST','mean'),PLAYER_CLUTCH_DEF_DIST = ('CLOSE_DEF_DIST','mean')).reset_index()
clutch_df = clutch_df.merge(player_clutch_stats, on = 'player_name', how = 'left')

#Group Statistics
numeric_features = ['SHOT_DIST', 'CLOSE_DEF_DIST', 'SHOT_CLOCK','SHOT_ANGLE', 'PRESSURE_RATIO', 'PLAYER_CLUTCH_FG', 'PLAYER_CLUTCH_SHOT_DIST']

categorical_features = ['PERIOD']

preprocessor = ColumnTransformer(transformers=[('num', StandardScaler(), numeric_features),('cat', OneHotEncoder(), categorical_features)])

X = clutch_df[numeric_features + categorical_features]
y = clutch_df['FGM']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train = X_train.fillna(0).replace([np.inf, -np.inf], np.nan).fillna(0).clip(-1e10, 1e10)
X_test = X_test.fillna(0).replace([np.inf, -np.inf], np.nan).fillna(0).clip(-1e10, 1e10)
model = Pipeline([('preprocessor',preprocessor), ('classifier',RandomForestClassifier(random_state = 42))])

model.fit(X_train,y_train)


from sklearn import tree
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

#fn=data.feature_names
#cn=data.target_names
#fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (4,4), dpi=800)
#tree.plot_tree(rf.estimators_[0],
     #          feature_names = fn,
 #              class_names=cn,
       #        filled = True);
#fig.savefig('rf_individualtree.png')


def recommend_clutch_shooter(team_players,current_features,player_stats):
  predictions = []
  for player in team_players:
    input_data = current_features.copy()
    p_stats = player_stats[player_stats['player_name'] == player].iloc[0]
    input_data.update({'PLAYER_CLUTCH_FG':p_stats['PLAYER_CLUTCH_FG'],'PLAYER_CLUTCH_SHOT_DIST': p_stats['PLAYER_CLUTCH_SHOT_DIST']})
    input_df = pd.DataFrame([input_data])
    prob = model.predict_proba(input_df)[0][1]
    predictions.append((player,prob))
  return max(predictions, key = lambda x: x[1])[0]


#create match up
def create_matchup_matrix(team_a_players, team_b_players, matchup_data, is_offensive=True):
  matrix = []
  print("PRINTING MATRIX")
  print(matrix)
  for a_player in team_a_players:
      row = []
      for b_player in team_b_players:
          # Try to find historical matchup data
          if is_offensive:
              matchup = matchup_data[
                  (matchup_data['player_name'] == a_player) &
                  (matchup_data['DEFENDER_ID'] == b_player)
              ]
          else:
              matchup = matchup_data[
                  (matchup_data['player_name'] == b_player) &
                  (matchup_data['DEFENDER_ID'] == a_player)
              ]

          if not matchup.empty:
              fg_pct = matchup['FGM'].mean()
          else:
              # Fallback to player's overall average if no matchup data
              if is_offensive:
                  fg_pct = df[df['player_name'] == a_player]['FGM'].mean()
              else:
                  fg_pct = 1 - df[df['DEFENDER_ID'] == a_player]['FGM'].mean()

          if matchup.empty:
            fg_pct = df[df['player_name'] == a_player]['FGM'].mean()  # Use player’s overall FG% if no matchup data
          else:
            fg_pct = matchup['FGM'].mean()
          row.append(fg_pct)
      matrix.append(row)
  return np.array(matrix)

from itertools import permutations
def brute_force_assignment(matrix):
    n = len(matrix)
    best_score = float('inf')
    best_pairing = None
    for perm in permutations(range(n)):
        score = sum(matrix[i][perm[i]] for i in range(n))
        if score < best_score:
            best_score = score
            best_pairing = perm
    return best_pairing

def optimize_matchups(team_a, team_b, is_offensive=True):
  team_a_players = df[df['TEAM_ABBR'] == team_a]['player_name'].unique()
  team_b_players = df[df['TEAM_ABBR'] == team_b]['player_name'].unique()
  matchup_subset = df[df['MATCHUP'].str.contains(f"{team_a} vs. {team_b}|{team_b} vs. {team_a}")]
  matrix = create_matchup_matrix(team_a_players, team_b_players, matchup_subset, is_offensive)
  if np.any(np.isnan(matrix)) or np.any(np.isinf(matrix)):
        print("Error: Matrix contains invalid values (NaN or Inf).")
        # You can replace invalid values with 0 or another placeholder value here
        matrix = np.nan_to_num(matrix, nan=0, posinf=0, neginf=0)
  if is_offensive:
    row_ind, col_ind = linear_sum_assignment(-matrix)
  else:
    optimal_indices = brute_force_assignment(matrix)
    optimal_pairs = [(team_a_players[i], team_b_players[j], matrix[i][j]) for i, j in zip(optimal_indices[::2], optimal_indices[1::2])]


  if is_offensive:
    optimal_pairs = [(team_a_players[r], team_b_players[c], matrix[r][c]) for r, c in zip(row_ind, col_ind)]



  return optimal_pairs

# Example usage
team_a = 'GSW'
team_b = 'CLE'

from IPython.display import Image, display, HTML
player_id_dict = df.groupby('player_name')['player_id'].unique().apply(lambda x: x[0]).to_dict()
player_id_dict = {k.lower(): v for k,v in player_id_dict.items()}
print(player_id_dict)
defender_id_dict = df.groupby('CLOSEST_DEFENDER')['CLOSEST_DEFENDER_PLAYER_ID'].unique().apply(lambda x: x[0]).to_dict()
defender_id_dict = {key.split(',')[0]: value for key, value in defender_id_dict.items()}
defender_id_dict = {k.lower(): v for k,v in defender_id_dict.items()}
print(defender_id_dict)
base_url = 'https://cdn.nba.com/headshots/nba/latest/1040x760/'


# Offensive matchups (Team A's best offensive players vs Team B's weakest defenders)
offensive_matchups = optimize_matchups(team_a, team_b, is_offensive=True)
print(offensive_matchups)
print("Optimal Offensive Matchups:")
for matchup in offensive_matchups:
    player_id = player_id_dict[matchup[0]]
    defender_id = defender_id_dict[matchup[1].split()[1]]
    final_url_player = base_url + str(player_id) + '.png'
    final_url_defender = base_url + str(defender_id) + '.png'

    display(HTML(f"""
        <div style="display: flex; align-items: center;">
            <img src="{final_url_player}" width="100" style="margin-right: 10px;" />
            <p style="font-size: 18px; margin-right: 10px;">{matchup[2]:.2%}</p>
            <img src="{final_url_defender}" width="100" />
        </div>
    """))

    print(f"Shooter: {matchup[0]} vs Defender: {matchup[1]} | Expected FG%: {matchup[2]:.2%}")


defensive_matchups = optimize_matchups(team_a, team_b, is_offensive=False)
print("DEFENSIVE MATCHUP")
print(defensive_matchups)
print("\nOptimal Defensive Matchups:")
for matchup in defensive_matchups:
    player_id = player_id_dict[matchup[1]]
    defender_id = defender_id_dict[matchup[0].split()[1]]
    final_url_player = base_url + str(player_id) + '.png'
    final_url_defender = base_url + str(defender_id) + '.png'

    display(HTML(f"""
        <div style="display: flex; align-items: center;">
            <img src="{final_url_defender}" width="100" style="margin-right: 10px;" />
            <p style="font-size: 18px; margin-right: 10px;">{matchup[2]:.2%}</p>
            <img src="{final_url_player}" width="100" />
        </div>
    """))
    print(f"Defender: {matchup[0]} vs Shooter: {matchup[1]} | Expected FG% Allowed: {matchup[2]:.2%}")


#test clutch shots
clutch_df = df[(df['CLUTCH_TIME'] == 1) & (df['PERIOD'] >= 4)]
print(f"Clutch shots: {len(clutch_df)}")
print(clutch_df[['GAME_CLOCK_SEC', 'PERIOD', 'FGM']].sample(20))
#test model
y_pred = model.predict(X_test)
print(f"Model Accuracy: {accuracy_score(y_test, y_pred)}")

# Test Clutch Shooter Recommendation
# Test with dummy data
sample_player = df['player_name'].iloc[0]  # Pick a player from your dataset
sample_features = {
    'SHOT_DIST': 15,
    'CLOSE_DEF_DIST': 2,
    'SHOT_CLOCK': 5.0,
    'SHOT_ANGLE': np.arctan(15/10),
    'PRESSURE_RATIO': (1/2) * (1/5),
    'PERIOD': 4
}
'''# Get recommendations
best_shooter = recommend_clutch_shooter(
    team_players=[sample_player],  # Replace with real player IDs
    current_features=sample_features,
    player_stats=player_clutch_stats
)
print(f"Recommended shooter ID: {best_shooter}")
'''
#print(df.shape)
#print(df.head(6))
#print(df.mean())
#print(df.info())



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
{'CHA': 1, 'MIN': 2, 'UTA': 3, 'OKC': 4, 'LAL': 5, 'GSW': 6, 'ATL': 7, 'WAS': 8, 'SAC': 9, 'NOP': 10, 'TOR': 11, 'DAL': 12, 'ORL': 13, 'CLE': 14, 'IND': 15, 'BOS': 16, 'PHX': 17, 'MEM': 18, 'SAS': 19, 'POR': 20, 'DEN': 21, 'MIA': 22, 'PHI': 23, 'MIL': 24, 'HOU': 25, 'NYK': 26, 'DET': 27, 'LAC': 28, 'CHI': 29, 'BKN': 30}
{'aaron brooks': 201166, 'aaron gordon': 203932, 'al farouq aminu': 202329, 'al horford': 201143, 'al jefferson': 2744, 'alan anderson': 101187, 'alan crabbe': 203459, 'alex len': 203458, 'alexis ajinca': 201582, 'alonzo gee': 202087, 'amare stoudemire': 2405, 'amir johnson': 101161, 'andre drummond': 203083, 'andre iguodala': 2738, 'andre miller': 1889, 'andre roberson': 203460, 'andrew bogut': 101106, 'andrew wiggins': 203952, 'anthony bennett': 203461, 'anthony davis': 203076, 'anthony morrow': 201627, 'aron baynes': 203382, 'arron afflalo'

Shooter: harrison barnes vs Defender: matthew dellavedova | Expected FG%: 49.12%


Shooter: leandro barbosa vs Defender: kyrie irving | Expected FG%: 47.74%


Shooter: stephen curry vs Defender: shawn marion | Expected FG%: 49.31%


Shooter: klay thompson vs Defender: lebron james | Expected FG%: 46.38%


Shooter: marreese speights vs Defender: tristan thompson | Expected FG%: 50.10%


Shooter: shaun livingston vs Defender: kevin love | Expected FG%: 52.78%


Shooter: andrew bogut vs Defender: mike miller | Expected FG%: 55.08%


Shooter: andre iguodala vs Defender: joe harris | Expected FG%: 46.34%


Shooter: draymond green vs Defender: timofey mozgov | Expected FG%: 43.85%
PRINTING MATRIX
[]
DEFENSIVE MATCHUP
[('harrison barnes', 'kyrie irving', 0.4911504424778761), ('stephen curry', 'lebron james', 0.49309245483528164), ('marreese speights', 'kevin love', 0.5010438413361169), ('andrew bogut', 'joe harris', 0.5508474576271186)]

Optimal Defensive Matchups:


Defender: harrison barnes vs Shooter: kyrie irving | Expected FG% Allowed: 49.12%


Defender: stephen curry vs Shooter: lebron james | Expected FG% Allowed: 49.31%


Defender: marreese speights vs Shooter: kevin love | Expected FG% Allowed: 50.10%


Defender: andrew bogut vs Shooter: joe harris | Expected FG% Allowed: 55.08%
Clutch shots: 4778
        GAME_CLOCK_SEC  PERIOD  FGM
32669               27       4    0
26270              113       4    1
30347               34       4    1
112683              35       4    0
77065              103       4    0
119107              30       4    0
74072              113       4    0
72142               90       4    0
115861              37       4    1
10117               86       4    0
91396               33       4    0
6305                99       4    1
108915             110       4    1
68641               86       4    1
120552              59       4    1
79193               90       4    0
109158              40       4    1
13588               89       4    0
102715              43       4    0
82537               32       4    1
Model Accuracy: 0.6506276150627615


'# Get recommendations\nbest_shooter = recommend_clutch_shooter(\n    team_players=[sample_player],  # Replace with real player IDs\n    current_features=sample_features,\n    player_stats=player_clutch_stats\n)\nprint(f"Recommended shooter ID: {best_shooter}")\n'

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# Load the CSV file
from google.colab import drive
drive.mount('/content/drive')

df = pd.read_csv('/content/drive/MyDrive/shot_logs.csv', index_col=0)
df.dropna(inplace=True)

df.describe()
scaler = StandardScaler()

df[['SHOT_NUMBER_T', 'PERIOD_T', 'SHOT_CLOCK_T', 'DRIBBLES_T', 'TOUCH_TIME_T','SHOT_DIST_T','PTS_TYPE_T','CLOSE_DEF_DIST_T', 'FGM_T','PTS_T']] = scaler.fit_transform(df[['SHOT_NUMBER', 'PERIOD', 'SHOT_CLOCK', 'DRIBBLES', 'TOUCH_TIME','SHOT_DIST','PTS_TYPE','CLOSE_DEF_DIST', 'FGM','PTS']])
df.dropna(inplace=True)

def optimise_k_means(data, max_k):
  means = []
  inertias = []

  for k in range(1, max_k):
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(data)

    means.append(k)
    inertias.append(kmeans.inertia_)

  fig = plt.subplots(figsize=(10,5))
  plt.plot(means, inertias, 'o-')
  plt.xlabel('Number of Clusters')
  plt.ylabel('Inertia')
  plt.grid(True)
  plt.show()

df['hover'] = df['player_name']
df_team = df['player_name']
kmeans = KMeans(n_clusters=5, max_iter=100, random_state=0)  # You can adjust the number of clusters

# Fit the model to the scaled data
kmeans.fit(df[['SHOT_DIST_T', 'PTS_T']])

# Get the labels (cluster assignments)
labels = kmeans.labels_
df['cluster_label'] = labels

import plotly.express as px
fig = px.scatter_3d(df, x='PTS', y='SHOT_CLOCK', z='CLOSE_DEF_DIST',

                        color='cluster_label', hover_data=['player_name'],

                        title='3D Cluster Visualization')
fig.show()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
