In [1]:
import numpy as np
import pandas as pd
import warnings
import copy

from sklearn.model_selection import train_test_split

# Column and row display
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_seq_items', None)

# Max column width so we can read play descriptions
pd.set_option('display.max_colwidth', None)

np.set_printoptions(threshold=np.inf)


# Notebook cell width display
from IPython.display import display, HTML
display(HTML("<style>:root { --jp-notebook-max-width: 98% !important; }</style>"))

# Float appearance, Pandas and NumPy
pd.set_option('display.float_format', '{:.2f}'.format)
np.set_printoptions(suppress=True, precision = 2)

# Supress warnings
warnings.filterwarnings('ignore')

In [3]:
aggregate = pd.read_csv('/mnt/c/Data_Science/Personal_Projects/nfl_wr_knn/working_exports/aggregate.csv')

In [3]:
aggregate.columns

Index(['player_name', 'player_position', 'season_year', 'player_game_count',
       'receptions', 'targets', 'yards', 'att_yards', 'yards_after_catch',
       'yards_after_contact', 'touchdown', 'routes', 'pass_plays',
       'contested_receptions', 'contested_targets', 'weather_attempt',
       'difficult_attempt', 'difficult_catch', 'difficult_success_rate',
       'difficult_pct', 'weather_catch', 'qb_bf_attempt', 'qb_bf_catch',
       'hurry_up_attempt', 'hurry_up_catch', 'possession_saver_attempt',
       'possession_saver_catch', 'clutch_catch', 'conversion_catch',
       'redzone_catch', 'deep_attempt', 'deep_catch', 'deep_sideline_attempt',
       'deep_sideline_catch', 'large_yac_catch', 'tackle_breaker_catch',
       'beast_catch', 'play_action_attempt', 'play_action_catch',
       'rpo_attempt', 'rpo_catch', 'cross_attempt', 'cross_catch',
       'corner_attempt', 'corner_catch', 'out_attempt', 'out_catch',
       'curl_attempt', 'curl_catch', 'post_attempt', 'post_catch',
 

In [120]:
aggregate['playing_style'] = None

aggregate.loc[aggregate['player_name'] == 'Tyreek Hill', 'playing_style'] = 'Speedster'
aggregate.loc[aggregate['player_name'] == 'Tyler Lockett', 'playing_style'] = 'Speedster'
aggregate.loc[aggregate['player_name'] == 'Quez Watkins', 'playing_style'] = 'Speedster'
aggregate.loc[aggregate['player_name'] == 'Justin Jefferson', 'playing_style'] = 'Versatile'
aggregate.loc[aggregate['player_name'] == 'Ja\'Marr Chase', 'playing_style'] = 'Versatile'
aggregate.loc[aggregate['player_name'] == 'Terry McLaurin', 'playing_style'] = 'Versatile'
aggregate.loc[aggregate['player_name'] == 'DK Metcalf', 'playing_style'] = 'Physical - Speedster'
aggregate.loc[aggregate['player_name'] == 'Chase Claypool', 'playing_style'] = 'Physical - Speedster'
aggregate.loc[aggregate['player_name'] == 'A.J. Brown', 'playing_style'] = 'Physical - Speedster'
aggregate.loc[aggregate['player_name'] == 'Tee Higgins', 'playing_style'] = 'Physical - Possession'
aggregate.loc[aggregate['player_name'] == 'Mike Evans', 'playing_style'] = 'Physical - Possession'
aggregate.loc[aggregate['player_name'] == 'Michael Pittman Jr.', 'playing_style'] = 'Physical - Possession'
aggregate.loc[aggregate['player_name'] == 'Stefon Diggs', 'playing_style'] = 'Route Technician'
aggregate.loc[aggregate['player_name'] == 'Davante Adams', 'playing_style'] = 'Route Technician'
aggregate.loc[aggregate['player_name'] == 'Jaylen Waddle', 'playing_style'] = 'Route Technician'
aggregate.loc[aggregate['player_name'] == 'Deebo Samuel', 'playing_style'] = 'YAC Specialist'
aggregate.loc[aggregate['player_name'] == 'Kadarius Toney', 'playing_style'] = 'YAC Specialist'
aggregate.loc[aggregate['player_name'] == 'Brandon Powell', 'playing_style'] = 'YAC Specialist'
aggregate.loc[aggregate['player_name'] == 'CeeDee Lamb', 'playing_style'] = 'Slot'
aggregate.loc[aggregate['player_name'] == 'Amon-Ra St. Brown', 'playing_style'] = 'Slot'
aggregate.loc[aggregate['player_name'] == 'Christian Kirk', 'playing_style'] = 'Slot'

In [121]:
aggregate[aggregate['playing_style'].notnull()]

Unnamed: 0,player_name,player_position,season_year,player_game_count,receptions,targets,yards,att_yards,yards_after_catch,yards_after_contact,touchdown,routes,pass_plays,contested_receptions,contested_targets,weather_attempt,difficult_attempt,difficult_catch,difficult_success_rate,difficult_pct,weather_catch,qb_bf_attempt,qb_bf_catch,hurry_up_attempt,hurry_up_catch,possession_saver_attempt,possession_saver_catch,clutch_catch,conversion_catch,redzone_catch,deep_attempt,deep_catch,deep_sideline_attempt,deep_sideline_catch,large_yac_catch,tackle_breaker_catch,beast_catch,play_action_attempt,play_action_catch,rpo_attempt,rpo_catch,cross_attempt,cross_catch,corner_attempt,corner_catch,out_attempt,out_catch,curl_attempt,curl_catch,post_attempt,post_catch,underneath_screen_attempt,underneath_screen_catch,flat_attempt,flat_catch,slant_attempt,slant_catch,wr_screen_attempt,wr_screen_catch,comeback_attempt,comeback_catch,go_attempt,go_catch,in_attempt,in_catch,slot_snaps,wide_snaps,cross_success_rate,corner_success_rate,out_success_rate,curl_success_rate,post_success_rate,underneath_screen_success_rate,flat_success_rate,slant_success_rate,wr_screen_success_rate,comeback_success_rate,go_success_rate,in_success_rate,deep_success_rate,play_action_success_rate,rpo_success_rate,hurry_up_success_rate,deep_sideline_success_rate,possession_saver_success_rate,route_rate,slot_rate,wide_rate,contested_catch_rate,cross_pct,corner_pct,out_pct,curl_pct,post_pct,underneath_screen_pct,flat_pct,slant_pct,wr_screen_pct,comeback_pct,go_pct,in_pct,deep_pct,play_action_pct,rpo_pct,hurry_up_pct,deep_sideline_pct,possession_saver_pct,adot,avg_yac,avg_yacon,catch_rate,yprr,height_in,weight_lbs,40,bench,vertical,broad_jump,shuttle,3_cone,playing_style
0,Justin Jefferson,WR,2022,17,128,176,1809,1858,624,132,8,690,736,22,39,122.0,48.0,1.0,0.02,0.26,84.0,66.0,40.0,33.0,23.0,106.0,64.0,8.0,27,6,27.0,16.0,20.0,11.0,22.0,6.0,1.0,44.0,31.0,4.0,2.0,5.0,4.0,19.0,9.0,38.0,30.0,18.0,11.0,12.0,10.0,0.0,0.0,16.0,14.0,12.0,10.0,12.0,11.0,18.0,9.0,20.0,8.0,14.0,12.0,218,511,0.8,0.47,0.79,0.61,0.83,0.0,0.88,0.83,0.92,0.5,0.4,0.86,0.59,0.7,0.5,0.7,0.55,0.6,0.94,0.3,0.69,0.56,0.03,0.1,0.21,0.1,0.07,0.0,0.09,0.07,0.07,0.1,0.11,0.08,0.15,0.24,0.02,0.18,0.11,0.58,10.56,4.88,1.03,0.73,2.62,73.25,202,4.43,14,37.5,126,4.27,7.02,Versatile
1,Tyreek Hill,WR,2022,17,119,167,1710,2107,482,58,7,534,568,13,25,124.0,38.0,0.0,0.0,0.22,85.0,46.0,33.0,5.0,5.0,111.0,71.0,3.0,22,2,38.0,20.0,15.0,10.0,17.0,4.0,0.0,69.0,50.0,25.0,15.0,16.0,13.0,7.0,3.0,22.0,13.0,18.0,15.0,19.0,8.0,0.0,0.0,7.0,7.0,18.0,13.0,14.0,12.0,17.0,16.0,23.0,12.0,9.0,7.0,239,309,0.81,0.43,0.59,0.83,0.42,0.0,1.0,0.72,0.86,0.94,0.52,0.78,0.53,0.72,0.6,1.0,0.67,0.64,0.94,0.42,0.54,0.52,0.09,0.04,0.13,0.11,0.11,0.0,0.04,0.11,0.08,0.1,0.14,0.05,0.22,0.41,0.15,0.03,0.09,0.65,12.62,4.05,0.49,0.71,3.2,68.13,185,4.29,13,40.5,129,4.06,6.53,Speedster
2,Davante Adams,WR,2022,17,100,168,1516,2129,493,95,14,618,657,15,34,88.0,64.0,0.0,0.0,0.36,50.0,68.0,31.0,9.0,5.0,100.0,45.0,4.0,23,4,36.0,14.0,20.0,8.0,16.0,8.0,1.0,36.0,23.0,8.0,6.0,6.0,6.0,11.0,3.0,26.0,16.0,21.0,12.0,15.0,7.0,0.0,0.0,8.0,8.0,14.0,6.0,7.0,6.0,17.0,6.0,35.0,14.0,20.0,16.0,197,457,1.0,0.27,0.62,0.57,0.47,0.0,1.0,0.43,0.86,0.35,0.4,0.8,0.39,0.64,0.75,0.56,0.4,0.45,0.94,0.3,0.7,0.44,0.03,0.06,0.14,0.12,0.08,0.0,0.04,0.08,0.04,0.09,0.19,0.11,0.2,0.2,0.04,0.05,0.11,0.56,12.67,4.93,0.95,0.6,2.45,72.88,212,4.56,14,39.5,123,4.3,6.82,Route Technician
3,A.J. Brown,WR,2022,17,88,137,1496,1754,548,192,11,578,611,15,30,70.0,45.0,0.0,0.0,0.31,45.0,56.0,30.0,24.0,12.0,84.0,44.0,2.0,17,4,28.0,13.0,21.0,11.0,18.0,7.0,4.0,47.0,27.0,43.0,32.0,4.0,4.0,8.0,3.0,18.0,10.0,16.0,10.0,2.0,1.0,1.0,1.0,6.0,5.0,35.0,26.0,6.0,5.0,13.0,6.0,22.0,9.0,14.0,8.0,157,453,1.0,0.38,0.56,0.62,0.5,1.0,0.83,0.74,0.83,0.46,0.41,0.57,0.46,0.57,0.74,0.5,0.52,0.52,0.95,0.26,0.74,0.5,0.03,0.06,0.12,0.11,0.01,0.01,0.04,0.24,0.04,0.09,0.15,0.1,0.19,0.32,0.3,0.17,0.14,0.58,12.8,6.23,2.18,0.64,2.59,72.5,226,4.49,19,36.5,120,4.25,7.0,Physical - Speedster
4,Stefon Diggs,WR,2022,16,108,149,1429,1729,419,100,11,573,607,12,24,117.0,36.0,1.0,0.03,0.23,80.0,58.0,35.0,16.0,12.0,87.0,55.0,5.0,17,7,23.0,12.0,14.0,8.0,12.0,2.0,2.0,46.0,37.0,26.0,20.0,5.0,3.0,12.0,5.0,17.0,12.0,20.0,17.0,12.0,6.0,2.0,2.0,15.0,13.0,22.0,18.0,9.0,9.0,14.0,8.0,16.0,9.0,10.0,6.0,207,400,0.6,0.42,0.71,0.85,0.5,1.0,0.87,0.82,1.0,0.57,0.56,0.6,0.52,0.8,0.77,0.75,0.57,0.63,0.94,0.34,0.66,0.5,0.03,0.08,0.11,0.13,0.08,0.01,0.1,0.14,0.06,0.09,0.1,0.06,0.15,0.3,0.17,0.1,0.09,0.56,11.6,3.88,0.93,0.72,2.49,72.0,195,4.46,11,35.0,115,4.32,7.03,Route Technician
5,CeeDee Lamb,WR,2022,17,107,148,1359,1573,486,148,9,571,602,12,26,60.0,35.0,0.0,0.0,0.22,43.0,61.0,39.0,16.0,9.0,81.0,48.0,3.0,20,4,26.0,12.0,12.0,3.0,17.0,10.0,3.0,47.0,28.0,17.0,13.0,12.0,10.0,12.0,9.0,17.0,12.0,24.0,19.0,12.0,8.0,0.0,0.0,16.0,13.0,21.0,12.0,10.0,9.0,9.0,5.0,14.0,3.0,9.0,7.0,376,218,0.83,0.75,0.71,0.79,0.67,0.0,0.81,0.57,0.9,0.56,0.21,0.78,0.46,0.6,0.76,0.56,0.25,0.59,0.95,0.62,0.36,0.46,0.08,0.08,0.11,0.15,0.08,0.0,0.1,0.13,0.06,0.06,0.09,0.06,0.17,0.3,0.11,0.1,0.08,0.52,10.63,4.54,1.38,0.72,2.38,73.63,198,4.5,11,34.5,124,4.24,7.0,Slot
6,Jaylen Waddle,WR,2022,17,75,114,1356,1381,510,149,8,524,559,5,20,72.0,25.0,0.0,0.0,0.21,49.0,33.0,19.0,7.0,6.0,79.0,50.0,2.0,20,4,17.0,9.0,8.0,3.0,16.0,4.0,4.0,39.0,23.0,18.0,12.0,9.0,5.0,2.0,1.0,12.0,10.0,8.0,4.0,17.0,13.0,0.0,0.0,1.0,1.0,20.0,13.0,8.0,7.0,14.0,8.0,10.0,4.0,16.0,9.0,137,415,0.56,0.5,0.83,0.5,0.76,0.0,1.0,0.65,0.88,0.57,0.4,0.56,0.53,0.59,0.67,0.86,0.38,0.63,0.94,0.25,0.74,0.25,0.08,0.02,0.1,0.07,0.15,0.0,0.01,0.17,0.07,0.12,0.09,0.14,0.15,0.33,0.15,0.06,0.07,0.68,12.11,6.8,1.99,0.66,2.59,69.5,180,4.55,11,34.0,122,4.22,6.99,Route Technician
8,Terry McLaurin,WR,2022,17,77,115,1191,1537,394,143,5,583,621,17,26,36.0,36.0,0.0,0.0,0.3,25.0,48.0,27.0,10.0,9.0,73.0,38.0,6.0,16,3,28.0,10.0,17.0,5.0,17.0,5.0,3.0,32.0,21.0,19.0,12.0,10.0,9.0,9.0,2.0,14.0,10.0,12.0,10.0,4.0,2.0,0.0,0.0,6.0,5.0,13.0,10.0,11.0,10.0,10.0,4.0,19.0,8.0,12.0,7.0,130,490,0.9,0.22,0.71,0.83,0.5,0.0,0.83,0.77,0.91,0.4,0.42,0.58,0.36,0.66,0.63,0.9,0.29,0.52,0.94,0.21,0.79,0.65,0.08,0.07,0.12,0.1,0.03,0.0,0.05,0.11,0.09,0.08,0.16,0.1,0.23,0.27,0.16,0.08,0.14,0.61,13.37,5.12,1.86,0.67,2.04,72.13,208,4.35,18,37.5,125,4.15,7.01,Versatile
9,Amon-Ra St. Brown,WR,2022,16,106,139,1161,945,516,95,6,483,511,5,13,147.0,30.0,0.0,0.0,0.21,106.0,64.0,43.0,8.0,4.0,56.0,39.0,3.0,26,6,6.0,3.0,4.0,1.0,19.0,7.0,1.0,33.0,24.0,5.0,3.0,12.0,8.0,4.0,3.0,38.0,26.0,22.0,16.0,9.0,8.0,3.0,2.0,7.0,6.0,16.0,12.0,13.0,11.0,6.0,3.0,1.0,0.0,16.0,11.0,306,198,0.67,0.75,0.68,0.73,0.89,0.67,0.86,0.75,0.85,0.5,0.0,0.69,0.5,0.73,0.6,0.5,0.25,0.7,0.95,0.6,0.39,0.38,0.08,0.03,0.26,0.15,0.06,0.02,0.05,0.11,0.09,0.04,0.01,0.11,0.04,0.23,0.03,0.05,0.03,0.38,6.8,4.87,0.9,0.76,2.4,71.5,197,4.61,20,38.5,127,4.26,6.9,Slot
11,Mike Evans,WR,2022,15,77,123,1124,1639,216,61,6,626,665,17,26,47.0,42.0,0.0,0.0,0.33,27.0,33.0,20.0,34.0,20.0,86.0,50.0,1.0,17,3,30.0,13.0,18.0,10.0,7.0,2.0,2.0,26.0,17.0,5.0,4.0,2.0,1.0,9.0,3.0,18.0,11.0,20.0,15.0,12.0,5.0,0.0,0.0,3.0,3.0,12.0,8.0,5.0,3.0,16.0,13.0,21.0,9.0,9.0,6.0,177,488,0.5,0.33,0.61,0.75,0.42,0.0,1.0,0.67,0.6,0.81,0.43,0.67,0.43,0.65,0.8,0.59,0.56,0.58,0.94,0.27,0.73,0.65,0.02,0.07,0.14,0.16,0.09,0.0,0.02,0.09,0.04,0.13,0.17,0.07,0.24,0.2,0.04,0.27,0.14,0.68,13.33,2.81,0.79,0.63,1.8,76.75,231,4.53,12,37.0,120,4.26,7.08,Physical - Possession


# PCA

## Data preparation

In [122]:
aggregate.isnull().sum()

player_name                         0
player_position                     0
season_year                         0
player_game_count                   0
receptions                          0
targets                             0
yards                               0
att_yards                           0
yards_after_catch                   0
yards_after_contact                 0
touchdown                           0
routes                              0
pass_plays                          0
contested_receptions                0
contested_targets                   0
weather_attempt                     0
difficult_attempt                   0
difficult_catch                     0
difficult_success_rate              0
difficult_pct                       0
weather_catch                       0
qb_bf_attempt                       0
qb_bf_catch                         0
hurry_up_attempt                    0
hurry_up_catch                      0
possession_saver_attempt            0
possession_s

### Columns to drop

- route_rate
  - Route participation is usually very high, so this column isn't helpful. Not enough variation.

In [123]:
# Ensuring that we only include numeric columns for PCA
# Dropping non-numeric columns (assuming non-numeric columns are 'player_name' and 'season_year')

scalable_features_df = aggregate.drop(['player_name', 'player_position', 'season_year', 'player_game_count','receptions', 'targets', 'yards', 'att_yards', 'yards_after_catch',
                                       'yards_after_contact', 'touchdown', 'routes', 'pass_plays', 'contested_receptions', 'contested_targets', 'weather_attempt', 
                                       'weather_catch', 'difficult_attempt', 'difficult_catch', 'qb_bf_attempt', 'qb_bf_catch', 'hurry_up_attempt','hurry_up_catch', 'possession_saver_attempt', 'possession_saver_catch',
                                       'deep_attempt', 'deep_catch', 'deep_sideline_attempt', 'deep_sideline_catch', 'clutch_catch', 'conversion_catch', 'redzone_catch',
                                       'difficult_success_rate', 'cross_success_rate', 'curl_success_rate', 'post_success_rate', 'underneath_screen_success_rate',
                                        'flat_success_rate', 'slant_success_rate', 'wr_screen_success_rate', 'comeback_success_rate', 'go_success_rate',
                                        'in_success_rate', 'deep_success_rate', 'play_action_success_rate', 'rpo_success_rate', 'hurry_up_success_rate',
                                        'deep_sideline_success_rate', 'possession_saver_success_rate', 'route_rate', 'large_yac_catch', 'tackle_breaker_catch', 'beast_catch', 'play_action_attempt', 'play_action_catch', 'rpo_attempt', 'rpo_catch',
                                       'cross_attempt', 'cross_catch', 'corner_attempt', 'corner_catch', 'curl_attempt', 'out_attempt', 'out_catch', 'curl_catch', 'post_attempt', 'post_catch', 'underneath_screen_attempt',
                                       'underneath_screen_catch', 'flat_attempt', 'flat_catch', 'slant_attempt', 'slant_catch', 'wr_screen_attempt', 'wr_screen_catch',
                                       'comeback_attempt', 'comeback_catch', 'go_attempt', 'go_catch', 'in_attempt', 'in_catch', 'slot_snaps', 'wide_snaps', 'route_rate', 'playing_style'], axis=1)
scalable_features_df.head()

Unnamed: 0,difficult_pct,corner_success_rate,out_success_rate,slot_rate,wide_rate,contested_catch_rate,cross_pct,corner_pct,out_pct,curl_pct,post_pct,underneath_screen_pct,flat_pct,slant_pct,wr_screen_pct,comeback_pct,go_pct,in_pct,deep_pct,play_action_pct,rpo_pct,hurry_up_pct,deep_sideline_pct,possession_saver_pct,adot,avg_yac,avg_yacon,catch_rate,yprr,height_in,weight_lbs,40,bench,vertical,broad_jump,shuttle,3_cone
0,0.26,0.47,0.79,0.3,0.69,0.56,0.03,0.1,0.21,0.1,0.07,0.0,0.09,0.07,0.07,0.1,0.11,0.08,0.15,0.24,0.02,0.18,0.11,0.58,10.56,4.88,1.03,0.73,2.62,73.25,202,4.43,14,37.5,126,4.27,7.02
1,0.22,0.43,0.59,0.42,0.54,0.52,0.09,0.04,0.13,0.11,0.11,0.0,0.04,0.11,0.08,0.1,0.14,0.05,0.22,0.41,0.15,0.03,0.09,0.65,12.62,4.05,0.49,0.71,3.2,68.13,185,4.29,13,40.5,129,4.06,6.53
2,0.36,0.27,0.62,0.3,0.7,0.44,0.03,0.06,0.14,0.12,0.08,0.0,0.04,0.08,0.04,0.09,0.19,0.11,0.2,0.2,0.04,0.05,0.11,0.56,12.67,4.93,0.95,0.6,2.45,72.88,212,4.56,14,39.5,123,4.3,6.82
3,0.31,0.38,0.56,0.26,0.74,0.5,0.03,0.06,0.12,0.11,0.01,0.01,0.04,0.24,0.04,0.09,0.15,0.1,0.19,0.32,0.3,0.17,0.14,0.58,12.8,6.23,2.18,0.64,2.59,72.5,226,4.49,19,36.5,120,4.25,7.0
4,0.23,0.42,0.71,0.34,0.66,0.5,0.03,0.08,0.11,0.13,0.08,0.01,0.1,0.14,0.06,0.09,0.1,0.06,0.15,0.3,0.17,0.1,0.09,0.56,11.6,3.88,0.93,0.72,2.49,72.0,195,4.46,11,35.0,115,4.32,7.03


## Feature scaling

In [124]:
# Standardizing the Data from scratch
# Subtract the mean and divide by the standard deviation for each column
mean = scalable_features_df.mean()
std = scalable_features_df.std()
scaled_features = (scalable_features_df - mean) / std

In [125]:
# Convert the scaled data to a NumPy array
scaled_features_array = scaled_features.to_numpy()
scaled_features_array

array([[ -0.15,   0.55,   0.84,  -0.36,   0.39,   0.68,  -0.33,   0.7 ,
          1.17,  -0.37,   0.11,  -0.17,   0.15,  -0.31,  -0.09,   0.17,
         -0.17,  -0.03,  -0.17,   0.27,  -0.63,   0.32,   0.03,   0.3 ,
         -0.11,   0.2 ,  -0.05,   0.51,   1.29,   0.38,   0.18,  -0.58,
         -0.09,   0.57,   0.37,   0.09,   0.24],
       [ -0.36,   0.42,   0.32,   0.17,  -0.24,   0.53,   0.23,  -0.22,
          0.27,  -0.33,   0.64,  -0.17,  -0.27,   0.02,   0.05,   0.19,
          0.03,  -0.34,   0.34,   1.27,   0.45,  -0.63,  -0.13,   0.64,
          0.24,  -0.06,  -0.37,   0.43,   1.88,  -1.73,  -0.88,  -2.03,
         -0.34,   1.68,   0.69,  -1.4 ,  -2.49],
       [  0.39,  -0.03,   0.39,  -0.35,   0.39,   0.26,  -0.28,   0.08,
          0.45,  -0.27,   0.32,  -0.17,  -0.24,  -0.21,  -0.29,   0.14,
          0.47,   0.45,   0.18,   0.03,  -0.43,  -0.5 ,   0.05,   0.21,
          0.25,   0.22,  -0.1 ,  -0.18,   1.12,   0.22,   0.81,   0.77,
         -0.09,   1.31,   0.05,   0.3 

## 1. Covariance Matrix

In [126]:
def calculate_covariance(one, two):
    """Calculate covariance between two features."""
    mean_one = np.mean(one) # Mean of feature one vector
    mean_two = np.mean(two) # Mean of feature two vector
    covariance = sum((one[i] - mean_one) * (two[i] - mean_two) for i in range(len(one))) / (len(one) - 1) # Sum of each value in feature vector less its mean, divided by (n-1)
    return covariance

In [127]:
# Transpose the data to get columns as features
transposed_data = scaled_features_array.T

In [128]:
scaled_features_array.shape, transposed_data.shape

((225, 37), (37, 225))

In [129]:
transposed_data

array([[ -0.15,  -0.36,   0.39,   0.13,  -0.3 ,  -0.36,  -0.42,  -0.46,
          0.07,  -0.46,  -0.04,   0.25,  -0.09,   0.42,  -0.18,  -0.4 ,
         -0.01,  -0.42,  -0.43,  -0.43,  -0.44,  -0.44,  -0.84,  -0.42,
         -0.04,   0.39,   0.33,  -0.03,   0.03,   0.26,  -0.12,  -0.55,
         -0.65,  -0.27,   0.06,  -0.32,  -0.14,  -0.38,  -0.35,  -0.59,
         -0.09,  -0.14,   0.12,  -0.16,   0.06,   0.62,  -0.58,  -0.18,
          0.12,  -0.32,  -0.21,   0.01,   0.48,  -0.98,  -0.17,  -0.02,
         -0.42,  -0.23,   0.68,   0.14,   0.4 ,   0.24,   0.14,   0.01,
          0.05,  -0.42,  -0.8 ,   0.35,  -0.19,  -0.92,  -0.12,   0.15,
          0.56,   0.16,  -0.57,  -0.33,  -0.7 ,   0.52,  -0.49,   0.15,
         -0.38,  -0.52,  -1.02,  -0.36,   0.64,  -0.45,  -0.28,  -0.14,
          0.44,   0.3 ,  -0.18,   0.12,  -0.27,  -0.37,   0.16,   0.71,
         -0.77,   0.62,   0.48,  -0.29,   0.31,   0.2 ,   0.69,  -0.58,
         -0.17,  -1.1 ,   0.59,   0.22,   0.09,   0.52,   0.59, 

In [130]:
# Initialize an empty covariance matrix
n_features = len(transposed_data)
cov_matrix = [[0 for _ in range(n_features)] for _ in range(n_features)]
n_features

37

In [131]:
# Calculate the covariance matrix
for i in range(n_features): # Iterates over features 1 to 101
    for j in range(n_features): # While holding row i the same, iterates over features 1 to 101 for row j, which changes 
        cov_matrix[i][j] = calculate_covariance(transposed_data[i], transposed_data[j])

In [132]:
cov_matrix

[[0.9999999999999998,
  -0.1377126562005336,
  -0.13487917147435877,
  -0.1340668382914854,
  0.16715207733039047,
  -0.10869473808481213,
  0.1530672837457566,
  0.07156165266795042,
  0.005926059636589294,
  -0.012122729851863629,
  0.02122059909021527,
  -0.12272963432948746,
  -0.19569451444401034,
  -0.1023332236394375,
  -0.30588357524948345,
  -0.0008084446222927039,
  0.39313049688344387,
  0.09618345311115861,
  0.3815116114929694,
  0.028141745973814105,
  -0.07827204552764974,
  -0.12723365929430755,
  0.31560650453050915,
  0.3977425033830329,
  0.47115008749322695,
  -0.00802887019058773,
  0.03700108535755029,
  -0.7260730523105768,
  -0.3208945938314071,
  0.11781745998405764,
  0.03852732595603893,
  -0.03379859251672055,
  0.021244749702608864,
  0.04462313785300339,
  0.05446378902388339,
  0.05709880126803722,
  0.10077700947534694],
 [-0.1377126562005336,
  1.0000000000000047,
  0.24829753848504038,
  0.0426276905450225,
  -0.00572324899876784,
  0.27015158241345316

In [133]:
cov_matrix_df = pd.DataFrame(cov_matrix)

In [134]:
# Step 1: Calculate the Covariance Matrix
# cov_matrix = np.cov(scaled_features_array.T)
# cov_matrix

## 2. Eigenvalues and Eigenvectors

In [135]:
# Step 2: Compute the Eigenvalues and Eigenvectors
eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)

In [136]:
eigenvalues_df = pd.DataFrame(eigenvalues)
eigenvalues_df.head()

Unnamed: 0,0
0,5.45
1,3.01
2,2.62
3,2.3
4,2.1


In [137]:
eigenvectors[:, 30]

array([ 0.08, -0.21,  0.1 , -0.1 ,  0.03,  0.12, -0.16,  0.05, -0.09,
        0.22,  0.07, -0.08,  0.19, -0.14, -0.21, -0.06,  0.17, -0.15,
       -0.07, -0.35,  0.21, -0.41, -0.2 ,  0.02,  0.04,  0.1 , -0.06,
        0.27, -0.19,  0.14,  0.  , -0.32, -0.16, -0.05, -0.01,  0.07,
       -0.03])

In [138]:
eigenvectors_df = pd.DataFrame(eigenvectors)
eigenvectors_df.iloc[:, 30]

0     0.08
1    -0.21
2     0.10
3    -0.10
4     0.03
5     0.12
6    -0.16
7     0.05
8    -0.09
9     0.22
10    0.07
11   -0.08
12    0.19
13   -0.14
14   -0.21
15   -0.06
16    0.17
17   -0.15
18   -0.07
19   -0.35
20    0.21
21   -0.41
22   -0.20
23    0.02
24    0.04
25    0.10
26   -0.06
27    0.27
28   -0.19
29    0.14
30    0.00
31   -0.32
32   -0.16
33   -0.05
34   -0.01
35    0.07
36   -0.03
Name: 30, dtype: float64

In [139]:
eigenvectors_df.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36
0,-0.25,-0.01,-0.16,-0.15,0.11,0.19,-0.29,-0.02,0.12,-0.16,-0.0,0.05,0.24,-0.11,0.02,0.08,-0.06,0.13,-0.0,0.07,0.13,0.09,0.37,0.35,-0.1,-0.22,0.03,0.14,0.05,-0.22,0.08,0.34,0.2,0.1,0.12,0.1,-0.05
1,0.01,0.15,0.27,0.14,0.03,0.23,0.16,-0.0,0.02,-0.06,-0.33,-0.18,0.11,0.1,0.02,0.15,0.14,-0.14,0.0,0.02,-0.02,-0.01,0.08,0.07,-0.02,0.0,0.09,-0.33,0.12,0.01,-0.21,-0.09,-0.02,0.35,0.25,0.13,-0.42
2,0.09,0.14,0.33,0.08,0.08,0.21,-0.11,0.02,0.21,0.17,0.13,0.07,-0.12,-0.12,-0.06,0.11,-0.21,-0.06,0.0,-0.0,-0.04,-0.01,-0.09,-0.04,0.05,-0.06,-0.01,0.11,-0.38,-0.46,0.1,-0.17,-0.16,0.24,0.19,0.03,0.24
3,0.22,0.21,0.0,-0.26,0.05,-0.09,-0.16,-0.31,0.21,0.01,-0.06,-0.18,0.03,0.05,0.16,0.24,0.1,0.06,0.0,-0.48,0.47,0.0,-0.08,0.02,0.07,0.11,0.08,0.0,-0.0,-0.06,-0.1,-0.04,-0.01,-0.05,-0.16,-0.05,0.01
4,-0.25,-0.18,0.03,0.23,-0.04,0.1,0.11,0.35,-0.17,-0.04,0.05,0.23,-0.07,0.09,-0.2,-0.14,-0.09,-0.06,0.0,-0.48,0.5,0.07,0.07,0.02,0.09,0.03,0.04,0.04,-0.04,0.09,0.03,-0.01,-0.07,0.09,0.11,0.0,-0.03
5,-0.03,-0.03,0.36,0.18,0.08,-0.02,0.04,-0.03,0.01,0.18,-0.06,0.05,0.18,0.17,0.22,0.24,-0.15,-0.25,-0.0,-0.04,-0.05,-0.01,-0.15,-0.04,0.0,-0.12,-0.04,0.5,0.03,0.19,0.12,0.28,0.12,-0.08,-0.09,-0.18,-0.2
6,0.01,0.02,-0.17,0.02,-0.01,0.23,-0.13,-0.09,-0.42,-0.05,0.27,-0.11,0.34,0.14,-0.19,0.35,-0.25,-0.22,-0.31,0.02,-0.03,0.07,-0.05,-0.09,0.14,0.07,-0.06,-0.17,0.05,-0.07,-0.16,-0.03,0.02,-0.09,-0.01,-0.07,0.13
7,-0.06,0.07,0.2,0.0,0.07,0.22,0.17,0.05,0.02,-0.33,-0.43,-0.14,0.17,0.15,-0.09,-0.13,0.1,0.03,-0.18,-0.01,0.02,-0.07,-0.05,0.01,-0.11,-0.07,0.03,0.14,-0.02,-0.07,0.05,-0.1,-0.06,-0.44,0.01,0.13,0.4
8,0.07,0.19,0.23,0.05,0.11,0.27,-0.18,0.15,0.33,0.02,0.1,0.11,-0.09,-0.18,-0.16,0.02,-0.22,0.21,-0.22,0.04,0.01,0.04,0.09,0.02,-0.0,-0.08,-0.12,-0.19,0.05,0.5,-0.09,-0.1,0.09,-0.12,-0.25,-0.0,-0.03
9,0.06,-0.26,-0.05,0.17,0.22,-0.32,-0.12,0.01,0.23,-0.27,-0.02,0.11,0.11,0.15,0.12,0.05,0.2,-0.06,-0.47,0.04,0.02,-0.01,-0.02,-0.11,0.11,-0.16,0.01,-0.16,-0.09,0.07,0.22,-0.08,0.06,0.31,0.01,-0.12,0.13


## 3. Sort eigenvectors and eigenvalues by eigenvalue magnitude

In [140]:
sorted_index = np.argsort(eigenvalues)[::-1] # argsort sorts the eigenvalues in ascending order by default
                                             # start:stop:step. Start and stop are ommitted, so the slice is the entire array. The step is -1, so the index decreases by 1 with each step.
                                             # This effectively reverses the order.
sorted_eigenvalues = eigenvalues[sorted_index]
sorted_eigenvectors = eigenvectors[:,sorted_index] # Sorts the columns by index in descending order for each eigenvalue magnitude. Leaves rows the same.

In [141]:
np.argsort(eigenvalues)[2:5:]

array([20, 21, 22])

In [142]:
np.argsort(eigenvalues)[5:2:-1]

array([23, 22, 21])

In [143]:
sorted_index

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 27, 28, 33, 34, 36, 35, 32, 31, 30, 29, 26, 25, 24, 23, 22, 21,
       20, 19, 18])

In [144]:
sorted_eigenvalues

array([5.45, 3.01, 2.62, 2.3 , 2.1 , 1.73, 1.58, 1.48, 1.46, 1.39, 1.26,
       1.13, 1.1 , 1.05, 0.95, 0.9 , 0.84, 0.78, 0.68, 0.65, 0.57, 0.54,
       0.5 , 0.45, 0.4 , 0.39, 0.33, 0.31, 0.23, 0.2 , 0.17, 0.16, 0.12,
       0.09, 0.04, 0.04, 0.  ])

In [145]:
sorted_eigenvectors_df = pd.DataFrame(sorted_eigenvectors)
sorted_eigenvectors_df.head(10)

# eigenvectors_df.iloc[:, 86] appears last in the eigenvectors sorted by eigenvalue magnitude. So it should appear as sorted_eigenvectors_df.iloc[:, 100].

# eigenvectors_df.iloc[:, 100] appears 91st in the eigenvectors sorted by eigenvalue magnitude. So it should appear as sorted_eigenvectors_df.iloc[:, 90].

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36
0,-0.25,-0.01,-0.16,-0.15,0.11,0.19,-0.29,-0.02,0.12,-0.16,-0.0,0.05,0.24,-0.11,0.02,0.08,-0.06,0.13,0.14,0.05,0.1,0.12,-0.05,0.1,0.2,0.34,0.08,-0.22,0.03,-0.22,-0.1,0.35,0.37,0.09,0.13,0.07,-0.0
1,0.01,0.15,0.27,0.14,0.03,0.23,0.16,-0.0,0.02,-0.06,-0.33,-0.18,0.11,0.1,0.02,0.15,0.14,-0.14,-0.33,0.12,0.35,0.25,-0.42,0.13,-0.02,-0.09,-0.21,0.01,0.09,0.0,-0.02,0.07,0.08,-0.01,-0.02,0.02,0.0
2,0.09,0.14,0.33,0.08,0.08,0.21,-0.11,0.02,0.21,0.17,0.13,0.07,-0.12,-0.12,-0.06,0.11,-0.21,-0.06,0.11,-0.38,0.24,0.19,0.24,0.03,-0.16,-0.17,0.1,-0.46,-0.01,-0.06,0.05,-0.04,-0.09,-0.01,-0.04,-0.0,0.0
3,0.22,0.21,0.0,-0.26,0.05,-0.09,-0.16,-0.31,0.21,0.01,-0.06,-0.18,0.03,0.05,0.16,0.24,0.1,0.06,0.0,-0.0,-0.05,-0.16,0.01,-0.05,-0.01,-0.04,-0.1,-0.06,0.08,0.11,0.07,0.02,-0.08,0.0,0.47,-0.48,0.0
4,-0.25,-0.18,0.03,0.23,-0.04,0.1,0.11,0.35,-0.17,-0.04,0.05,0.23,-0.07,0.09,-0.2,-0.14,-0.09,-0.06,0.04,-0.04,0.09,0.11,-0.03,0.0,-0.07,-0.01,0.03,0.09,0.04,0.03,0.09,0.02,0.07,0.07,0.5,-0.48,0.0
5,-0.03,-0.03,0.36,0.18,0.08,-0.02,0.04,-0.03,0.01,0.18,-0.06,0.05,0.18,0.17,0.22,0.24,-0.15,-0.25,0.5,0.03,-0.08,-0.09,-0.2,-0.18,0.12,0.28,0.12,0.19,-0.04,-0.12,0.0,-0.04,-0.15,-0.01,-0.05,-0.04,-0.0
6,0.01,0.02,-0.17,0.02,-0.01,0.23,-0.13,-0.09,-0.42,-0.05,0.27,-0.11,0.34,0.14,-0.19,0.35,-0.25,-0.22,-0.17,0.05,-0.09,-0.01,0.13,-0.07,0.02,-0.03,-0.16,-0.07,-0.06,0.07,0.14,-0.09,-0.05,0.07,-0.03,0.02,-0.31
7,-0.06,0.07,0.2,0.0,0.07,0.22,0.17,0.05,0.02,-0.33,-0.43,-0.14,0.17,0.15,-0.09,-0.13,0.1,0.03,0.14,-0.02,-0.44,0.01,0.4,0.13,-0.06,-0.1,0.05,-0.07,0.03,-0.07,-0.11,0.01,-0.05,-0.07,0.02,-0.01,-0.18
8,0.07,0.19,0.23,0.05,0.11,0.27,-0.18,0.15,0.33,0.02,0.1,0.11,-0.09,-0.18,-0.16,0.02,-0.22,0.21,-0.19,0.05,-0.12,-0.25,-0.03,-0.0,0.09,-0.1,-0.09,0.5,-0.12,-0.08,-0.0,0.02,0.09,0.04,0.01,0.04,-0.22
9,0.06,-0.26,-0.05,0.17,0.22,-0.32,-0.12,0.01,0.23,-0.27,-0.02,0.11,0.11,0.15,0.12,0.05,0.2,-0.06,-0.16,-0.09,0.31,0.01,0.13,-0.12,0.06,-0.08,0.22,0.07,0.01,-0.16,0.11,-0.11,-0.02,-0.01,0.02,0.04,-0.47


## 4. Select subset of eigenvectors to form principal components

In [146]:
# Cumulative sum divided by sum.
# Each element represents the marginal variance explained by adding one more principal component.
cumulative_var_explained = np.cumsum(sorted_eigenvalues) / np.sum(sorted_eigenvalues)

In [147]:
sorted_index

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 27, 28, 33, 34, 36, 35, 32, 31, 30, 29, 26, 25, 24, 23, 22, 21,
       20, 19, 18])

In [148]:
sorted_eigenvalues

array([5.45, 3.01, 2.62, 2.3 , 2.1 , 1.73, 1.58, 1.48, 1.46, 1.39, 1.26,
       1.13, 1.1 , 1.05, 0.95, 0.9 , 0.84, 0.78, 0.68, 0.65, 0.57, 0.54,
       0.5 , 0.45, 0.4 , 0.39, 0.33, 0.31, 0.23, 0.2 , 0.17, 0.16, 0.12,
       0.09, 0.04, 0.04, 0.  ])

In [149]:
np.cumsum(sorted_eigenvalues)

array([ 5.45,  8.46, 11.07, 13.37, 15.47, 17.2 , 18.79, 20.27, 21.73,
       23.12, 24.38, 25.51, 26.61, 27.66, 28.6 , 29.5 , 30.35, 31.13,
       31.81, 32.46, 33.02, 33.56, 34.06, 34.51, 34.91, 35.29, 35.63,
       35.94, 36.17, 36.37, 36.54, 36.7 , 36.82, 36.91, 36.96, 37.  ,
       37.  ])

In [150]:
np.sum(sorted_eigenvalues)

37.00000000000001

In [151]:
cumulative_var_explained

array([0.15, 0.23, 0.3 , 0.36, 0.42, 0.46, 0.51, 0.55, 0.59, 0.62, 0.66,
       0.69, 0.72, 0.75, 0.77, 0.8 , 0.82, 0.84, 0.86, 0.88, 0.89, 0.91,
       0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.98, 0.99, 0.99, 1.  ,
       1.  , 1.  , 1.  , 1.  ])

In [152]:
# Finds the indices where cumulative variance explained is at least 95%.
# These indices determine how many PCs are needed to explain at least 95% of the total variance.
# [0][0] To access the first index from the first array
# +1 because Python is 0-indexed
# Returns the number of PCs needed to explain at least 95% of the variance.
num_components = np.where(cumulative_var_explained >= 0.95)[0][0] + 1 

In [153]:
np.where(cumulative_var_explained >= 0.95) # Actually an array nested within an array

(array([25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36]),)

In [154]:
num_components

26

In [155]:
pca_components = sorted_eigenvectors[:, :num_components]

In [156]:
pca_components_df = pd.DataFrame(pca_components)
pca_components_df.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25
0,-0.25,-0.01,-0.16,-0.15,0.11,0.19,-0.29,-0.02,0.12,-0.16,-0.0,0.05,0.24,-0.11,0.02,0.08,-0.06,0.13,0.14,0.05,0.1,0.12,-0.05,0.1,0.2,0.34
1,0.01,0.15,0.27,0.14,0.03,0.23,0.16,-0.0,0.02,-0.06,-0.33,-0.18,0.11,0.1,0.02,0.15,0.14,-0.14,-0.33,0.12,0.35,0.25,-0.42,0.13,-0.02,-0.09
2,0.09,0.14,0.33,0.08,0.08,0.21,-0.11,0.02,0.21,0.17,0.13,0.07,-0.12,-0.12,-0.06,0.11,-0.21,-0.06,0.11,-0.38,0.24,0.19,0.24,0.03,-0.16,-0.17
3,0.22,0.21,0.0,-0.26,0.05,-0.09,-0.16,-0.31,0.21,0.01,-0.06,-0.18,0.03,0.05,0.16,0.24,0.1,0.06,0.0,-0.0,-0.05,-0.16,0.01,-0.05,-0.01,-0.04
4,-0.25,-0.18,0.03,0.23,-0.04,0.1,0.11,0.35,-0.17,-0.04,0.05,0.23,-0.07,0.09,-0.2,-0.14,-0.09,-0.06,0.04,-0.04,0.09,0.11,-0.03,0.0,-0.07,-0.01
5,-0.03,-0.03,0.36,0.18,0.08,-0.02,0.04,-0.03,0.01,0.18,-0.06,0.05,0.18,0.17,0.22,0.24,-0.15,-0.25,0.5,0.03,-0.08,-0.09,-0.2,-0.18,0.12,0.28
6,0.01,0.02,-0.17,0.02,-0.01,0.23,-0.13,-0.09,-0.42,-0.05,0.27,-0.11,0.34,0.14,-0.19,0.35,-0.25,-0.22,-0.17,0.05,-0.09,-0.01,0.13,-0.07,0.02,-0.03
7,-0.06,0.07,0.2,0.0,0.07,0.22,0.17,0.05,0.02,-0.33,-0.43,-0.14,0.17,0.15,-0.09,-0.13,0.1,0.03,0.14,-0.02,-0.44,0.01,0.4,0.13,-0.06,-0.1
8,0.07,0.19,0.23,0.05,0.11,0.27,-0.18,0.15,0.33,0.02,0.1,0.11,-0.09,-0.18,-0.16,0.02,-0.22,0.21,-0.19,0.05,-0.12,-0.25,-0.03,-0.0,0.09,-0.1
9,0.06,-0.26,-0.05,0.17,0.22,-0.32,-0.12,0.01,0.23,-0.27,-0.02,0.11,0.11,0.15,0.12,0.05,0.2,-0.06,-0.16,-0.09,0.31,0.01,0.13,-0.12,0.06,-0.08


## 5. Transform the original data

In [157]:
pca_transformed_data = np.dot(scaled_features_array, pca_components)

In [158]:
scaled_features_array.shape, pca_components.shape

((225, 37), (37, 26))

In [159]:
# Creating a DataFrame of the PCA-transformed data
pca_df = pd.DataFrame(pca_transformed_data, columns=[f'PC{i+1}' for i in range(num_components)])

In [160]:
pca_df

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15,PC16,PC17,PC18,PC19,PC20,PC21,PC22,PC23,PC24,PC25,PC26
0,-0.07,0.34,1.66,1.2,-0.23,0.62,0.43,-0.06,0.62,0.25,-0.28,0.34,-0.06,-0.52,-0.66,-0.1,-0.23,-0.26,-0.33,-0.11,-0.24,-0.31,0.45,0.01,0.77,0.39
1,-0.06,3.37,0.45,2.72,-0.57,-0.37,-0.02,-1.22,-1.03,-0.49,0.61,0.1,-0.43,-0.45,0.21,0.13,0.45,0.39,0.38,0.77,1.05,-0.31,0.9,-0.06,-0.05,1.0
2,-0.94,-0.17,0.81,0.61,-0.31,0.51,-0.31,-0.28,0.07,0.44,0.6,-0.23,-0.31,-0.55,0.34,-0.2,0.24,-0.05,-0.2,-0.4,0.02,0.34,0.76,0.98,0.33,0.95
3,-0.82,-0.6,1.8,0.38,-1.93,-0.53,-1.04,-0.22,-0.33,0.22,-0.23,0.33,0.35,0.22,0.28,-0.53,-0.72,0.57,0.19,0.36,0.24,0.49,0.05,0.02,-0.22,-0.46
4,0.3,0.75,1.71,-0.24,-0.0,-0.48,0.12,0.36,-0.77,-0.01,-0.27,0.7,0.06,-0.07,-0.2,-0.28,-0.04,-0.31,-0.06,-0.07,0.3,-0.14,0.13,0.27,0.33,0.12
5,0.95,1.02,1.59,-0.36,-0.17,0.0,0.08,-0.68,-0.14,-0.36,-0.35,-0.42,0.34,0.13,0.2,0.4,0.49,-0.29,-0.67,-0.78,0.18,-0.2,-0.46,-0.2,0.52,-0.01
6,0.13,1.48,1.18,-0.09,-0.72,-0.11,-0.57,1.56,-1.03,-0.28,0.72,0.99,-0.62,-0.6,-0.23,0.18,1.07,-0.47,-0.14,-0.34,0.44,0.93,-0.08,-0.42,-0.19,0.25
7,0.96,1.39,0.58,0.88,0.46,0.05,0.4,1.24,-0.42,0.19,-0.16,0.97,0.68,-0.25,-0.46,-0.11,0.35,0.02,-0.36,-1.27,-0.25,0.82,-0.51,-0.35,-0.21,0.06
8,-1.17,0.28,0.91,1.33,-1.28,0.34,0.02,-0.28,-0.02,0.36,0.08,0.52,0.12,0.3,-0.22,0.01,-0.5,0.17,0.81,0.65,0.2,0.12,0.59,-0.6,0.29,-0.32
9,2.19,0.14,1.25,0.58,-0.26,0.92,-0.74,-0.63,0.84,0.16,0.23,-0.79,-0.96,-0.29,-0.12,0.94,0.28,-0.09,-0.75,0.32,0.35,0.58,0.52,-0.04,0.02,0.52


In [161]:
# Display the shape of the original and the PCA-transformed data
original_shape = scaled_features_array.shape
pca_shape = pca_transformed_data.shape
original_shape, pca_shape, pca_components.shape

((225, 37), (225, 26), (37, 26))

# KNN

In [162]:
pca_df['playing_style'] = aggregate['playing_style'].values
pca_df.head()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15,PC16,PC17,PC18,PC19,PC20,PC21,PC22,PC23,PC24,PC25,PC26,playing_style
0,-0.07,0.34,1.66,1.2,-0.23,0.62,0.43,-0.06,0.62,0.25,-0.28,0.34,-0.06,-0.52,-0.66,-0.1,-0.23,-0.26,-0.33,-0.11,-0.24,-0.31,0.45,0.01,0.77,0.39,Versatile
1,-0.06,3.37,0.45,2.72,-0.57,-0.37,-0.02,-1.22,-1.03,-0.49,0.61,0.1,-0.43,-0.45,0.21,0.13,0.45,0.39,0.38,0.77,1.05,-0.31,0.9,-0.06,-0.05,1.0,Speedster
2,-0.94,-0.17,0.81,0.61,-0.31,0.51,-0.31,-0.28,0.07,0.44,0.6,-0.23,-0.31,-0.55,0.34,-0.2,0.24,-0.05,-0.2,-0.4,0.02,0.34,0.76,0.98,0.33,0.95,Route Technician
3,-0.82,-0.6,1.8,0.38,-1.93,-0.53,-1.04,-0.22,-0.33,0.22,-0.23,0.33,0.35,0.22,0.28,-0.53,-0.72,0.57,0.19,0.36,0.24,0.49,0.05,0.02,-0.22,-0.46,Physical - Speedster
4,0.3,0.75,1.71,-0.24,-0.0,-0.48,0.12,0.36,-0.77,-0.01,-0.27,0.7,0.06,-0.07,-0.2,-0.28,-0.04,-0.31,-0.06,-0.07,0.3,-0.14,0.13,0.27,0.33,0.12,Route Technician


In [163]:
train = pca_df[pca_df['playing_style'].notnull()]
test = pca_df[pca_df['playing_style'].isnull()]

In [164]:
train.shape, test.shape

((21, 27), (204, 27))

In [165]:
X_train = train.drop('playing_style', axis = 1)
y_train = train['playing_style']

X_test = test.drop('playing_style', axis = 1)
# y_test = test['playing_style']

In [166]:
def knn(features, test_input, k):
    squared_distance = 0
    for feature in features:
        squared_distance += (X_train[feature] - test_input[feature])**2
    X_train['distance'] = squared_distance**(1/2)
    
    prediction = y_train[X_train['distance'].nsmallest(n=k).index].mode()[0]
    return prediction

In [167]:
pca_df.columns

Index(['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10',
       'PC11', 'PC12', 'PC13', 'PC14', 'PC15', 'PC16', 'PC17', 'PC18', 'PC19',
       'PC20', 'PC21', 'PC22', 'PC23', 'PC24', 'PC25', 'PC26',
       'playing_style'],
      dtype='object')

In [168]:
features = ['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10',
       'PC11', 'PC12', 'PC13', 'PC14', 'PC15', 'PC16', 'PC17', 'PC18', 'PC19',
       'PC20', 'PC21', 'PC22', 'PC23', 'PC24', 'PC25', 'PC26']

In [183]:
X_test['playing_style'] = X_test.apply(lambda x: knn(features, x, 4), axis = 1)

In [184]:
combined_playing_style = pd.concat([train['playing_style'], X_test['playing_style']])
final = aggregate
final['playing_style'] = combined_playing_style


# Checking results

In [185]:
pd.set_option('display.float_format', '{:.2f}'.format)

In [186]:
final = final[['player_name', 'playing_style', 'height_in', 'weight_lbs', '40', 'bench',
       'vertical', 'broad_jump', 'shuttle', '3_cone', 'cross_pct', 'corner_pct', 'out_pct', 'curl_pct', 'post_pct', 'underneath_screen_pct',
       'flat_pct', 'slant_pct', 'wr_screen_pct', 'comeback_pct', 'go_pct', 'in_pct', 'deep_pct', 'play_action_pct', 'rpo_pct',
       'hurry_up_pct', 'difficult_pct', 'deep_sideline_pct', 'possession_saver_pct', 'clutch_catch', 'conversion_catch',
       'redzone_catch', 'adot', 'avg_yac', 'avg_yacon', 'catch_rate', 'yprr', 'slot_rate', 'wide_rate', 'contested_catch_rate'
       ]]

In [187]:
final.head(40)

Unnamed: 0,player_name,playing_style,height_in,weight_lbs,40,bench,vertical,broad_jump,shuttle,3_cone,cross_pct,corner_pct,out_pct,curl_pct,post_pct,underneath_screen_pct,flat_pct,slant_pct,wr_screen_pct,comeback_pct,go_pct,in_pct,deep_pct,play_action_pct,rpo_pct,hurry_up_pct,difficult_pct,deep_sideline_pct,possession_saver_pct,clutch_catch,conversion_catch,redzone_catch,adot,avg_yac,avg_yacon,catch_rate,yprr,slot_rate,wide_rate,contested_catch_rate
0,Justin Jefferson,Versatile,73.25,202,4.43,14,37.5,126,4.27,7.02,0.03,0.1,0.21,0.1,0.07,0.0,0.09,0.07,0.07,0.1,0.11,0.08,0.15,0.24,0.02,0.18,0.26,0.11,0.58,8.0,27,6,10.56,4.88,1.03,0.73,2.62,0.3,0.69,0.56
1,Tyreek Hill,Speedster,68.13,185,4.29,13,40.5,129,4.06,6.53,0.09,0.04,0.13,0.11,0.11,0.0,0.04,0.11,0.08,0.1,0.14,0.05,0.22,0.41,0.15,0.03,0.22,0.09,0.65,3.0,22,2,12.62,4.05,0.49,0.71,3.2,0.42,0.54,0.52
2,Davante Adams,Route Technician,72.88,212,4.56,14,39.5,123,4.3,6.82,0.03,0.06,0.14,0.12,0.08,0.0,0.04,0.08,0.04,0.09,0.19,0.11,0.2,0.2,0.04,0.05,0.36,0.11,0.56,4.0,23,4,12.67,4.93,0.95,0.6,2.45,0.3,0.7,0.44
3,A.J. Brown,Physical - Speedster,72.5,226,4.49,19,36.5,120,4.25,7.0,0.03,0.06,0.12,0.11,0.01,0.01,0.04,0.24,0.04,0.09,0.15,0.1,0.19,0.32,0.3,0.17,0.31,0.14,0.58,2.0,17,4,12.8,6.23,2.18,0.64,2.59,0.26,0.74,0.5
4,Stefon Diggs,Route Technician,72.0,195,4.46,11,35.0,115,4.32,7.03,0.03,0.08,0.11,0.13,0.08,0.01,0.1,0.14,0.06,0.09,0.1,0.06,0.15,0.3,0.17,0.1,0.23,0.09,0.56,5.0,17,7,11.6,3.88,0.93,0.72,2.49,0.34,0.66,0.5
5,CeeDee Lamb,Slot,73.63,198,4.5,11,34.5,124,4.24,7.0,0.08,0.08,0.11,0.15,0.08,0.0,0.1,0.13,0.06,0.06,0.09,0.06,0.17,0.3,0.11,0.1,0.22,0.08,0.52,3.0,20,4,10.63,4.54,1.38,0.72,2.38,0.62,0.36,0.46
6,Jaylen Waddle,Route Technician,69.5,180,4.55,11,34.0,122,4.22,6.99,0.08,0.02,0.1,0.07,0.15,0.0,0.01,0.17,0.07,0.12,0.09,0.14,0.15,0.33,0.15,0.06,0.21,0.07,0.68,2.0,20,4,12.11,6.8,1.99,0.66,2.59,0.25,0.74,0.25
7,DeVonta Smith,Route Technician,72.25,170,4.53,9,34.0,131,4.22,6.95,0.07,0.07,0.15,0.15,0.03,0.02,0.05,0.09,0.13,0.13,0.07,0.04,0.15,0.14,0.17,0.18,0.21,0.1,0.45,1.0,22,5,9.97,5.16,1.03,0.72,1.98,0.25,0.75,0.42
8,Terry McLaurin,Versatile,72.13,208,4.35,18,37.5,125,4.15,7.01,0.08,0.07,0.12,0.1,0.03,0.0,0.05,0.11,0.09,0.08,0.16,0.1,0.23,0.27,0.16,0.08,0.3,0.14,0.61,6.0,16,3,13.37,5.12,1.86,0.67,2.04,0.21,0.79,0.65
9,Amon-Ra St. Brown,Slot,71.5,197,4.61,20,38.5,127,4.26,6.9,0.08,0.03,0.26,0.15,0.06,0.02,0.05,0.11,0.09,0.04,0.01,0.11,0.04,0.23,0.03,0.05,0.21,0.03,0.38,3.0,26,6,6.8,4.87,0.9,0.76,2.4,0.6,0.39,0.38


In [188]:
# final.loc[final['player_name'] == 'Stefon Diggs'].iloc[:,10:22].sum(axis=1).sum()

In [189]:
final['playing_style'].value_counts()

playing_style
Route Technician         61
Physical - Possession    45
Slot                     39
Versatile                31
Physical - Speedster     20
Speedster                19
YAC Specialist           10
Name: count, dtype: int64

In [190]:
final[final['playing_style'] == 'Route Technician']

Unnamed: 0,player_name,playing_style,height_in,weight_lbs,40,bench,vertical,broad_jump,shuttle,3_cone,cross_pct,corner_pct,out_pct,curl_pct,post_pct,underneath_screen_pct,flat_pct,slant_pct,wr_screen_pct,comeback_pct,go_pct,in_pct,deep_pct,play_action_pct,rpo_pct,hurry_up_pct,difficult_pct,deep_sideline_pct,possession_saver_pct,clutch_catch,conversion_catch,redzone_catch,adot,avg_yac,avg_yacon,catch_rate,yprr,slot_rate,wide_rate,contested_catch_rate
2,Davante Adams,Route Technician,72.88,212,4.56,14,39.5,123,4.3,6.82,0.03,0.06,0.14,0.12,0.08,0.0,0.04,0.08,0.04,0.09,0.19,0.11,0.2,0.2,0.04,0.05,0.36,0.11,0.56,4.0,23,4,12.67,4.93,0.95,0.6,2.45,0.3,0.7,0.44
4,Stefon Diggs,Route Technician,72.0,195,4.46,11,35.0,115,4.32,7.03,0.03,0.08,0.11,0.13,0.08,0.01,0.1,0.14,0.06,0.09,0.1,0.06,0.15,0.3,0.17,0.1,0.23,0.09,0.56,5.0,17,7,11.6,3.88,0.93,0.72,2.49,0.34,0.66,0.5
6,Jaylen Waddle,Route Technician,69.5,180,4.55,11,34.0,122,4.22,6.99,0.08,0.02,0.1,0.07,0.15,0.0,0.01,0.17,0.07,0.12,0.09,0.14,0.15,0.33,0.15,0.06,0.21,0.07,0.68,2.0,20,4,12.11,6.8,1.99,0.66,2.59,0.25,0.74,0.25
7,DeVonta Smith,Route Technician,72.25,170,4.53,9,34.0,131,4.22,6.95,0.07,0.07,0.15,0.15,0.03,0.02,0.05,0.09,0.13,0.13,0.07,0.04,0.15,0.14,0.17,0.18,0.21,0.1,0.45,1.0,22,5,9.97,5.16,1.03,0.72,1.98,0.25,0.75,0.42
13,Garrett Wilson,Route Technician,71.75,183,4.38,12,36.0,123,4.36,6.99,0.05,0.1,0.09,0.1,0.05,0.01,0.07,0.14,0.08,0.1,0.14,0.09,0.12,0.17,0.1,0.09,0.36,0.08,0.59,5.0,20,3,11.14,4.63,2.3,0.6,1.85,0.36,0.63,0.36
16,Chris Olave,Route Technician,72.38,187,4.39,12,32.0,124,4.23,6.99,0.01,0.06,0.2,0.18,0.04,0.01,0.03,0.08,0.03,0.13,0.17,0.06,0.24,0.19,0.03,0.07,0.29,0.14,0.67,1.0,20,2,14.79,2.92,0.75,0.63,2.42,0.34,0.66,0.33
26,Diontae Johnson,Route Technician,70.5,183,4.53,15,33.5,123,4.45,7.09,0.03,0.03,0.17,0.17,0.03,0.0,0.03,0.05,0.05,0.19,0.16,0.08,0.15,0.16,0.07,0.09,0.34,0.08,0.53,4.0,16,0,10.96,2.73,0.85,0.61,1.44,0.13,0.87,0.36
27,Drake London,Route Technician,75.88,219,4.55,16,35.5,121,4.28,7.05,0.04,0.06,0.16,0.1,0.07,0.01,0.03,0.13,0.09,0.09,0.14,0.09,0.13,0.19,0.26,0.15,0.28,0.05,0.61,3.0,13,4,10.63,3.21,0.6,0.63,2.07,0.22,0.78,0.54
29,Gabe Davis,Route Technician,74.0,216,4.54,14,35.0,124,4.59,7.08,0.01,0.09,0.12,0.17,0.1,0.0,0.03,0.08,0.01,0.19,0.1,0.11,0.26,0.19,0.11,0.08,0.33,0.12,0.72,2.0,12,3,15.33,3.04,0.44,0.52,1.43,0.09,0.9,0.35
33,Jakobi Meyers,Route Technician,73.63,203,4.63,13,37.0,118,4.23,7.07,0.08,0.08,0.18,0.09,0.1,0.01,0.07,0.07,0.08,0.03,0.1,0.08,0.16,0.21,0.05,0.07,0.24,0.07,0.51,1.0,20,3,10.24,3.52,0.48,0.72,1.9,0.7,0.3,0.57


In [191]:
final[(final['weight_lbs'] >= 220) & (final['40'] < 4.5)]

Unnamed: 0,player_name,playing_style,height_in,weight_lbs,40,bench,vertical,broad_jump,shuttle,3_cone,cross_pct,corner_pct,out_pct,curl_pct,post_pct,underneath_screen_pct,flat_pct,slant_pct,wr_screen_pct,comeback_pct,go_pct,in_pct,deep_pct,play_action_pct,rpo_pct,hurry_up_pct,difficult_pct,deep_sideline_pct,possession_saver_pct,clutch_catch,conversion_catch,redzone_catch,adot,avg_yac,avg_yacon,catch_rate,yprr,slot_rate,wide_rate,contested_catch_rate
3,A.J. Brown,Physical - Speedster,72.5,226,4.49,19,36.5,120,4.25,7.0,0.03,0.06,0.12,0.11,0.01,0.01,0.04,0.24,0.04,0.09,0.15,0.1,0.19,0.32,0.3,0.17,0.31,0.14,0.58,2.0,17,4,12.8,6.23,2.18,0.64,2.59,0.26,0.74,0.5
14,DK Metcalf,Physical - Speedster,75.38,228,4.33,27,40.5,134,4.5,7.38,0.08,0.02,0.1,0.13,0.1,0.01,0.04,0.12,0.01,0.13,0.18,0.07,0.12,0.21,0.07,0.21,0.26,0.08,0.55,1.0,13,5,11.38,2.41,0.71,0.65,1.81,0.17,0.83,0.48
71,Chase Claypool,Physical - Speedster,76.25,238,4.42,19,40.5,126,4.37,7.08,0.07,0.07,0.12,0.14,0.04,0.01,0.09,0.07,0.07,0.1,0.12,0.09,0.21,0.15,0.12,0.06,0.31,0.09,0.49,2.0,12,1,10.93,3.24,1.13,0.61,1.07,0.61,0.39,0.47
97,Julio Jones,Versatile,74.75,220,4.34,17,38.5,135,4.25,6.66,0.05,0.09,0.09,0.09,0.05,0.0,0.02,0.23,0.02,0.05,0.14,0.16,0.28,0.09,0.05,0.33,0.4,0.12,0.58,4.0,5,1,13.79,5.17,1.21,0.56,1.22,0.28,0.72,0.29
183,Dezmon Patmon,Physical - Speedster,76.0,228,4.48,15,36.0,132,4.38,7.28,0.33,0.0,0.17,0.17,0.0,0.0,0.0,0.0,0.0,0.0,0.17,0.17,0.17,0.33,0.33,0.0,0.33,0.17,0.67,0.0,0,0,12.17,3.5,2.5,0.33,1.6,0.31,0.69,0.0
184,Dareke Young,Physical - Possession,74.0,223,4.44,22,37.0,135,4.19,6.88,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0,0.5,11.5,6.0,1.0,0.77,0.38,0.59,0.0
185,Simi Fehoko,Physical - Possession,75.88,222,4.43,16,34.5,120,4.26,6.78,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.25,0.0,0.0,0.25,0.0,0.25,0.0,0.0,0.0,0.0,0.25,0.25,0.0,0,0,12.75,3.33,2.0,0.75,1.04,0.38,0.62,0.5
191,Keith Kirkwood,Physical - Possession,74.5,221,4.45,17,35.0,125,4.43,6.94,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.25,0.0,0.5,0.0,0.5,0.25,0.0,0.0,0.25,0.5,0.75,0.0,1,0,13.75,2.0,0.0,0.5,0.26,0.64,0.34,0.5
204,Miles Boykin,Physical - Possession,75.75,220,4.42,12,43.5,140,4.07,6.77,0.33,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.33,0.33,0.0,0.33,0.0,0.0,0.33,0.33,0.33,0.67,0.0,0,0,15.33,0.0,0.0,0.67,0.38,0.26,0.68,1.0
209,Jalen Camp,Physical - Possession,73.88,226,4.48,29,39.5,125,4.14,7.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0,0,7.0,0.0,0.0,1.0,1.0,0.38,0.62,1.0


In [192]:
final[(final['player_name'] == 'Julio Jones')]

Unnamed: 0,player_name,playing_style,height_in,weight_lbs,40,bench,vertical,broad_jump,shuttle,3_cone,cross_pct,corner_pct,out_pct,curl_pct,post_pct,underneath_screen_pct,flat_pct,slant_pct,wr_screen_pct,comeback_pct,go_pct,in_pct,deep_pct,play_action_pct,rpo_pct,hurry_up_pct,difficult_pct,deep_sideline_pct,possession_saver_pct,clutch_catch,conversion_catch,redzone_catch,adot,avg_yac,avg_yacon,catch_rate,yprr,slot_rate,wide_rate,contested_catch_rate
97,Julio Jones,Versatile,74.75,220,4.34,17,38.5,135,4.25,6.66,0.05,0.09,0.09,0.09,0.05,0.0,0.02,0.23,0.02,0.05,0.14,0.16,0.28,0.09,0.05,0.33,0.4,0.12,0.58,4.0,5,1,13.79,5.17,1.21,0.56,1.22,0.28,0.72,0.29


In [193]:
final[(final['player_name'] == 'Brandon Aiyuk')]

Unnamed: 0,player_name,playing_style,height_in,weight_lbs,40,bench,vertical,broad_jump,shuttle,3_cone,cross_pct,corner_pct,out_pct,curl_pct,post_pct,underneath_screen_pct,flat_pct,slant_pct,wr_screen_pct,comeback_pct,go_pct,in_pct,deep_pct,play_action_pct,rpo_pct,hurry_up_pct,difficult_pct,deep_sideline_pct,possession_saver_pct,clutch_catch,conversion_catch,redzone_catch,adot,avg_yac,avg_yacon,catch_rate,yprr,slot_rate,wide_rate,contested_catch_rate
20,Brandon Aiyuk,Versatile,71.63,205,4.5,11,40.0,128,4.27,7.02,0.05,0.02,0.16,0.1,0.07,0.0,0.04,0.15,0.06,0.05,0.08,0.23,0.12,0.19,0.11,0.04,0.21,0.08,0.52,4.0,9,6,10.04,4.97,1.36,0.7,1.91,0.24,0.76,0.41
