# Pseudo code for KNN classifier

In [1]:
import json
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

In [2]:
pro_qb_data_scores = pd.read_csv('../ProData/qb_career_stats_no_career_totals.csv')
college_qb_data = pd.read_csv('../CollegeData/all_passing.csv')

## Iterate through list of matches and map college id to nfl id

In [3]:
with open('./NFL EDA/matches.json', 'r') as match_file:
  match_data = json.load(match_file)

In [4]:
player_id_tuple = [(match['nfl_id'], match['college_id']) for match in match_data]

nfl_id, college_id = list(zip(*player_id_tuple))
player_ids = pd.DataFrame.from_dict({
  'nfl_id': nfl_id,
  'college_id': college_id
}).set_index('nfl_id')

player_ids.head()

Unnamed: 0_level_0,college_id
nfl_id,Unnamed: 1_level_1
UdezKe20,kenechi-udeze-1
QualEl00,elijah-qualls-1
AmarJa00,jace-amaro-1
AttaJe00,jeremiah-attaochu-1
AdamDa01,davante-adams-1


## Reset pro data index to college player id

In [5]:
target = pro_qb_data_scores.set_index('player_id').join(player_ids, how='inner').set_index('college_id')
print(len(target))
target.head()

169


Unnamed: 0_level_0,season_count,career_score
college_id,Unnamed: 1_level_1,Unnamed: 2_level_1
brandon-allen-2,3,-0.011637
derek-anderson-1,12,-0.125445
matt-barkley-1,6,-0.249274
john-beck-1,2,-0.110608
brooks-bollinger-1,3,-0.235905


## Split Scored Pro data into tiers for model training / testing

In [6]:
TIERS = 5

sorted_scores = target.sort_values(by='career_score').reset_index()
number_of_players = len(sorted_scores)
increment = number_of_players // TIERS


prev_idx = 0
for tier_num, i in enumerate(range(increment, number_of_players, increment)):
  if tier_num == TIERS:
    sorted_scores.loc[prev_idx:, 'tier'] = tier_num
  else:
    sorted_scores.loc[prev_idx:i, 'tier'] = tier_num
    
  prev_idx = i
  
target = sorted_scores.set_index('college_id').drop(['season_count', 'career_score'], axis=1)

## Clean College

In [7]:
players_to_keep = set(target.index.values).intersection(set(college_qb_data.playerId.values))


In [8]:

college_feature_data = college_qb_data.groupby('playerId').sum().loc[players_to_keep]
print(len(college_feature_data))
target = target.loc[players_to_keep]
print(len(target))
target.head()

166
166


  college_feature_data = college_qb_data.groupby('playerId').sum().loc[players_to_keep]
  target = target.loc[players_to_keep]


Unnamed: 0_level_0,tier
college_id,Unnamed: 1_level_1
patrick-ramsey-1,2.0
brooks-bollinger-1,1.0
brady-quinn-1,2.0
marques-tuiasosopo-1,0.0
brett-hundley-1,2.0


## Train Model

### Split train and test data

In [9]:
features = college_feature_data.sort_index()
target = target.sort_index()
print(len(target), print(len(features)))

166
166 None


In [10]:
# X -> features, y -> label
X = features
y = target['tier']
 
# dividing X, y into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

### Train model

In [11]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 8).fit(X_train, y_train)
 
# accuracy on X_test
accuracy = knn.score(X_test, y_test)
print(accuracy)
 
# creating a confusion matrix
knn_predictions = knn.predict(X_test)
cm = confusion_matrix(y_test, knn_predictions)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').