In [3]:
import csv
import json
import os
import re

from collections import defaultdict, OrderedDict

In [4]:
def remove_special_characters(match_string):
    return re.sub("(\/)|(\d)|(\.)|(')|(-)|(\s+)", '', match_string)

# Constants

In [8]:
nfl_acronym_team_map = {
    'Arizona Cardinals':'ARI', 
    'Atlanta Falcons':'ATL', 
    'Baltimore Ravens':'BAL', 
    'Buffalo Bills':'BUF', 
    'Carolina Panthers':'CAR', 
    'Chicago Bears':'CHI', 
    'Cincinnati Bengals':'CIN', 
    'Cleveland Browns':'CLE', 
    'Dallas Cowboys':'DAL', 
    'Denver Broncos':'DEN', 
    'Detroit Lions':'DET', 
    'Green Bay Packers':'GNB', 
    'Houston Texans':'HOU', 
    'Indianapolis Colts':'IND', 
    'Jacksonville Jaguars':'JAX', 
    'Kansas City Chiefs':'KAN', 
    'Las Vegas Raiders':'LVR', 
    'Los Angeles Chargers':'LAC', 
    'Los Angeles Rams':'LAR', 
    'Miami Dolphins':'MIA', 
    'Minnesota Vikings':'MIN', 
    'New England Patriots':'NWE', 
    'New Orleans Saints':'NOR', 
    'New York Giants':'NYG', 
    'New York Jets':'NYJ', 
    'Oakland Raiders':'OAK', 
    'Philadelphia Eagles':'PHI', 
    'Pittsburgh Steelers':'PIT', 
    'San Diego Chargers':'SDG', 
    'San Francisco 49ers':'SEA', 
    'Seattle Seahawks':'SEA', 
    'St. Louis Rams':'STL', 
    'Tampa Bay Buccaneers':'TAM', 
    'Tennessee Titans': 'TEN', 
    'Washington Football Team':'WAS',
    'Washington Redskins': 'WAS'
}

# Set professional matching data

Create a "match object" for each player in the NFL dataset.

In [14]:
with open('./ProData/cleaned/player_id_info.json', 'r') as nfl_data_file:
    nfl_name_data = json.load(nfl_data_file)

In [15]:
nfl_match_data = {}
for player_id, player_data in nfl_name_data.items():
    match_string = f'{player_data["name"]}{player_data["position"]}'
    matching_data = {
        'nfl_name': player_data['name'],
        'nfl_position': player_data['position'],
        'nfl_id': player_id,
        'nfl_match_string': remove_special_characters(match_string).lower()
    }
    nfl_match_data[player_id] = matching_data

### Add draft data 
Add draft data to the match object for each player

In [53]:
all_draft_data = {}
for _, _, files in os.walk('./ProData/cleaned/draft_data'):
    for file in files:
        with open(f'./ProData/cleaned/draft_data/{file}', 'r') as draft_data_file:
            data = json.load(draft_data_file)
            all_draft_data = {**data, **all_draft_data}

In [62]:
for player_id, plyr_draft_data in all_draft_data.items():
    team_acronym = plyr_draft_data['team']
    draft_pick = plyr_draft_data['draft_pick']
    
    nfl_name = nfl_match_data[player_id]['nfl_name']
    match_string = f'{remove_special_characters(nfl_name)}{draft_pick}'
    
    nfl_match_data[player_id]['nfl_drafted_by'] = plyr_draft_data['team']
    nfl_match_data[player_id]['nfl_overall_pick'] = plyr_draft_data['draft_pick']
    nfl_match_data[player_id]['nfl_match_string'] = match_string.lower()

# Load college data
Create match object for each player from the college dataset

In [69]:
with open('./CollegeData/draftedBy.json', 'r') as college_draft_file:
    college_draft_data = json.load(college_draft_file)

for player in college_draft_data:
    player_id = player['playerId']
    draft_pick = player['draftedOverall']

    match_string = f'{remove_special_characters(player_id)}{draft_pick}'

    player['match_string'] = match_string.lower()
    try:
        player['draftedBy'] = nfl_acronym_team_map[player['draftedBy']]
    except:
        player['invalidTeam'] = True
        continue
    
college_draft_data_clean = [player for player in college_draft_data if 'invalidTeam' not in player]

# Match

## Match NFL players and college players
In order to match college and NFL players we perform fuzzy matching for each professional player against all college players in the dataset. In order to account for misspellings and duplicate names, we decided to match based on name and overall draft position (to make this most accurate, we would have needed the "year drafted" from the college data, which is not present). However, we felt comfortable with the risk that two players with the exact same name would be drafted in the exact same spot over two drafts. 

The matching process is defined as follows:
1. From college and NFL dataset combine the players name and overall draft position
    * Player names were standardized by removing all special characters, white spaces, and numbers and then converted to lower case.
2. In order to be considered for a match, the two strings would need to have a match ratio of **60% or greater**
3. The college player that produces the highest match score will be saved for each NFL player.
4. To protect against false positives, we will consider a match to only be true if both of the following conditions are true:
    * The team that drafted the player from the college dataset matches the team that drafted the player from the NFL dataset
    * The overall draft pick from the college dataset matches the overall draft pick from the NFL dataset.

In [20]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [110]:
matches = []

min_score = 60

for pro_data in nfl_match_data.values():
    
    prev_match_score = 0
    match_college_id = ''
    match_obj = pro_data
    pro_match_string = pro_data['nfl_match_string']
    
    for college_data in college_draft_data_clean:
        match_score = fuzz.ratio(pro_match_string, college_data['match_string'])
        
        
        if match_score > prev_match_score and match_score > min_score:
            match_obj['match_score'] = match_score
            match_obj['college_match_str'] = college_data['match_string']
            match_obj['college_id'] = college_data['playerId']
            match_obj['college_position'] = college_data['draftedPos']
            match_obj['draft_team_college_data'] = college_data['draftedBy']
            match_obj['draft_pick_college_data'] = college_data['draftedOverall']
           
            prev_match_score = match_score
        
        if match_score == 100:
            break
    
    matches.append(match_obj)

In [111]:
no_match = [match_obj for match_obj in matches if 'match_score' not in match_obj]

In [112]:
initial_matches = [match_obj for match_obj in matches if 'match_score' in match_obj]

In [113]:
actual_matches = [match for match in initial_matches 
                 if match['nfl_drafted_by'] == match['draft_team_college_data'] and
                     str(match['nfl_overall_pick']) == match['draft_pick_college_data']]

In [1]:
print(f'No match above 60%: {len(no_match)}')
print(f'Players with match scores greater than 60: {len(initial_matches)}')
print(f'Number of true positives: {len(actual_matches)}')
print(f'Number of false positives: {len(nfl_match_data.values()) - len(actual_matches)}')
print(f'Number of players: {len(nfl_match_data.values())}')

NameError: name 'no_match' is not defined

## Save true positive matches

In [78]:
with open('./matches.json', 'w') as outfile:
    json.dump(actual_matches, outfile)

## Find number of non matches of relevant positions
Since we are not measuring success of players in the following positions: `['G', 'K', 'LS', 'P', 'T', 'OL', 'C']`, we will ignore non matches for these positions

In [95]:
false_matches = [match for match in initial_matches 
                 if match['nfl_drafted_by'] != match['draft_team_college_data'] or
                     str(match['nfl_overall_pick']) != match['draft_pick_college_data']]

In [97]:
no_match += false_matches

1402

In [98]:
positions_in_matchless = set([player['nfl_position'] for player in no_match])

In [100]:
ignore_positions = ['G', 'K', 'LS', 'P', 'T', 'OL', 'C']

In [101]:
relevant_non_matches = [non_match for non_match in no_match if non_match['nfl_position'] not in ignore_positions]

In [102]:
len(relevant_non_matches)

961

## Ignore players drafted after 2019

The college dataset only includes data from players through 2019.

In [103]:
all_draft_data = {}
for root, _, files in os.walk('./ProData/cleaned/draft_data'):
    for file in files:
        position = file.strip('.json')
        
        if position in ignore_positions:
            continue
        
        with open(f'{root}/{file}', 'r') as draft_data_file:
            draft_data = json.load(draft_data_file)
            
        for player_id, data in draft_data.items():
            all_draft_data = {**all_draft_data, **draft_data}

In [104]:
non_matches_with_draft_year = []
for player in relevant_non_matches:
    draft_year = all_draft_data[player['nfl_id']]['year']
    player['draft_year'] = int(re.sub('\*', '', draft_year))

In [105]:
college_data_cutoff_year = 2019

non_matches_pre_2020 = [player for player in relevant_non_matches 
                            if player['draft_year'] <= college_data_cutoff_year]

In [106]:
len(non_matches_pre_2020)

722

## Find percentage of matchless players by year drafted

In [2]:
draft_year_counts_missing_data = defaultdict(int)

for player in non_matches_pre_2020:
    draft_year = player['draft_year']
    draft_year_counts_missing_data[draft_year]+=1

missing_matches_by_year_perc = {}
for year, missing_matches_count in draft_year_counts_missing_data.items():
    missing_matches_by_year_perc[year] = 100 * missing_matches_count / len(non_matches_pre_2020)

sorted(missing_matches_by_year_perc.items(), key=lambda x: x[0])

NameError: name 'defaultdict' is not defined

# Find non matches that are statistically significant 

## Get all player data

In [242]:
season_data = {}

for root, _, files in os.walk('./ProData/results'):
    for file in files:
        draft_year, last_initial = file.strip('.json').split('_')

        with open(f'{root}/{file}', 'r') as player_data_file:
            nfl_player_data = json.load(player_data_file)

        for player_id, all_data in nfl_player_data.items():
            season_data[player_id] = all_data['career_statistics']

In [243]:
career_statistics = {}
for player in eligible_non_matches:
    player_id = player['nfl_id']
    
    career_statistics[player_id] = season_data[player_id].values()
    

In [182]:
from functools import reduce

In [234]:
def get_total_games_and_games_started(acc, curr):
    if 'gs' in curr and curr['gs']:
        acc['gs'] += int(curr['gs']) if len(curr['gs']) <= 2 else 0
    
    if 'g' in curr and curr['g']:
        acc['g'] += int(curr['g']) if len(curr['g']) <= 2 else 0
        
    if 'snaps_played' in curr and 'total_snaps' in curr:
        acc['snaps_played'] += curr['snaps_played']
        acc['total_snaps'] += round(curr['total_snaps'])
        acc['snap_perc'] = acc['snaps_played'] / acc['total_snaps'] if acc['total_snaps'] else 0
    return acc

In [244]:
missing_player_game_data = {}

for player_id, season_stats in career_statistics.items():
    reduce_init = {'gs': 0, 'g': 0, 'snap_perc': 0, 'snaps_played': 0, 'total_snaps': 0}
    game_data = reduce(get_total_games_and_games_started, season_stats, reduce_init)
    missing_player_game_data[player_id] = game_data   

In [245]:
missing_player_game_data

{'AikeWa00': {'gs': 4,
  'g': 93,
  'snap_perc': 0.09418360196215837,
  'snaps_played': 672,
  'total_snaps': 7135},
 'RobbFr20': {'gs': 148,
  'g': 180,
  'snap_perc': 0,
  'snaps_played': 0,
  'total_snaps': 0},
 'BellYe20': {'gs': 107,
  'g': 142,
  'snap_perc': 0.9897674418604652,
  'snaps_played': 2128,
  'total_snaps': 2150},
 'AndrWi20': {'gs': 0,
  'g': 30,
  'snap_perc': 0,
  'snaps_played': 0,
  'total_snaps': 0},
 'EmanKy00': {'gs': 32,
  'g': 72,
  'snap_perc': 0,
  'snaps_played': 0,
  'total_snaps': 0},
 'RobiTr02': {'gs': 8,
  'g': 38,
  'snap_perc': 0.25341560158660204,
  'snaps_played': 575,
  'total_snaps': 2269},
 'VandKy99': {'gs': 137,
  'g': 152,
  'snap_perc': 0.6199226305609284,
  'snaps_played': 641,
  'total_snaps': 1034},
 'FreeDw00': {'gs': 157,
  'g': 222,
  'snap_perc': 0.5209707822808671,
  'snaps_played': 2211,
  'total_snaps': 4244},
 'FreeEd20': {'gs': 0,
  'g': 20,
  'snap_perc': 0,
  'snaps_played': 0,
  'total_snaps': 0},
 'FishTr20': {'gs': 74,
  '

In [257]:
players_who_played = [player_id for player_id, game_info in missing_player_game_data.items()
                                if game_info['snap_perc'] > 0.3]

In [256]:
# before snap data was taken
len([player_id for player_id, game_info in missing_player_game_data.items()
                                if game_info['total_snaps'] == 0])

429