In [62]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import os.path
import pickle
from IPython.core.debugger import set_trace


## Scraping the Draft list

In [2]:
def readstat(row, tag, stat):
    """
    finds *tag* in table *row* with attribute: "data-stat" == *stat*
    """
    temp = row.find(tag, attrs = {"data-stat" : stat})
    if temp is None:
        return None
    else:
        return temp.text

In [205]:
def create_draft_df(bs_obj):
    """
    Returns draft info dataframe from profootballreference using
    BeautifulSoup object
    """
    
    table = bs.find("table", class_ = 'sortable stats_table').find_all('tr')    

    #columns of interest
    data_columns = ['draft_pick', 'team', 'player', 'pos', 'age', 'year_max', 'g', 'college_id']

    drafted = []    

    for row in table[2:-1]:
        rowdata = {}

        # Get data column in <TH>
        rowdata['draft_round'] = readstat(row, "th", "draft_round")

        # Get data columns in <TD>
        for col in data_columns:
            rowdata[col] = readstat(row, "td", col)

        # Get player link
        x = row.find("td", attrs = {'data-stat' : 'player'})

        if x is None:
            continue

        if x.find("a") is None:
            rowdata['player_link'] = ""
        else:
            rowdata['player_link'] = x.find("a").get('href')

        # Get college link
        x = row.find("td", attrs = {'data-stat' : 'college_link'})

        if x is None:
            continue

        if x.find("a") is None:
            rowdata['college_link'] = ""
        else:
            rowdata['college_link'] = x.find("a").get('href')            
            
        drafted.append(rowdata)    

    return pd.DataFrame(drafted)

        

## Scraping the Combine


In [211]:
def create_combine_df(bs_obj):
    """
    Returns Combine info dataframe from profootballreference using
    BeautifulSoup object
    """
    #set_trace()
    year = bs_obj.h1.text[:4]
    
    ctable = bs_obj.find("table")
    
    #columns of interest
    data_columns = "pos school_name height weight forty_yd vertical bench_reps broad_jump cone shuttle".split()

    combine = []    

    for row in ctable.find_all('tr')[1:]:
        rowdata = {}

        try:
            #rowdata['player'] = row.find("th", attrs = {"csk":True}).text
            th = row.find("th", attrs = {"data-stat":'player'})
            if th:
                if th.a:
                    rowdata['player'] = th.a.text
                    rowdata['player_link'] = th.a.get('href')
                else:
                    rowdata['player'] = th.text
                    rowdata['player_link'] = ''
            else:
                continue
        except AttributeError:
            set_trace()
            continue

        # Get data columns
        for col in data_columns:
            rowdata[col] = readstat(row, "td", col)

        rowdata['year'] = year    
        
        combine.append(rowdata)    
    
    return pd.DataFrame(combine)

In [212]:
def convertHeighttoIn(x):
    """
    Converts player height in feet-inches to inches
    """
    try:
        feet, inches = x.split("-")
        return int(feet) * 12 + int(inches)
    except:
        return 0

In [213]:
def calcBMI(h, w):
    """
    Calculates BMI based on height (inches) and weight (lbs)
    """
    if (h == 0) or (w == 0):
        return 0
    else:
        return w / h**2 * 703

In [214]:
def create_combine_df_altsite(bs_obj):
    """
    Pulls combine data from alternative site for pre-2000 data
    """ 
    columns = ["year", "player", "school_name", "pos", "height", "weight", "wonderlic", 
              "forty_yd", "bench_reps", "vertical", "broad_jump", "shuttle", "cone"]

    #set_trace()
    tbl = bs_obj.find("table")
    combine = []
    for row in tbl.find_all("tr"):
        #set_trace()
        rowdata = {}
        for i, td in enumerate(row.find_all("td")):
            #set_trace()
            if td.div:
                rowdata[columns[i]] = td.text
            elif td.a:
                #set_trace()
                rowdata[columns[i]] = td.a.text
            else:
                rowdata[columns[i]] = td.text
        #print("row:", rowdata)
        combine.append(rowdata) 
    
    return pd.DataFrame(combine)

In [265]:
#first available combine

years = range(1990, 2011, 1)

draft_url = "https://www.pro-football-reference.com/years/{}/draft.htm"
combine_url = "https://www.pro-football-reference.com/draft/{}-combine.htm"
combine_url_alt = "https://nflcombineresults.com/nflcombinedata.php?year={}&pos=&college="

dflist = []

for year in years:
    #get draft for *year*
    
    url_d = draft_url.format(year)
    url_c = combine_url.format(year)
    url_c_alt = combine_url_alt.format(year)
    download_d = "data/{}_draft.pkl".format(year)
    download_c = "data/{}_combine.pkl".format(year)
    download_c_alt = "data/{}_combine_alt.pkl".format(year)
    text = ""
    
    print(f"Year: {year}")
    
    if os.path.exists(download_d):
        #if html file has already been downloaded, use it
        print(f"Draft year {year} already downloaded; loading from file")
        with open(download_d, "rb") as file:
            response = pickle.load(file)
    else:
        #otherwise pull the data from the url
        print(f"Draft year {year} downloaded and saved to file")        
        response = requests.get(url_d)
        with open(download_d, "wb") as file:
            pickle.dump(response, file)                        
         
    bs = BeautifulSoup(response.text, "html5lib" )
   # print("bs >>>>>>>>>>>>>>>>>>>>>" , bs)
    df1 = create_draft_df(bs)
    #print("Draft:", df1.head())
    
    if year >= 2000:
        #use primary site
        if os.path.exists(download_c):
            #if html file has already been downloaded, use it
            print(f"Combine year {year} already downloaded; loading from file")
            with open(download_c, "rb") as file:
                response = pickle.load(file)
        else:
            #otherwise pull the data from the url
            print(f"Combine year {year} downloaded and saved to file")        
            response = requests.get(url_c)
            with open(download_c, "wb") as file:
                pickle.dump(response, file)                        
                
        bs2 = BeautifulSoup(response.text, "html5lib" )
        df2 = create_combine_df(bs2)
        #print("Combine:", df2.head())           
    else:
        #use alt site
        if os.path.exists(download_c_alt):
            #if html file has already been downloaded, use it
            print(f"ALT Combine year {year} already downloaded; loading from file")
            with open(download_c_alt, "rb") as file:
                response = pickle.load(file)
        else:
            #otherwise pull the data from the url
            print(f"ALT Combine year {year} downloaded and saved to file")        
            response = requests.get(url_c_alt)
            text = response.text
            with open(download_c_alt, "wb") as file:
                pickle.dump(response, file)                        

        #set_trace()
        bs2 = BeautifulSoup(response.text, "html5lib" )
        df2 = create_combine_df_altsite(bs2)
        #print("Combine:", df2.head())                   
    
    #Hall of Fame players have 'HOF' after name
    df1["player"] = df1.player.str.replace('HOF','')
    
    df1["player"] = df1.player.str.strip()
    df1['year'] = str(year)
    df1['playernametolink'] = df1.player.str.lower()
    
    df2["player"] = df2.player.str.strip()
    df2['playernametolink'] = df2.player.str.lower()
    
    if year >= 2000:
        #set_trace()
        #d_and_c = pd.merge(df1, df2, how="left", on = ["player_link"])
        d_and_c = pd.merge(df1, df2, how="left", on = ["playernametolink", 'year'])
    else:
        d_and_c = pd.merge(df1, df2, how="left", on = ["playernametolink", 'year'])

    d_and_c['height_in'] = d_and_c.height.map(convertHeighttoIn)   
                            
    d_and_c['DraftYr'] = year

    dflist.append(d_and_c) 
    
players = pd.concat(dflist)
players.head()
                            

Year: 1990
Draft year 1990 already downloaded; loading from file
ALT Combine year 1990 already downloaded; loading from file
Year: 1991
Draft year 1991 already downloaded; loading from file
ALT Combine year 1991 already downloaded; loading from file
Year: 1992
Draft year 1992 already downloaded; loading from file
ALT Combine year 1992 already downloaded; loading from file
Year: 1993
Draft year 1993 already downloaded; loading from file
ALT Combine year 1993 already downloaded; loading from file
Year: 1994
Draft year 1994 already downloaded; loading from file
ALT Combine year 1994 already downloaded; loading from file
Year: 1995
Draft year 1995 already downloaded; loading from file
ALT Combine year 1995 already downloaded; loading from file
Year: 1996
Draft year 1996 already downloaded; loading from file
ALT Combine year 1996 already downloaded; loading from file
Year: 1997
Draft year 1997 already downloaded; loading from file
ALT Combine year 1997 already downloaded; loading from file


Unnamed: 0,draft_round,draft_pick,team,player_x,pos_x,age,year_max,g,college_id,player_link,...,forty_yd,bench_reps,vertical,broad_jump,shuttle,cone,height_in,DraftYr,player_link_x,player_link_y
0,1,1,IND,Jeff George,QB,22,2001,131,Illinois,/players/G/GeorJe00.htm,...,,,,,,,0,1990,,
1,1,2,NYJ,Blair Thomas,RB,22,1995,64,Penn St.,/players/T/ThomBl00.htm,...,,,,,,,0,1990,,
2,1,3,SEA,Cortez Kennedy,DT,22,2000,167,Miami (FL),/players/K/KennCo00.htm,...,4.95,23.0,,,9.99,9.99,0,1990,,
3,1,4,TAM,Keith McCants,DE,22,1995,88,Alabama,/players/M/McCaKe21.htm,...,,,,,,,0,1990,,
4,1,5,SDG,Junior Seau,LB,21,2009,268,USC,/players/S/SeauJu00.htm,...,,,,,,,0,1990,,


In [281]:
players.rename(columns={'player_x':'player'}, inplace=True)

In [282]:
players.drop(columns=['player_y'], inplace=True)

In [283]:
players.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5493 entries, 0 to 253
Data columns (total 28 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   draft_round       5493 non-null   object
 1   draft_pick        5493 non-null   object
 2   team              5493 non-null   object
 3   player            5493 non-null   object
 4   pos_x             5493 non-null   object
 5   age               5493 non-null   object
 6   year_max          5493 non-null   object
 7   g                 5493 non-null   object
 8   college_id        5493 non-null   object
 9   player_link       2691 non-null   object
 10  college_link      5493 non-null   object
 11  year              5493 non-null   object
 12  playernametolink  5493 non-null   object
 13  school_name       4369 non-null   object
 14  pos_y             4369 non-null   object
 15  height            4369 non-null   object
 16  weight            4369 non-null   object
 17  wonderlic      

In [271]:
players.DraftYr.value_counts(dropna = False)

1992    339
1991    336
1990    332
2003    262
2002    260
2005    258
2007    255
2009    255
1996    255
2000    254
2010    254
2006    254
2004    254
1999    252
2008    251
1995    248
2001    245
1997    241
1998    240
1993    227
1994    221
Name: DraftYr, dtype: int64

In [278]:
players.sample(30)


Unnamed: 0,draft_round,draft_pick,team,player_x,pos_x,age,year_max,g,college_id,player_link,...,forty_yd,bench_reps,vertical,broad_jump,shuttle,cone,height_in,DraftYr,player_link_x,player_link_y
103,4,103,PIT,Adrian Cooper,TE,23.0,1996.0,77.0,Oklahoma,/players/C/CoopAd00.htm,...,4.94,19.0,31.5,110.0,4.67,9.99,0,1991,,
190,6,188,HOU,C.C. Brown,DB,22.0,2011.0,83.0,Louisiana,,...,,,,,,,0,2005,/players/B/BrowC.20.htm,
50,2,51,DEN,Allen Aldridge,LB,22.0,2001.0,128.0,Houston,/players/A/AldrAl21.htm,...,4.61,28.0,36.5,121.0,4.35,9.99,0,1994,,
85,3,86,PHI,Daniel Te'o-Nesheim,DE,23.0,2013.0,39.0,Washington,,...,4.73,29.0,37.0,121.0,4.18,6.91,75,2010,/players/T/Teo-Da99.htm,/players/T/Teo-Da99.htm
1,1,2,OAK,Robert Gallery,T,24.0,2011.0,104.0,Iowa,,...,4.98,24.0,30.0,105.0,4.38,7.42,79,2004,/players/G/GallRo20.htm,/players/G/GallRo20.htm
231,7,231,OAK,Clifton Black,DB,,,,Texas St.,,...,,,,,,,0,2000,,
184,7,183,NOR,Scott Hough,G,,,,Maine,,...,,,,,,,0,1990,,
105,4,106,MIN,Antonio Wilson,LB,22.0,2002.0,16.0,Texas A&M-Commerce,,...,4.56,11.0,,,,,74,2000,/players/W/WilsAn20.htm,/players/W/WilsAn20.htm
40,2,41,GNB,Robert Ferguson,WR,21.0,2008.0,83.0,Texas A&M,,...,4.48,,36.5,120.0,4.07,7.24,73,2001,/players/F/FergRo00.htm,/players/F/FergRo00.htm
188,6,189,SEA,Tyler Schmitt,LB,22.0,,,San Diego St.,,...,4.93,21.0,29.0,118.0,4.33,7.03,74,2008,/players/S/SchmTy99.htm,/players/S/SchmTy99.htm


In [263]:
df2.player_link.nunique()

221

In [258]:
df2.sort_values('player_link')

Unnamed: 0,player,player_link,pos,school_name,height,weight,forty_yd,vertical,bench_reps,broad_jump,cone,shuttle,year,playernametolink
207,Richard Mercier,,OG,Miami (FL),6-3,295,5.39,26.0,18,97,7.84,4.78,2000,richard mercier
323,Destry Wright,,RB,Jackson State,5-11,216,4.84,29.5,15,112,7.15,4.41,2000,destry wright
75,Adam Davis,,OG,Oklahoma State,6-4,309,5.66,26.0,,92,8.51,5.00,2000,adam davis
74,Darren Davis,,RB,Iowa State,5-8,189,4.81,32.0,13,109,7.04,4.47,2000,darren davis
73,Joe Dean Davenport,,TE,Arkansas,6-7,268,5.06,30.0,13,108,7.33,4.67,2000,joe dean davenport
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
321,Antonio Wilson,/players/W/WilsAn20.htm,OLB,Texas A&M-Commerce,6-2,245,4.56,,11,,,,2000,antonio wilson
322,Cedric Woodard,/players/W/WoodCe20.htm,DT,Texas,6-2,290,5.21,30.5,23,109,,,2000,cedric woodard
324,Spergon Wynn,/players/W/WynnSp00.htm,QB,Texas State,6-3,229,4.91,34.0,,108,7.71,4.59,2000,spergon wynn
325,Bashir Yamini,/players/Y/YamiBa20.htm,WR,Iowa,6-3,191,4.53,33.0,,130,7.09,4.18,2000,bashir yamini


In [273]:
players.to_csv('df/players3.csv')

In [272]:
!pwd


/Users/jmichit/Documents/Metis/linear_regression


## Import College stats


In [157]:
debug

> [0;32m/opt/anaconda3/envs/metis/lib/python3.8/site-packages/pandas/core/generic.py[0m(1684)[0;36m_get_label_or_level_values[0;34m()[0m
[0;32m   1682 [0;31m            [0mvalues[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0maxes[0m[0;34m[[0m[0maxis[0m[0;34m][0m[0;34m.[0m[0mget_level_values[0m[0;34m([0m[0mkey[0m[0;34m)[0m[0;34m.[0m[0m_values[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   1683 [0;31m        [0;32melse[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m-> 1684 [0;31m            [0;32mraise[0m [0mKeyError[0m[0;34m([0m[0mkey[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   1685 [0;31m[0;34m[0m[0m
[0m[0;32m   1686 [0;31m        [0;31m# Check for duplicates[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> ll
[1;32m   1640 [0m    [0;34m@[0m[0mfinal[0m[0;34m[0m[0;34m[0m[0m
[1;32m   1641 [0m    [0;32mdef[0m [0m_get_label_or_level_values[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mkey[0m[0;34m:[0m [0mstr[

ipdb> up
> [0;32m/opt/anaconda3/envs/metis/lib/python3.8/site-packages/pandas/core/reshape/merge.py[0m(1033)[0;36m_get_merge_keys[0;34m()[0m
[0;32m   1031 [0;31m                    [0;32mif[0m [0;32mnot[0m [0mis_rkey[0m[0;34m([0m[0mrk[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   1032 [0;31m                        [0;32mif[0m [0mrk[0m [0;32mis[0m [0;32mnot[0m [0;32mNone[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m-> 1033 [0;31m                            [0mright_keys[0m[0;34m.[0m[0mappend[0m[0;34m([0m[0mright[0m[0;34m.[0m[0m_get_label_or_level_values[0m[0;34m([0m[0mrk[0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   1034 [0;31m                        [0;32melse[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   1035 [0;31m                            [0;31m# work-around for merge_asof(right_index=True)[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> up
> [0;32m/opt/anaconda3/envs

In [9]:
# url = "https://www.sports-reference.com/cfb/players/kion-wilson-1.html"

# response = requests.get(url)

# bs3 = BeautifulSoup(response.text)

In [30]:
def get_college_data(bs_obj, debug = False):
    #set_trace()
    tbl = bs_obj.find("table", class_="stats_table")
    
    #No tbl available so return {}
    if tbl is None:
        return {}
    
    rows = tbl.find_all("tr", attrs = {"class":False})

    games = 0
    seasons = 0

    conf = ""

    for row in rows[1:]:
        seasons += 1
        x = row.find('td', attrs = {"data-stat":"g"})
        
        if x.text.isnumeric():
            games += int(x.text)
        
        cname = row.find('td', attrs = {'data-stat':"school_name"}).text
        cname_link = row.find('td', attrs = {'data-stat':"school_name"}).a.get('href')
        conf = row.find('td', attrs = {'data-stat':"conf_abbr"}).text    

    if debug:
        print("College Conference:  {}".format(conf))    
        print("College Name:        {}".format(cname))    
        print("Cname link {}".format(cname_link))
        print("College Seasons:     {}".format(seasons))
        print("College Games:       {}".format(games))

    odict = {"conf" : conf,
             "cname" : cname,
             "cname_link" : cname_link,
             "cseasons" : seasons,
             "cgames" : games}
    
    return odict


In [31]:
def process_player_file(name, url, filestr):

    if os.path.exists(filestr):
        #if html file has already been downloaded, use it
        print(f"Player {name} already downloaded; loading from file")
        with open(filestr, "rb") as file:
            response = pickle.load(file)
    else:
        #otherwise pull the data from the url
        print(f"Player {name} downloaded and saved to file")        
        response = requests.get(url)
        with open(filestr, "wb") as file:
            pickle.dump(response, file)                        
         
    bs = BeautifulSoup(response.text, "html5lib" )
    data = get_college_data(bs)
    
    return data

In [107]:
def load_cdat_by_draftyr(year):
    base_pkl = f"players/{year}/"

    ls_college =[]

    for i, row in players[players.DraftYr == year].iterrows():
        link = row.college_link
        name = row.player
        plink = row.player_link
        filestr = base_pkl + os.path.basename(link).replace('.html','.pkl')  

        cdat = {}
        if link.strip() != '':      
            cdat = process_player_file(name, link, filestr)

        cdat["player"] = name
        cdat["year"] = year
        cdat["clink"] = link
        cdat["plink"] = row.player_link

        ls_college.append(cdat)

    return pd.DataFrame(ls_college)

In [284]:
draft_ls = []

for year in range(1990,2011):
    print(f"***{year}***")
    df_coll = load_cdat_by_draftyr(year)
    draft_ls.append(df_coll)
    
collegedf = pd.concat(draft_ls)

#collegedf.to_csv('collegedf2.csv')

collegedf.head()

***1990***
Player Jeff George already downloaded; loading from file
Player Blair Thomas already downloaded; loading from file
Player Keith McCants already downloaded; loading from file
Player Junior Seau already downloaded; loading from file
Player Mark Carrier already downloaded; loading from file
Player Andre Ware already downloaded; loading from file
Player Chris Singleton already downloaded; loading from file
Player James Francis already downloaded; loading from file
Player Percy Snow already downloaded; loading from file
Player Emmitt Smith already downloaded; loading from file
Player Darrell Thompson already downloaded; loading from file
Player Steve Broussard already downloaded; loading from file
Player Ben Smith already downloaded; loading from file
Player Rodney Hampton already downloaded; loading from file
Player Dexter Carter already downloaded; loading from file
Player Alexander Wright already downloaded; loading from file
Player Reggie Rembert already downloaded; loading f

Player Anthony Thompson already downloaded; loading from file
Player Chris Ellison already downloaded; loading from file
Player Derrick Kelson already downloaded; loading from file
Player Terry Anthony already downloaded; loading from file
Player Tommie Stowers already downloaded; loading from file
Player Reginald Warnsley already downloaded; loading from file
Player Daryl Reed already downloaded; loading from file
Player Carnel Smith already downloaded; loading from file
Player John Hudson already downloaded; loading from file
Player Tim Downing already downloaded; loading from file
Player Myron Jones already downloaded; loading from file
Player Shawn McCarthy already downloaded; loading from file
Player Darrell Davis already downloaded; loading from file
Player Donnie Riley already downloaded; loading from file
Player Ventson Donelson already downloaded; loading from file
Player Anthony Cooney already downloaded; loading from file
Player John Gromos already downloaded; loading from f

Player Jerry Evans already downloaded; loading from file
Player Cedric Jackson already downloaded; loading from file
Player Tim Bruton already downloaded; loading from file
Player Mike Dingle already downloaded; loading from file
Player Gary Brown already downloaded; loading from file
Player Larry Horton already downloaded; loading from file
Player Roland Smith already downloaded; loading from file
Player Tony Hargain already downloaded; loading from file
Player O'Neil Glenn already downloaded; loading from file
Player Gerald Hudson already downloaded; loading from file
Player Damon Mays already downloaded; loading from file
Player Howard Griffith already downloaded; loading from file
Player Anthony Wallace already downloaded; loading from file
Player Bruce McGonnigal already downloaded; loading from file
Player Shawn Wiggins already downloaded; loading from file
Player Shane Garrett already downloaded; loading from file
Player Chuck Weatherspoon already downloaded; loading from file
P

Player Deems May already downloaded; loading from file
Player Derek Ware already downloaded; loading from file
Player Russ Campbell already downloaded; loading from file
Player David Wilson already downloaded; loading from file
Player Ken Swilling already downloaded; loading from file
Player Kevin Smith already downloaded; loading from file
Player Kevin Smith already downloaded; loading from file
Player Erick Anderson already downloaded; loading from file
Player Scottie Graham already downloaded; loading from file
Player Elbert Turner already downloaded; loading from file
Player Dave Moore already downloaded; loading from file
Player Jon Bostick already downloaded; loading from file
Player Calvin Holmes already downloaded; loading from file
Player Jason Belser already downloaded; loading from file
Player Anthony McDowell already downloaded; loading from file
Player Eric Blount already downloaded; loading from file
Player Darren Perry already downloaded; loading from file
Player Scott L

Player Kevin Williams already downloaded; loading from file
Player Chuck Belin already downloaded; loading from file
Player John Henry Mills already downloaded; loading from file
Player Chris Gray already downloaded; loading from file
Player Everett Lindsay already downloaded; loading from file
Player Tyrone Hughes already downloaded; loading from file
Player Rich Griffith already downloaded; loading from file
Player Doug Evans already downloaded; loading from file
Player Lawrence Hatch already downloaded; loading from file
Player Richie Anderson already downloaded; loading from file
Player Dave Hoffmann already downloaded; loading from file
Player Greg Jeffries already downloaded; loading from file
Player Deral Boykin already downloaded; loading from file
Player Mitch Lyons already downloaded; loading from file
Player Melvin Bonner already downloaded; loading from file
Player Darryl Morrison already downloaded; loading from file
Player Carlos Etheredge already downloaded; loading from

Player A.J. Ofodile already downloaded; loading from file
Player Tony Vinson already downloaded; loading from file
Player Lamont Warren already downloaded; loading from file
Player Steve Hawkins already downloaded; loading from file
Player Rickey Brady already downloaded; loading from file
Player Max Lane already downloaded; loading from file
Player Jay Kearney already downloaded; loading from file
Player Lloyd Hill already downloaded; loading from file
Player Robert Strait already downloaded; loading from file
Player Terry Samuels already downloaded; loading from file
Player Ruffin Hamilton already downloaded; loading from file
Player Derrell Mitchell already downloaded; loading from file
Player Jim Miller already downloaded; loading from file
Player Jocelyn Borgella already downloaded; loading from file
Player Anthony Daigle already downloaded; loading from file
Player Lee Gissendaner already downloaded; loading from file
Player Darren Studstill already downloaded; loading from file


Player Kevin Hickman already downloaded; loading from file
Player Steve Strahan already downloaded; loading from file
Player Jerry Colquitt already downloaded; loading from file
Player Cory Schlesinger already downloaded; loading from file
Player Kenny Gales already downloaded; loading from file
Player Jeff Kopp already downloaded; loading from file
Player Dino Philyaw already downloaded; loading from file
Player Terrell Davis already downloaded; loading from file
Player Craig Whelihan already downloaded; loading from file
Player Barron Miles already downloaded; loading from file
Player Eddie Goines already downloaded; loading from file
Player Charles Way already downloaded; loading from file
Player Fred McCrary already downloaded; loading from file
Player Chad Cota already downloaded; loading from file
Player Kevin Bouie already downloaded; loading from file
Player C.J. Richardson already downloaded; loading from file
Player Billy Williams already downloaded; loading from file
Player 

Player Leon Neal already downloaded; loading from file
Player Tony Johnson already downloaded; loading from file
Player Stephen Pitts already downloaded; loading from file
Player Phillip Riley already downloaded; loading from file
Player Hayward Clay already downloaded; loading from file
Player Spence Fischer already downloaded; loading from file
Player Toderick Malone already downloaded; loading from file
Player T.J. Cunningham already downloaded; loading from file
Player Chris Hayes already downloaded; loading from file
Player Lovett Purnell already downloaded; loading from file
Player Mike Archie already downloaded; loading from file
Player Reggie Rusk already downloaded; loading from file
Player Freddie Bradley already downloaded; loading from file
Player Adrian Robinson already downloaded; loading from file
Player L.T. Levine already downloaded; loading from file
Player Brian Gragert already downloaded; loading from file
Player Sean Manuel already downloaded; loading from file
Pla

Player Tony Graziani already downloaded; loading from file
Player Terry Battle already downloaded; loading from file
Player Koy Detmer already downloaded; loading from file
Player Carlos Jones already downloaded; loading from file
Player Chris Miller already downloaded; loading from file
Player William Carr already downloaded; loading from file
Player Chris Bayne already downloaded; loading from file
Player Mike Adams already downloaded; loading from file
Player Byron Capers already downloaded; loading from file
Player Pat Fitzgerald already downloaded; loading from file
Player Pat Fitzgerald already downloaded; loading from file
Player Kris Mangum already downloaded; loading from file
Player Jerald Sowell already downloaded; loading from file
Player Marcus Harris already downloaded; loading from file
Player Wally Richardson already downloaded; loading from file
***1998***
Player Peyton Manning already downloaded; loading from file
Player Ryan Leaf already downloaded; loading from file

Player Chester Burnett already downloaded; loading from file
Player Brandon Tolbert already downloaded; loading from file
Player Ryan Thelwell already downloaded; loading from file
Player Eric Warfield already downloaded; loading from file
Player Chad Overhauser already downloaded; loading from file
Player Eddie Watson already downloaded; loading from file
Player Nate Wayne already downloaded; loading from file
Player Damian Vaughn already downloaded; loading from file
Player Tarik Smith already downloaded; loading from file
Player Ernest Blackwell already downloaded; loading from file
Player Tony Darden already downloaded; loading from file
Player Pat Tillman already downloaded; loading from file
Player Jim Turner already downloaded; loading from file
Player Corey Gaines already downloaded; loading from file
Player Moses Moreno already downloaded; loading from file
Player Ron Janes already downloaded; loading from file
Player Kio Sanford already downloaded; loading from file
Player Ja

Player Andre Weathers already downloaded; loading from file
Player Dennis McKinley already downloaded; loading from file
Player Madre Hill already downloaded; loading from file
Player Jed Weaver already downloaded; loading from file
Player Hunter Smith already downloaded; loading from file
Player Anthony Poindexter already downloaded; loading from file
Player Tim Alexander already downloaded; loading from file
Player Billy Miller already downloaded; loading from file
Player Chad Kelsay already downloaded; loading from file
Player Phil Glover already downloaded; loading from file
Player JoJuan Armour already downloaded; loading from file
Player Michael Bishop already downloaded; loading from file
Player Kris Brown already downloaded; loading from file
Player Mike Lucky already downloaded; loading from file
Player Sheldon Jackson already downloaded; loading from file
Player Autry Denson already downloaded; loading from file
Player Justin Swift already downloaded; loading from file
Player

Player Mike Anderson already downloaded; loading from file
Player David Gibson already downloaded; loading from file
Player Michael Hawthorne already downloaded; loading from file
Player Emanuel Smith already downloaded; loading from file
Player Robaire Smith already downloaded; loading from file
Player Matt Bowen already downloaded; loading from file
Player Tom Brady already downloaded; loading from file
Player Sherrod Gideon already downloaded; loading from file
Player Todd Husak already downloaded; loading from file
Player Jason Gavadza already downloaded; loading from file
Player Tim Rattay already downloaded; loading from file
Player Mike Green already downloaded; loading from file
Player Mike Green already downloaded; loading from file
Player Jarious Jackson already downloaded; loading from file
Player Sekou Sanyika already downloaded; loading from file
Player Rashidi Barnes already downloaded; loading from file
Player Casey Tisdale already downloaded; loading from file
Player Mo

Player John Markham already downloaded; loading from file
Player Chris Barnes already downloaded; loading from file
Player Hakim Akbar already downloaded; loading from file
Player Michael Jameson already downloaded; loading from file
Player Bobby Newcombe already downloaded; loading from file
Player Cedrick Wilson already downloaded; loading from file
Player Josh Booty already downloaded; loading from file
Player Jason Glenn already downloaded; loading from file
Player Jameel Cook already downloaded; loading from file
Player Dee Brown already downloaded; loading from file
Player Josh Heupel already downloaded; loading from file
Player Tony Driver already downloaded; loading from file
Player Rashad Holman already downloaded; loading from file
Player Ellis Wyms already downloaded; loading from file
Player Rick Crowell already downloaded; loading from file
Player Kevin Kasper already downloaded; loading from file
Player Dan Alexander already downloaded; loading from file
Player Jason Doer

Player John Owens already downloaded; loading from file
Player Justin Bannan already downloaded; loading from file
Player Bobby Gray already downloaded; loading from file
Player Andra Davis already downloaded; loading from file
Player Herb Haygood already downloaded; loading from file
Player Kyle Johnson already downloaded; loading from file
Player Kevin McCadam already downloaded; loading from file
Player Jason McAddley already downloaded; loading from file
Player Jake Schifino already downloaded; loading from file
Player Nick Greisen already downloaded; loading from file
Player Ramon Walker already downloaded; loading from file
Player Terry Jones already downloaded; loading from file
Player Aaron Kampman already downloaded; loading from file
Player Jermaine Phillips already downloaded; loading from file
Player Kurt Kittner already downloaded; loading from file
Player Andre Lott already downloaded; loading from file
Player Robert Royal already downloaded; loading from file
Player Omar

Player Donald Strickland already downloaded; loading from file
Player Julian Battle already downloaded; loading from file
Player Chris Brown already downloaded; loading from file
Player Chris Brown already downloaded; loading from file
Player Angelo Crowell already downloaded; loading from file
Player Billy McMullen already downloaded; loading from file
Player Justin Fargas already downloaded; loading from file
Player Chris Simms already downloaded; loading from file
Player Dennis Weathersby already downloaded; loading from file
Player Artose Pinner already downloaded; loading from file
Player Todd Johnson already downloaded; loading from file
Player Domanick Williams already downloaded; loading from file
Player Bradie James already downloaded; loading from file
Player George Wrighster already downloaded; loading from file
Player Onterrio Smith already downloaded; loading from file
Player Shaun McDonald already downloaded; loading from file
Player DeJuan Groce already downloaded; loadi

Player Chris Perry already downloaded; loading from file
Player Jason Babin already downloaded; loading from file
Player Chris Gamble already downloaded; loading from file
Player Michael Jenkins already downloaded; loading from file
Player Kevin Jones already downloaded; loading from file
Player Rashaun Woods already downloaded; loading from file
Player Ben Watson already downloaded; loading from file
Player Karlos Dansby already downloaded; loading from file
Player Igor Olshansky already downloaded; loading from file
Player Teddy Lehman already downloaded; loading from file
Player Daryl Smith already downloaded; loading from file
Player Ben Troupe already downloaded; loading from file
Player Tatum Bell already downloaded; loading from file
Player Travis LaBoy already downloaded; loading from file
Player Julius Jones already downloaded; loading from file
Player Bob Sanders already downloaded; loading from file
Player Jake Grove already downloaded; loading from file
Player Dontarrious T

Player Derrick Pope already downloaded; loading from file
Player Jacques Reeves already downloaded; loading from file
Player Donnie Jones already downloaded; loading from file
Player Matt Mauck already downloaded; loading from file
Player David Kimball already downloaded; loading from file
Player Jared Clauss already downloaded; loading from file
Player Michael Gaines already downloaded; loading from file
Player Christian Morton already downloaded; loading from file
Player Trevor Johnson already downloaded; loading from file
Player Derrick Ward already downloaded; loading from file
Player Rashad Washington already downloaded; loading from file
Player Erik Jensen already downloaded; loading from file
Player Colby Bockwoldt already downloaded; loading from file
Player Sean McHugh already downloaded; loading from file
Player Bruce Perry already downloaded; loading from file
Player Derek Abney already downloaded; loading from file
Player Courtney Anderson already downloaded; loading from f

Player Lance Mitchell already downloaded; loading from file
Player Ben Emanuel already downloaded; loading from file
Player Tyjuan Hagler already downloaded; loading from file
Player Rasheed Marshall already downloaded; loading from file
Player Anttaj Hawthorne already downloaded; loading from file
Player Nick Speegle already downloaded; loading from file
Player Bo Scaife already downloaded; loading from file
Player Mike Montgomery already downloaded; loading from file
Player Chris Harris already downloaded; loading from file
Player Cedric Houston already downloaded; loading from file
Player Jared Newberry already downloaded; loading from file
Player Bill Swancutt already downloaded; loading from file
Player Chad Owens already downloaded; loading from file
Player Eric Moore already downloaded; loading from file
Player Will Svitek already downloaded; loading from file
Player Jovan Haye already downloaded; loading from file
Player Tab Perry already downloaded; loading from file
Player Da

Player Cory Rodgers already downloaded; loading from file
Player Ko Simpson already downloaded; loading from file
Player Garrett Mills already downloaded; loading from file
Player Jason Avant already downloaded; loading from file
Player Leon Williams already downloaded; loading from file
Player Demetrius Williams already downloaded; loading from file
Player Victor Adeyanju already downloaded; loading from file
Player Will Blackmon already downloaded; loading from file
Player Stephen Tulloch already downloaded; loading from file
Player Leon Washington already downloaded; loading from file
Player Stephen Gostkowski already downloaded; loading from file
Player Brandon Marshall already downloaded; loading from file
Player Jamar Williams already downloaded; loading from file
Player Nate Salley already downloaded; loading from file
Player Alan Zemaitis already downloaded; loading from file
Player Domata Peko already downloaded; loading from file
Player Barry Cofield already downloaded; loadi

Player Robert Meachem already downloaded; loading from file
Player Joe Staley already downloaded; loading from file
Player Ben Grubbs already downloaded; loading from file
Player Craig Davis already downloaded; loading from file
Player Greg Olsen already downloaded; loading from file
Player Anthony Gonzalez already downloaded; loading from file
Player Alan Branch already downloaded; loading from file
Player Paul Posluszny already downloaded; loading from file
Player Arron Sears already downloaded; loading from file
Player Kevin Kolb already downloaded; loading from file
Player Eric Weddle already downloaded; loading from file
Player Zach Miller already downloaded; loading from file
Player Justin Blalock already downloaded; loading from file
Player John Beck already downloaded; loading from file
Player Chris Houston already downloaded; loading from file
Player Tony Ugoh already downloaded; loading from file
Player Drew Stanton already downloaded; loading from file
Player Sidney Rice alr

Player Mike Richardson already downloaded; loading from file
Player Jordan Palmer already downloaded; loading from file
Player Ryan Smith already downloaded; loading from file
Player Prescott Burgess already downloaded; loading from file
Player Corey Hilliard already downloaded; loading from file
Player Jordan Kent already downloaded; loading from file
Player Oscar Lua already downloaded; loading from file
Player Chase Pittman already downloaded; loading from file
Player Chris Denman already downloaded; loading from file
Player Ben Patrick already downloaded; loading from file
Player Tyler Ecker already downloaded; loading from file
Player Zach Diles already downloaded; loading from file
Player Kelvin Smith already downloaded; loading from file
Player Marvin Mitchell already downloaded; loading from file
Player Trumaine McBride already downloaded; loading from file
Player Derek Schouman already downloaded; loading from file
Player Michael Johnson already downloaded; loading from file
P

Player Martin Rucker already downloaded; loading from file
Player Dwight Lowery already downloaded; loading from file
Player Reggie Corner already downloaded; loading from file
Player Dre Moore already downloaded; loading from file
Player Kenny Iwebema already downloaded; loading from file
Player Quintin Demps already downloaded; loading from file
Player Xavier Adibi already downloaded; loading from file
Player Jack Williams already downloaded; loading from file
Player Craig Steltz already downloaded; loading from file
Player Red Bryant already downloaded; loading from file
Player Tashard Choice already downloaded; loading from file
Player Bryan Kehl already downloaded; loading from file
Player Justin Tryon already downloaded; loading from file
Player Lavelle Hawkins already downloaded; loading from file
Player Jacob Tamme already downloaded; loading from file
Player Keenan Burton already downloaded; loading from file
Player Jonathan Wilhite already downloaded; loading from file
Player

Player Alphonso Smith already downloaded; loading from file
Player Rey Maualuga already downloaded; loading from file
Player Eben Britton already downloaded; loading from file
Player Ron Brace already downloaded; loading from file
Player Darius Butler already downloaded; loading from file
Player Jairus Byrd already downloaded; loading from file
Player Everette Brown already downloaded; loading from file
Player Pat White already downloaded; loading from file
Player Clint Sintim already downloaded; loading from file
Player Connor Barwin already downloaded; loading from file
Player Mike Mitchell already downloaded; loading from file
Player Darcel McBath already downloaded; loading from file
Player Max Unger already downloaded; loading from file
Player Mohamed Massaquoi already downloaded; loading from file
Player Andy Levitre already downloaded; loading from file
Player David Veikune already downloaded; loading from file
Player LeSean McCoy already downloaded; loading from file
Player Phi

Player Ellis Lankster already downloaded; loading from file
Player Eddie Williams already downloaded; loading from file
Player Pat McAfee already downloaded; loading from file
Player Troy Nolan already downloaded; loading from file
Player Demetrius Byrd already downloaded; loading from file
Player A.Q. Shipley already downloaded; loading from file
Player Lydon Murtha already downloaded; loading from file
Player Manuel Johnson already downloaded; loading from file
Player Moise Fokou already downloaded; loading from file
Player Jamarca Sanford already downloaded; loading from file
Player Julian Edelman already downloaded; loading from file
Player Sammie Stroughter already downloaded; loading from file
Player Zack Follett already downloaded; loading from file
Player Jaimie Thomas already downloaded; loading from file
Player Jake O'Connell already downloaded; loading from file
Player Stoney Woodson already downloaded; loading from file
Player Ryan Durand already downloaded; loading from fi

Player Eric Norwood already downloaded; loading from file
Player E.J. Wilson already downloaded; loading from file
Player Jason Fox already downloaded; loading from file
Player Jacques McClendon already downloaded; loading from file
Player O'Brien Schofield already downloaded; loading from file
Player Roddrick Muckelroy already downloaded; loading from file
Player Michael Hoomanawanui already downloaded; loading from file
Player Kam Chancellor already downloaded; loading from file
Player Ricky Sapp already downloaded; loading from file
Player Dominique Franks already downloaded; loading from file
Player Kendrick Lewis already downloaded; loading from file
Player Perrish Cox already downloaded; loading from file
Player Walter McFadden already downloaded; loading from file
Player John Conner already downloaded; loading from file
Player Ed Wang already downloaded; loading from file
Player Joshua Moore already downloaded; loading from file
Player Cameron Sheffield already downloaded; loadi

Unnamed: 0,conf,cname,cname_link,cseasons,cgames,player,year,clink,plink
0,Big Ten,Illinois,/cfb/schools/illinois/1989.html,3.0,34.0,Jeff George,1990,http://www.sports-reference.com/cfb/players/je...,/players/G/GeorJe00.htm
1,Ind,Penn State,/cfb/schools/penn-state/1989.html,4.0,44.0,Blair Thomas,1990,http://www.sports-reference.com/cfb/players/bl...,/players/T/ThomBl00.htm
2,,,,,,Cortez Kennedy,1990,,/players/K/KennCo00.htm
3,SEC,Alabama,/cfb/schools/alabama/1989.html,2.0,22.0,Keith McCants,1990,http://www.sports-reference.com/cfb/players/ke...,/players/M/McCaKe21.htm
4,Pac-10,USC,/cfb/schools/southern-california/1989.html,1.0,12.0,Junior Seau,1990,http://www.sports-reference.com/cfb/players/ju...,/players/S/SeauJu00.htm


In [289]:
collegedf.to_csv('df/collegedf3.csv')

In [288]:
collegedf.sample(34)

Unnamed: 0,conf,cname,cname_link,cseasons,cgames,player,year,clink,plink
190,Big 12,Kansas,/cfb/schools/kansas/2009.html,3.0,37.0,Dezmon Briscoe,2010,http://www.sports-reference.com/cfb/players/de...,
207,,,,,,Frank Walker,2003,,
83,Big West,Northern Illinois,/cfb/schools/northern-illinois/1993.html,2.0,22.0,LeShon Johnson,1994,http://www.sports-reference.com/cfb/players/le...,/players/J/JohnLe01.htm
168,,,,,,Dwayne White,1990,,/players/W/WhitDw20.htm
240,,,,,,Sean Morey,1999,,/players/M/MoreSe00.htm
22,Big Ten,Michigan,/cfb/schools/michigan/1994.html,2.0,24.0,Ty Law,1995,http://www.sports-reference.com/cfb/players/ty...,/players/L/LawxTy00.htm
174,ACC,Florida State,/cfb/schools/florida-state/2007.html,3.0,36.0,Geno Hayes,2008,http://www.sports-reference.com/cfb/players/ge...,
173,Pac-10,UCLA,/cfb/schools/ucla/2004.html,4.0,46.0,Ben Emanuel,2005,http://www.sports-reference.com/cfb/players/be...,
136,,,,,,Joe Horn,1996,,/players/H/HornJo00.htm
141,,,,,,Terry Charles,2002,,


In [299]:
collegedf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5493 entries, 0 to 253
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   conf        3361 non-null   object 
 1   cname       3361 non-null   object 
 2   cname_link  3361 non-null   object 
 3   cseasons    3361 non-null   float64
 4   cgames      3361 non-null   float64
 5   player      5493 non-null   object 
 6   year        5493 non-null   int64  
 7   clink       5493 non-null   object 
 8   plink       2691 non-null   object 
dtypes: float64(2), int64(1), object(6)
memory usage: 429.1+ KB


In [302]:
collegedf["year2"] = collegedf.year.astype(str)
players['year2'] = players.year

In [303]:
new_df = pd.merge(players, collegedf, how ='left', 
                  on = ['player', 'year2'])

In [304]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5619 entries, 0 to 5618
Data columns (total 37 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   draft_round       5619 non-null   object 
 1   draft_pick        5619 non-null   object 
 2   team              5619 non-null   object 
 3   player            5619 non-null   object 
 4   pos_x             5619 non-null   object 
 5   age               5619 non-null   object 
 6   year_max          5619 non-null   object 
 7   g                 5619 non-null   object 
 8   college_id        5619 non-null   object 
 9   player_link       2787 non-null   object 
 10  college_link      5619 non-null   object 
 11  year_x            5619 non-null   object 
 12  playernametolink  5619 non-null   object 
 13  school_name       4493 non-null   object 
 14  pos_y             4493 non-null   object 
 15  height            4493 non-null   object 
 16  weight            4493 non-null   object 


In [305]:
new_df.sample(40)

Unnamed: 0,draft_round,draft_pick,team,player,pos_x,age,year_max,g,college_id,player_link,...,player_link_y,year2,conf,cname,cname_link,cseasons,cgames,year_y,clink,plink
5165,2,56,IND,Fili Moala,DT,24.0,2014.0,64.0,USC,,...,/players/M/MoalFi99.htm,2009,Pac-10,USC,/cfb/schools/southern-california/2008.html,4.0,51.0,2009,http://www.sports-reference.com/cfb/players/fi...,
2944,5,158,NOR,Austin Wheatley,TE,22.0,2000.0,4.0,Iowa,,...,/players/W/WheaAu20.htm,2000,Big Ten,Iowa,/cfb/schools/iowa/1999.html,3.0,33.0,2000,http://www.sports-reference.com/cfb/players/au...,
1320,1,9,CLE,Antonio Langham,DB,22.0,2000.0,102.0,Alabama,/players/L/LangAn20.htm,...,,1994,SEC,Alabama,/cfb/schools/alabama/1993.html,4.0,44.0,1994,http://www.sports-reference.com/cfb/players/an...,/players/L/LangAn20.htm
4260,5,174,SFO,Rasheed Marshall,WR,24.0,2005.0,12.0,West Virginia,,...,/players/M/MarsRa00.htm,2005,Big East,West Virginia,/cfb/schools/west-virginia/2004.html,4.0,39.0,2005,http://www.sports-reference.com/cfb/players/ra...,
4808,6,204,TEN,Jacob Ford,DE,24.0,2010.0,43.0,Central Arkansas,,...,/players/F/FordJa99.htm,2007,,,,,,2007,,
3195,5,153,NOR,Onomo Ojo,WR,,,,California-Davis,,...,,2001,,,,,,2001,,
5111,1,2,STL,Jason Smith,T,23.0,2012.0,45.0,Baylor,,...,/players/S/SmitJa22.htm,2009,Big 12,Baylor,/cfb/schools/baylor/2007.html,3.0,30.0,2009,http://www.sports-reference.com/cfb/players/ja...,
3785,7,235,DEN,Ahmaad Galloway,RB,,,,Alabama,,...,,2003,SEC,Alabama,/cfb/schools/alabama/2002.html,4.0,38.0,2003,http://www.sports-reference.com/cfb/players/ah...,
1212,5,131,HOU,John Henry Mills,LB,23.0,1999.0,100.0,Wake Forest,/players/M/MillJo00.htm,...,,1993,ACC,Wake Forest,/cfb/schools/wake-forest/1992.html,5.0,54.0,1993,http://www.sports-reference.com/cfb/players/jo...,/players/M/MillJo00.htm
2190,5,141,CHI,Van Hiles,DB,21.0,1997.0,16.0,Kentucky,/players/H/HileVa20.htm,...,,1997,SEC,Kentucky,/cfb/schools/kentucky/1996.html,3.0,29.0,1997,http://www.sports-reference.com/cfb/players/va...,/players/H/HileVa20.htm


In [306]:
new_df.to_csv('df/newdf.csv')

In [131]:
mask = players.player_link == '/players/G/GeorJe00.htm'
players[mask]

Unnamed: 0,draft_round,draft_pick,team,player,pos_x,age,year_max,g,college_id,player_link,...,weight,wonderlic,forty_yd,bench_reps,vertical,broad_jump,shuttle,cone,height_in,DraftYr
0,1,1,IND,Jeff George,QB,22,2001,131,Illinois,/players/G/GeorJe00.htm,...,,,,,,,,,0,1990


In [130]:
mask = collegedf.plink == '/players/G/GeorJe00.htm'
collegedf[mask]

Unnamed: 0,conf,cname,cname_link,cseasons,cgames,player,year,clink,plink
0,Big Ten,Illinois,/cfb/schools/illinois/1989.html,3.0,34.0,Jeff George,1990,http://www.sports-reference.com/cfb/players/je...,/players/G/GeorJe00.htm


In [274]:
collegedf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3929 entries, 0 to 3928
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   conf        1299 non-null   object 
 1   cname       1299 non-null   object 
 2   cname_link  1299 non-null   object 
 3   cseasons    1299 non-null   float64
 4   cgames      1299 non-null   float64
 5   player      0 non-null      float64
 6   year        3929 non-null   int64  
 7   clink       3929 non-null   object 
 8   plink       3929 non-null   object 
dtypes: float64(3), int64(1), object(5)
memory usage: 276.4+ KB


In [94]:
collegedf.player.nunique()

327

In [95]:
collegedf.player.value_counts()

James Williams        84
Anthony Thompson      42
Ron Lewis             42
Pat Newman            21
Clemente Gordon       21
                      ..
Ray Agnew             21
Mohammed Elewonibi    21
Donnie Salum          21
Sean Smith            21
Eddie Miles           21
Name: player, Length: 327, dtype: int64

In [96]:
collegedf.plink.value_counts()

                           2121
/players/W/WillJa23.htm      42
/players/W/WillJa24.htm      42
/players/R/RobiJe20.htm      21
/players/R/RobiJu20.htm      21
                           ... 
/players/M/MitcSc00.htm      21
/players/D/DougDe20.htm      21
/players/B/BrouSt00.htm      21
/players/S/StayJo20.htm      21
/players/P/PattMa21.htm      21
Name: plink, Length: 230, dtype: int64

In [162]:
players.sample(20)


Unnamed: 0,draft_round,draft_pick,team,player,pos_x,age,year_max,g,college_id,player_link,...,weight,wonderlic,forty_yd,bench_reps,vertical,broad_jump,shuttle,cone,height_in,DraftYr
19,1,19,ATL,Tony Smith,RB,22.0,1994.0,33.0,Southern Miss,/players/S/SmitTo00.htm,...,189.0,,9.99,,,,9.99,9.99,0,1992
62,2,63,GNB,Brandon Jackson,RB,21.0,2012.0,54.0,Nebraska,/players/J/JackBr00.htm,...,,,,,,,,,0,2007
201,7,202,CIN,Marcus Parker,RB,,,,Virginia Tech,,...,244.0,,4.69,18.0,36.0,111.0,4.36,7.97,0,1998
52,2,53,MIA,Todd Wade,T,23.0,2007.0,99.0,Mississippi,/players/W/WadeTo20.htm,...,,,,,,,,,0,2000
212,8,211,CIN,Mike Dingle,RB,22.0,1991.0,8.0,South Carolina,/players/D/DingMi00.htm,...,240.0,,4.85,,31.0,106.0,4.71,9.99,0,1991
34,2,35,STL,James Laurinaitis,LB,22.0,2016.0,117.0,Ohio St.,/players/L/LaurJa99.htm,...,,,,,,,,,0,2009
128,4,129,NYG,Guy Whimper,DE,23.0,2013.0,78.0,East Carolina,/players/W/WhimGu20.htm,...,,,,,,,,,0,2006
36,2,37,ATL,Curtis Lofton,LB,22.0,2015.0,128.0,Oklahoma,/players/L/LoftCu99.htm,...,,,,,,,,,0,2008
19,1,20,NOR,Irv Smith,TE,21.0,1999.0,95.0,Notre Dame,/players/S/SmitIr00.htm,...,255.0,,4.81,16.0,31.5,118.0,4.46,9.99,0,1993
66,3,67,HOU,Chris Sanders,WR,23.0,2001.0,97.0,Ohio St.,/players/S/SandCh00.htm,...,184.0,,4.54,,38.0,121.0,4.09,9.99,0,1995


In [133]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 620977 entries, 0 to 620976
Data columns (total 34 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   draft_round   620977 non-null  object 
 1   draft_pick    620977 non-null  object 
 2   team          620977 non-null  object 
 3   player_x      620977 non-null  object 
 4   pos_x         620977 non-null  object 
 5   age           620977 non-null  object 
 6   year_max      620977 non-null  object 
 7   g             620977 non-null  object 
 8   college_id    620977 non-null  object 
 9   player_link   620977 non-null  object 
 10  college_link  620977 non-null  object 
 11  year_x        248190 non-null  object 
 12  school_name   375956 non-null  object 
 13  pos_y         375956 non-null  object 
 14  height        375956 non-null  object 
 15  weight        375956 non-null  object 
 16  wonderlic     248190 non-null  object 
 17  forty_yd      375956 non-null  object 
 18  benc

In [123]:
new_df.player_link.nunique()

4687

In [136]:
sum(players.duplicated())

0

In [139]:
sum(collegedf.duplicated())

24

In [141]:
collegedf[collegedf.duplicated()]

Unnamed: 0,conf,cname,cname_link,cseasons,cgames,player,year,clink,plink
16,,,,,,James Williams,1990,,/players/W/WillJa23.htm
159,,,,,,James Williams,1990,,/players/W/WillJa24.htm
30,Ind,Florida State,/cfb/schools/florida-state/1990.html,4.0,39.0,Reggie Johnson,1991,http://www.sports-reference.com/cfb/players/re...,/players/J/JohnRe00.htm
207,,,,,,Reggie Johnson,1991,,
248,Pac-10,UCLA,/cfb/schools/ucla/1990.html,2.0,22.0,Scott Miller,1991,http://www.sports-reference.com/cfb/players/sc...,/players/M/MillSc00.htm
17,SWC,Texas A&M,/cfb/schools/texas-am/1991.html,4.0,46.0,Kevin Smith,1992,http://www.sports-reference.com/cfb/players/ke...,/players/S/SmitKe26.htm
20,Ind,Southern Mississippi,/cfb/schools/southern-mississippi/1991.html,3.0,33.0,Tony Smith,1992,http://www.sports-reference.com/cfb/players/to...,/players/S/SmitTo00.htm
161,Ind,Notre Dame,/cfb/schools/notre-dame/1991.html,3.0,33.0,Tony Smith,1992,http://www.sports-reference.com/cfb/players/to...,
188,Pac-10,UCLA,/cfb/schools/ucla/1991.html,3.0,34.0,Kevin Smith,1992,http://www.sports-reference.com/cfb/players/ke...,/players/S/SmitKe24.htm
44,,,,,,Chad Brown,1993,,/players/B/BrowCh04.htm


In [145]:
players.player_link.value_counts()

                           785
/players/G/GreeMi00.htm      2
/players/B/BrowCh03.htm      2
/players/W/WillJa23.htm      2
/players/S/SmitAl02.htm      2
                          ... 
/players/F/FonoTo20.htm      1
/players/M/MahaSe20.htm      1
/players/A/AdibXa99.htm      1
/players/M/McCoJa99.htm      1
/players/H/HawkAr20.htm      1
Name: player_link, Length: 4687, dtype: int64

In [143]:
players.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5493 entries, 0 to 253
Data columns (total 25 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   draft_round   5493 non-null   object
 1   draft_pick    5493 non-null   object
 2   team          5493 non-null   object
 3   player        5493 non-null   object
 4   pos_x         5493 non-null   object
 5   age           5493 non-null   object
 6   year_max      5493 non-null   object
 7   g             5493 non-null   object
 8   college_id    5493 non-null   object
 9   player_link   5493 non-null   object
 10  college_link  5493 non-null   object
 11  year          1986 non-null   object
 12  school_name   4298 non-null   object
 13  pos_y         4298 non-null   object
 14  height        4298 non-null   object
 15  weight        4298 non-null   object
 16  wonderlic     1986 non-null   object
 17  forty_yd      4298 non-null   object
 18  bench_reps    4298 non-null   object
 19  vertica

In [148]:
mask = players.player_link == '/players/W/WillJa23.htm'
players[mask]

Unnamed: 0,draft_round,draft_pick,team,player,pos_x,age,year_max,g,college_id,player_link,...,weight,wonderlic,forty_yd,bench_reps,vertical,broad_jump,shuttle,cone,height_in,DraftYr
15,1,16,BUF,James Williams,DB,23,1996,70,Fresno St.,/players/W/WillJa23.htm,...,172,,4.34,9,38.0,129,4.03,9.99,0,1990
16,1,16,BUF,James Williams,DB,23,1996,70,Fresno St.,/players/W/WillJa23.htm,...,223,,4.74,19,32.0,114,4.44,9.99,0,1990


In [149]:
mask = players.player == 'James Williams'
players[mask]

Unnamed: 0,draft_round,draft_pick,team,player,pos_x,age,year_max,g,college_id,player_link,...,weight,wonderlic,forty_yd,bench_reps,vertical,broad_jump,shuttle,cone,height_in,DraftYr
15,1,16,BUF,James Williams,DB,23,1996,70,Fresno St.,/players/W/WillJa23.htm,...,172,,4.34,9.0,38.0,129,4.03,9.99,0,1990
16,1,16,BUF,James Williams,DB,23,1996,70,Fresno St.,/players/W/WillJa23.htm,...,223,,4.74,19.0,32.0,114,4.44,9.99,0,1990
158,6,158,NOR,James Williams,LB,21,1999,137,Mississippi St.,/players/W/WillJa24.htm,...,172,,4.34,9.0,38.0,129,4.03,9.99,0,1990
159,6,158,NOR,James Williams,LB,21,1999,137,Mississippi St.,/players/W/WillJa24.htm,...,223,,4.74,19.0,32.0,114,4.44,9.99,0,1990
174,6,175,SEA,James Williams,WR,22,2003,30,Marshall,/players/W/WillJa01.htm,...,180,,4.59,,36.0,123,4.16,7.22,71,2000


In [239]:
players.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29748 entries, 0 to 902
Data columns (total 31 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   draft_round         29748 non-null  object
 1   draft_pick          29748 non-null  object
 2   team                29748 non-null  object
 3   player              2691 non-null   object
 4   pos_x               29748 non-null  object
 5   age                 29748 non-null  object
 6   year_max            29748 non-null  object
 7   g                   29748 non-null  object
 8   college_id          29748 non-null  object
 9   player_link         29748 non-null  object
 10  college_link        29748 non-null  object
 11  year                2691 non-null   object
 12  playernametolink_x  29748 non-null  object
 13  school_name         27163 non-null  object
 14  pos_y               27163 non-null  object
 15  height              27163 non-null  object
 16  weight              2716

In [240]:
players[players.year == '2005'].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 0 entries
Data columns (total 31 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   draft_round         0 non-null      object
 1   draft_pick          0 non-null      object
 2   team                0 non-null      object
 3   player              0 non-null      object
 4   pos_x               0 non-null      object
 5   age                 0 non-null      object
 6   year_max            0 non-null      object
 7   g                   0 non-null      object
 8   college_id          0 non-null      object
 9   player_link         0 non-null      object
 10  college_link        0 non-null      object
 11  year                0 non-null      object
 12  playernametolink_x  0 non-null      object
 13  school_name         0 non-null      object
 14  pos_y               0 non-null      object
 15  height              0 non-null      object
 16  weight              0 non-null      ob

In [242]:
players.year.value_counts(dropna = False)

NaN     27057
1992      339
1991      336
1990      332
1996      255
1999      252
1995      248
1997      241
1998      240
1993      227
1994      221
Name: year, dtype: int64

In [186]:
players.forty_yd.value_counts(dropna = False)

NaN     1195
9.99     390
4.62      80
4.50      77
4.59      76
        ... 
5.74       1
5.77       1
5.69       1
4.22       1
5.59       1
Name: forty_yd, Length: 150, dtype: int64

In [188]:
players.weight.value_counts(dropna = False)

NaN    1195
197      57
194      49
236      48
199      47
       ... 
339       1
370       1
348       1
358       1
366       1
Name: weight, Length: 213, dtype: int64

In [189]:
4298 + 1195

5493

In [198]:
players[players.year == '2001'].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 245 entries, 0 to 244
Data columns (total 25 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   draft_round   245 non-null    object
 1   draft_pick    245 non-null    object
 2   team          245 non-null    object
 3   player        245 non-null    object
 4   pos_x         245 non-null    object
 5   age           245 non-null    object
 6   year_max      245 non-null    object
 7   g             245 non-null    object
 8   college_id    245 non-null    object
 9   player_link   245 non-null    object
 10  college_link  245 non-null    object
 11  year          245 non-null    object
 12  school_name   205 non-null    object
 13  pos_y         205 non-null    object
 14  height        205 non-null    object
 15  weight        205 non-null    object
 16  wonderlic     0 non-null      object
 17  forty_yd      205 non-null    object
 18  bench_reps    205 non-null    object
 19  vertical

In [201]:
players[(players.year == '2001') & (players.forty_yd.isna())]

Unnamed: 0,draft_round,draft_pick,team,player,pos_x,age,year_max,g,college_id,player_link,...,weight,wonderlic,forty_yd,bench_reps,vertical,broad_jump,shuttle,cone,height_in,DraftYr
48,2,49,NYJ,LaMont Jordan,RB,22.0,2009.0,114.0,Maryland,/players/J/JordLa00.htm,...,,,,,,,,,0,2001
63,3,64,ARI,Adrian Wilson,DB,21.0,2012.0,181.0,North Carolina St.,/players/W/WilsAd99.htm,...,,,,,,,,,0,2001
77,3,78,NYG,Will Peterson,DB,22.0,2010.0,97.0,West. Illinois,/players/P/PeteWi20.htm,...,,,,,,,,,0,2001
97,4,98,ARI,Bill Gramatica,K,23.0,2004.0,34.0,South Florida,/players/G/gramabil01.htm,...,,,,,,,,,0,2001
101,4,102,ATL,Matt Stewart,LB,22.0,2006.0,93.0,Vanderbilt,/players/S/StewMa21.htm,...,,,,,,,,,0,2001
115,4,116,STL,Milton Wynn,WR,22.0,2002.0,4.0,Washington St.,/players/W/WynnMi00.htm,...,,,,,,,,,0,2001
125,4,126,BAL,Ed Hartwell,LB,23.0,2006.0,77.0,West. Illinois,/players/H/HartEd20.htm,...,,,,,,,,,0,2001
127,4,128,SEA,Floyd Womack,T,22.0,2010.0,120.0,Mississippi St.,/players/W/WomaFl20.htm,...,,,,,,,,,0,2001
140,5,141,KAN,Bill Baber,TE,22.0,2004.0,30.0,Virginia,/players/B/BabeBi00.htm,...,,,,,,,,,0,2001
152,5,153,NOR,Onomo Ojo,WR,,,,California-Davis,,...,,,,,,,,,0,2001


In [238]:
players[(players.year == '2001')].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 0 entries
Data columns (total 31 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   draft_round         0 non-null      object
 1   draft_pick          0 non-null      object
 2   team                0 non-null      object
 3   player              0 non-null      object
 4   pos_x               0 non-null      object
 5   age                 0 non-null      object
 6   year_max            0 non-null      object
 7   g                   0 non-null      object
 8   college_id          0 non-null      object
 9   player_link         0 non-null      object
 10  college_link        0 non-null      object
 11  year                0 non-null      object
 12  playernametolink_x  0 non-null      object
 13  school_name         0 non-null      object
 14  pos_y               0 non-null      object
 15  height              0 non-null      object
 16  weight              0 non-null      ob

In [None]:
players