In [1]:
#First, import the libraries we will be using for scraping
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup

In [2]:
def get_html_soup(url):
    req = Request(url, headers={'User-Agent' : "Magic Browser"})
    html = urlopen(req)
    soup = BeautifulSoup(html.read(), "lxml")
    return soup

In [3]:
def get_player_list_items(soup):
    portal_main = soup.find('div', 'portal-main')
    list_items_players = portal_main.find_all('li', 'portal-list_itm')
    return list_items_players

In [4]:
def get_arrow_index_pos(institution_div_contents):
    for idx, item in enumerate(institution_div_contents):
        if item.name == 'svg':
            return idx
        

In [5]:
def get_entry_exit_institutions(institution_div_contents, arrow_idx_pos):
    entry_institution = None
    exit_institution = None

    for idx, item in enumerate(institution_div_contents):
        if idx != arrow_idx_pos:
            try:
                image_tag = item.find('img')
                institution_name = image_tag.attrs['title']
                if idx < arrow_idx_pos:
                    entry_institution = institution_name
                else:
                    exit_institution = institution_name
            except:
                pass
    return (entry_institution, exit_institution)
            

In [6]:
def get_transfer_institution_info(list_item):
    """Receive the html div that includes player information and isolate transfer information

    Keyword arguments:
    list_item -- The html div with transfer institution info
    
    Returns:
    tuple of strings: (entry_institution, exit_institution)
    """
    institution_transfer_div = list_item.find('div', 'transfer-institution cb')
    if institution_transfer_div is None:
        institution_transfer_div = list_item.find('div', 'transfer-institution')
    tag_content_list = institution_transfer_div.contents
    #Strip any list elements that are empty from the list
    tag_content_list = [item for item in tag_content_list if str(item).strip()]
    arrow_idx_pos = get_arrow_index_pos(tag_content_list)
    entry_institution, exit_institution = get_entry_exit_institutions(tag_content_list, arrow_idx_pos)
    return (entry_institution, exit_institution)
    

In [7]:
#Outer for loop should be for each year
year_list = ['2018','2019','2020']

url_list = ["https://247sports.com/Season/" + year + "-Football/TransferPortal/" for year in year_list]


In [8]:
#For loop exploration
#Consider refactoring this for loop using more methods and using list comprehensions.
years = []
player_names = []
entry_institutions = []
exit_institutions = []
positions = []

#Outer for loop should be for each year
year_list = ['2018','2019','2020']

url_list = ["https://247sports.com/Season/" + year + "-Football/TransferPortal/" for year in year_list]

for outer_idx, url in enumerate(url_list):
    #Get the html soup
    soup = get_html_soup(url)
    list_items_players = get_player_list_items(soup)
    for idx, item in enumerate(list_items_players):
        item_string = str(item)
        #Isolate the player's name
        player_div = item.find('div', 'player')
        if player_div is not None:
            player_name = player_div.find('a').contents[0]
            player_names.append(player_name)
            #print(player_name)
            #Isolate the transfer institutions
            entry_institution, exit_institution = get_transfer_institution_info(item)
            entry_institutions.append(entry_institution)
            exit_institutions.append(exit_institution)
            #print(entry_institution, exit_institution)
            position_div = item.find('div', 'position')
            if position_div is not None:
                position = position_div.text.strip()
                if position == 'PRO' or position == 'DUAL':
                    position = 'QB'
                positions.append(position)
            years.append(year_list[outer_idx])
            

In [9]:
#Throw lists into a pandas dataframe
import pandas as pd
  
# dictionary of lists  
value_lists_dict = {'PLAYER_NAME': player_names, 'POSITION': positions, 'SCHOOL_AT_ENTRY': entry_institutions,
                   'SCHOOL_AT_EXIT': exit_institutions, 'YEAR':years}  
    
df = pd.DataFrame(value_lists_dict) 
    

In [10]:
df.head(10)  

Unnamed: 0,PLAYER_NAME,POSITION,SCHOOL_AT_ENTRY,SCHOOL_AT_EXIT,YEAR
0,Kyle Lindquist,QB,Fresno State,,2018
1,Tre Nixon,WR,Ole Miss,UCF,2018
2,Grant Miles,TE,New Mexico State,Arizona State,2018
3,Jonathan Johnson,WR,Morgan State,East Carolina,2018
4,Donny Navarro,WR,Valparaiso,Illinois,2018
5,Casey Tucker,OT,Stanford,Arizona State,2018
6,Jack DeFoor,OT,Ole Miss,Georgia Tech,2018
7,Jimmy Hogan,SDE,Rutgers,Temple,2018
8,Malik Burns,SDE,Temple,Western Illinois,2018
9,Dion Goldbourne,SDE,Maryland,UT Martin,2018


In [11]:
df['REMAINED_WITH_ORIGINAL_SCHOOL'] = [True if school == df['SCHOOL_AT_EXIT'][idx] else False for idx, school in enumerate(df['SCHOOL_AT_ENTRY'])]

df['PORTAL_EXITED'] = df['SCHOOL_AT_EXIT'].map(lambda x: False if x is None else True)

#Good Source: https://stackoverflow.com/questions/30953299/pandas-if-row-in-column-a-contains-x-write-y-to-row-in-column-b

In [12]:
df.head(10)

Unnamed: 0,PLAYER_NAME,POSITION,SCHOOL_AT_ENTRY,SCHOOL_AT_EXIT,YEAR,REMAINED_WITH_ORIGINAL_SCHOOL,PORTAL_EXITED
0,Kyle Lindquist,QB,Fresno State,,2018,False,False
1,Tre Nixon,WR,Ole Miss,UCF,2018,False,True
2,Grant Miles,TE,New Mexico State,Arizona State,2018,False,True
3,Jonathan Johnson,WR,Morgan State,East Carolina,2018,False,True
4,Donny Navarro,WR,Valparaiso,Illinois,2018,False,True
5,Casey Tucker,OT,Stanford,Arizona State,2018,False,True
6,Jack DeFoor,OT,Ole Miss,Georgia Tech,2018,False,True
7,Jimmy Hogan,SDE,Rutgers,Temple,2018,False,True
8,Malik Burns,SDE,Temple,Western Illinois,2018,False,True
9,Dion Goldbourne,SDE,Maryland,UT Martin,2018,False,True


In [13]:
#Throw the lists of schools into a set. Use this for tieing the conferences to the school information
unique_schools = set([school for school in entry_institutions] + [school for school in exit_institutions])
print(len(unique_schools))
print(unique_schools)

246
{'Akron', 'San Diego', 'Sacramento State', 'Jacksonville', 'Appalachian State', 'San Jose State', 'Portland State', 'Nicholls State', 'Northwestern State', 'TCU', 'Western Kentucky', 'Jacksonville State', 'Missouri', 'Massachusetts', 'Colgate', 'Stanford', 'UNLV', 'Hampton', 'New Hampshire', 'Old Dominion', 'Rutgers', 'Elon', 'East Tennessee State', 'Princeton', 'South Carolina State', 'Bethune-Cookman', 'Oklahoma', 'BYU', 'Arkansas', 'Charlotte', 'Navy', 'The Citadel', 'Jackson State', 'Cal Poly', 'Columbia', 'Memphis', 'UT Martin', 'Wyoming', 'Saint Francis (PA)', 'Kansas State', 'Western Carolina', 'Michigan', 'Furman', 'Charleston Southern', 'Texas State', 'Indiana State', 'Montana', 'Howard', 'Butler', 'Maine', 'Eastern Kentucky', 'Hawaii', 'Northern Iowa', 'Texas', 'Rhode Island', 'Oklahoma State', 'Georgia Tech', 'Georgetown', 'Washington', 'Grand Valley State', 'Texas Southern', 'Maryland', 'Professional', 'Liberty', 'Notre Dame', 'Miami (OH)', 'Youngstown State', 'Fresno S

In [18]:
#Load my lits of schools with their conferences. Load it as a dictionary
file_name = 'schools_with_conferences.csv'

school_conference_df = pd.read_csv(file_name)

school_conference_list = list(zip(school_conference_df.SCHOOL_NAME, school_conference_df.CONFERENCE_NAME))

print(school_conference_list)

[('Boston College', 'ACC'), ('Clemson', 'ACC'), ('Duke', 'ACC'), ('Florida State', 'ACC'), ('Georgia Tech', 'ACC'), ('Louisville', 'ACC'), ('Miami', 'ACC'), ('NC State', 'ACC'), ('North Carolina', 'ACC'), ('Pittsburgh', 'ACC'), ('Syracuse', 'ACC'), ('Virginia', 'ACC'), ('Virginia Tech', 'ACC'), ('Wake Forest', 'ACC'), ('Baylor', 'Big 12'), ('Iowa State', 'Big 12'), ('Kansas', 'Big 12'), ('Kansas State', 'Big 12'), ('Oklahoma', 'Big 12'), ('Oklahoma State', 'Big 12'), ('TCU', 'Big 12'), ('Texas', 'Big 12'), ('Texas Tech', 'Big 12'), ('West Virginia', 'Big 12'), ('Illinois', 'Big Ten'), ('Indiana', 'Big Ten'), ('Iowa', 'Big Ten'), ('Maryland', 'Big Ten'), ('Michigan', 'Big Ten'), ('Michigan State', 'Big Ten'), ('Minnesota', 'Big Ten'), ('Nebraska', 'Big Ten'), ('Northwestern', 'Big Ten'), ('Ohio State', 'Big Ten'), ('Penn State', 'Big Ten'), ('Purdue', 'Big Ten'), ('Rutgers', 'Big Ten'), ('Wisconsin', 'Big Ten'), ('Alabama', 'SEC'), ('Arkansas', 'SEC'), ('Auburn', 'SEC'), ('Florida', 'SE

In [19]:
#Map the conference information to my original dataframe

#I don't need the school_conference dataframe anymore so delete from memory
del school_conference_df

from collections import defaultdict 

school_conference_dict = defaultdict(lambda: "Unknown") 

for school, conference in school_conference_list:
    school_conference_dict[school] = conference
    
df['CONFERENCE_AT_ENTRY'] = df['SCHOOL_AT_ENTRY'].map(lambda x: school_conference_dict[x])

df['CONFERENCE_AT_EXIT'] = df['SCHOOL_AT_EXIT'].map(lambda x: school_conference_dict[x])

In [20]:
df.head(10)

Unnamed: 0,PLAYER_NAME,POSITION,SCHOOL_AT_ENTRY,SCHOOL_AT_EXIT,YEAR,REMAINED_WITH_ORIGINAL_SCHOOL,PORTAL_EXITED,CONFERENCE_AT_ENTRY,CONFERENCE_AT_EXIT
0,Kyle Lindquist,QB,Fresno State,,2018,False,False,Mountain West,Unknown
1,Tre Nixon,WR,Ole Miss,UCF,2018,False,True,SEC,American Athletic
2,Grant Miles,TE,New Mexico State,Arizona State,2018,False,True,FBS Independents,Pac-12
3,Jonathan Johnson,WR,Morgan State,East Carolina,2018,False,True,Mid-Eastern Athletic Conference,American Athletic
4,Donny Navarro,WR,Valparaiso,Illinois,2018,False,True,Pioneer Football League,Big Ten
5,Casey Tucker,OT,Stanford,Arizona State,2018,False,True,Pac-12,Pac-12
6,Jack DeFoor,OT,Ole Miss,Georgia Tech,2018,False,True,SEC,ACC
7,Jimmy Hogan,SDE,Rutgers,Temple,2018,False,True,Big Ten,American Athletic
8,Malik Burns,SDE,Temple,Western Illinois,2018,False,True,American Athletic,Missouri Valley Football Conference
9,Dion Goldbourne,SDE,Maryland,UT Martin,2018,False,True,Big Ten,Ohio Valley


In [21]:
#Create an additional dataframe that furhter filters our results
#Discard rows where REMAINED_WITH_ORIGINAL_SCHOOL == True
#Discard rows where EITHER SCHOOL_AT_ENTRY OR SCHOOL_AT_EXIT is None

df_verified_transfer_records = df[df.SCHOOL_AT_ENTRY.notnull() & df.SCHOOL_AT_EXIT.notnull() & df.REMAINED_WITH_ORIGINAL_SCHOOL == False]

df_verified_transfer_records = df_verified_transfer_records.dropna()

print(len(df_verified_transfer_records))

df_verified_transfer_records.head(10)


1366


Unnamed: 0,PLAYER_NAME,POSITION,SCHOOL_AT_ENTRY,SCHOOL_AT_EXIT,YEAR,REMAINED_WITH_ORIGINAL_SCHOOL,PORTAL_EXITED,CONFERENCE_AT_ENTRY,CONFERENCE_AT_EXIT
1,Tre Nixon,WR,Ole Miss,UCF,2018,False,True,SEC,American Athletic
2,Grant Miles,TE,New Mexico State,Arizona State,2018,False,True,FBS Independents,Pac-12
3,Jonathan Johnson,WR,Morgan State,East Carolina,2018,False,True,Mid-Eastern Athletic Conference,American Athletic
4,Donny Navarro,WR,Valparaiso,Illinois,2018,False,True,Pioneer Football League,Big Ten
5,Casey Tucker,OT,Stanford,Arizona State,2018,False,True,Pac-12,Pac-12
6,Jack DeFoor,OT,Ole Miss,Georgia Tech,2018,False,True,SEC,ACC
7,Jimmy Hogan,SDE,Rutgers,Temple,2018,False,True,Big Ten,American Athletic
8,Malik Burns,SDE,Temple,Western Illinois,2018,False,True,American Athletic,Missouri Valley Football Conference
9,Dion Goldbourne,SDE,Maryland,UT Martin,2018,False,True,Big Ten,Ohio Valley
10,Casey Williams,DT,Temple,Stony Brook,2018,False,True,American Athletic,Colonial Athletic Association


In [22]:
#Write out the subset of verified transfer records to csv
df_verified_transfer_records.to_csv('transfer_portal_activity_verified_records.csv')

In [133]:
'''
school_file = open("unique_schools.txt","w") 
for school in unique_schools:
    if school is not None:
        school_file.write(school + '\n')
school_file.close()
'''

In [23]:
df.to_csv('transfer_portal_activity.csv')