# 247 Transfer Portal Scraping Data

In [176]:
#import relevant packages
import requests as rq
from bs4 import BeautifulSoup as bs
import re
import pandas as pd
import os
import numpy as np

In [177]:
s = 2021
#set the relevant url, must do in parts then join together
url_parts = ['https://247sports.com/Season/', str(s), '-Football/TransferPortal/']
url = ''.join(url_parts)

#use get to access the url and save the page
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'}
page = rq.get(url, headers = headers)

#save the html content of the page
soup = bs(page.content, 'html.parser')

In [178]:
transfer_players = soup.find_all('li', class_ = 'portal-list_itm')

In [179]:
player_link_list = []
player_name_list = []
height_in_list = []
weight_list = []
transfer_stars_list = []
transfer_rating_list = []
hs_stars_list = []
hs_rating_list = []
position_list = []
eligibility_list = []
old_school_list = []
new_school_list = []

In [180]:
for transfer_player in transfer_players:
    player_elem = transfer_player.find('div', class_ = 'player')
    player_link = None if player_elem == None else player_elem.find('a')['href']
    player_name = None if player_elem == None else player_elem.find('a').text.strip()
    
    measurables = None if player_elem == None else player_elem.find('div', class_ = 'metrics').text.strip()
    weight = None if measurables == None else measurables.split('/')[1].strip()
    height = None if measurables == None else measurables.split('/')[0].strip()
    feet = None if height == None else height.split('-')[0].strip()
    inches = None if height == None else height.split('-')[1].strip()
    
    if height == None:
        height_in = None
    else:
        try:
            height_in = float(feet)*12 + float(inches)
        except:
            height_in = None
            
    rating_elem = None if player_elem == None else player_elem.find('div', class_ = 'rating')
    rating_types_elem = None if rating_elem == None else rating_elem.find_all('span')
    rating_types = None if rating_types_elem == None else len(rating_types_elem)
   
    if rating_types == 16:
        transfer_stars = str(rating_types_elem[0]).count('yellow')
        transfer_rating = rating_types_elem[0].find('span', class_ = 'score').text[:-4].strip()
        hs_stars = str(rating_types_elem[8]).count('yellow')
        hs_rating = rating_types_elem[8].find('span', class_ = 'score').text[:-4].strip()
    elif rating_types == 8:
        transfer_stars = None
        transfer_rating = None
        hs_stars = str(rating_types_elem[0]).count('yellow')
        hs_rating = rating_types_elem[0].find('span', class_ = 'score').text[:-4].strip()
    else:
        transfer_stars = None
        transfer_rating = None
        hs_stars = None
        hs_rating = None 
    if hs_rating == 'N/A':
        hs_rating = None
    
    position_elem = transfer_player.find('div', class_ = 'position')
    position = None if position_elem == None else position_elem.text.strip()

    eligibility_elem = transfer_player.find('div', class_ = 'eligibility not-tbd')
    eligibility = None if eligibility_elem == None else eligibility_elem.text.strip()

    transfer_elem = transfer_player.find_all('div', class_ = 'transfer-institution')
    
    if len(transfer_elem) == 0:
        old_school = None
        new_school = None
    else:
        if len([i[9:-1] for i in re.findall('img alt=\".+?\"', str(transfer_elem[0]))]) == 1:
            old_school = [i[9:-1] for i in re.findall('img alt=\".+?\"', str(transfer_elem[0]))][0]
            new_school = None
        elif len([i[9:-1] for i in re.findall('img alt=\".+?\"', str(transfer_elem[0]))]) == 2:
            old_school = [i[9:-1] for i in re.findall('img alt=\".+?\"', str(transfer_elem[0]))][0]
            new_school = [i[9:-1] for i in re.findall('img alt=\".+?\"', str(transfer_elem[0]))][1]            
        else: 
            old_school = None
            new_school = None
    
    player_link_list.append(player_link)
    player_name_list.append(player_name)
    height_in_list.append(height_in)
    weight_list.append(weight)
    transfer_stars_list.append(transfer_stars)
    transfer_rating_list.append(transfer_rating)
    hs_stars_list.append(hs_stars)
    hs_rating_list.append(hs_rating)
    position_list.append(position)
    eligibility_list.append(eligibility)
    old_school_list.append(old_school)
    new_school_list.append(new_school)

In [181]:
transfer_247_df = pd.DataFrame({'url': player_link_list,
                                'name': player_name_list,
                                'height_in': height_in_list,
                                'weight': weight_list,
                                'position': position_list,
                                'transfer_stars': transfer_stars_list,
                                'transfer_rating': transfer_rating_list,
                                'hs_stars': hs_stars_list,
                                'hs_rating': hs_rating_list,
                                'eligibility': eligibility_list,
                                'old_school': old_school_list,
                                'new_school': new_school_list})

In [182]:
transfer_247_df = transfer_247_df.fillna(value=np.nan).dropna(axis = 0, how = 'all').reset_index().drop('index', axis = 1)

In [184]:
transfer_247_df.to_csv('raw-data/transfer_247_data.csv', index = False)