<h1 align="center">Phase 4 Data Collection (Data Scrapping with Beautiful Soup) 

In [27]:
#importing modules required by default

import pandas as pd # file management | I-O | data processing
import numpy as np  # linear algebra

import requests # html request handler

from bs4 import BeautifulSoup # html parser | data scraper

import time# for sleep()

import re # regular expression | data cleaning

from collections import defaultdict #for using(defaultdict(list))

In [28]:
def profile_scrapper(tail, season):
    url = "https://www.basketball-reference.com" + tail
    res = requests.get(url)
    soup = BeautifulSoup(res.text)
    
    player_data= {}
    
    stats_per_game = soup.find(attrs={'id': 'all_per_game'})
    for row in stats_per_game.findAll("tr"):
        if 'id' in row.attrs and row.attrs['id'] == "per_game." + season:
            player_data['fga'] = float(row.find('td', attrs={'data-stat': 'fga_per_g'}).text)
            player_data['fg3a'] = float(row.find('td', attrs={'data-stat': 'fg3a_per_g'}).text)
            player_data['fta'] = float(row.find('td', attrs={'data-stat': 'fta_per_g'}).text)
            break
    
    advanced_stats = soup.find(attrs={'id': 'all_advanced'})
    for child in advanced_stats.children: # data scrapping from advanced table
        if "table_outer_container" in child:
            other_soup = BeautifulSoup(child)
            rows = other_soup.findAll("tr")
    for row in rows:
        if 'id' in row.attrs and row.attrs['id'] == "advanced." + season:
            player_data.update(
                {
                    'per': float(row.find('td', attrs={'data-stat': 'per'}).text),
                    'ts_pct': float(row.find('td', attrs={'data-stat': 'ts_pct'}).text),
                    'usg_pct': float(row.find('td', attrs={'data-stat': 'usg_pct'}).text),
                    'bpm': float(row.find('td', attrs={'data-stat': 'bpm'}).text),
                    'season': str(int(season)-1) + "-" + season[-2:],
                }
            )
    return player_data

In [33]:
def players_mvp(url):
    res = requests.get(url)
    soup = BeautifulSoup(res.text)
    table = soup.find(attrs={'class': 'stats_table'})
    rows = table.findAll("tr")
    
    season = url.rsplit("/",3)[-1][-9:-5]
    
    print(f"Current season: {season}")
    
    combined_stats = defaultdict(list)
    
    for index, row in enumerate(rows):
        
        print(f" Working on index {index+1} of {len(rows)}")
        
        data_cells = row.findAll("td")
        if not data_cells:
            continue
        for cell in data_cells:
            if 'data-stat' not in cell.attrs:
                continue
                
            if cell['data-stat'] == 'age':
                continue
                
            if cell['data-stat'] == 'team_id': # block to calculate win_pct
                
                base = "https://www.basketball-reference.com"
                try:
                    link = cell.find("a")['href']
                except Exception:
                    combined_stats['win_pct'].append(0.5)  # append average if link not found
                    continue
                    
                url = base + link
                time.sleep(1)
                soup = BeautifulSoup(requests.get(url).text)
                
                for item in soup.findAll("p"):
                    
                    if "Record" in item.text:
                        record = re.findall("\d+\-\d+", item.text)[0]
                        splitted = record.split("-")
                        combined_stats['win_pct'].append(float(splitted[0]) / (float(splitted[1]) + float(splitted[0])))
                        break
                        
                continue
                
                
            if cell['data-stat'] == 'player': # block to scrap data from players' page weblink
                time.sleep(1)
                advanced_dict = profile_scrapper(cell.find("a")['href'], season)
                for key in advanced_dict:
                    combined_stats[key].append(advanced_dict[key])
                combined_stats[cell['data-stat']].append(cell.getText())
            else:
                text = cell.getText() or "0"
                combined_stats[cell['data-stat']].append(float(text))
                
    return combined_stats

In [34]:
season_list = range(2016, 2020)

mvp_stats = defaultdict(list)

for season in season_list:
    full_url = f"https://www.basketball-reference.com/awards/awards_{str(season)}.html"
    season_stats = players_mvp(full_url)
    for key in season_stats:
        mvp_stats[key].extend(season_stats[key])
        


Current season: 2016
 Working on index 1 of 12
 Working on index 2 of 12
 Working on index 3 of 12
 Working on index 4 of 12
 Working on index 5 of 12
 Working on index 6 of 12
 Working on index 7 of 12
 Working on index 8 of 12
 Working on index 9 of 12
 Working on index 10 of 12
 Working on index 11 of 12
 Working on index 12 of 12
Current season: 2017
 Working on index 1 of 13
 Working on index 2 of 13
 Working on index 3 of 13
 Working on index 4 of 13
 Working on index 5 of 13
 Working on index 6 of 13
 Working on index 7 of 13
 Working on index 8 of 13
 Working on index 9 of 13
 Working on index 10 of 13
 Working on index 11 of 13
 Working on index 12 of 13
 Working on index 13 of 13
Current season: 2018
 Working on index 1 of 15
 Working on index 2 of 15
 Working on index 3 of 15
 Working on index 4 of 15
 Working on index 5 of 15
 Working on index 6 of 15
 Working on index 7 of 15
 Working on index 8 of 15
 Working on index 9 of 15
 Working on index 10 of 15
 Working on index 1

In [35]:
data_frame = pd.DataFrame(mvp_stats)
data_frame.to_csv("data_mvp.csv")