# Scrape Combined ND Football Schedules From Sports Reference



# Load Modules Using Import

In [1]:
# import requests
import requests

# import csv
import csv

# import beautifulsoup
from bs4 import BeautifulSoup
from bs4 import Comment

# import pandas
import pandas as pd

# import numpy
import numpy as np

# Get List of Season URLs from Sports Reference

In [2]:
# root url
root = "https://www.sports-reference.com/cfb/schools/notre-dame/"

# year range
years = range(1899, 2021, 1)

# url tag
tag = "-schedule.html"

# empty list for urls
urls = []

# for loop that concatenates full url
for year in years:
    urls.append(root + str(year) + tag)
    
# show sample url
urls[0]

'https://www.sports-reference.com/cfb/schools/notre-dame/1899-schedule.html'

# Create Dictionary That Connects Season Year With URL

In [3]:
test_dict = {years[i]: urls[i] for i in range(len(years))}

test_dict

{1899: 'https://www.sports-reference.com/cfb/schools/notre-dame/1899-schedule.html',
 1900: 'https://www.sports-reference.com/cfb/schools/notre-dame/1900-schedule.html',
 1901: 'https://www.sports-reference.com/cfb/schools/notre-dame/1901-schedule.html',
 1902: 'https://www.sports-reference.com/cfb/schools/notre-dame/1902-schedule.html',
 1903: 'https://www.sports-reference.com/cfb/schools/notre-dame/1903-schedule.html',
 1904: 'https://www.sports-reference.com/cfb/schools/notre-dame/1904-schedule.html',
 1905: 'https://www.sports-reference.com/cfb/schools/notre-dame/1905-schedule.html',
 1906: 'https://www.sports-reference.com/cfb/schools/notre-dame/1906-schedule.html',
 1907: 'https://www.sports-reference.com/cfb/schools/notre-dame/1907-schedule.html',
 1908: 'https://www.sports-reference.com/cfb/schools/notre-dame/1908-schedule.html',
 1909: 'https://www.sports-reference.com/cfb/schools/notre-dame/1909-schedule.html',
 1910: 'https://www.sports-reference.com/cfb/schools/notre-dame/1

# Create Dictionary of Pandas DataFrames with Each Year's Schedule

NOTE: The 1899-1935 seasons do not include an AP poll summary table  before the game information and results. The `if-else` statement in the `for` loop accounts for this varying page table structure.

In [4]:
# empty dictionary
schedules = {}

# for loop that loads schedule as pandas data frame and saves dataframe as dictionary value associated with season year key
for key, value in test_dict.items():
    df_name = key
    if key >= 1899 | key <= 1935 or key == 1962 or key ==1963 or key == 2007 or key ==2008 or key == 2010:
        try:
            dfs = pd.read_html(value)
            schedule = dfs[0]
            schedule.loc[:, 'Season'] = key
            schedules[df_name] = schedule
        except:
            continue
    else:
        try:
            dfs = pd.read_html(value)
            schedule = dfs[1]
            schedule.loc[:, 'Season'] = key
            schedules[df_name] = schedule
        except:
            continue

# Combine Dictionary of DataFrames using `pd.concat`

In [5]:
# create combined_schedule df
combined_schedule = pd.concat(schedules.values(), ignore_index=True)

# show df
combined_schedule

Unnamed: 0,G,Date,Day,School,Unnamed: 4,Opponent,Conf,Unnamed: 7,Pts,Opp,W,L,T,Streak,Notes,Season,Time,Unnamed: 5,Unnamed: 8
0,1,"Sep 27, 1899",Wed,Notre Dame,,Englewood High,Non-Major,W,29,5,1,0,0.0,W 1,,1899,,,
1,2,"Sep 30, 1899",Sat,Notre Dame,,Michigan State,Non-Major,W,40,0,2,0,0.0,W 2,,1899,,,
2,3,"Oct 4, 1899",Wed,Notre Dame,@,Chicago,Western,L,6,23,2,1,0.0,L 1,,1899,,,
3,4,"Oct 14, 1899",Sat,Notre Dame,,Lake Forest,Ind,W,38,0,3,1,0.0,W 1,,1899,,,
4,5,"Oct 18, 1899",Wed,Notre Dame,@,Michigan,Western,L,0,12,3,2,0.0,L 1,,1899,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
968,8,"Nov 14, 2020",Sat,(2) Notre Dame,,Boston College,ACC,,45,31,8,0,,W 8,,2020,3:30 PM,@,W
969,9,"Nov 27, 2020",Fri,(2) Notre Dame,,(25) North Carolina,ACC,,31,17,9,0,,W 9,,2020,3:30 PM,@,W
970,10,"Dec 5, 2020",Sat,(2) Notre Dame,,Syracuse,ACC,,45,21,10,0,,W 10,,2020,2:30 PM,,W
971,11,"Dec 19, 2020",Sat,(2) Notre Dame,,(4) Clemson,ACC,,10,34,10,1,,L 1,ACC Championship Game,2020,4:00 PM,N,L


# Data Cleaning/Wrangling

In [6]:
# rename columns
combined_schedule.rename(columns = {'Unnamed: 4': 'Location', 'Unnamed: 7': 'Result'}, inplace= True)

# show updated dataframe
combined_schedule

Unnamed: 0,G,Date,Day,School,Location,Opponent,Conf,Result,Pts,Opp,W,L,T,Streak,Notes,Season,Time,Unnamed: 5,Unnamed: 8
0,1,"Sep 27, 1899",Wed,Notre Dame,,Englewood High,Non-Major,W,29,5,1,0,0.0,W 1,,1899,,,
1,2,"Sep 30, 1899",Sat,Notre Dame,,Michigan State,Non-Major,W,40,0,2,0,0.0,W 2,,1899,,,
2,3,"Oct 4, 1899",Wed,Notre Dame,@,Chicago,Western,L,6,23,2,1,0.0,L 1,,1899,,,
3,4,"Oct 14, 1899",Sat,Notre Dame,,Lake Forest,Ind,W,38,0,3,1,0.0,W 1,,1899,,,
4,5,"Oct 18, 1899",Wed,Notre Dame,@,Michigan,Western,L,0,12,3,2,0.0,L 1,,1899,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
968,8,"Nov 14, 2020",Sat,(2) Notre Dame,,Boston College,ACC,,45,31,8,0,,W 8,,2020,3:30 PM,@,W
969,9,"Nov 27, 2020",Fri,(2) Notre Dame,,(25) North Carolina,ACC,,31,17,9,0,,W 9,,2020,3:30 PM,@,W
970,10,"Dec 5, 2020",Sat,(2) Notre Dame,,Syracuse,ACC,,45,21,10,0,,W 10,,2020,2:30 PM,,W
971,11,"Dec 19, 2020",Sat,(2) Notre Dame,,(4) Clemson,ACC,,10,34,10,1,,L 1,ACC Championship Game,2020,4:00 PM,N,L


In [7]:
# take non-null values from second Location column ('Unnamed: 5') and assign to cells in 'Location' column
combined_schedule["Location"] = np.where(combined_schedule["Unnamed: 5"].notnull(), combined_schedule["Unnamed: 5"], combined_schedule["Location"])

# take non-null values from second Result column ('Unnamed: 8') and assign to cells in 'Result' column
combined_schedule["Result"] = np.where(combined_schedule["Unnamed: 8"].notnull(), combined_schedule["Unnamed: 8"], combined_schedule["Result"])

# show updated dataframe
combined_schedule

Unnamed: 0,G,Date,Day,School,Location,Opponent,Conf,Result,Pts,Opp,W,L,T,Streak,Notes,Season,Time,Unnamed: 5,Unnamed: 8
0,1,"Sep 27, 1899",Wed,Notre Dame,,Englewood High,Non-Major,W,29,5,1,0,0.0,W 1,,1899,,,
1,2,"Sep 30, 1899",Sat,Notre Dame,,Michigan State,Non-Major,W,40,0,2,0,0.0,W 2,,1899,,,
2,3,"Oct 4, 1899",Wed,Notre Dame,@,Chicago,Western,L,6,23,2,1,0.0,L 1,,1899,,,
3,4,"Oct 14, 1899",Sat,Notre Dame,,Lake Forest,Ind,W,38,0,3,1,0.0,W 1,,1899,,,
4,5,"Oct 18, 1899",Wed,Notre Dame,@,Michigan,Western,L,0,12,3,2,0.0,L 1,,1899,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
968,8,"Nov 14, 2020",Sat,(2) Notre Dame,@,Boston College,ACC,W,45,31,8,0,,W 8,,2020,3:30 PM,@,W
969,9,"Nov 27, 2020",Fri,(2) Notre Dame,@,(25) North Carolina,ACC,W,31,17,9,0,,W 9,,2020,3:30 PM,@,W
970,10,"Dec 5, 2020",Sat,(2) Notre Dame,,Syracuse,ACC,W,45,21,10,0,,W 10,,2020,2:30 PM,,W
971,11,"Dec 19, 2020",Sat,(2) Notre Dame,N,(4) Clemson,ACC,L,10,34,10,1,,L 1,ACC Championship Game,2020,4:00 PM,N,L


In [8]:
# remove second location column
del combined_schedule['Unnamed: 5']

# remove second result column
del combined_schedule['Unnamed: 8']

# show updated dataframe
combined_schedule

Unnamed: 0,G,Date,Day,School,Location,Opponent,Conf,Result,Pts,Opp,W,L,T,Streak,Notes,Season,Time
0,1,"Sep 27, 1899",Wed,Notre Dame,,Englewood High,Non-Major,W,29,5,1,0,0.0,W 1,,1899,
1,2,"Sep 30, 1899",Sat,Notre Dame,,Michigan State,Non-Major,W,40,0,2,0,0.0,W 2,,1899,
2,3,"Oct 4, 1899",Wed,Notre Dame,@,Chicago,Western,L,6,23,2,1,0.0,L 1,,1899,
3,4,"Oct 14, 1899",Sat,Notre Dame,,Lake Forest,Ind,W,38,0,3,1,0.0,W 1,,1899,
4,5,"Oct 18, 1899",Wed,Notre Dame,@,Michigan,Western,L,0,12,3,2,0.0,L 1,,1899,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
968,8,"Nov 14, 2020",Sat,(2) Notre Dame,@,Boston College,ACC,W,45,31,8,0,,W 8,,2020,3:30 PM
969,9,"Nov 27, 2020",Fri,(2) Notre Dame,@,(25) North Carolina,ACC,W,31,17,9,0,,W 9,,2020,3:30 PM
970,10,"Dec 5, 2020",Sat,(2) Notre Dame,,Syracuse,ACC,W,45,21,10,0,,W 10,,2020,2:30 PM
971,11,"Dec 19, 2020",Sat,(2) Notre Dame,N,(4) Clemson,ACC,L,10,34,10,1,,L 1,ACC Championship Game,2020,4:00 PM


In [9]:
# replace @ symbols in Location column with 'Away' string
combined_schedule['Location'] = combined_schedule['Location'].str.replace("@", "Away")

# replace 'N' string in Location column with 'Neutral' string
combined_schedule['Location'] = combined_schedule['Location'].str.replace("N", "Neutral")

# replace NaN values in Location column with 'Home' string
combined_schedule['Location'] = combined_schedule['Location'].fillna("Home")

# show updated dataframe
combined_schedule

Unnamed: 0,G,Date,Day,School,Location,Opponent,Conf,Result,Pts,Opp,W,L,T,Streak,Notes,Season,Time
0,1,"Sep 27, 1899",Wed,Notre Dame,Home,Englewood High,Non-Major,W,29,5,1,0,0.0,W 1,,1899,
1,2,"Sep 30, 1899",Sat,Notre Dame,Home,Michigan State,Non-Major,W,40,0,2,0,0.0,W 2,,1899,
2,3,"Oct 4, 1899",Wed,Notre Dame,Away,Chicago,Western,L,6,23,2,1,0.0,L 1,,1899,
3,4,"Oct 14, 1899",Sat,Notre Dame,Home,Lake Forest,Ind,W,38,0,3,1,0.0,W 1,,1899,
4,5,"Oct 18, 1899",Wed,Notre Dame,Away,Michigan,Western,L,0,12,3,2,0.0,L 1,,1899,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
968,8,"Nov 14, 2020",Sat,(2) Notre Dame,Away,Boston College,ACC,W,45,31,8,0,,W 8,,2020,3:30 PM
969,9,"Nov 27, 2020",Fri,(2) Notre Dame,Away,(25) North Carolina,ACC,W,31,17,9,0,,W 9,,2020,3:30 PM
970,10,"Dec 5, 2020",Sat,(2) Notre Dame,Home,Syracuse,ACC,W,45,21,10,0,,W 10,,2020,2:30 PM
971,11,"Dec 19, 2020",Sat,(2) Notre Dame,Neutral,(4) Clemson,ACC,L,10,34,10,1,,L 1,ACC Championship Game,2020,4:00 PM


In [10]:
# create duplicate School column to standardize school name
combined_schedule['Standardized_School'] = combined_schedule['School']

# remove rankings from new column
combined_schedule['Standardized_School']= combined_schedule['Standardized_School'].str.replace(r'\d', '')
combined_schedule['Standardized_School']= combined_schedule['Standardized_School'].str.replace(r'[()]', '')

# show updated dataframe
combined_schedule

  combined_schedule['Standardized_School']= combined_schedule['Standardized_School'].str.replace(r'\d', '')
  combined_schedule['Standardized_School']= combined_schedule['Standardized_School'].str.replace(r'[()]', '')


Unnamed: 0,G,Date,Day,School,Location,Opponent,Conf,Result,Pts,Opp,W,L,T,Streak,Notes,Season,Time,Standardized_School
0,1,"Sep 27, 1899",Wed,Notre Dame,Home,Englewood High,Non-Major,W,29,5,1,0,0.0,W 1,,1899,,Notre Dame
1,2,"Sep 30, 1899",Sat,Notre Dame,Home,Michigan State,Non-Major,W,40,0,2,0,0.0,W 2,,1899,,Notre Dame
2,3,"Oct 4, 1899",Wed,Notre Dame,Away,Chicago,Western,L,6,23,2,1,0.0,L 1,,1899,,Notre Dame
3,4,"Oct 14, 1899",Sat,Notre Dame,Home,Lake Forest,Ind,W,38,0,3,1,0.0,W 1,,1899,,Notre Dame
4,5,"Oct 18, 1899",Wed,Notre Dame,Away,Michigan,Western,L,0,12,3,2,0.0,L 1,,1899,,Notre Dame
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
968,8,"Nov 14, 2020",Sat,(2) Notre Dame,Away,Boston College,ACC,W,45,31,8,0,,W 8,,2020,3:30 PM,Notre Dame
969,9,"Nov 27, 2020",Fri,(2) Notre Dame,Away,(25) North Carolina,ACC,W,31,17,9,0,,W 9,,2020,3:30 PM,Notre Dame
970,10,"Dec 5, 2020",Sat,(2) Notre Dame,Home,Syracuse,ACC,W,45,21,10,0,,W 10,,2020,2:30 PM,Notre Dame
971,11,"Dec 19, 2020",Sat,(2) Notre Dame,Neutral,(4) Clemson,ACC,L,10,34,10,1,,L 1,ACC Championship Game,2020,4:00 PM,Notre Dame


In [11]:
# create duplicate Opponents column to standardize opponent name
combined_schedule['Standardized_Opponent'] = combined_schedule['Opponent']

# remove rankings from new column
combined_schedule['Standardized_Opponent']= combined_schedule['Standardized_Opponent'].str.replace(r'\d', '')
combined_schedule['Standardized_Opponent']= combined_schedule['Standardized_Opponent'].str.replace(r'[()]', '')

# show updated dataframe
combined_schedule

  combined_schedule['Standardized_Opponent']= combined_schedule['Standardized_Opponent'].str.replace(r'\d', '')
  combined_schedule['Standardized_Opponent']= combined_schedule['Standardized_Opponent'].str.replace(r'[()]', '')


Unnamed: 0,G,Date,Day,School,Location,Opponent,Conf,Result,Pts,Opp,W,L,T,Streak,Notes,Season,Time,Standardized_School,Standardized_Opponent
0,1,"Sep 27, 1899",Wed,Notre Dame,Home,Englewood High,Non-Major,W,29,5,1,0,0.0,W 1,,1899,,Notre Dame,Englewood High
1,2,"Sep 30, 1899",Sat,Notre Dame,Home,Michigan State,Non-Major,W,40,0,2,0,0.0,W 2,,1899,,Notre Dame,Michigan State
2,3,"Oct 4, 1899",Wed,Notre Dame,Away,Chicago,Western,L,6,23,2,1,0.0,L 1,,1899,,Notre Dame,Chicago
3,4,"Oct 14, 1899",Sat,Notre Dame,Home,Lake Forest,Ind,W,38,0,3,1,0.0,W 1,,1899,,Notre Dame,Lake Forest
4,5,"Oct 18, 1899",Wed,Notre Dame,Away,Michigan,Western,L,0,12,3,2,0.0,L 1,,1899,,Notre Dame,Michigan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
968,8,"Nov 14, 2020",Sat,(2) Notre Dame,Away,Boston College,ACC,W,45,31,8,0,,W 8,,2020,3:30 PM,Notre Dame,Boston College
969,9,"Nov 27, 2020",Fri,(2) Notre Dame,Away,(25) North Carolina,ACC,W,31,17,9,0,,W 9,,2020,3:30 PM,Notre Dame,North Carolina
970,10,"Dec 5, 2020",Sat,(2) Notre Dame,Home,Syracuse,ACC,W,45,21,10,0,,W 10,,2020,2:30 PM,Notre Dame,Syracuse
971,11,"Dec 19, 2020",Sat,(2) Notre Dame,Neutral,(4) Clemson,ACC,L,10,34,10,1,,L 1,ACC Championship Game,2020,4:00 PM,Notre Dame,Clemson


In [12]:
# create new column for converted datetime object
combined_schedule['Standardized_Date'] = pd.to_datetime(combined_schedule['Date'])

combined_schedule

Unnamed: 0,G,Date,Day,School,Location,Opponent,Conf,Result,Pts,Opp,W,L,T,Streak,Notes,Season,Time,Standardized_School,Standardized_Opponent,Standardized_Date
0,1,"Sep 27, 1899",Wed,Notre Dame,Home,Englewood High,Non-Major,W,29,5,1,0,0.0,W 1,,1899,,Notre Dame,Englewood High,1899-09-27
1,2,"Sep 30, 1899",Sat,Notre Dame,Home,Michigan State,Non-Major,W,40,0,2,0,0.0,W 2,,1899,,Notre Dame,Michigan State,1899-09-30
2,3,"Oct 4, 1899",Wed,Notre Dame,Away,Chicago,Western,L,6,23,2,1,0.0,L 1,,1899,,Notre Dame,Chicago,1899-10-04
3,4,"Oct 14, 1899",Sat,Notre Dame,Home,Lake Forest,Ind,W,38,0,3,1,0.0,W 1,,1899,,Notre Dame,Lake Forest,1899-10-14
4,5,"Oct 18, 1899",Wed,Notre Dame,Away,Michigan,Western,L,0,12,3,2,0.0,L 1,,1899,,Notre Dame,Michigan,1899-10-18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
968,8,"Nov 14, 2020",Sat,(2) Notre Dame,Away,Boston College,ACC,W,45,31,8,0,,W 8,,2020,3:30 PM,Notre Dame,Boston College,2020-11-14
969,9,"Nov 27, 2020",Fri,(2) Notre Dame,Away,(25) North Carolina,ACC,W,31,17,9,0,,W 9,,2020,3:30 PM,Notre Dame,North Carolina,2020-11-27
970,10,"Dec 5, 2020",Sat,(2) Notre Dame,Home,Syracuse,ACC,W,45,21,10,0,,W 10,,2020,2:30 PM,Notre Dame,Syracuse,2020-12-05
971,11,"Dec 19, 2020",Sat,(2) Notre Dame,Neutral,(4) Clemson,ACC,L,10,34,10,1,,L 1,ACC Championship Game,2020,4:00 PM,Notre Dame,Clemson,2020-12-19


In [13]:
combined_schedule.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 973 entries, 0 to 972
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   G                      973 non-null    int64         
 1   Date                   973 non-null    object        
 2   Day                    973 non-null    object        
 3   School                 973 non-null    object        
 4   Location               973 non-null    object        
 5   Opponent               973 non-null    object        
 6   Conf                   973 non-null    object        
 7   Result                 973 non-null    object        
 8   Pts                    973 non-null    int64         
 9   Opp                    973 non-null    int64         
 10  W                      973 non-null    int64         
 11  L                      973 non-null    int64         
 12  T                      662 non-null    float64       
 13  Strea

In [14]:
# list with desired column order
column_names = ['G', 'Season', 'Standardized_Date', 'Date', 'Day', 'Time', 'School', 'Standardized_School', 'Location', 'Opponent', 'Standardized_Opponent', 'Conf', 'Result', 'Pts', 'Opp', 'W', 'L', 'T', 'Streak', 'Notes']

# reorder columns
combined_schedule = combined_schedule.reindex(columns = column_names)

# show updated dataframe
combined_schedule

Unnamed: 0,G,Season,Standardized_Date,Date,Day,Time,School,Standardized_School,Location,Opponent,Standardized_Opponent,Conf,Result,Pts,Opp,W,L,T,Streak,Notes
0,1,1899,1899-09-27,"Sep 27, 1899",Wed,,Notre Dame,Notre Dame,Home,Englewood High,Englewood High,Non-Major,W,29,5,1,0,0.0,W 1,
1,2,1899,1899-09-30,"Sep 30, 1899",Sat,,Notre Dame,Notre Dame,Home,Michigan State,Michigan State,Non-Major,W,40,0,2,0,0.0,W 2,
2,3,1899,1899-10-04,"Oct 4, 1899",Wed,,Notre Dame,Notre Dame,Away,Chicago,Chicago,Western,L,6,23,2,1,0.0,L 1,
3,4,1899,1899-10-14,"Oct 14, 1899",Sat,,Notre Dame,Notre Dame,Home,Lake Forest,Lake Forest,Ind,W,38,0,3,1,0.0,W 1,
4,5,1899,1899-10-18,"Oct 18, 1899",Wed,,Notre Dame,Notre Dame,Away,Michigan,Michigan,Western,L,0,12,3,2,0.0,L 1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
968,8,2020,2020-11-14,"Nov 14, 2020",Sat,3:30 PM,(2) Notre Dame,Notre Dame,Away,Boston College,Boston College,ACC,W,45,31,8,0,,W 8,
969,9,2020,2020-11-27,"Nov 27, 2020",Fri,3:30 PM,(2) Notre Dame,Notre Dame,Away,(25) North Carolina,North Carolina,ACC,W,31,17,9,0,,W 9,
970,10,2020,2020-12-05,"Dec 5, 2020",Sat,2:30 PM,(2) Notre Dame,Notre Dame,Home,Syracuse,Syracuse,ACC,W,45,21,10,0,,W 10,
971,11,2020,2020-12-19,"Dec 19, 2020",Sat,4:00 PM,(2) Notre Dame,Notre Dame,Neutral,(4) Clemson,Clemson,ACC,L,10,34,10,1,,L 1,ACC Championship Game


# Write DataFrame to CSV File

In [15]:
# write combined_schedules dataframe to csv file
combined_schedule.to_csv('combined_nd_schedules.csv', index=False)