# NFL Ratings Data

### Retrieves game data.

In [1]:
import pandas as pd
import numpy as np
import random
from functools import reduce
import os
os.chdir("../")
os.getcwd()

'/Users/jakesingleton/Documents/projects/football'

In [2]:
# For reproducibility
np.random.seed(9)

# All 32 teams in alphabetical order
teams = ["Arizona Cardinals", "Atlanta Falcons", "Baltimore Ravens", "Buffalo Bills", "Carolina Panthers", "Chicago Bears", "Cincinnati Bengals",
        "Cleveland Browns", "Dallas Cowboys", "Denver Broncos", "Detroit Lions", "Green Bay Packers", "Houston Texans", "Indianapolis Colts",
        "Jacksonville Jaguars", "Kansas City Chiefs", "Miami Dolphins", "Minnesota Vikings", "New England Patriots", "New Orleans Saints",
        "New York Giants", "New York Jets", "Las Vegas Raiders", "Philadelphia Eagles", "Pittsburgh Steelers", "Los Angeles Rams", "Los Angeles Chargers",
        "San Francisco 49ers", "Seattle Seahawks", "Tampa Bay Buccaneers", "Tennessee Titans", "Washington Football Team"]

# Generates list of 32 numbers in range [-10, 10] with average 0. To be used for initial trial ratings
# https://stackoverflow.com/questions/39435481/how-to-generate-numbers-in-range-with-specific-average-with-python
def gen_avg(expected_avg = 0, n = 32, a = -10, b = 10):
    while True:
        l = [random.randint(a, b) for i in range(n)]
        avg = reduce(lambda x, y: x + y, l) / len(l)

        if avg == expected_avg:
            return l
        
ratings = gen_avg()
off_ratings = gen_avg()
def_ratings = gen_avg()

# Put teams and ratings into one data frame
teams = pd.DataFrame(data = {"Team": teams, "ID": np.arange(32), "Rating": ratings,
                             "Off._Rating": off_ratings, "Def._Rating": def_ratings})
display(teams.head())
teams.to_csv("./data/team_ids_and_ratings.csv")  # Export for Excel use

Unnamed: 0,Team,ID,Rating,Off._Rating,Def._Rating
0,Arizona Cardinals,0,6,7,-6
1,Atlanta Falcons,1,1,1,9
2,Baltimore Ravens,2,-2,5,-3
3,Buffalo Bills,3,-3,-2,6
4,Carolina Panthers,4,-9,6,1


In [3]:
# Scraping game results

# CHANGE year IF NECESSARY
year = 2021
url = 'https://www.pro-football-reference.com/years/' + str(year) + '/week_1.htm'
df = pd.read_html(url)

# Cleaning
game_results = pd.concat(df[0:32:2], ignore_index = False)  # Gets list of game results (a list of data frames) and concats them into one
game_results = game_results.drop(2, axis = 1).drop(labels = 0, axis = 0)  # Removes useless 3rd col, drops date rows
game_results.head(6)

Unnamed: 0,0,1
1,Dallas Cowboys,29
2,Tampa Bay Buccaneers,31
1,Pittsburgh Steelers,23
2,Buffalo Bills,16
1,Philadelphia Eagles,32
2,Atlanta Falcons,6


In [4]:
# Nice solution to get the teams to go side-by-side: 
# https://stackoverflow.com/questions/57763470/pandas-get-second-row-and-put-it-at-the-end-of-first-row-and-automatically-cre

game_results = pd.DataFrame([y.values.ravel() for x, y in game_results.groupby(np.arange(len(game_results)) // 2)])
game_results = (game_results.loc[:, [2, 0, 3, 1]]
                .rename({2: "Home Team", 0: "Away Team", 3: "Home Score", 1: "Away Score"}, axis = 1)
                .astype({"Home Score": "int64", "Away Score": "int64"}))  # Re-order and rename columns, change score cols to ints


In [5]:
# Defines Margin as the home score - away score. A negative means the away team won; 0 a tie
game_results["Margin"] = game_results["Home Score"] - game_results["Away Score"]
game_results.head()

Unnamed: 0,Home Team,Away Team,Home Score,Away Score,Margin
0,Tampa Bay Buccaneers,Dallas Cowboys,31,29,2
1,Buffalo Bills,Pittsburgh Steelers,16,23,-7
2,Atlanta Falcons,Philadelphia Eagles,6,32,-26
3,Washington Football Team,Los Angeles Chargers,16,20,-4
4,Tennessee Titans,Arizona Cardinals,13,38,-25


In [6]:
# Function that performs cleaning; returns data frame of all requested 2020 games
# Takes url_string for a given week and an appropriate_idx, which accounts for how many teams played that given week, making scraping work properly
def clean_scores(url_string, appropriate_idx):
    df = pd.read_html(url_string)
    
    # Cleaning
    game_results = pd.concat(df[0:appropriate_idx:2], ignore_index = False)  # Gets list of game results (a list of data frames) and concats them into one
    game_results = game_results.drop(2, axis = 1).drop(labels = 0, axis = 0)  # Removes useless 3rd col, drops date rows
    
    game_results = pd.DataFrame([y.values.ravel() for x, y in game_results.groupby(np.arange(len(game_results)) // 2)])
    game_results = (game_results.loc[:, [2, 0, 3, 1]]
                    .rename({2: "Home Team", 0: "Away Team", 3: "Home Score", 1: "Away Score"}, axis = 1)
                    .astype({"Home Score": "int64", "Away Score": "int64"}))  # Re-order and rename columns, change score cols to ints
    
    # Defines Margin as the home score - away score. A negative means the away team won; 0 a tie
    game_results["Margin"] = game_results["Home Score"] - game_results["Away Score"]
    
    # Return game_results data frame
    return game_results

In [7]:
# Call clean_scores() on all 18 weeks of the season (Round 1 of playoffs has been completed as of 01/12/20)

# CHANGE THIS NUMBER TO THE NUMBER OF COMPLETED WEEKS
num_weeks = 4  # As of 09/29/21, 3 week has been completed.

# CHANGE if necessary: this is a list of how many teams participated each week. 2020 was abnormal due to COVID
appropriate_idx = [32, 32, 32, 32]

# Container for data frames
scores = []

for i in range(1, num_weeks + 1):
    url = 'https://www.pro-football-reference.com/years/' + str(year) + '/week_' + str(i) + '.htm'
    week_i_scores = clean_scores(url, appropriate_idx[i - 1])
    week_i_scores["Week"] = i
    scores.append(week_i_scores)

In [8]:
# Concat scores into one data frame
scores = pd.concat(scores, axis = 0)
scores["Game #"] = list(np.arange(1, len(scores) + 1))
scores.head()

Unnamed: 0,Home Team,Away Team,Home Score,Away Score,Margin,Week,Game #
0,Tampa Bay Buccaneers,Dallas Cowboys,31,29,2,1,1
1,Buffalo Bills,Pittsburgh Steelers,16,23,-7,1,2
2,Atlanta Falcons,Philadelphia Eagles,6,32,-26,1,3
3,Washington Football Team,Los Angeles Chargers,16,20,-4,1,4
4,Tennessee Titans,Arizona Cardinals,13,38,-25,1,5


In [9]:
# Assures we collected the exactly correct number of games
scores.shape[0] == (np.array(appropriate_idx) / 2).sum()

True

In [10]:
# Merge with teams df to get team ID numbers. Requires 2 merges. Then some straightforward cleaning
scores_merged = (scores.merge(teams, how = "inner", left_on = "Home Team", right_on = "Team", suffixes = ("_Home", "_Away"))
                .merge(teams, how = "inner", left_on = "Away Team", right_on = "Team", suffixes = ("_Home", "_Away"))
                .drop(["Team_Home", "Team_Away", "Rating_Home", "Off._Rating_Home", "Def._Rating_Home",
                       "Rating_Away", "Off._Rating_Away", "Def._Rating_Away"], axis = 1)
                .loc[:, ["Week", "Game #", "Home Team", "Away Team", "ID_Home", "ID_Away", "Home Score", "Away Score", "Margin"]]
                .sort_values(by = "Game #", axis = 0)
                .reset_index(drop = True))
scores_merged.head()

Unnamed: 0,Week,Game #,Home Team,Away Team,ID_Home,ID_Away,Home Score,Away Score,Margin
0,1,1,Tampa Bay Buccaneers,Dallas Cowboys,29,8,31,29,2
1,1,2,Buffalo Bills,Pittsburgh Steelers,3,24,16,23,-7
2,1,3,Atlanta Falcons,Philadelphia Eagles,1,23,6,32,-26
3,1,4,Washington Football Team,Los Angeles Chargers,31,26,16,20,-4
4,1,5,Tennessee Titans,Arizona Cardinals,30,0,13,38,-25


In [11]:
scores_merged.tail()  # Check .tail() to see if we got playoff games

Unnamed: 0,Week,Game #,Home Team,Away Team,ID_Home,ID_Away,Home Score,Away Score,Margin
59,4,60,Los Angeles Rams,Arizona Cardinals,25,0,20,37,-17
60,4,61,Green Bay Packers,Pittsburgh Steelers,11,24,27,17,10
61,4,62,Denver Broncos,Baltimore Ravens,9,2,7,23,-16
62,4,63,New England Patriots,Tampa Bay Buccaneers,18,29,17,19,-2
63,4,64,Los Angeles Chargers,Las Vegas Raiders,26,22,28,14,14


Looks good!

In [12]:
# Export to csv so we can use Excel solver...
scores_merged.to_csv("./data/" + str(year) + "_nfl_game_data.csv", index = False)  

In [13]:
scores_merged.tail()

Unnamed: 0,Week,Game #,Home Team,Away Team,ID_Home,ID_Away,Home Score,Away Score,Margin
59,4,60,Los Angeles Rams,Arizona Cardinals,25,0,20,37,-17
60,4,61,Green Bay Packers,Pittsburgh Steelers,11,24,27,17,10
61,4,62,Denver Broncos,Baltimore Ravens,9,2,7,23,-16
62,4,63,New England Patriots,Tampa Bay Buccaneers,18,29,17,19,-2
63,4,64,Los Angeles Chargers,Las Vegas Raiders,26,22,28,14,14
