In [85]:
import requests
from datetime import datetime
import pandas as pd

# Define the NHL API endpoint for the schedule endpoint
api_url = "https://statsapi.web.nhl.com/api/v1/schedule"

# Define a list of seasons you want to retrieve data for
start_season = 20002001
current_year = datetime.now().year
current_month = currentMonth = datetime.now().month

if current_month >= 8:
    end_season = int(str(current_year) + str(current_year + 1))
else:
    end_season = int(str(current_year - 1) + str(current_year))

seasons = []
for i in range(start_season, end_season + 10001, 10001):
    seasons.append(str(i))

# Initialize an empty list to store game data
all_games = []

# Loop through each season and retrieve game data
for season in seasons:
    # Define parameters for the API request for the current season
    params = {
        "hydrate": "team,linescore,game(content(media(epg))),broadcasts(all)",
        "site": "en_nhl",
        "season": season,
    }

    # Send a GET request to the NHL API
    response = requests.get(api_url, params=params)

    # Check if the request was successful
    if response.status_code == 200:
        data = response.json()

        # Extract game data from the response
        games = []
        for date in data["dates"]:
            for game in date["games"]:
                games.append(game)

        # Add a "Season" column to the game data
        for game in games:
            game["Season"] = season

        # Extend the list of all games with games from the current season
        all_games.extend(games)
    else:
        print(f"Failed to retrieve data for season {season}. Status code:", response.status_code)

# Create a Pandas DataFrame from all the game data
df = pd.DataFrame(all_games)

In [86]:
df['game_id'] = df['gamePk'].astype(str)
df.rename(columns = {'Season' : 'season', 'gameType' : 'type', 'gameDate' : 'date_time_GMT'}, inplace = True)

def extract_away_team_id(row):
    return row['away']['team']['id']

def extract_home_team_id(row):
    return row['home']['team']['id']

def extract_away_goals(row):
    return row['away']['score']

def extract_home_goals(row):
    return row['home']['score']

def extract_game_status(row):
    return row['detailedState']

# Apply the extraction functions to create new columns
df['away_team_id'] = df['teams'].apply(extract_away_team_id)
df['home_team_id'] = df['teams'].apply(extract_home_team_id)
df['away_goals'] = df['teams'].apply(extract_away_goals)
df['home_goals'] = df['teams'].apply(extract_home_goals)
df['game_status'] = df['status'].apply(extract_game_status)

# Calculate the outcome based on goals
df['outcome'] = df.apply(lambda row: 'Home Win' if row['home_goals'] > row['away_goals'] else 'Away Win' if row['away_goals'] > row['home_goals'] else 'Tie', axis=1)

df_game = df[['game_id', 'season', 'type', 'date_time_GMT', 'away_team_id', 'home_team_id', 'away_goals', 'home_goals', 'outcome', 'game_status']]
df_game

Unnamed: 0,game_id,season,season.1,type,date_time_GMT,away_team_id,home_team_id,away_goals,home_goals,outcome,game_status
0,2000010201,20002001,20002001,PR,2000-09-14T00:00:00Z,23,3264,0,0,Tie,Final
1,2000010202,20002001,20002001,PR,2000-09-16T00:00:00Z,23,3212,0,0,Tie,Final
2,2000020001,20002001,20002001,R,2000-10-04T23:00:00Z,21,25,2,2,Tie,Final
3,2000020002,20002001,20002001,R,2000-10-05T23:00:00Z,9,6,4,4,Tie,Final
4,2000020003,20002001,20002001,R,2000-10-05T23:00:00Z,16,7,2,4,Home Win,Final
...,...,...,...,...,...,...,...,...,...,...,...
31429,2023021308,20232024,20232024,R,2024-04-19T00:00:00Z,23,52,0,0,Tie,Scheduled
31430,2023021309,20232024,20232024,R,2024-04-19T01:00:00Z,28,20,0,0,Tie,Scheduled
31431,2023021310,20232024,20232024,R,2024-04-19T01:30:00Z,22,21,0,0,Tie,Scheduled
31432,2023021311,20232024,20232024,R,2024-04-19T02:00:00Z,24,54,0,0,Tie,Scheduled


In [89]:
import os
os.chdir('/Users/jdmcatee/Desktop/Sports Betting')
df_game.to_csv('nhl_historical_game_data.csv') 

In [238]:
import requests
import pandas as pd
seasons = ['20202021', '20212022', '20222023',]
# Define the NHL API endpoint for all NHL teams
api_url = "https://statsapi.web.nhl.com/api/v1/teams"

# Send a GET request to the NHL API to retrieve data about all NHL teams
response = requests.get(api_url)

# Check if the request was successful
if response.status_code == 200:
    data = response.json()

    # Extract the team IDs from the response
    team_ids = [team["id"] for team in data["teams"]]

    # Display the list of team IDs
    print("Most Recent Team IDs:", team_ids)

else:
    print("Failed to retrieve data. Status code:", response.status_code)

# Create an empty list to store skater data
skater_data = []
    
# Iterate through each season
for season in seasons:
    # Iterate through each team ID
    for team_id in team_ids:
        # Define the NHL API endpoint for a specific team's game roster
        api_url = f"https://statsapi.web.nhl.com/api/v1/teams/{team_id}?expand=team.roster"

        # Send a GET request to the NHL API to retrieve the team's roster
        response = requests.get(api_url)

        # Check if the request was successful
        if response.status_code == 200:
            data = response.json()

            # Extract the team's roster
            roster = data["teams"][0]["roster"]["roster"]

            # Iterate through each player on the roster
            for player in roster:
                player_id = player["person"]["id"]
                player_name = player["person"]["fullName"]

                # Define the NHL API endpoint for player game logs with the season parameter
                player_game_url = f"https://statsapi.web.nhl.com/api/v1/people/{player_id}/stats?stats=gameLog&season={season}"

                # Send a GET request to the NHL API to retrieve player game logs
                response = requests.get(player_game_url)

                # Check if the request was successful
                if response.status_code == 200:
                    player_game_data = response.json()


                # Extract skater statistics for each game
                for game_stats in player_game_data["stats"][0]["splits"]:
                    game_data = {
                        "team_id": team_id,
                        "team_name": data["teams"][0]["name"],
                        "opponent": game_stats["opponent"]["name"],
                        "player_id": player_id,
                        "player_name": player_name,
                        "game_id": game_stats["game"]["gamePk"],
                        "season": game_stats["season"],
                        "goals": game_stats["stat"].get("goals"),
                        "assists": game_stats["stat"].get("assists"),
                        "shots": game_stats["stat"].get("shots"),
                        "hits": game_stats["stat"].get("hits"),
                        "powerPlayGoals": game_stats["stat"].get("powerPlayGoals"),
                        "powerPlayAssists": game_stats["stat"].get("powerPlayAssists"),
                        "time_on_ice": game_stats["stat"]["timeOnIce"],
                        # Add more statistics as needed
                    }
                    skater_data.append(game_data)

    else:
        print(f"Failed to retrieve data for team ID {team_id}. Status code:", response.status_code)

# Create a Pandas DataFrame from the skater data
df_game_skater_stats = pd.DataFrame(skater_data)

# Display the DataFrame
print(df_game_skater_stats.head())

ConnectionError: HTTPSConnectionPool(host='statsapi.web.nhl.com', port=443): Max retries exceeded with url: /api/v1/teams (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x12bce8f50>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))

In [214]:
import pandas as pd
import requests

seasons = ['20202021', '20212022', '20222023',]

# Define the NHL API endpoints
NHL_API_URL = "https://statsapi.web.nhl.com/api/v1/"
SCHEDULE_ENDPOINT = "schedule"
GAME_STATS_ENDPOINT = "game/{game_id}/boxscore"

# Function to fetch game IDs for a specific season and game type
def fetch_game_ids(season, game_type):
    endpoint = f"{SCHEDULE_ENDPOINT}?season={season}&gameType={game_type}"
    url = f"{NHL_API_URL}{endpoint}"

    response = requests.get(url)
    data = response.json()

    game_id_map = {}

    for date in data["dates"]:
        for game in date["games"]:
            game_id_map[game["gamePk"]] = [game["teams"]["away"]["team"]["id"], game["teams"]["home"]["team"]["id"]]
    return game_id_map

fetch_game_ids('20212022', 'R')

# Function to fetch game teams stats for a specific game ID and team ID
def fetch_game_teams_stats(game_id, team_id, home_or_away):
    endpoint = f"{GAME_STATS_ENDPOINT}".format(game_id=game_id)
    url = f"{NHL_API_URL}{endpoint}"

    response = requests.get(url)
    data = response.json()

    home_or_away = home_or_away
    
    # Calculate if the team won
    home_goals = data["teams"]["home"]["teamStats"]["teamSkaterStats"]["goals"]
    away_goals = data["teams"]["away"]["teamStats"]["teamSkaterStats"]["goals"]
    won = home_goals > away_goals

    game_teams_stats = {
        "game_id": game_id,
        "team_id": team_id,
        "HoA": home_or_away,
        "won": won,
        # "settled_in": data["decisions"]["winner"] if won else data["decisions"]["loser"],
        "head_coach": data["teams"][home_or_away]["coaches"][0]["person"]["fullName"],
        "goals": home_goals if home_or_away == "home" else away_goals,
        "shots": data["teams"][home_or_away]["teamStats"]["teamSkaterStats"]["shots"],
        "hits": data["teams"][home_or_away]["teamStats"]["teamSkaterStats"].get("hits"),
        "pim": data["teams"][home_or_away]["teamStats"]["teamSkaterStats"]["pim"],
    }

    return game_teams_stats

# Specify the seasons and game types you want to retrieve data for
game_types = ["R", "P"]  # "R" for regular season, "P" for playoff

# Fetch game teams stats data for the specified seasons and game types
all_data = []

for season in seasons:
    for game_type in game_types:
        game_id_map = fetch_game_ids(season, game_type)
        for game_id, team_ids in game_id_map.items():
            
            # Fetch stats for both home and away teams
            home_team_id = fetch_game_teams_stats(game_id, team_ids[1], "home")
            away_team_id = fetch_game_teams_stats(game_id, team_ids[0], "away")

            # Append the stats for each team to the list
            all_data.append(home_team_id)
            all_data.append(away_team_id)

# Create a DataFrame from the extracted data
df_game_team_stats = pd.DataFrame(all_data)

# Define the CSV file path
csv_file_path = 'game_teams_stats.csv'

try:
    existing_df = pd.read_csv(csv_file_path)
except FileNotFoundError:
    existing_df = pd.DataFrame()

# Append the new data to the existing data
combined_df_team_stats = existing_df.append(df_game_team_stats, ignore_index=True)

# Remove duplicates based on the unique identifier ('id' and 'point' in this case)
combined_df_team_stats.drop_duplicates(subset=['game_id', 'team_id'], keep='last', inplace=True)

# Save the updated data to the CSV file
combined_df_team_stats.to_csv(csv_file_path, index=False)

# Display the updated DataFrame
print(combined_df_team_stats)

Data saved to game_teams_stats.csv


In [218]:
df_kaggle_game = pd.read_csv('game.csv')
df_kaggle_game

Unnamed: 0,game_id,season,type,date_time_GMT,away_team_id,home_team_id,away_goals,home_goals,outcome,home_rink_side_start,venue,venue_link,venue_time_zone_id,venue_time_zone_offset,venue_time_zone_tz
0,2016020045,20162017,R,2016-10-19T00:30:00Z,4,16,4,7,home win REG,right,United Center,/api/v1/venues/null,America/Chicago,-5,CDT
1,2017020812,20172018,R,2018-02-07T00:00:00Z,24,7,4,3,away win OT,left,KeyBank Center,/api/v1/venues/null,America/New_York,-4,EDT
2,2015020314,20152016,R,2015-11-24T01:00:00Z,21,52,4,1,away win REG,right,MTS Centre,/api/v1/venues/null,America/Winnipeg,-5,CDT
3,2015020849,20152016,R,2016-02-17T00:00:00Z,52,12,1,2,home win REG,right,PNC Arena,/api/v1/venues/null,America/New_York,-4,EDT
4,2017020586,20172018,R,2017-12-30T03:00:00Z,20,24,1,2,home win REG,left,Honda Center,/api/v1/venues/null,America/Los_Angeles,-7,PDT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26300,2018030415,20182019,P,2019-06-07T00:00:00Z,19,6,2,1,away win REG,left,TD Garden,/api/v1/venues/5085,America/New_York,-5,EST
26301,2018030416,20182019,P,2019-06-10T00:00:00Z,6,19,5,1,away win REG,left,Enterprise Center,/api/v1/venues/5076,America/Chicago,-6,CST
26302,2018030416,20182019,P,2019-06-10T00:00:00Z,6,19,5,1,away win REG,left,Enterprise Center,/api/v1/venues/5076,America/Chicago,-6,CST
26303,2018030417,20182019,P,2019-06-13T00:00:00Z,19,6,4,1,away win REG,left,TD Garden,/api/v1/venues/5085,America/New_York,-5,EST


In [232]:
df_game_j = pd.read_csv('nhl_historical_skater_stats.csv')
df_game_j[df_game_j.powerPlayAssists != 0]

Unnamed: 0.1,Unnamed: 0,team_id,team_name,opponent,player_id,player_name,game_id,season,goals,assists,shots,hits,powerPlayGoals,powerPlayAssists,time_on_ice
0,0,12,Carolina Hurricanes,St. Louis Blues,8470613,Brent Burns,2003021226,20032004,0.0,1.0,3.0,2.0,0.0,,20:28
1,1,12,Carolina Hurricanes,Dallas Stars,8470613,Brent Burns,2003021210,20032004,0.0,0.0,2.0,4.0,0.0,,19:11
2,2,12,Carolina Hurricanes,Colorado Avalanche,8470613,Brent Burns,2003021195,20032004,0.0,0.0,4.0,1.0,0.0,,16:51
3,3,12,Carolina Hurricanes,Detroit Red Wings,8470613,Brent Burns,2003021181,20032004,0.0,0.0,2.0,0.0,0.0,,17:16
4,4,12,Carolina Hurricanes,Anaheim Ducks,8470613,Brent Burns,2003021175,20032004,0.0,0.0,1.0,0.0,0.0,,13:56
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294830,294830,55,Seattle Kraken,Chicago Blackhawks,8483524,Shane Wright,2022020087,20222023,0.0,0.0,0.0,0.0,0.0,,05:51
294831,294831,55,Seattle Kraken,Colorado Avalanche,8483524,Shane Wright,2022020073,20222023,0.0,0.0,0.0,1.0,0.0,,08:42
294832,294832,55,Seattle Kraken,St. Louis Blues,8483524,Shane Wright,2022020058,20222023,0.0,1.0,0.0,0.0,0.0,,06:36
294833,294833,55,Seattle Kraken,Carolina Hurricanes,8483524,Shane Wright,2022020047,20222023,0.0,0.0,1.0,0.0,0.0,,06:50


In [236]:
df_skater_kaggle = pd.read_csv('game_skater_stats.csv')
df_skater_kaggle.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 945830 entries, 0 to 945829
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   game_id               945830 non-null  int64  
 1   player_id             945830 non-null  int64  
 2   team_id               945830 non-null  int64  
 3   timeOnIce             945830 non-null  int64  
 4   assists               945830 non-null  int64  
 5   goals                 945830 non-null  int64  
 6   shots                 945830 non-null  int64  
 7   hits                  547723 non-null  float64
 8   powerPlayGoals        945830 non-null  int64  
 9   powerPlayAssists      945830 non-null  int64  
 10  penaltyMinutes        945830 non-null  int64  
 11  faceOffWins           945830 non-null  int64  
 12  faceoffTaken          945830 non-null  int64  
 13  takeaways             547723 non-null  float64
 14  giveaways             547723 non-null  float64
 15  