# NFL Teams DataFrame

Build a comprehensive teams dataset from the 2024 schedule data.

In [None]:
import pandas as pd

# Load the complete 2024 schedule
schedule_df = pd.read_csv('nfl_2024_schedule.csv')
print(f"Loaded schedule: {len(schedule_df)} games")
schedule_df.head()

In [None]:
# Get unique team names from both home and away teams
all_teams = set(schedule_df['home_team'].unique()) | set(schedule_df['away_team'].unique())

# Create teams DataFrame
teams_df = pd.DataFrame({
    'team_name': sorted(list(all_teams))
})

In [None]:
# Build opponent lists for each team
def get_team_opponents(team_name, schedule_df):
    """Get all opponents for a team in 2024"""
    
    # Games where team is home
    home_games = schedule_df[schedule_df['home_team'] == team_name]
    home_opponents = home_games['away_team'].tolist()
    
    # Games where team is away  
    away_games = schedule_df[schedule_df['away_team'] == team_name]
    away_opponents = away_games['home_team'].tolist()
    
    # Combine all opponents
    all_opponents = home_opponents + away_opponents
    
    return all_opponents

# Add opponents list to each team
teams_df['opponents_2024'] = teams_df['team_name'].apply(
    lambda team: get_team_opponents(team, schedule_df)
)


teams_df.head(5)

In [None]:
# Scrape NFL team data from Wikipedia table - handle complex merged cell structure
import requests
from bs4 import BeautifulSoup
import re

def scrape_nfl_teams_wikipedia():
    """Scrape NFL team info from Wikipedia table - handles merged cells properly"""
    
    url = "https://en.wikipedia.org/wiki/National_Football_League"
    
    print(f"Scraping {url}...")
    response = requests.get(url)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find the specific NFL teams table - it's the one with Conference, Division, Team columns
        tables = soup.find_all('table', {'class': 'wikitable'})
        
        for i, table in enumerate(tables):
            # Look for the table that contains NFL team data
            # Check if table has the right structure by looking for AFC/NFC
            table_text = table.get_text()
            if 'Arizona Cardinals' in table_text and 'Conference' in table_text:
                print(f"✓ Found NFL teams table (table {i})")
                
                teams_data = []
                current_conference = None
                current_division = None
                
                # Get all rows
                rows = table.find_all('tr')
                
                for row_idx, row in enumerate(rows):
                    cells = row.find_all(['td', 'th'])
                    
                    if len(cells) == 0:
                        continue
                        
                    # Extract text from each cell and clean it
                    cell_texts = []
                    for cell in cells:
                        text = cell.get_text().strip()
                        # Remove footnote references [1], [2], etc.
                        text = re.sub(r'\\[\\d+\\]', '', text)
                        # Remove asterisks and daggers
                        text = text.replace('*', '').replace('†', '').strip()
                        cell_texts.append(text)
                    
                    # Skip header row
                    if 'Conference' in cell_texts or 'Division' in cell_texts:
                        continue
                    
                    # Handle different row types based on cell count and content
                    if len(cell_texts) >= 8:
                        # This could be a full row with conference, division, team info
                        if cell_texts[0] in ['AFC', 'NFC']:
                            # New conference row
                            current_conference = cell_texts[0]
                            current_division = cell_texts[1]
                            
                            # Extract team data from this row
                            team = cell_texts[2]
                            city = cell_texts[3]
                            stadium = cell_texts[4]
                            capacity = cell_texts[5]
                            first_season = cell_texts[6]
                            head_coach = cell_texts[7]
                            
                            if team and team not in ['', 'Team']:  # Valid team name
                                teams_data.append({
                                    'team': team,
                                    'conference': current_conference,
                                    'division': current_division,
                                    'city': city,
                                    'stadium': stadium,
                                    'capacity': capacity,
                                    'first_season': first_season,
                                    'head_coach': head_coach
                                })
                        
                        elif current_conference and cell_texts[0] == '':
                            # Continuation row - just division change
                            if cell_texts[1] in ['East', 'West', 'North', 'South']:
                                current_division = cell_texts[1]
                                
                                # Extract team data
                                team = cell_texts[2]
                                city = cell_texts[3]
                                stadium = cell_texts[4]
                                capacity = cell_texts[5]
                                first_season = cell_texts[6]
                                head_coach = cell_texts[7]
                                
                                if team and team not in ['', 'Team']:
                                    teams_data.append({
                                        'team': team,
                                        'conference': current_conference,
                                        'division': current_division,
                                        'city': city,
                                        'stadium': stadium,
                                        'capacity': capacity,
                                        'first_season': first_season,
                                        'head_coach': head_coach
                                    })
                    
                    # Handle rows where team data might be in different positions
                    elif len(cell_texts) >= 6 and current_conference:
                        # Look for team names in the cells
                        for idx, cell_text in enumerate(cell_texts):
                            # Check if this looks like a team name (contains common NFL team words)
                            team_indicators = ['Bills', 'Patriots', 'Jets', 'Dolphins', 'Ravens', 'Bengals', 
                                             'Browns', 'Steelers', 'Texans', 'Colts', 'Jaguars', 'Titans',
                                             'Broncos', 'Chiefs', 'Raiders', 'Chargers', 'Cowboys', 'Giants',
                                             'Eagles', 'Commanders', 'Bears', 'Lions', 'Packers', 'Vikings',
                                             'Falcons', 'Panthers', 'Saints', 'Buccaneers', 'Cardinals', 
                                             'Rams', '49ers', 'Seahawks']
                            
                            if any(indicator in cell_text for indicator in team_indicators):
                                # Found a team - try to extract data
                                team = cell_text
                                city = cell_texts[idx + 1] if idx + 1 < len(cell_texts) else ''
                                stadium = cell_texts[idx + 2] if idx + 2 < len(cell_texts) else ''
                                capacity = cell_texts[idx + 3] if idx + 3 < len(cell_texts) else ''
                                first_season = cell_texts[idx + 4] if idx + 4 < len(cell_texts) else ''
                                head_coach = cell_texts[idx + 5] if idx + 5 < len(cell_texts) else ''
                                
                                teams_data.append({
                                    'team': team,
                                    'conference': current_conference,
                                    'division': current_division,
                                    'city': city,
                                    'stadium': stadium,
                                    'capacity': capacity,
                                    'first_season': first_season,
                                    'head_coach': head_coach
                                })
                                break
                
                if teams_data:
                    df = pd.DataFrame(teams_data)
                    # Remove duplicates if any
                    df = df.drop_duplicates(subset=['team'])
                    print(f"✓ Scraped {len(df)} teams from Wikipedia")
                    return df
        
        print("❌ Could not find NFL teams table")
        return None
    
    else:
        print(f"❌ Failed to fetch Wikipedia: {response.status_code}")
        return None

# Try scraping again with improved parser
scraped_teams = scrape_nfl_teams_wikipedia()

if scraped_teams is not None:
    print("\\n✓ Successfully scraped NFL teams:")
    print(scraped_teams[['team', 'conference', 'division', 'stadium', 'head_coach']].head(10))
    print(f"\\nTotal teams: {len(scraped_teams)}")
    
    # Check if we got all 32 teams
    if len(scraped_teams) == 32:
        print("✓ Got all 32 NFL teams!")
    else:
        print(f"⚠️  Only got {len(scraped_teams)} teams, expected 32")
        
else:
    print("❌ Wikipedia scraping failed")

In [None]:
# Parse city and state from the scraped Wikipedia data
def parse_city_state(city_string):
    """Parse city and state from Wikipedia city field like 'Baltimore, Maryland'"""
    if not city_string or city_string.strip() == '':
        return '', ''
    
    city_string = city_string.strip()
    
    if ',' in city_string:
        parts = [p.strip() for p in city_string.split(',')]
        if len(parts) >= 2:
            city = parts[0]
            state = parts[1]
            return city, state
    
    # If no comma, assume it's just city
    return city_string, ''

# Add city and state columns to the scraped data
if scraped_teams is not None:
    scraped_teams[['stadium_city', 'stadium_state']] = scraped_teams['city'].apply(
        lambda x: pd.Series(parse_city_state(x))
    )

In [None]:
teams_df.head()

In [None]:
scraped_teams.sort_values(by='team', inplace=True)
scraped_teams.reset_index(drop=True, inplace=True)
scraped_teams.head(10)

In [None]:
# Add opponents_2024 column to scraped_teams by merging with teams_df
teams = scraped_teams.merge(
    teams_df[['team_name', 'opponents_2024']], 
    left_on='team', 
    right_on='team_name', 
    how='left'
).drop('team_name', axis=1)

print(f"Successfully merged opponents_2024 column")
print(f"Scraped teams shape: {teams.shape}")
print(f"Teams with opponents data: {teams['opponents_2024'].notna().sum()}/32")

teams.head()