## Exploratory analysis of the datasets

In [1]:
import pandas as pd
import numpy as np 

### Players

In [2]:
players_df = pd.read_csv('./nfl-big-data-bowl-2025/players.csv')

In [3]:
players_df.head()

Unnamed: 0,nflId,height,weight,birthDate,collegeName,position,displayName
0,25511,6-4,225,1977-08-03,Michigan,QB,Tom Brady
1,29550,6-4,328,1982-01-22,Arkansas,T,Jason Peters
2,29851,6-2,225,1983-12-02,California,QB,Aaron Rodgers
3,30842,6-6,267,1984-05-19,UCLA,TE,Marcedes Lewis
4,33084,6-4,217,1985-05-17,Boston College,QB,Matt Ryan


In [7]:
player_weight_grouped_by_position = players_df.groupby('position')['weight'].mean()

In [17]:
print(f"Position with the highest weight is {player_weight_grouped_by_position.idxmax()} with a weight of {round(player_weight_grouped_by_position.max(), 2)} lbs.")
print(f"Position with the highest weight is {player_weight_grouped_by_position.idxmin()} with a weight of {round(player_weight_grouped_by_position.min(), 2)} lbs.")

Position with the highest weight is NT with a weight of 322.88 lbs.
Position with the highest weight is CB with a weight of 193.05 lbs.


In [19]:
# Convert height into inches
def convert_ft_to_inches(height: str):
    """Convert height as a string into an integer"""
    feet = int(height[0])
    inches = int(height[-1])
    total_inches = feet * 12 + inches
    return total_inches


In [22]:
players_df['height_in_inches'] = players_df['height'].apply(convert_ft_to_inches)

In [23]:
players_df.head()

Unnamed: 0,nflId,height,weight,birthDate,collegeName,position,displayName,height_in_inches
0,25511,6-4,225,1977-08-03,Michigan,QB,Tom Brady,76
1,29550,6-4,328,1982-01-22,Arkansas,T,Jason Peters,76
2,29851,6-2,225,1983-12-02,California,QB,Aaron Rodgers,74
3,30842,6-6,267,1984-05-19,UCLA,TE,Marcedes Lewis,78
4,33084,6-4,217,1985-05-17,Boston College,QB,Matt Ryan,76


In [59]:
import plotly.express as px

fig = px.scatter(
    players_df,
    y = "height_in_inches",
    x = "weight",
    hover_data = ['displayName']
)

fig.update_layout(
    xaxis_title = "Weight (lbs)",
    yaxis_title = "Height (inches)",
    title = {
        'text' : 'Comparison of Player Height and Weight',
        'x' : 0.5,
        'xanchor' : 'center'
    }
)
fig.update_traces(
    text = players_df['displayName'],
    hovertemplate='<b>%{text}</b>' + 
                    '<br>Position: %{customdata[0]}' +  
                    '<br>Weight: %{x} lbs ' +
                    '<br>Height: %{y} inches',
    customdata = players_df[['position']]
)

fig.show()

### Games

In [61]:
games_df = pd.read_csv('./nfl-big-data-bowl-2025/games.csv')

In [62]:
games_df.head()

Unnamed: 0,gameId,season,week,gameDate,gameTimeEastern,homeTeamAbbr,visitorTeamAbbr,homeFinalScore,visitorFinalScore
0,2022090800,2022,1,9/8/2022,20:20:00,LA,BUF,10,31
1,2022091100,2022,1,9/11/2022,13:00:00,ATL,NO,26,27
2,2022091101,2022,1,9/11/2022,13:00:00,CAR,CLE,24,26
3,2022091102,2022,1,9/11/2022,13:00:00,CHI,SF,19,10
4,2022091103,2022,1,9/11/2022,13:00:00,CIN,PIT,20,23


How many seasons are stored in this data?

In [66]:
games_df['season'].unique()

array([2022])

In [68]:
games_df['week'].unique()

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

It seems that only 2022 seasons data are stored with 9 weeks.

Which team has the highest wins?

### Plays

In [None]:
plays_df = pd.read_csv('./nfl-big-data-bowl-2025/plays.csv')