In [1]:
import requests
import pandas as pd

# MLB Stats API endpoint
base_url = "https://statsapi.mlb.com/api/v1/stats"

# Parameters for player hitting stats
params = {
    "stats": "season",       
    "group": "hitting",      
    "limit": 100,            
    "offset": 0,             
    "season": 2024,          
    "sportIds": 1            
}

all_players = []

while True:
    print(f"Fetching offset: {params['offset']}")
    res = requests.get(base_url, params=params)
    data = res.json()

    # Extract player records
    players = data.get("stats", [])[0].get("splits", [])
    if not players:
        break  # no more data

    for p in players:
        stat = p.get("stat", {})
        stat["player"] = p.get("player", {}).get("fullName", "")
        stat["team"] = p.get("team", {}).get("name", "")
        all_players.append(stat)

    # Move to next page
    params["offset"] += params["limit"]

# Convert to DataFrame
df = pd.DataFrame(all_players)

# Save to CSV
df.to_csv("mlb_hitting_stats.csv", index=False)
print("✅ Data saved to mlb_hitting_stats.csv")



Fetching offset: 0
Fetching offset: 100
Fetching offset: 200
✅ Data saved to mlb_hitting_stats.csv


In [2]:
df

Unnamed: 0,gamesPlayed,groundOuts,airOuts,runs,doubles,triples,homeRuns,strikeOuts,baseOnBalls,intentionalWalks,...,rbi,leftOnBase,sacBunts,sacFlies,babip,groundOutsToAirouts,catchersInterference,atBatsPerHomeRun,player,team
0,161,122,205,125,45,11,32,106,57,9,...,109,181,0,8,.354,0.60,0,19.88,Bobby Witt Jr.,Kansas City Royals
1,159,185,140,98,44,1,30,96,72,12,...,103,225,0,4,.342,1.32,0,20.53,Vladimir Guerrero Jr.,Toronto Blue Jays
2,158,82,128,122,36,1,58,171,133,20,...,144,238,0,2,.367,0.64,1,9.64,Aaron Judge,New York Yankees
3,150,199,212,83,32,3,4,29,24,3,...,46,181,2,1,.324,0.94,0,159.25,Luis Arraez,San Diego Padres
4,159,124,158,134,38,7,54,162,81,10,...,130,236,0,5,.336,0.78,3,11.78,Shohei Ohtani,Los Angeles Dodgers
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124,157,163,148,50,24,0,17,128,41,1,...,46,254,1,7,.249,1.10,0,32.41,Orlando Arcia,Atlanta Braves
125,131,80,130,59,14,0,26,149,53,1,...,82,233,0,6,.250,0.62,2,17.27,Rhys Hoskins,Milwaukee Brewers
126,136,74,152,73,21,7,18,137,48,1,...,58,201,1,1,.262,0.49,0,25.50,Daulton Varsho,Toronto Blue Jays
127,138,102,110,60,20,2,17,188,38,0,...,49,231,3,5,.296,0.93,0,29.24,Zack Gelof,Oakland Athletics


In [3]:
%pip install plotly

import numpy as np
import plotly.express as px 

Note: you may need to restart the kernel to use updated packages.


In [4]:
import pandas as pd

df = pd.read_csv("mlb_hitting_stats.csv")
df.head() 

Unnamed: 0,gamesPlayed,groundOuts,airOuts,runs,doubles,triples,homeRuns,strikeOuts,baseOnBalls,intentionalWalks,...,rbi,leftOnBase,sacBunts,sacFlies,babip,groundOutsToAirouts,catchersInterference,atBatsPerHomeRun,player,team
0,161,122,205,125,45,11,32,106,57,9,...,109,181,0,8,0.354,0.6,0,19.88,Bobby Witt Jr.,Kansas City Royals
1,159,185,140,98,44,1,30,96,72,12,...,103,225,0,4,0.342,1.32,0,20.53,Vladimir Guerrero Jr.,Toronto Blue Jays
2,158,82,128,122,36,1,58,171,133,20,...,144,238,0,2,0.367,0.64,1,9.64,Aaron Judge,New York Yankees
3,150,199,212,83,32,3,4,29,24,3,...,46,181,2,1,0.324,0.94,0,159.25,Luis Arraez,San Diego Padres
4,159,124,158,134,38,7,54,162,81,10,...,130,236,0,5,0.336,0.78,3,11.78,Shohei Ohtani,Los Angeles Dodgers


In [5]:
## data exploration
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129 entries, 0 to 128
Data columns (total 34 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   gamesPlayed           129 non-null    int64  
 1   groundOuts            129 non-null    int64  
 2   airOuts               129 non-null    int64  
 3   runs                  129 non-null    int64  
 4   doubles               129 non-null    int64  
 5   triples               129 non-null    int64  
 6   homeRuns              129 non-null    int64  
 7   strikeOuts            129 non-null    int64  
 8   baseOnBalls           129 non-null    int64  
 9   intentionalWalks      129 non-null    int64  
 10  hits                  129 non-null    int64  
 11  hitByPitch            129 non-null    int64  
 12  avg                   129 non-null    float64
 13  atBats                129 non-null    int64  
 14  obp                   129 non-null    float64
 15  slg                   1

In [6]:

df.describe()

Unnamed: 0,gamesPlayed,groundOuts,airOuts,runs,doubles,triples,homeRuns,strikeOuts,baseOnBalls,intentionalWalks,...,plateAppearances,totalBases,rbi,leftOnBase,sacBunts,sacFlies,babip,groundOutsToAirouts,catchersInterference,atBatsPerHomeRun
count,129.0,129.0,129.0,129.0,129.0,129.0,129.0,129.0,129.0,129.0,...,129.0,129.0,129.0,129.0,129.0,129.0,129.0,129.0,129.0,129.0
mean,147.434109,131.705426,149.565891,77.379845,27.620155,2.449612,21.674419,128.48062,53.503876,2.635659,...,611.403101,238.860465,75.178295,220.333333,0.79845,4.302326,0.299147,0.917132,0.310078,32.066202
std,10.514755,27.934746,30.478693,17.209279,6.985693,2.588922,9.134552,33.990371,18.710844,3.450403,...,60.108693,46.86829,18.977556,32.873653,1.470437,2.412871,0.029717,0.271869,0.836646,27.094796
min,116.0,72.0,82.0,46.0,11.0,0.0,2.0,29.0,15.0,0.0,...,507.0,155.0,32.0,131.0,0.0,0.0,0.225,0.44,0.0,9.64
25%,142.0,114.0,129.0,65.0,23.0,1.0,16.0,103.0,41.0,1.0,...,561.0,206.0,62.0,199.0,0.0,2.0,0.279,0.73,0.0,21.59
50%,150.0,128.0,148.0,74.0,27.0,2.0,20.0,127.0,52.0,2.0,...,619.0,231.0,74.0,221.0,0.0,4.0,0.3,0.89,0.0,26.05
75%,155.0,150.0,170.0,85.0,31.0,3.0,26.0,156.0,65.0,3.0,...,654.0,261.0,86.0,237.0,1.0,6.0,0.318,1.1,0.0,33.94
max,162.0,199.0,233.0,134.0,48.0,14.0,58.0,218.0,133.0,20.0,...,735.0,411.0,144.0,291.0,9.0,13.0,0.37,1.9,5.0,237.5


In [7]:
df.describe(include='object')

Unnamed: 0,stolenBasePercentage,player,team
count,129.0,129,129
unique,57.0,129,30
top,1.0,Bobby Witt Jr.,Chicago Cubs
freq,20.0,1,7


In [8]:
df.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
124    False
125    False
126    False
127    False
128    False
Length: 129, dtype: bool

In [9]:
df.isna().sum()

gamesPlayed             0
groundOuts              0
airOuts                 0
runs                    0
doubles                 0
triples                 0
homeRuns                0
strikeOuts              0
baseOnBalls             0
intentionalWalks        0
hits                    0
hitByPitch              0
avg                     0
atBats                  0
obp                     0
slg                     0
ops                     0
caughtStealing          0
stolenBases             0
stolenBasePercentage    0
groundIntoDoublePlay    0
numberOfPitches         0
plateAppearances        0
totalBases              0
rbi                     0
leftOnBase              0
sacBunts                0
sacFlies                0
babip                   0
groundOutsToAirouts     0
catchersInterference    0
atBatsPerHomeRun        0
player                  0
team                    0
dtype: int64

In [10]:
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
categorical_cols 

['stolenBasePercentage', 'player', 'team']

In [11]:
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
numerical_cols

['gamesPlayed',
 'groundOuts',
 'airOuts',
 'runs',
 'doubles',
 'triples',
 'homeRuns',
 'strikeOuts',
 'baseOnBalls',
 'intentionalWalks',
 'hits',
 'hitByPitch',
 'avg',
 'atBats',
 'obp',
 'slg',
 'ops',
 'caughtStealing',
 'stolenBases',
 'groundIntoDoublePlay',
 'numberOfPitches',
 'plateAppearances',
 'totalBases',
 'rbi',
 'leftOnBase',
 'sacBunts',
 'sacFlies',
 'babip',
 'groundOutsToAirouts',
 'catchersInterference',
 'atBatsPerHomeRun']

In [12]:
for col in categorical_cols:
    print(col)
    print(df[col].nunique())
    print(df[col].unique())
    print('-' * 100) 

stolenBasePercentage
57
['.721' '.500' '1.000' '.750' '.937' '.826' '.759' '.786' '.842' '.706'
 '.875' '.889' '.636' '.829' '.727' '.818' '.815' '.840' '.---' '.769'
 '.714' '.854' '.833' '.846' '.879' '.828' '.838' '.800' '.857' '.692'
 '.545' '.688' '.667' '.807' '.957' '.000' '.571' '.767' '.893' '.778'
 '.864' '.909' '.625' '.867' '.609' '.882' '.655' '.600' '.914' '.400'
 '.647' '.696' '.949' '.814' '.793' '.941' '.533']
----------------------------------------------------------------------------------------------------
player
129
['Bobby Witt Jr.' 'Vladimir Guerrero Jr.' 'Aaron Judge' 'Luis Arraez'
 'Shohei Ohtani' 'Yordan Alvarez' 'Marcell Ozuna' 'Yainer Diaz'
 'Trea Turner' 'Jose Altuve' 'Brent Rooker' 'Jackson Merrill'
 'Steven Kwan' 'Ketel Marte' 'Mookie Betts' 'Juan Soto' 'Bryce Harper'
 'Jarren Duran' 'Seiya Suzuki' 'Freddie Freeman' 'Luis García Jr.'
 'Gunnar Henderson' 'William Contreras' 'Yandy Díaz' 'Jurickson Profar'
 'Alec Bohm' 'José Ramírez' 'Lourdes Gurriel Jr.' '

In [13]:
# Convert 'stolenBasePercentage' to numeric
df['stolenBasePercentage'] = pd.to_numeric(
    df['stolenBasePercentage'].apply(lambda x: '0' + x if isinstance(x, str) and x.startswith('.') else x),
    errors='coerce'
)

# Add to numerical_cols 
if 'stolenBasePercentage' not in numerical_cols:
    numerical_cols.append('stolenBasePercentage') 

In [14]:
df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129 entries, 0 to 128
Data columns (total 34 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   gamesPlayed           129 non-null    int64  
 1   groundOuts            129 non-null    int64  
 2   airOuts               129 non-null    int64  
 3   runs                  129 non-null    int64  
 4   doubles               129 non-null    int64  
 5   triples               129 non-null    int64  
 6   homeRuns              129 non-null    int64  
 7   strikeOuts            129 non-null    int64  
 8   baseOnBalls           129 non-null    int64  
 9   intentionalWalks      129 non-null    int64  
 10  hits                  129 non-null    int64  
 11  hitByPitch            129 non-null    int64  
 12  avg                   129 non-null    float64
 13  atBats                129 non-null    int64  
 14  obp                   129 non-null    float64
 15  slg                   1

In [15]:
df['stolenBasePercentage'].isna().mean()*100 

np.float64(2.3255813953488373)

In [16]:
import plotly.express as px

for col in numerical_cols:
    fig = px.histogram(df, x=col, nbins=20, title=f"Distribution of {col}")
    fig.show() 

In [17]:
import plotly.graph_objects as go

for col in numerical_cols:
    fig = go.Figure()
    fig.add_trace(go.Box(y=df[col], name=col, boxpoints='outliers'))
    fig.update_layout(title=f"Boxplot of {col}", yaxis_title=col)
    fig.show()

In [18]:
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
categorical_cols

['player', 'team']

In [19]:
import plotly.express as px

for col in categorical_cols:
    value_counts = df[col].value_counts().reset_index()
    value_counts.columns = [col, 'count']
    fig = px.bar(
        value_counts,
        x=col,
        y='count',
        title=f"Bar Chart of {col}",
        labels={col: col, 'count': 'Count'}
    )
    fig.show()

In [20]:
import plotly.express as px

# Compute the correlation matrix for numerical columns
corr_matrix = df[numerical_cols].corr()

fig = px.imshow(
    corr_matrix,
    labels=dict(x="Features", y="Features", color="Correlation"),
    x=corr_matrix.columns,
    y=corr_matrix.columns,
    color_continuous_scale="RdBu",
    zmin=-1, zmax=1,
    title="Correlation Matrix Heatmap"
)
fig.update_layout(width=900, height=900)
fig.show() 

In [21]:
# Remove outliers from numerical 
Q1 = df[numerical_cols].quantile(0.25)
Q3 = df[numerical_cols].quantile(0.75)
IQR = Q3 - Q1

# Keep only rows without outliers 
df_clean = df[~((df[numerical_cols] < (Q1 - 1.5 * IQR)) | (df[numerical_cols] > (Q3 + 1.5 * IQR))).any(axis=1)]

df_clean.info() 

<class 'pandas.core.frame.DataFrame'>
Index: 58 entries, 7 to 128
Data columns (total 34 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   gamesPlayed           58 non-null     int64  
 1   groundOuts            58 non-null     int64  
 2   airOuts               58 non-null     int64  
 3   runs                  58 non-null     int64  
 4   doubles               58 non-null     int64  
 5   triples               58 non-null     int64  
 6   homeRuns              58 non-null     int64  
 7   strikeOuts            58 non-null     int64  
 8   baseOnBalls           58 non-null     int64  
 9   intentionalWalks      58 non-null     int64  
 10  hits                  58 non-null     int64  
 11  hitByPitch            58 non-null     int64  
 12  avg                   58 non-null     float64
 13  atBats                58 non-null     int64  
 14  obp                   58 non-null     float64
 15  slg                   58 non-

In [22]:
# Fill missing values in 'stolenBasePercentage' with the column mean
df_clean['stolenBasePercentage'] = df_clean['stolenBasePercentage'].fillna(df_clean['stolenBasePercentage'].mean()) 



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [23]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 58 entries, 7 to 128
Data columns (total 34 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   gamesPlayed           58 non-null     int64  
 1   groundOuts            58 non-null     int64  
 2   airOuts               58 non-null     int64  
 3   runs                  58 non-null     int64  
 4   doubles               58 non-null     int64  
 5   triples               58 non-null     int64  
 6   homeRuns              58 non-null     int64  
 7   strikeOuts            58 non-null     int64  
 8   baseOnBalls           58 non-null     int64  
 9   intentionalWalks      58 non-null     int64  
 10  hits                  58 non-null     int64  
 11  hitByPitch            58 non-null     int64  
 12  avg                   58 non-null     float64
 13  atBats                58 non-null     int64  
 14  obp                   58 non-null     float64
 15  slg                   58 non-

In [24]:
df = df_clean.copy() 

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 58 entries, 7 to 128
Data columns (total 34 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   gamesPlayed           58 non-null     int64  
 1   groundOuts            58 non-null     int64  
 2   airOuts               58 non-null     int64  
 3   runs                  58 non-null     int64  
 4   doubles               58 non-null     int64  
 5   triples               58 non-null     int64  
 6   homeRuns              58 non-null     int64  
 7   strikeOuts            58 non-null     int64  
 8   baseOnBalls           58 non-null     int64  
 9   intentionalWalks      58 non-null     int64  
 10  hits                  58 non-null     int64  
 11  hitByPitch            58 non-null     int64  
 12  avg                   58 non-null     float64
 13  atBats                58 non-null     int64  
 14  obp                   58 non-null     float64
 15  slg                   58 non-

In [26]:
%pip install scipy



Note: you may need to restart the kernel to use updated packages.


In [27]:
%pip install scikit-learn



Note: you may need to restart the kernel to use updated packages.


In [28]:
df

Unnamed: 0,gamesPlayed,groundOuts,airOuts,runs,doubles,triples,homeRuns,strikeOuts,baseOnBalls,intentionalWalks,...,rbi,leftOnBase,sacBunts,sacFlies,babip,groundOutsToAirouts,catchersInterference,atBatsPerHomeRun,player,team
7,148,191,120,70,29,3,16,107,24,1,...,84,268,0,8,0.338,1.59,0,36.56,Yainer Diaz,Houston Astros
9,153,153,171,94,31,0,20,119,47,2,...,65,193,0,0,0.337,0.89,0,31.4,Jose Altuve,Houston Astros
18,132,72,139,74,27,6,21,160,63,3,...,73,220,0,4,0.37,0.52,0,24.38,Seiya Suzuki,Chicago Cubs
20,140,150,124,58,25,1,18,86,27,2,...,70,197,0,1,0.31,1.21,0,27.78,Luis García Jr.,Washington Nationals
22,155,175,117,99,37,2,23,139,78,5,...,92,224,0,3,0.33,1.5,0,25.87,William Contreras,Milwaukee Brewers
27,133,117,157,72,22,2,18,101,29,1,...,75,220,0,5,0.313,0.75,0,28.5,Lourdes Gurriel Jr.,Arizona Diamondbacks
29,153,163,185,65,34,3,14,81,47,0,...,73,228,0,5,0.3,0.88,0,41.93,Brendan Donovan,St. Louis Cardinals
30,156,158,140,73,29,3,24,156,57,3,...,88,231,0,3,0.33,1.13,0,25.92,Bryan Reynolds,Pittsburgh Pirates
32,148,141,124,80,29,4,21,121,39,4,...,79,200,1,2,0.32,1.14,0,25.14,Jackson Chourio,Milwaukee Brewers
33,152,130,195,107,39,1,33,127,56,0,...,91,223,0,3,0.295,0.67,0,18.73,Francisco Lindor,New York Mets


In [29]:
from scipy.stats import skew
import numpy as np
from sklearn.preprocessing import PowerTransformer

# numerical_cols is already defined in previous cells
# If not, re-define it here:
# numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Use the existing numerical_cols variable from previous cells
skewed_cols = [col for col in numerical_cols if abs(skew(df[col].dropna())) > 0.75]

# Apply log1p transformation to handle positive skewness
for col in skewed_cols:
    # Only apply if all values are >= 0
    if (df[col] >= 0).all():
        df[col] = np.log1p(df[col])
    else:
        # For columns with negative values, use a different transformation (e.g., Yeo-Johnson)
        pt = PowerTransformer(method='yeo-johnson')
        df[col] = pt.fit_transform(df[[col]])

# Check skewness after transformation
{col: skew(df[col].dropna()) for col in skewed_cols} 

{'caughtStealing': np.float64(-0.05257666509746663),
 'stolenBases': np.float64(-0.062322313313329075),
 'sacBunts': np.float64(1.7865692875075025)}

In [30]:
import plotly.express as px

for col in numerical_cols:
    fig = px.histogram(df_clean, x=col, nbins=20, title=f"Distribution of {col} (Cleaned Data)")
    fig.show()

In [31]:
df.to_csv("mlb_hitting_stats_cleaned.csv", index=False)
print("✅ Cleaned data saved to mlb_hitting_stats_cleaned.csv") 

✅ Cleaned data saved to mlb_hitting_stats_cleaned.csv


In [32]:
# what is the average stolen base percentage by team?
team_sb_eff = df.groupby("team")["stolenBasePercentage"].mean().sort_values(ascending=False).reset_index()

fig = px.bar(team_sb_eff, x="team", y="stolenBasePercentage",
             title="Average Stolen Base Percentage by Team",
             labels={"stolenBasePercentage": "Stolen Base %"})
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show() 

In [33]:
# is there a relationship between games played and stolen bases?
fig = px.scatter(df, x="gamesPlayed", y="stolenBases",
                 hover_name="player", color="team",
                 title="Stolen Bases vs Games Played")
fig.show() 


In [34]:
#What is the home run rate (homeRuns / atBats) for each player?
df["homeRunRate"] = df["homeRuns"] / df["atBats"]

top_power_hitters = df[df["atBats"] > 100].sort_values("homeRunRate", ascending=False).head(10)

fig = px.bar(top_power_hitters, x="player", y="homeRunRate", color="team",
             title="Top 10 Power Hitters (HR Rate, Min 100 AB)")
fig.show()


In [35]:
# Is there a relationship between batting average and on-base percentage?
fig = px.scatter(df, x="avg", y="obp", 
                 color="team", hover_name="player",
                 title="Batting Average vs On-Base Percentage")
fig.show()


In [36]:
%pip install streamlit 



Note: you may need to restart the kernel to use updated packages.


In [38]:
%%writefile mlb_analysis_app.py
import streamlit as st
import pandas as pd
import plotly.express as px

# Load data
df = pd.read_csv("mlb_hitting_stats_cleaned.csv")

# Calculate derived columns if needed
df["homeRunRate"] = df["homeRuns"] / df["atBats"]

# Sidebar navigation
st.sidebar.title("MLB Hitting Stats Explorer")
menu = st.sidebar.radio("Select Analysis Type:", ["Univariate Analysis", "Bivariate Analysis"])

st.title("⚾ MLB Hitting Stats Dashboard")

if menu == "Univariate Analysis":
    st.header("Univariate Analysis")
    plot_type = st.sidebar.selectbox("Choose Plot Type:", ["Histogram", "Boxplot", "Bar Chart"])

    if plot_type == "Histogram":
        numeric_col = st.selectbox("Select numerical column:", df.select_dtypes(include='number').columns)
        fig = px.histogram(df, x=numeric_col, nbins=30, title=f"Distribution of {numeric_col}")
        st.plotly_chart(fig)

    elif plot_type == "Boxplot":
        numeric_col = st.selectbox("Select numerical column:", df.select_dtypes(include='number').columns)
        fig = px.box(df, y=numeric_col, title=f"Boxplot of {numeric_col}")
        st.plotly_chart(fig)

    elif plot_type == "Bar Chart":
        cat_col = st.selectbox("Select categorical column:", df.select_dtypes(include='object').columns)
        bar_data = df[cat_col].value_counts().reset_index()
        bar_data.columns = [cat_col, "count"]
        fig = px.bar(bar_data, x=cat_col, y="count", title=f"Bar Chart of {cat_col}")
        st.plotly_chart(fig)

elif menu == "Bivariate Analysis":
    st.header("Bivariate Analysis")
    analysis_option = st.sidebar.selectbox("Select Analysis Question:", [
        "Average Stolen Base Percentage by Team",
        "Stolen Bases vs Games Played",
        "Top 10 Power Hitters",
        "Batting Average vs On-Base Percentage"
    ])

    if analysis_option == "Average Stolen Base Percentage by Team":
        team_sb_eff = df.groupby("team")["stolenBasePercentage"].mean().sort_values(ascending=False).reset_index()
        fig = px.bar(team_sb_eff, x="team", y="stolenBasePercentage",
                     title="Average Stolen Base Percentage by Team",
                     labels={"stolenBasePercentage": "Stolen Base %"})
        fig.update_layout(xaxis={'categoryorder':'total descending'})
        st.plotly_chart(fig)

    elif analysis_option == "Stolen Bases vs Games Played":
        fig = px.scatter(df, x="gamesPlayed", y="stolenBases",
                         hover_name="player", color="team",
                         title="Stolen Bases vs Games Played")
        st.plotly_chart(fig)

    elif analysis_option == "Top 10 Power Hitters":
        top_power_hitters = df[df["atBats"] > 100].sort_values("homeRunRate", ascending=False).head(10)
        fig = px.bar(top_power_hitters, x="player", y="homeRunRate", color="team",
                     title="Top 10 Power Hitters (HR Rate, Min 100 AB)")
        st.plotly_chart(fig)

    elif analysis_option == "Batting Average vs On-Base Percentage":
        fig = px.scatter(df, x="avg", y="obp", 
                         color="team", hover_name="player",
                         title="Batting Average vs On-Base Percentage")
        st.plotly_chart(fig) 


Overwriting mlb_analysis_app.py
