### Define Functions & Import Packages

In [1]:
from pybaseball import statcast
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import warnings
from pybaseball import statcast_sprint_speed
warnings.simplefilter(action='ignore', category=FutureWarning)


def get_existing_columns(df, columns):
    """Returns a list of columns that exist in the DataFrame."""
    return [col for col in columns if col in df.columns]


### Load in Data

In [None]:
# Enable cache to avoid repeated downloads
from pybaseball import cache
cache.enable()

# Get Statcast data for a sample month
data = statcast(start_dt="2024-04-01", end_dt="2024-04-30")
data.head()


### Filter for singles and key features for now

In [3]:
# Filter to singles only
singles = data[data['events'] == 'single']

# Keep useful columns
columns = [
    'player_name', 'player_id', 'events', 'launch_speed', 'launch_angle',
    'hit_distance_sc', 'hc_x', 'hc_y', 'hit_location', 'bb_type', 'spin_rate',
    'release_speed', 'pitch_type', 'stand',
    'fielder_8', 'fielder_8_x', 'fielder_8_y', 'fielder_9_x', 'fielder_9_y',
    'fielder_7_x', 'fielder_7_y', 'fielder_6_x', 'fielder_6_y',
    'inning', 'inning_topbot', 'outs_when_up', 'balls', 'strikes',
    'bat_score', 'fld_score',
    'on_1b', 'on_2b', 'on_3b',
    'home_team', 'away_team',
    'day_night', 'venue_id', 'game_date', 'game_type',
    'weather_temp', 'weather_wind', 'temp', 'wind_speed', 'wind_direction'
]

existing_cols = get_existing_columns(singles, columns)
singles_data = singles[existing_cols]

# Merge Sprint and Speed data 
# Load sprint speed data for the year
sprint = statcast_sprint_speed(2024)[['last_name, first_name', 'sprint_speed']]
sprint = sprint.rename(columns={'last_name, first_name': 'player_name'})


# Merge with singles on player_id
singles_data = singles_data.merge(sprint, on='player_name', how='left')

### Exploratory visualization

In [None]:
sns.histplot(singles_data['launch_speed'], bins=30)
plt.title('Launch Speed Distribution for Singles')
plt.show()

sns.scatterplot(x='launch_angle', y='launch_speed', data=singles_data)
plt.title('Launch Angle vs Launch Speed for Singles')
plt.show()


In [31]:
import pandas as pd
import numpy as np
from pybaseball import statcast, statcast_sprint_speed, cache, playerid_lookup
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns


# Enable cache to avoid repeated downloads
cache.enable()

def estimate_sprint_speed_from_physical_attributes(hits_data):
    """
    Estimate sprint speed based on player physical attributes (height/weight) when actual data unavailable
    Uses general athletic performance correlations
    """
    from pybaseball import playerid_lookup
    
    print("Attempting to get player physical attributes for sprint speed estimation...")
    unique_players = hits_data['player_name'].unique()
    
    # Try to get player physical data
    player_attributes = []
    
    for player_name in unique_players:
        try:
            # Split name for lookup
            name_parts = player_name.split()
            if len(name_parts) >= 2:
                first_name = name_parts[0]
                last_name = " ".join(name_parts[1:])
                
                # Look up player
                player_info = playerid_lookup(last_name, first_name)
                
                if len(player_info) > 0:
                    # Get the most recent entry
                    player = player_info.iloc[0]
                    
                    # Extract height and weight if available
                    height = getattr(player, 'height', None) if hasattr(player, 'height') else None
                    weight = getattr(player, 'weight', None) if hasattr(player, 'weight') else None
                    
                    # Convert height to inches if it's in feet-inches format
                    height_inches = None
                    if height and isinstance(height, str):
                        try:
                            if "'" in height or "-" in height:
                                # Format like "6'2" or "6-2"
                                height_clean = height.replace("'", "-").replace('"', '')
                                if "-" in height_clean:
                                    feet, inches = height_clean.split("-")
                                    height_inches = int(feet) * 12 + int(inches)
                                else:
                                    height_inches = int(height_clean)
                            else:
                                height_inches = int(height)
                        except:
                            height_inches = None
                    elif height and isinstance(height, (int, float)):
                        height_inches = height
                    
                    player_attributes.append({
                        'player_name': player_name,
                        'height_inches': height_inches,
                        'weight': weight
                    })
                else:
                    player_attributes.append({
                        'player_name': player_name,
                        'height_inches': None,
                        'weight': None
                    })
            else:
                player_attributes.append({
                    'player_name': player_name,
                    'height_inches': None,
                    'weight': None
                })
        except Exception as e:
            # If lookup fails, add with null values
            player_attributes.append({
                'player_name': player_name,
                'height_inches': None,
                'weight': None
            })
    
    player_df = pd.DataFrame(player_attributes)
    
    # Estimate sprint speed based on physical attributes
    def estimate_speed(row):
        height = row['height_inches']
        weight = row['weight']
        
        # MLB average sprint speed is approximately 27 ft/s
        base_speed = 27.0
        
        if pd.isna(height) and pd.isna(weight):
            # No data available, return MLB average
            return base_speed
        
        # Height factor: Taller players tend to be slightly faster due to longer strides
        # But very tall players may be slower. Optimal around 6'0-6'2 (72-74 inches)
        if not pd.isna(height):
            if height <= 66:  # Under 5'6"
                height_factor = -0.5
            elif height <= 70:  # 5'6" to 5'10"
                height_factor = 0.3
            elif height <= 74:  # 5'10" to 6'2"
                height_factor = 0.8
            elif height <= 78:  # 6'2" to 6'6"
                height_factor = 0.2
            else:  # Over 6'6"
                height_factor = -0.8
        else:
            height_factor = 0
        
        # Weight factor: Generally, lighter players are faster
        # But need enough muscle mass. Optimal around 180-200 lbs for speed
        if not pd.isna(weight):
            if weight <= 170:
                weight_factor = 0.5
            elif weight <= 190:
                weight_factor = 1.0
            elif weight <= 210:
                weight_factor = 0.3
            elif weight <= 230:
                weight_factor = -0.5
            else:
                weight_factor = -1.2
        else:
            weight_factor = 0
        
        # BMI consideration if both height and weight available
        bmi_factor = 0
        if not pd.isna(height) and not pd.isna(weight):
            bmi = (weight / (height ** 2)) * 703  # BMI formula
            if bmi <= 22:
                bmi_factor = 0.3
            elif bmi <= 25:
                bmi_factor = 0.1
            elif bmi <= 28:
                bmi_factor = -0.2
            else:
                bmi_factor = -0.8
        
        # Combine factors
        estimated_speed = base_speed + height_factor + weight_factor + bmi_factor
        
        # Add some realistic variation
        estimated_speed += np.random.normal(0, 0.5)
        
        # Keep within reasonable bounds (22-32 ft/s for MLB players)
        estimated_speed = np.clip(estimated_speed, 22, 32)
        
        return round(estimated_speed, 1)
    
    player_df['sprint_speed'] = player_df.apply(estimate_speed, axis=1)
    
    print(f"Estimated sprint speeds for {len(player_df)} players")
    print(f"Average estimated speed: {player_df['sprint_speed'].mean():.1f} ft/s")
    
    return player_df[['player_name', 'sprint_speed']]

def get_existing_columns(df, desired_columns):
    """Helper function to get only existing columns from a list"""
    return [col for col in desired_columns if col in df.columns]

def calculate_hit_angle(hc_x, hc_y):
    """
    Calculate hit angle from home plate coordinates
    hc_x, hc_y are Statcast coordinates where (125.42, 199.33) is home plate
    """
    # Adjust coordinates relative to home plate
    x_adj = hc_x - 125.42
    y_adj = hc_y - 199.33
    
    # Calculate angle in degrees (0 = straight up middle, negative = left field, positive = right field)
    angle = np.degrees(np.arctan2(x_adj, y_adj))
    return angle

def calculate_fielder_distance(hit_x, hit_y, fielder_x, fielder_y):
    """Calculate distance between hit location and fielder position"""
    if pd.isna(fielder_x) or pd.isna(fielder_y):
        return np.nan
    return np.sqrt((hit_x - fielder_x)**2 + (hit_y - fielder_y)**2)

def prepare_training_data(start_date, end_date):
    """
    Prepare training data by getting both singles and doubles to train the model
    """
    print(f"Loading Statcast data from {start_date} to {end_date}...")
    data = statcast(start_dt=start_date, end_dt=end_date)
    
    # Filter to singles and doubles only
    hits = data[data['events'].isin(['single', 'double'])].copy()
    
    # Keep useful columns
    columns = [
        'player_name', 'player_id', 'events', 'launch_speed', 'launch_angle',
        'hit_distance_sc', 'hc_x', 'hc_y', 'hit_location', 'bb_type', 'spin_rate',
        'release_speed', 'pitch_type', 'stand',
        'fielder_7', 'fielder_8', 'fielder_9',
        'inning', 'inning_topbot', 'outs_when_up', 'balls', 'strikes',
        'bat_score', 'fld_score',
        'on_1b', 'on_2b', 'on_3b',
        'home_team', 'away_team',
        'day_night', 'venue_id', 'game_date', 'game_type',
        'weather_temp', 'weather_wind', 'temp', 'wind_speed', 'wind_direction'
    ]
    
    existing_cols = get_existing_columns(hits, columns)
    hits_data = hits[existing_cols].copy()
    
    # Load sprint speed data
    print("Loading sprint speed data...")
    try:
        sprint = statcast_sprint_speed(2024)
        if 'last_name, first_name' in sprint.columns:
            sprint = sprint[['last_name, first_name', 'sprint_speed']].rename(
                columns={'last_name, first_name': 'player_name'})
        else:
            # Handle different column naming
            name_cols = [col for col in sprint.columns if 'name' in col.lower()]
            if name_cols:
                sprint = sprint[[name_cols[0], 'sprint_speed']].rename(
                    columns={name_cols[0]: 'player_name'})
    except Exception as e:
        print(f"Error loading sprint speed: {e}")
        # Create estimated sprint speed data based on player physical attributes
        sprint = estimate_sprint_speed_from_physical_attributes(hits_data)
    
    # Merge with sprint speed
    hits_data = hits_data.merge(sprint, on='player_name', how='left')
    
    return hits_data

def engineer_features(df):
    """
    Create features that might predict whether a single could have been a double
    """
    df = df.copy()
    
    # Create target variable (1 if double, 0 if single)
    df['is_double'] = (df['events'] == 'double').astype(int)
    
    # Hit angle features
    df['hit_angle'] = calculate_hit_angle(df['hc_x'], df['hc_y'])
    df['abs_hit_angle'] = abs(df['hit_angle'])
    
    # Categorize hit direction
    df['hit_direction'] = pd.cut(df['hit_angle'], 
                                bins=[-180, -30, 30, 180], 
                                labels=['left_field', 'center_field', 'right_field'])
    
    # Distance and speed features
    df['exit_velocity_squared'] = df['launch_speed'] ** 2
    df['launch_angle_abs'] = abs(df['launch_angle'])
    
    # Situational features
    df['runners_on_base'] = (~df['on_1b'].isna()).astype(int) + \
                           (~df['on_2b'].isna()).astype(int) + \
                           (~df['on_3b'].isna()).astype(int)
    
    df['late_inning'] = (df['inning'] >= 7).astype(int)
    df['close_game'] = (abs(df['bat_score'] - df['fld_score']) <= 2).astype(int)
    
    # Fill missing sprint speed with median
    median_sprint = df['sprint_speed'].median()
    df['sprint_speed'] = df['sprint_speed'].fillna(median_sprint)
    
    # Speed tier
    df['speed_tier'] = pd.cut(df['sprint_speed'], 
                             bins=[0, 26, 28, 35], 
                             labels=['slow', 'average', 'fast'])
    
    # Ball type features
    df['is_line_drive'] = (df['bb_type'] == 'line_drive').astype(int)
    df['is_ground_ball'] = (df['bb_type'] == 'ground_ball').astype(int)
    
    return df

def assess_training_data_adequacy(df):
    """
    Assess whether we have enough training data for reliable modeling
    """
    print("="*50)
    print("TRAINING DATA ASSESSMENT")
    print("="*50)
    
    # Basic counts
    total_hits = len(df)
    singles_count = len(df[df['events'] == 'single'])
    doubles_count = len(df[df['events'] == 'double'])
    
    print(f"Total hits: {total_hits:,}")
    print(f"Singles: {singles_count:,}")
    print(f"Doubles: {doubles_count:,}")
    print(f"Singles/Doubles ratio: {singles_count/doubles_count:.1f}:1")
    
    # Class balance assessment
    if doubles_count / total_hits < 0.1:
        print("⚠️  WARNING: Very few doubles in dataset - may lead to poor model performance")
    elif doubles_count / total_hits < 0.2:
        print("⚠️  CAUTION: Relatively few doubles - consider expanding date range")
    else:
        print("✅ Good class balance for modeling")
    
    # Sample size guidelines
    print(f"\nSample Size Assessment:")
    if total_hits < 1000:
        print("❌ INSUFFICIENT: Need at least 1,000 hits for basic modeling")
        recommended_days = int((1000 / total_hits) * 30) if total_hits > 0 else 90
        print(f"   Recommendation: Expand to ~{recommended_days} days of data")
    elif total_hits < 5000:
        print("⚠️  MINIMAL: 1K-5K hits may work but results less reliable")
        print("   Recommendation: Expand to 60-90 days for better results")
    elif total_hits < 15000:
        print("✅ ADEQUATE: Good sample size for initial modeling")
    else:
        print("✅ EXCELLENT: Large sample size should provide reliable results")
    
    # Feature completeness
    key_features = ['launch_speed', 'launch_angle', 'hit_distance_sc', 'sprint_speed']
    print(f"\nFeature Completeness:")
    
    for feature in key_features:
        if feature in df.columns:
            missing_pct = df[feature].isna().mean() * 100
            if missing_pct < 10:
                status = "✅"
            elif missing_pct < 25:
                status = "⚠️ "
            else:
                status = "❌"
            print(f"  {status} {feature}: {missing_pct:.1f}% missing")
        else:
            print(f"  ❌ {feature}: Column not found")
    
    # Unique players
    unique_players = df['player_name'].nunique() if 'player_name' in df.columns else 0
    print(f"\nUnique players: {unique_players}")
    if unique_players < 100:
        print("⚠️  Limited player diversity - consider expanding date range")
    else:
        print("✅ Good player diversity")
    
    return {
        'total_hits': total_hits,
        'singles_count': singles_count,
        'doubles_count': doubles_count,
        'unique_players': unique_players,
        'adequate_sample': total_hits >= 1000,
        'good_balance': doubles_count / total_hits >= 0.15
    }

def build_stretch_probability_model(df):
    """
    Build a model to predict the probability that a single could have been stretched to a double
    """
    # Assess training data first
    data_assessment = assess_training_data_adequacy(df)
    
    if not data_assessment['adequate_sample']:
        print("\n❌ Insufficient training data. Model may not be reliable.")
        print("Consider expanding your date range before proceeding.")
        return None, None, None
    
    # Feature columns for modeling
    feature_cols = [
        'launch_speed', 'launch_angle', 'hit_distance_sc', 'sprint_speed',
        'hit_angle', 'abs_hit_angle', 'exit_velocity_squared', 'launch_angle_abs',
        'runners_on_base', 'outs_when_up', 'late_inning', 'close_game',
        'is_line_drive', 'is_ground_ball'
    ]
    
    # Get available features
    available_features = [col for col in feature_cols if col in df.columns]
    
    # Prepare data
    model_data = df[available_features + ['is_double']].dropna()
    
    if len(model_data) == 0:
        print("No data available for modeling after removing missing values")
        return None, None, None
    
    print(f"\nModel training data: {len(model_data):,} hits after removing missing values")
    
    X = model_data[available_features]
    y = model_data['is_double']
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
    print(f"Training set: {len(X_train):,} hits")
    print(f"Test set: {len(X_test):,} hits")
    
    # Train Random Forest model
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
    rf_model.fit(X_train, y_train)
    
    # Evaluate model
    y_pred_proba = rf_model.predict_proba(X_test)[:, 1]
    y_pred = rf_model.predict(X_test)
    auc_score = roc_auc_score(y_test, y_pred_proba)
    
    print(f"\n📊 MODEL PERFORMANCE:")
    print(f"AUC Score: {auc_score:.3f}")
    
    # Performance interpretation
    if auc_score < 0.6:
        print("❌ Poor model performance - may not be useful for predictions")
    elif auc_score < 0.7:
        print("⚠️  Fair model performance - use predictions cautiously")
    elif auc_score < 0.8:
        print("✅ Good model performance - predictions should be reliable")
    else:
        print("✅ Excellent model performance - high confidence in predictions")
    
    # Feature importance
    feature_importance = pd.DataFrame({
        'feature': available_features,
        'importance': rf_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    # Learning curve assessment
    train_scores = []
    test_scores = []
    sample_sizes = [0.1, 0.3, 0.5, 0.7, 0.9, 1.0]
    
    for size in sample_sizes:
        if size == 1.0:
            X_temp, y_temp = X_train, y_train
        else:
            X_temp, _, y_temp, _ = train_test_split(X_train, y_train, train_size=size, random_state=42, stratify=y_train)
        
        temp_model = RandomForestClassifier(n_estimators=50, random_state=42, class_weight='balanced')
        temp_model.fit(X_temp, y_temp)
        
        train_pred = temp_model.predict_proba(X_temp)[:, 1]
        test_pred = temp_model.predict_proba(X_test)[:, 1]
        
        train_scores.append(roc_auc_score(y_temp, train_pred))
        test_scores.append(roc_auc_score(y_test, test_pred))
    
    # Check if more data would help
    if len(train_scores) >= 2:
        recent_improvement = test_scores[-1] - test_scores[-2]
        if recent_improvement > 0.01:
            print("📈 Model still improving with more data - consider expanding dataset")
        else:
            print("📊 Model performance plateauing - current data size likely sufficient")
    
    return rf_model, feature_importance, available_features

def analyze_stretchable_singles(df, model, features):
    """
    Analyze singles that could potentially have been doubles
    """
    # Filter to singles only
    singles = df[df['events'] == 'single'].copy()
    
    if model is None or len(singles) == 0:
        print("No singles data or model available for analysis")
        return pd.DataFrame()
    
    # Get available features for prediction
    available_features = [f for f in features if f in singles.columns]
    singles_features = singles[available_features].fillna(singles[available_features].median())
    
    # Predict probability of being a double
    singles['stretch_probability'] = model.predict_proba(singles_features)[:, 1]
    
    # Add stretch potential categories
    singles['stretch_potential'] = pd.cut(singles['stretch_probability'],
                                        bins=[0, 0.3, 0.6, 1.0],
                                        labels=['Low', 'Medium', 'High'])
    
    return singles

def generate_insights(singles_analysis):
    """
    Generate insights from the stretchable singles analysis
    """
    if len(singles_analysis) == 0:
        return "No data available for analysis"
    
    insights = []
    
    # Overall statistics
    total_singles = len(singles_analysis)
    high_stretch = len(singles_analysis[singles_analysis['stretch_potential'] == 'High'])
    medium_stretch = len(singles_analysis[singles_analysis['stretch_potential'] == 'Medium'])
    
    insights.append(f"Total Singles Analyzed: {total_singles}")
    insights.append(f"High Stretch Potential: {high_stretch} ({high_stretch/total_singles*100:.1f}%)")
    insights.append(f"Medium Stretch Potential: {medium_stretch} ({medium_stretch/total_singles*100:.1f}%)")
    
    # Top players with stretchable singles
    if 'player_name' in singles_analysis.columns:
        top_players = singles_analysis[singles_analysis['stretch_potential'] == 'High'].groupby('player_name').size().sort_values(ascending=False).head(5)
        insights.append("\nTop Players with High-Stretch Singles:")
        for player, count in top_players.items():
            insights.append(f"  {player}: {count} singles")
    
    # Speed analysis
    if 'sprint_speed' in singles_analysis.columns:
        avg_speed_high = singles_analysis[singles_analysis['stretch_potential'] == 'High']['sprint_speed'].mean()
        avg_speed_low = singles_analysis[singles_analysis['stretch_potential'] == 'Low']['sprint_speed'].mean()
        insights.append(f"\nAverage Sprint Speed:")
        insights.append(f"  High Stretch Potential: {avg_speed_high:.1f} ft/s")
        insights.append(f"  Low Stretch Potential: {avg_speed_low:.1f} ft/s")
    
    return "\n".join(insights)

# Main analysis function
def run_singles_analysis(start_date="2024-04-01", end_date="2024-04-30"):
    """
    Run the complete singles stretching analysis
    """
    try:
        # Prepare data
        print("Step 1: Preparing training data...")
        hits_data = prepare_training_data(start_date, end_date)
        
        print("Step 2: Engineering features...")
        hits_with_features = engineer_features(hits_data)
        
        print("Step 3: Building stretch probability model...")
        model, feature_importance, features = build_stretch_probability_model(hits_with_features)
        
        if model is not None:
            print("\nFeature Importance:")
            print(feature_importance.head(10))
        
        print("Step 4: Analyzing stretchable singles...")
        singles_analysis = analyze_stretchable_singles(hits_with_features, model, features)
        
        print("Step 5: Generating insights...")
        insights = generate_insights(singles_analysis)
        print("\n" + "="*50)
        print("ANALYSIS RESULTS")
        print("="*50)
        print(insights)
        
        return {
            'model': model,
            'feature_importance': feature_importance,
            'singles_analysis': singles_analysis,
            'insights': insights
        }
        
    except Exception as e:
        print(f"Error in analysis: {e}")
        return None

# Example usage
if __name__ == "__main__":
    # Run the analysis
    results = run_singles_analysis("2024-04-01", "2024-04-30")
    
    # Access results
    if results:
        singles_with_stretch = results['singles_analysis']
        
        # Show top stretchable singles
        if len(singles_with_stretch) > 0:
            print("\nTop 10 Most Stretchable Singles:")
            top_stretch = singles_with_stretch.nlargest(10, 'stretch_probability')[
                ['player_name', 'launch_speed', 'sprint_speed', 'hit_distance_sc', 'stretch_probability']
            ]
            print(top_stretch.to_string(index=False))

Step 1: Preparing training data...
Loading Statcast data from 2024-04-01 to 2024-04-30...
This is a large query, it may take a moment to complete


100%|██████████| 30/30 [00:00<00:00, 73.48it/s]


Loading sprint speed data...
Step 2: Engineering features...
Step 3: Building stretch probability model...
TRAINING DATA ASSESSMENT
Total hits: 5,442
Singles: 4,216
Doubles: 1,226
Singles/Doubles ratio: 3.4:1
✅ Good class balance for modeling

Sample Size Assessment:
✅ ADEQUATE: Good sample size for initial modeling

Feature Completeness:
  ✅ launch_speed: 0.1% missing
  ✅ launch_angle: 0.1% missing
  ✅ hit_distance_sc: 0.2% missing
  ✅ sprint_speed: 0.0% missing

Unique players: 510
✅ Good player diversity

Model training data: 5,427 hits after removing missing values
Training set: 4,341 hits
Test set: 1,086 hits

📊 MODEL PERFORMANCE:
AUC Score: 0.944
✅ Excellent model performance - high confidence in predictions
📊 Model performance plateauing - current data size likely sufficient

Feature Importance:
                  feature  importance
5           abs_hit_angle    0.261215
2         hit_distance_sc    0.222513
4               hit_angle    0.129275
0            launch_speed    0.088

In [33]:
import pandas as pd
import numpy as np
from pybaseball import statcast, statcast_sprint_speed, cache, playerid_lookup, playerid_reverse_lookup
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns


def debug_sprint_speed_data():
    """
    Debug function to understand the sprint speed data structure
    """
    try:
        sprint = statcast_sprint_speed(2024)
        print("="*50)
        print("SPRINT SPEED DATA DEBUG")
        print("="*50)
        print(f"Columns: {list(sprint.columns)}")
        print(f"Shape: {sprint.shape}")
        print(f"Data types:")
        print(sprint.dtypes)
        print("\nFirst 5 rows:")
        print(sprint.head())
        
        # Check for different name formats
        name_cols = [col for col in sprint.columns if 'name' in col.lower()]
        print(f"\nName columns found: {name_cols}")
        
        if 'player_id' in sprint.columns:
            print(f"\nPlayer IDs sample: {sprint['player_id'].head().tolist()}")
        
        return sprint
    except Exception as e:
        print(f"Error loading sprint speed data: {e}")
        return None

# Add this function call before the main analysis

def get_batter_names(data):
    """
    Convert batter IDs to batter names using reverse lookup
    """
    print("Converting batter IDs to names...")
    
    # Get unique batter IDs
    unique_batters = data['batter'].dropna().unique()
    
    # Create a dictionary to store batter ID -> name mappings
    batter_names = {}
    
    for batter_id in unique_batters:
        try:
            # Use reverse lookup to get player info from ID
            player_info = playerid_reverse_lookup([int(batter_id)], key_type='mlbam')
            if len(player_info) > 0:
                # Get first and last name
                first_name = player_info.iloc[0]['name_first']
                last_name = player_info.iloc[0]['name_last']
                full_name = f"{first_name} {last_name}"
                batter_names[batter_id] = full_name
            else:
                batter_names[batter_id] = f"Unknown Player {batter_id}"
        except Exception as e:
            print(f"Could not lookup batter ID {batter_id}: {e}")
            batter_names[batter_id] = f"Unknown Player {batter_id}"
    
    # Map batter names to the data
    data['batter_name'] = data['batter'].map(batter_names)
    
    print(f"Successfully mapped {len(batter_names)} batter names")
    return data

def estimate_sprint_speed_from_physical_attributes(hits_data):
    """
    Estimate sprint speed based on player physical attributes (height/weight) when actual data unavailable
    Uses general athletic performance correlations
    """
    from pybaseball import playerid_lookup
    
    print("Attempting to get player physical attributes for sprint speed estimation...")
    unique_players = hits_data['batter_name'].unique()  # Changed from player_name to batter_name
    
    # Try to get player physical data
    player_attributes = []
    
    for player_name in unique_players:
        try:
            # Split name for lookup
            name_parts = player_name.split()
            if len(name_parts) >= 2:
                first_name = name_parts[0]
                last_name = " ".join(name_parts[1:])
                
                # Look up player
                player_info = playerid_lookup(last_name, first_name)
                
                if len(player_info) > 0:
                    # Get the most recent entry
                    player = player_info.iloc[0]
                    
                    # Extract height and weight if available
                    height = getattr(player, 'height', None) if hasattr(player, 'height') else None
                    weight = getattr(player, 'weight', None) if hasattr(player, 'weight') else None
                    
                    # Convert height to inches if it's in feet-inches format
                    height_inches = None
                    if height and isinstance(height, str):
                        try:
                            if "'" in height or "-" in height:
                                # Format like "6'2" or "6-2"
                                height_clean = height.replace("'", "-").replace('"', '')
                                if "-" in height_clean:
                                    feet, inches = height_clean.split("-")
                                    height_inches = int(feet) * 12 + int(inches)
                                else:
                                    height_inches = int(height_clean)
                            else:
                                height_inches = int(height)
                        except:
                            height_inches = None
                    elif height and isinstance(height, (int, float)):
                        height_inches = height
                    
                    player_attributes.append({
                        'batter_name': player_name,  # Changed from player_name to batter_name
                        'height_inches': height_inches,
                        'weight': weight
                    })
                else:
                    player_attributes.append({
                        'batter_name': player_name,  # Changed from player_name to batter_name
                        'height_inches': None,
                        'weight': None
                    })
            else:
                player_attributes.append({
                    'batter_name': player_name,  # Changed from player_name to batter_name
                    'height_inches': None,
                    'weight': None
                })
        except Exception as e:
            # If lookup fails, add with null values
            player_attributes.append({
                'batter_name': player_name,  # Changed from player_name to batter_name
                'height_inches': None,
                'weight': None
            })
    
    player_df = pd.DataFrame(player_attributes)
    
    # Estimate sprint speed based on physical attributes
    def estimate_speed(row):
        height = row['height_inches']
        weight = row['weight']
        
        # MLB average sprint speed is approximately 27 ft/s
        base_speed = 27.0
        
        if pd.isna(height) and pd.isna(weight):
            # No data available, return MLB average
            return base_speed
        
        # Height factor: Taller players tend to be slightly faster due to longer strides
        # But very tall players may be slower. Optimal around 6'0-6'2 (72-74 inches)
        if not pd.isna(height):
            if height <= 66:  # Under 5'6"
                height_factor = -0.5
            elif height <= 70:  # 5'6" to 5'10"
                height_factor = 0.3
            elif height <= 74:  # 5'10" to 6'2"
                height_factor = 0.8
            elif height <= 78:  # 6'2" to 6'6"
                height_factor = 0.2
            else:  # Over 6'6"
                height_factor = -0.8
        else:
            height_factor = 0
        
        # Weight factor: Generally, lighter players are faster
        # But need enough muscle mass. Optimal around 180-200 lbs for speed
        if not pd.isna(weight):
            if weight <= 170:
                weight_factor = 0.5
            elif weight <= 190:
                weight_factor = 1.0
            elif weight <= 210:
                weight_factor = 0.3
            elif weight <= 230:
                weight_factor = -0.5
            else:
                weight_factor = -1.2
        else:
            weight_factor = 0
        
        # BMI consideration if both height and weight available
        bmi_factor = 0
        if not pd.isna(height) and not pd.isna(weight):
            bmi = (weight / (height ** 2)) * 703  # BMI formula
            if bmi <= 22:
                bmi_factor = 0.3
            elif bmi <= 25:
                bmi_factor = 0.1
            elif bmi <= 28:
                bmi_factor = -0.2
            else:
                bmi_factor = -0.8
        
        # Combine factors
        estimated_speed = base_speed + height_factor + weight_factor + bmi_factor
        
        # Add some realistic variation
        estimated_speed += np.random.normal(0, 0.5)
        
        # Keep within reasonable bounds (22-32 ft/s for MLB players)
        estimated_speed = np.clip(estimated_speed, 22, 32)
        
        return round(estimated_speed, 1)
    
    player_df['sprint_speed'] = player_df.apply(estimate_speed, axis=1)
    
    print(f"Estimated sprint speeds for {len(player_df)} players")
    print(f"Average estimated speed: {player_df['sprint_speed'].mean():.1f} ft/s")
    
    return player_df[['batter_name', 'sprint_speed']]  # Changed from player_name to batter_name

def get_existing_columns(df, desired_columns):
    """Helper function to get only existing columns from a list"""
    return [col for col in desired_columns if col in df.columns]

def calculate_hit_angle(hc_x, hc_y):
    """
    Calculate hit angle from home plate coordinates
    hc_x, hc_y are Statcast coordinates where (125.42, 199.33) is home plate
    """
    # Adjust coordinates relative to home plate
    x_adj = hc_x - 125.42
    y_adj = hc_y - 199.33
    
    # Calculate angle in degrees (0 = straight up middle, negative = left field, positive = right field)
    angle = np.degrees(np.arctan2(x_adj, y_adj))
    return angle

def calculate_fielder_distance(hit_x, hit_y, fielder_x, fielder_y):
    """Calculate distance between hit location and fielder position"""
    if pd.isna(fielder_x) or pd.isna(fielder_y):
        return np.nan
    return np.sqrt((hit_x - fielder_x)**2 + (hit_y - fielder_y)**2)

def prepare_training_data(start_date, end_date):
    """
    Prepare training data by getting both singles and doubles to train the model
    """
    print(f"Loading Statcast data from {start_date} to {end_date}...")
    data = statcast(start_dt=start_date, end_dt=end_date)
    
    # Convert batter IDs to names FIRST
    data = get_batter_names(data)
    
    # Filter to singles and doubles only
    hits = data[data['events'].isin(['single', 'double'])].copy()
    
    # Keep useful columns - note we now use 'batter_name' instead of 'player_name'
    columns = [
        'batter_name', 'batter', 'events', 'launch_speed', 'launch_angle',
        'hit_distance_sc', 'hc_x', 'hc_y', 'hit_location', 'bb_type', 'spin_rate',
        'release_speed', 'pitch_type', 'stand',
        'fielder_7', 'fielder_8', 'fielder_9',
        'inning', 'inning_topbot', 'outs_when_up', 'balls', 'strikes',
        'bat_score', 'fld_score',
        'on_1b', 'on_2b', 'on_3b',
        'home_team', 'away_team',
        'day_night', 'venue_id', 'game_date', 'game_type',
        'weather_temp', 'weather_wind', 'temp', 'wind_speed', 'wind_direction'
    ]
    
    existing_cols = get_existing_columns(hits, columns)
    hits_data = hits[existing_cols].copy()
    
    # Load sprint speed data
    print("Loading sprint speed data...")
    try:
        sprint = statcast_sprint_speed(2024)
        
        # Debug: Print column names and sample data
        print(f"Sprint speed data columns: {list(sprint.columns)}")
        print(f"Sprint speed data shape: {sprint.shape}")
        if len(sprint) > 0:
            print("Sample sprint speed data:")
            print(sprint.head())
        
        # Try different approaches to match names
        sprint_processed = None
        
        # Approach 1: Check for 'last_name, first_name' format
        if 'last_name, first_name' in sprint.columns:
            print("Using 'last_name, first_name' format")
            def convert_name_format(name_str):
                if pd.isna(name_str) or ',' not in str(name_str):
                    return name_str
                parts = str(name_str).split(', ')
                if len(parts) == 2:
                    return f"{parts[1].strip()} {parts[0].strip()}"
                return name_str
            
            sprint['batter_name'] = sprint['last_name, first_name'].apply(convert_name_format)
            sprint_processed = sprint[['batter_name', 'sprint_speed']].copy()
        
        # Approach 2: Use player_id to match with our batter IDs
        elif 'player_id' in sprint.columns and 'batter' in hits_data.columns:
            print("Using player_id matching approach")
            # Merge by player ID instead of name
            sprint_processed = sprint[['player_id', 'sprint_speed']].rename(columns={'player_id': 'batter'})
            # This will be merged on 'batter' field instead of 'batter_name'
        
        # Approach 3: Look for any name columns
        else:
            name_cols = [col for col in sprint.columns if 'name' in col.lower()]
            print(f"Found name columns: {name_cols}")
            if name_cols:
                sprint_processed = sprint[[name_cols[0], 'sprint_speed']].rename(
                    columns={name_cols[0]: 'batter_name'})
        
        # If we have processed sprint data, try to merge it
        if sprint_processed is not None:
            if 'batter_name' in sprint_processed.columns:
                print(f"Merging sprint speed by batter_name. Sprint data has {len(sprint_processed)} records")
                hits_data = hits_data.merge(sprint_processed, on='batter_name', how='left')
            elif 'batter' in sprint_processed.columns:
                print(f"Merging sprint speed by batter ID. Sprint data has {len(sprint_processed)} records")
                hits_data = hits_data.merge(sprint_processed, on='batter', how='left')
            
            # Check merge success
            sprint_matches = hits_data['sprint_speed'].notna().sum()
            total_hits = len(hits_data)
            match_rate = sprint_matches / total_hits * 100
            print(f"Sprint speed merge success: {sprint_matches}/{total_hits} ({match_rate:.1f}%)")
            
            if match_rate < 10:
                print("⚠️ Low sprint speed match rate. Falling back to estimation.")
                raise Exception("Low match rate")
        else:
            raise Exception("Could not process sprint speed data format")
            
    except Exception as e:
        print(f"Error loading/processing sprint speed: {e}")
        print("Falling back to estimated sprint speed based on player physical attributes")
        # Create estimated sprint speed data based on player physical attributes
        estimated_sprint = estimate_sprint_speed_from_physical_attributes(hits_data)
        hits_data = hits_data.merge(estimated_sprint, on='batter_name', how='left')
        print(f"Using estimated sprint speeds for {hits_data['sprint_speed'].notna().sum()} players")
    
    # Merge with sprint speed - this section is now handled above in the try/except block
    # hits_data = hits_data.merge(sprint, on='batter_name', how='left')
    
    return hits_data

def engineer_features(df):
    """
    Create features that might predict whether a single could have been a double
    """
    df = df.copy()
    
    # Create target variable (1 if double, 0 if single)
    df['is_double'] = (df['events'] == 'double').astype(int)
    
    # Hit angle features
    df['hit_angle'] = calculate_hit_angle(df['hc_x'], df['hc_y'])
    df['abs_hit_angle'] = abs(df['hit_angle'])
    
    # Categorize hit direction
    df['hit_direction'] = pd.cut(df['hit_angle'], 
                                bins=[-180, -30, 30, 180], 
                                labels=['left_field', 'center_field', 'right_field'])
    
    # Distance and speed features
    df['exit_velocity_squared'] = df['launch_speed'] ** 2
    df['launch_angle_abs'] = abs(df['launch_angle'])
    
    # Situational features
    df['runners_on_base'] = (~df['on_1b'].isna()).astype(int) + \
                           (~df['on_2b'].isna()).astype(int) + \
                           (~df['on_3b'].isna()).astype(int)
    
    df['late_inning'] = (df['inning'] >= 7).astype(int)
    df['close_game'] = (abs(df['bat_score'] - df['fld_score']) <= 2).astype(int)
    
    # Fill missing sprint speed with median
    median_sprint = df['sprint_speed'].median()
    df['sprint_speed'] = df['sprint_speed'].fillna(median_sprint)
    
    # Speed tier
    df['speed_tier'] = pd.cut(df['sprint_speed'], 
                             bins=[0, 26, 28, 35], 
                             labels=['slow', 'average', 'fast'])
    
    # Ball type features
    df['is_line_drive'] = (df['bb_type'] == 'line_drive').astype(int)
    df['is_ground_ball'] = (df['bb_type'] == 'ground_ball').astype(int)
    
    return df

def assess_training_data_adequacy(df):
    """
    Assess whether we have enough training data for reliable modeling
    """
    print("="*50)
    print("TRAINING DATA ASSESSMENT")
    print("="*50)
    
    # Basic counts
    total_hits = len(df)
    singles_count = len(df[df['events'] == 'single'])
    doubles_count = len(df[df['events'] == 'double'])
    
    print(f"Total hits: {total_hits:,}")
    print(f"Singles: {singles_count:,}")
    print(f"Doubles: {doubles_count:,}")
    print(f"Singles/Doubles ratio: {singles_count/doubles_count:.1f}:1")
    
    # Class balance assessment
    if doubles_count / total_hits < 0.1:
        print("⚠️  WARNING: Very few doubles in dataset - may lead to poor model performance")
    elif doubles_count / total_hits < 0.2:
        print("⚠️  CAUTION: Relatively few doubles - consider expanding date range")
    else:
        print("✅ Good class balance for modeling")
    
    # Sample size guidelines
    print(f"\nSample Size Assessment:")
    if total_hits < 1000:
        print("❌ INSUFFICIENT: Need at least 1,000 hits for basic modeling")
        recommended_days = int((1000 / total_hits) * 30) if total_hits > 0 else 90
        print(f"   Recommendation: Expand to ~{recommended_days} days of data")
    elif total_hits < 5000:
        print("⚠️  MINIMAL: 1K-5K hits may work but results less reliable")
        print("   Recommendation: Expand to 60-90 days for better results")
    elif total_hits < 15000:
        print("✅ ADEQUATE: Good sample size for initial modeling")
    else:
        print("✅ EXCELLENT: Large sample size should provide reliable results")
    
    # Feature completeness
    key_features = ['launch_speed', 'launch_angle', 'hit_distance_sc', 'sprint_speed']
    print(f"\nFeature Completeness:")
    
    for feature in key_features:
        if feature in df.columns:
            missing_pct = df[feature].isna().mean() * 100
            if missing_pct < 10:
                status = "✅"
            elif missing_pct < 25:
                status = "⚠️ "
            else:
                status = "❌"
            print(f"  {status} {feature}: {missing_pct:.1f}% missing")
        else:
            print(f"  ❌ {feature}: Column not found")
    
    # Unique players
    unique_players = df['batter_name'].nunique() if 'batter_name' in df.columns else 0
    print(f"\nUnique batters: {unique_players}")
    if unique_players < 100:
        print("⚠️  Limited batter diversity - consider expanding date range")
    else:
        print("✅ Good batter diversity")
    
    return {
        'total_hits': total_hits,
        'singles_count': singles_count,
        'doubles_count': doubles_count,
        'unique_players': unique_players,
        'adequate_sample': total_hits >= 1000,
        'good_balance': doubles_count / total_hits >= 0.15
    }

def build_stretch_probability_model(df):
    """
    Build a model to predict the probability that a single could have been stretched to a double
    """
    # Assess training data first
    data_assessment = assess_training_data_adequacy(df)
    
    if not data_assessment['adequate_sample']:
        print("\n❌ Insufficient training data. Model may not be reliable.")
        print("Consider expanding your date range before proceeding.")
        return None, None, None
    
    # Feature columns for modeling
    feature_cols = [
        'launch_speed', 'launch_angle', 'hit_distance_sc', 'sprint_speed',
        'hit_angle', 'abs_hit_angle', 'exit_velocity_squared', 'launch_angle_abs',
        'runners_on_base', 'outs_when_up', 'late_inning', 'close_game',
        'is_line_drive', 'is_ground_ball'
    ]
    
    # Get available features
    available_features = [col for col in feature_cols if col in df.columns]
    
    # Prepare data
    model_data = df[available_features + ['is_double']].dropna()
    
    if len(model_data) == 0:
        print("No data available for modeling after removing missing values")
        return None, None, None
    
    print(f"\nModel training data: {len(model_data):,} hits after removing missing values")
    
    X = model_data[available_features]
    y = model_data['is_double']
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
    print(f"Training set: {len(X_train):,} hits")
    print(f"Test set: {len(X_test):,} hits")
    
    # Train Random Forest model
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
    rf_model.fit(X_train, y_train)
    
    # Evaluate model
    y_pred_proba = rf_model.predict_proba(X_test)[:, 1]
    y_pred = rf_model.predict(X_test)
    auc_score = roc_auc_score(y_test, y_pred_proba)
    
    print(f"\n📊 MODEL PERFORMANCE:")
    print(f"AUC Score: {auc_score:.3f}")
    
    # Performance interpretation
    if auc_score < 0.6:
        print("❌ Poor model performance - may not be useful for predictions")
    elif auc_score < 0.7:
        print("⚠️  Fair model performance - use predictions cautiously")
    elif auc_score < 0.8:
        print("✅ Good model performance - predictions should be reliable")
    else:
        print("✅ Excellent model performance - high confidence in predictions")
    
    # Feature importance
    feature_importance = pd.DataFrame({
        'feature': available_features,
        'importance': rf_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    # Learning curve assessment
    train_scores = []
    test_scores = []
    sample_sizes = [0.1, 0.3, 0.5, 0.7, 0.9, 1.0]
    
    for size in sample_sizes:
        if size == 1.0:
            X_temp, y_temp = X_train, y_train
        else:
            X_temp, _, y_temp, _ = train_test_split(X_train, y_train, train_size=size, random_state=42, stratify=y_train)
        
        temp_model = RandomForestClassifier(n_estimators=50, random_state=42, class_weight='balanced')
        temp_model.fit(X_temp, y_temp)
        
        train_pred = temp_model.predict_proba(X_temp)[:, 1]
        test_pred = temp_model.predict_proba(X_test)[:, 1]
        
        train_scores.append(roc_auc_score(y_temp, train_pred))
        test_scores.append(roc_auc_score(y_test, test_pred))
    
    # Check if more data would help
    if len(train_scores) >= 2:
        recent_improvement = test_scores[-1] - test_scores[-2]
        if recent_improvement > 0.01:
            print("📈 Model still improving with more data - consider expanding dataset")
        else:
            print("📊 Model performance plateauing - current data size likely sufficient")
    
    return rf_model, feature_importance, available_features

def analyze_stretchable_singles(df, model, features):
    """
    Analyze singles that could potentially have been doubles
    """
    # Filter to singles only
    singles = df[df['events'] == 'single'].copy()
    
    if model is None or len(singles) == 0:
        print("No singles data or model available for analysis")
        return pd.DataFrame()
    
    # Get available features for prediction
    available_features = [f for f in features if f in singles.columns]
    singles_features = singles[available_features].fillna(singles[available_features].median())
    
    # Predict probability of being a double
    singles['stretch_probability'] = model.predict_proba(singles_features)[:, 1]
    
    # Add stretch potential categories
    singles['stretch_potential'] = pd.cut(singles['stretch_probability'],
                                        bins=[0, 0.3, 0.6, 1.0],
                                        labels=['Low', 'Medium', 'High'])
    
    return singles

def generate_insights(singles_analysis):
    """
    Generate insights from the stretchable singles analysis
    """
    if len(singles_analysis) == 0:
        return "No data available for analysis"
    
    insights = []
    
    # Overall statistics
    total_singles = len(singles_analysis)
    high_stretch = len(singles_analysis[singles_analysis['stretch_potential'] == 'High'])
    medium_stretch = len(singles_analysis[singles_analysis['stretch_potential'] == 'Medium'])
    
    insights.append(f"Total Singles Analyzed: {total_singles}")
    insights.append(f"High Stretch Potential: {high_stretch} ({high_stretch/total_singles*100:.1f}%)")
    insights.append(f"Medium Stretch Potential: {medium_stretch} ({medium_stretch/total_singles*100:.1f}%)")
    
    # Top players with stretchable singles
    if 'batter_name' in singles_analysis.columns:  # Changed from player_name to batter_name
        top_players = singles_analysis[singles_analysis['stretch_potential'] == 'High'].groupby('batter_name').size().sort_values(ascending=False).head(5)
        insights.append("\nTop Batters with High-Stretch Singles:")
        for player, count in top_players.items():
            insights.append(f"  {player}: {count} singles")
    
    # Speed analysis
    if 'sprint_speed' in singles_analysis.columns:
        avg_speed_high = singles_analysis[singles_analysis['stretch_potential'] == 'High']['sprint_speed'].mean()
        avg_speed_low = singles_analysis[singles_analysis['stretch_potential'] == 'Low']['sprint_speed'].mean()
        insights.append(f"\nAverage Sprint Speed:")
        insights.append(f"  High Stretch Potential: {avg_speed_high:.1f} ft/s")
        insights.append(f"  Low Stretch Potential: {avg_speed_low:.1f} ft/s")
    
    return "\n".join(insights)

# Main analysis function
def run_singles_analysis(start_date="2024-04-01", end_date="2024-04-30"):
    """
    Run the complete singles stretching analysis
    """
    try:
        # Prepare data
        print("Step 1: Preparing training data...")
        hits_data = prepare_training_data(start_date, end_date)
        
        print("Step 2: Engineering features...")
        hits_with_features = engineer_features(hits_data)
        
        print("Step 3: Building stretch probability model...")
        model, feature_importance, features = build_stretch_probability_model(hits_with_features)
        
        if model is not None:
            print("\nFeature Importance:")
            print(feature_importance.head(10))
        
        print("Step 4: Analyzing stretchable singles...")
        singles_analysis = analyze_stretchable_singles(hits_with_features, model, features)
        
        print("Step 5: Generating insights...")
        insights = generate_insights(singles_analysis)
        print("\n" + "="*50)
        print("ANALYSIS RESULTS")
        print("="*50)
        print(insights)
        
        return {
            'model': model,
            'feature_importance': feature_importance,
            'singles_analysis': singles_analysis,
            'insights': insights
        }
        
    except Exception as e:
        print(f"Error in analysis: {e}")
        return None

# Example usage
if __name__ == "__main__":
    # Debug sprint speed data first
    print("Debugging sprint speed data structure...")
    debug_sprint_speed_data()
    
    # Run the analysis
    results = run_singles_analysis("2024-04-01", "2024-04-30")
    
    # Access results
    if results:
        singles_with_stretch = results['singles_analysis']
        
        # Show top stretchable singles
        if len(singles_with_stretch) > 0:
            print("\nTop 10 Most Stretchable Singles:")
            top_stretch = singles_with_stretch.nlargest(10, 'stretch_probability')[
                ['batter_name', 'launch_speed', 'sprint_speed', 'hit_distance_sc', 'stretch_probability']  # Changed from player_name to batter_name
            ]
            print(top_stretch.to_string(index=False))

Debugging sprint speed data structure...
SPRINT SPEED DATA DEBUG
Columns: ['last_name, first_name', 'player_id', 'team_id', 'team', 'position', 'age', 'competitive_runs', 'bolts', 'hp_to_1b', 'sprint_speed']
Shape: (566, 10)
Data types:
last_name, first_name     object
player_id                  int64
team_id                    int64
team                      object
position                  object
age                        int64
competitive_runs           int64
bolts                    float64
hp_to_1b                 float64
sprint_speed             float64
dtype: object

First 5 rows:
  last_name, first_name  player_id  team_id team position  age  \
0       Witt Jr., Bobby     677951      118   KC       SS   24   
1          Rojas, Johan     679032      143  PHI       CF   23   
2      De La Cruz, Elly     682829      113  CIN       SS   22   
3     Fitzgerald, Tyler     666149      137   SF       SS   26   
4        Clase, Jonatan     682729      141  TOR       LF   22   

   comp

100%|██████████| 30/30 [00:00<00:00, 64.42it/s]


Converting batter IDs to names...
Successfully mapped 460 batter names
Loading sprint speed data...
Sprint speed data columns: ['last_name, first_name', 'player_id', 'team_id', 'team', 'position', 'age', 'competitive_runs', 'bolts', 'hp_to_1b', 'sprint_speed']
Sprint speed data shape: (566, 10)
Sample sprint speed data:
  last_name, first_name  player_id  team_id team position  age  \
0       Witt Jr., Bobby     677951      118   KC       SS   24   
1          Rojas, Johan     679032      143  PHI       CF   23   
2      De La Cruz, Elly     682829      113  CIN       SS   22   
3     Fitzgerald, Tyler     666149      137   SF       SS   26   
4        Clase, Jonatan     682729      141  TOR       LF   22   

   competitive_runs  bolts  hp_to_1b  sprint_speed  
0               298  156.0      4.10          30.5  
1               176   78.0      4.24          30.1  
2               249   81.0      4.21          30.0  
3                99   47.0      4.30          30.0  
4               