In [10]:
# Analyzes correlations, statistical significance, and win percentages for NBA playoff series data

# --- Imports ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as sp_stats  # Use distinct alias to avoid conflict with stats list

# --- Load and Preprocess Data ---
def load_and_preprocess_data(file_path):
    """Load the dataset, filter for advanced stats, and calculate differences."""
    # Load data
    master_df = pd.read_csv(file_path)
    
    # Filter rows with advanced stats (post-1996-97)
    master_df = master_df.dropna(subset=['Winner_OffRtg', 'Loser_OffRtg'])
    print(f"Filtered DataFrame shape (post-1996-97): {master_df.shape}")

    # Define stats list
    stats = ["W", "L", "OffRtg", "DefRtg", "NetRtg", "AST%", "AST/TO", "AST Ratio",
             "OREB%", "DREB%", "REB%", "TOV%", "eFG%", "TS%", "PACE", "PIE", "POSS"]

    # Calculate differences (Winner - Loser) for each stat
    for stat in stats:
        master_df[f"Diff_{stat}"] = master_df[f"Winner_{stat}"] - master_df[f"Loser_{stat}"]
    
    # Calculate seed difference (Loser_Seed - Winner_Seed: positive = higher seed advantage)
    master_df["Diff_Seed"] = master_df["Loser_Seed"] - master_df["Winner_Seed"]
    
    # Calculate series margin
    master_df["Series_Margin"] = master_df["Winner_Wins"] - master_df["Loser_Wins"]
    
    return master_df, stats

# --- Correlation and Statistical Significance Analysis ---
def calculate_correlations(master_df, stats):
    """Calculate correlations and p-values for Winner_Wins and Series_Margin."""
    n = len(master_df)
    print(f"Sample size: {n}")

    # Outcomes
    outcomes = {
        "Winner_Wins": master_df["Winner_Wins"],
        "Series_Margin": master_df["Series_Margin"]
    }

    for outcome_name, outcome in outcomes.items():
        print(f"\nCorrelation and Statistical Significance ({outcome_name}):")
        correlations = {}
        p_values = {}

        # Calculate correlations and p-values for each stat
        for stat in stats + ["Seed"]:
            diff_col = f"Diff_{stat}"
            r = master_df[diff_col].corr(outcome, method="pearson")
            correlations[stat] = r
            t = r * np.sqrt((n - 2) / (1 - r**2))  # t-statistic
            p = 2 * (1 - sp_stats.t.cdf(abs(t), df=n-2))  # Two-tailed p-value
            p_values[stat] = p

        # Sort by absolute correlation
        sorted_correlations = sorted(correlations.items(), key=lambda x: abs(x[1]), reverse=True)

        # Display results
        for stat, r in sorted_correlations:
            p = p_values[stat]
            sig = "Yes" if p < 0.05 else "No"
            print(f"{stat}: r = {r:.4f}, p = {p:.4f}, Significant (p < 0.05)? {sig}")

        # Display top stat
        top_stat, top_r = sorted_correlations[0]
        print(f"\nStat with highest correlation: {top_stat} (r = {top_r:.4f}, p = {p_values[top_stat]:.4f})")

# --- Higher Stat Wins Analysis ---
def higher_stat_wins(master_df, stats):
    """Calculate percentage of series won by the team with the higher stat."""
    total_series = len(master_df)
    results = {}

    # Calculate wins for each stat
    for stat in stats:
        if stat == "Seed":
            master_df["Higher_Stat_Won"] = master_df["Winner_Seed"] < master_df["Loser_Seed"]
        elif stat in ["DefRtg", "L", "TOV%"]:
            master_df["Higher_Stat_Won"] = master_df[f"Winner_{stat}"] < master_df[f"Loser_{stat}"]
        else:
            master_df["Higher_Stat_Won"] = master_df[f"Winner_{stat}"] > master_df[f"Loser_{stat}"]
        
        wins = master_df["Higher_Stat_Won"].sum()
        pct = (wins / total_series) * 100
        results[stat] = {"Wins": wins, "Percentage": pct}

    # Sort results by percentage
    sorted_results = sorted(results.items(), key=lambda x: x[1]["Percentage"], reverse=True)

    # Display results
    print("\nSeries won by team with the higher stat (1996-97 onward, sorted by percentage):")
    for stat, data in sorted_results:
        print(f"{stat}: {data['Wins']} wins, {data['Percentage']:.2f}%")

    # Higher seed wins specifically
    higher_seed_wins = master_df["Winner_Seed"] < master_df["Loser_Seed"]
    higher_seed_win_percentage = (higher_seed_wins.sum() / total_series) * 100
    print(f"\nHigher seed wins: {higher_seed_wins.sum()} ({higher_seed_win_percentage:.2f}%)")

# --- Overlap and Series Margin Analysis ---
def overlap_and_margin_analysis(master_df):
    """Analyze overlap of key stats and plot NetRtg vs. Series Margin."""
    # Overlap of NetRtg, PIE, and Seed
    overlap = master_df[(master_df["Winner_NetRtg"] > master_df["Loser_NetRtg"]) &
                        (master_df["Winner_PIE"] > master_df["Loser_PIE"]) &
                        (master_df["Winner_Seed"] < master_df["Loser_Seed"])].shape[0]
    print(f"\nNetRtg, PIE, and Seed all higher: {overlap} ({overlap/410*100:.2f}%)")

    # Average Series Margin for key stats
    for stat in ["NetRtg", "PIE", "Seed"]:
        if stat == "Seed":
            wins = master_df["Winner_Seed"] < master_df["Loser_Seed"]
        else:
            wins = master_df[f"Winner_{stat}"] > master_df[f"Loser_{stat}"]
        print(f"Avg Series Margin ({stat}): {master_df[wins]['Series_Margin'].mean():.2f}")

    # Plot Diff_NetRtg vs. Series_Margin
    plt.figure(figsize=(8, 6))
    plt.scatter(master_df["Diff_NetRtg"], master_df["Series_Margin"], alpha=0.5)
    plt.xlabel("NetRtg Difference")
    plt.ylabel("Series Margin")
    plt.title("NetRtg Difference vs. Series Margin")
    plt.grid(True)
    #plt.show()
    png_path = r"C:\Users\jonla\NBA_Playoffs_Series_Predictor\Visualizations\netrtg_vs_series_margin.png"
    plt.savefig(png_path)
    print("netrtg_vs_series_margin.png saved to: ", png_path)
    plt.close()

    # Plot Diff_PIE vs. Series_Margin
    plt.figure(figsize=(8, 6))
    plt.scatter(master_df["Diff_PIE"], master_df["Series_Margin"], alpha=0.5)
    plt.xlabel("PIE Difference")
    plt.ylabel("Series Margin")
    plt.title("PIE Difference vs. Series Margin")
    plt.grid(True)
    #plt.show()
    png_path = r"C:\Users\jonla\NBA_Playoffs_Series_Predictor\Visualizations\PIE_vs_series_margin.png"
    plt.savefig(png_path)
    print("PIE_vs_series_margin.png saved to: ", png_path)
    plt.close()

# --- Main Execution ---
def main():
    file_path = r"C:\Users\jonla\NBA_Playoffs_Series_Predictor\Data\Master_DF.csv"
    
    # Load and preprocess data
    master_df, stats = load_and_preprocess_data(file_path)
    
    # Perform analyses
    calculate_correlations(master_df, stats)
    higher_stat_wins(master_df, stats)
    overlap_and_margin_analysis(master_df)
    
    # Save updated DataFrame
    #master_df.to_csv("master_df_with_differences.csv", index=False)
    #print("\nSaved updated DataFrame with stat differences to 'master_df_with_differences.csv'")

if __name__ == "__main__":
    main()

Filtered DataFrame shape (post-1996-97): (410, 52)
Sample size: 410

Correlation and Statistical Significance (Winner_Wins):
Seed: r = -0.1048, p = 0.0338, Significant (p < 0.05)? Yes
W: r = -0.0926, p = 0.0610, Significant (p < 0.05)? No
L: r = 0.0918, p = 0.0634, Significant (p < 0.05)? No
DefRtg: r = 0.0873, p = 0.0775, Significant (p < 0.05)? No
PIE: r = -0.0860, p = 0.0821, Significant (p < 0.05)? No
NetRtg: r = -0.0671, p = 0.1751, Significant (p < 0.05)? No
AST Ratio: r = 0.0613, p = 0.2156, Significant (p < 0.05)? No
DREB%: r = -0.0478, p = 0.3347, Significant (p < 0.05)? No
AST%: r = 0.0442, p = 0.3718, Significant (p < 0.05)? No
eFG%: r = 0.0421, p = 0.3947, Significant (p < 0.05)? No
AST/TO: r = 0.0330, p = 0.5047, Significant (p < 0.05)? No
PACE: r = 0.0301, p = 0.5427, Significant (p < 0.05)? No
TS%: r = 0.0266, p = 0.5916, Significant (p < 0.05)? No
TOV%: r = 0.0261, p = 0.5988, Significant (p < 0.05)? No
REB%: r = -0.0246, p = 0.6193, Significant (p < 0.05)? No
POSS: r =