In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

data = pd.read_csv('inningScore.csv') 
print(data.head())

print(data.columns)

In [None]:
def Sum_to_n_inning(To_which_inning: int):  
    """
    Calculate the (cumulative) run differences at the specified inning, and plot it.
    ## Parameters
        `To_which_inning`: the specified inning

    ## Return
        None
    """
    # sum from 1st inning to 5th inning for every row
    col_list = [str(i) for i in range(1, To_which_inning+1)]

    # Convert columns to numeric type
    data[col_list] = data[col_list].apply(pd.to_numeric, errors='coerce')   

    data['InningSum'] = data[col_list].sum(axis=1, numeric_only=True)
    sum_data = data[['Team', 'Game', 'InningSum']].copy()

    # For every Game, calculate the abs difference between the two teams 
    # One Game print out one row
    # x.diff(): home - away (lower row minus upper row)
    sum_data.loc[:, 'SumDiff'] = sum_data.groupby('Game')['InningSum'].transform(lambda x: x.diff())

    # only print distinct Game
    diff_data = sum_data[['Game', 'SumDiff']]
    diff_data = diff_data.dropna()

    plt.xlabel('Difference of scores')
    plt.title(f'Difference of scores at the end of Inning {To_which_inning}')
    sns.histplot(diff_data['SumDiff'], bins=20, kde=True)


def neg_sum_to_n_inning(To_which_inning: int, equal: bool=True) -> pd.Series:  
    """
    Calculate the (cumulative) run differences at the specified inning, and only return
    those with negative run differences.

    ## Parameters
        `To_which_inning`: the specified inning

        `equal`: 
        ```python
        if equal:
            neg_diff_data = diff_data[diff_data['SumDiff'] <= 0]
        else:
            neg_diff_data = diff_data[diff_data['SumDiff'] < 0]
        ```
                    
    ## Return
        A `pd.Series` of negative (cumulative) run differences.
    """
    # sum from 1st inning to 5th inning for every row
    col_list = [str(i) for i in range(1, To_which_inning+1)]

    # Convert columns to numeric type
    data[col_list] = data[col_list].apply(pd.to_numeric, errors='coerce')   

    data['InningSum'] = data[col_list].sum(axis=1, numeric_only=True)
    sum_data = data[['Team', 'Game', 'InningSum']].copy()

    # For every Game, calculate the abs difference between the two teams 
    # One Game print out one row
    # x.diff(): home - away (lower row minus upper row)
    sum_data.loc[:, 'SumDiff'] = sum_data.groupby('Game')['InningSum'].transform(lambda x: x.diff())

    # only print distinct Game
    diff_data = sum_data[['Game', 'SumDiff']]
    diff_data = diff_data.dropna()
    if equal:
        neg_diff_data = diff_data[diff_data['SumDiff'] <= 0]
    else:
        neg_diff_data = diff_data[diff_data['SumDiff'] < 0]

    return neg_diff_data

## Plot the run difference to the end of inning 1-8

In [None]:
"""
This block in commented for overall efficiency.
"""
# Create subplots
fig, axs = plt.subplots(2, 4, figsize=(16, 8))  # 2 rows, 4 columns

neg_diff_data_list = []
for i, ax in enumerate(axs.flat, start=1):
    plt.subplot(2, 4, i)  # Set current subplot
    diff_data = Sum_to_n_inning(i)

plt.tight_layout()  # Adjust layout to prevent overlap
plt.show()

### Plot Heat map

In [None]:
def sum_to_n_inning(To_which_inning: int) -> pd.Series:  
    # sum from 1st inning to 5th inning for every row
    col_list = [str(i) for i in range(1, To_which_inning+1)]

    # Convert columns to numeric type
    data[col_list] = data[col_list].apply(pd.to_numeric, errors='coerce')   

    data['InningSum'] = data[col_list].sum(axis=1, numeric_only=True)
    sum_data = data[['Team', 'Game', 'InningSum']].copy()

    # print(sum_data.head(10))

    # For every Game, calculate the abs difference between the two teams 
    # One Game print out one row
    # x.diff(): home - away (lower row minus upper row)
    sum_data.loc[:, 'SumDiff'] = sum_data.groupby('Game')['InningSum'].transform(lambda x: x.diff())
    
    # only print distinct Game
    diff_data = sum_data[['Game', 'SumDiff']]
    diff_data = diff_data.dropna()
    diff_data['Inning'] = To_which_inning

    return diff_data

# heatmap with x-axis as inning and y-axis as sumdiff to the end of the inning
innings = [str(i) for i in range(1, 10)]
sum_diff_data = []

# concatenate all columns which inning is sum to
for i in range(9):
    sum_diff_data.append(sum_to_n_inning(i + 1))


# Concatenate the dataframes in sum_diff_data
concatenated_data = pd.concat(sum_diff_data)

# Count the number of occurrences of each (Inning, SumDiff) pair
count_data = concatenated_data.groupby(['Inning', 'SumDiff']).size().reset_index(name='Count')

# Pivot the DataFrame to have 'Count' as values, 'Inning' as rows, and 'SumDiff' as columns
pivot_data = count_data.pivot(index='SumDiff', columns='Inning', values='Count')

# Plot heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(pivot_data, cmap='coolwarm', annot=True, fmt=".0f", linewidths=.5)
plt.title('Number of games by inning and difference of final scores')
plt.xlabel('Inning')
plt.ylabel('Difference of final scores')
plt.show()

## Get the run difference to the end of inning 1-8 (only negative results)

In [None]:
neg_diff_data_list = []
for i in range(1, 9):
    neg_diff_data = neg_sum_to_n_inning(i)
    # neg_diff_data = neg_sum_to_n_inning(i, False)
    neg_diff_mean = neg_diff_data['SumDiff'].mean()
    print(f'(negative) mean diff. at {i}-th inning: {neg_diff_mean}')
    neg_diff_data_list.append(neg_diff_data)

neg_diff_data_list[0]

## Run difference at the End of the Game

In [None]:
# sum elements in all_inning if it is not NaN
all_inning = [str(i) for i in range(1, 20)]
# print(all_inning)

data[all_inning] = data[all_inning].replace({np.nan: 0, '-': 0})

# Convert columns to numeric type
data[all_inning] = data[all_inning].apply(pd.to_numeric, errors='coerce')

data["final_score"] = data[all_inning].sum(axis=1, numeric_only=True)

# calculate the difference between the two teams per game
data['ScoreDiff'] = data.groupby('Game')['final_score'].transform(lambda x: x.diff())

# print(data.head())

"""Final score difference"""
ScoreDiff_data = data[['Game', 'ScoreDiff']].dropna()

print(ScoreDiff_data.shape)
print(ScoreDiff_data['ScoreDiff'].value_counts())

plt.xlabel('Score difference')
plt.ylabel('Count')
plt.title('Final score difference (home - away)')
sns.histplot(ScoreDiff_data['ScoreDiff'], bins=100, kde=True)

plt.show()

## Divide final run differences with the $k$-th inning results (inning 1 ~ 8)

In [None]:
"""
A set of final score differences (home - away)
`score_diff_data_set[i]` corresponds to `i+1`-th inning
"""
score_diff_data_set = []
fig, axs = plt.subplots(2, 4, figsize=(16, 8))  # 2 rows, 4 columns

for i in range(8):
    plt.subplot(2, 4, i + 1)  # Set current subplot
    score_diff_data_set.append(ScoreDiff_data[ScoreDiff_data['Game'].isin(neg_diff_data_list[i]['Game'])])
    score_diff_mean = score_diff_data_set[i]['ScoreDiff'].mean()
    # print(score_diff_data_set[i])
    print('score diff mean: ', score_diff_mean)
    print('sample size:', score_diff_data_set[i].shape[0])


    plt.text(0.02, 0.95, f'Mean: {score_diff_mean:.2f}', transform=plt.gca().transAxes, color='red', fontsize=12, verticalalignment='top')
    plt.xlabel('Score difference')
    plt.ylabel('Count')
    plt.title(f'Score difference at the end of the game\n(home - away)\n(Home is trailing at the {i+1}-th inning)')
    sns.histplot(score_diff_data_set[i]['ScoreDiff'], bins=100, kde=True)

plt.tight_layout()
plt.show()

## Calculate confidence intervals (t-test)

In [None]:
from scipy import stats

confidence = 0.95
confidence_intervals = []
for i in range(8):
    lower, upper = stats.t.interval(
        confidence=confidence,
        df=score_diff_data_set[i].shape[0] - 1,
        loc=score_diff_data_set[i]['ScoreDiff'].mean(),
        scale=stats.sem(score_diff_data_set[i]['ScoreDiff'])
    )
    confidence_intervals.append((lower, upper))
    print(f'CI of final run difference (for {i + 1}-th inning): ({lower:.3f}, {upper:.3f})')

In [None]:
# X positions for the confidence intervals
x_positions = range(1, len(confidence_intervals) + 1)

# Extract lower and upper bounds
lower_bounds = [interval[0] for interval in confidence_intervals]
upper_bounds = [interval[1] for interval in confidence_intervals]

# Calculate the means (for plotting purposes)
means = [(low + high) / 2 for low, high in confidence_intervals]

# Plotting
plt.figure(figsize=(10, 6))
# plt.errorbar(x_positions, means, yerr=[(mean - low, high - mean) for mean, low, high in zip(means, lower_bounds, upper_bounds)], 
            #  fmt='o', ecolor='red', capsize=5, linestyle='None')

plt.errorbar(x_positions, means, yerr=[mean - low for mean, low, high in zip(means, lower_bounds, upper_bounds)], 
             fmt='o', ecolor='red', capsize=5, linestyle='None')

plt.xticks(x_positions)
plt.xlabel('Home team being trailing at the $k$-th inning')
plt.ylabel('Mean difference of final scores')
plt.title('Confidence Intervals of mean difference of final scores')
plt.grid(True)
plt.show()


## Goodness-of-fit

There are four goodness-of-fit tests against differnet distributions:
1. Normal
2. Poisson
3. Exponential
4. Bimodal (located in `bimodal_distribution_test.py`)
5. Folded normal

In [None]:
'''
Preform normality test on ScoreDiff
'''
from scipy import stats

data = ScoreDiff_data['ScoreDiff']
stat, p_val = stats.kstest(data, 'norm', [data.mean(), data.std()])
print(f'mean: {data.mean()}; variance: {data.var()}')
if p_val > 0.05:
    print('normally distributed')
else:
    print('not normally distributed')

In [None]:
'''
Try test for Poisson or Exponential (with abs(x.diff())).
Since the mean and variance of our data is not close, it is unlikely a Poisson distribution.
Same reason for the exponential distribution.
'''

abs_data = abs(data)
# The mean and the variance are not close.
# That is, the underlying distribution is unlikely Poisson
print(f'mean: {abs_data.mean()}; variance: {abs_data.var()}')
stat, p_val = stats.kstest(abs_data, stats.poisson.cdf, [abs_data.mean()])
if p_val > 0.05:
    print('poisson distribution')
else:
    print('not poisson distribution')


scale = 1 / abs_data.mean()
stat, p_val = stats.kstest(abs_data, stats.expon.cdf, [0, scale])
if p_val > 0.05:
    print('exponentially distributed')
else:
    print('not exponentially distributed')

In [None]:
"""
Test whether our data fits "folded normal distribution".

It turns out that the result is NO.
"""

# Plot histogram of the data
plt.hist(abs_data, bins=100, density=True, alpha=0.6, color='g', label='Data Histogram')

# Fit the data to a folded normal distribution
param = stats.foldnorm.fit(data, floc=0)

print(f'estimated param: {param}')

# Get the PDF of the fitted folded normal distribution
x = np.linspace(0, max(data), 1000)
pdf_fitted = stats.foldnorm.pdf(x, *param)
plt.plot(x, pdf_fitted, 'r-', label='Fitted Folded Normal PDF')

plt.xlabel('Absoulte value of score differences')
plt.ylabel('Probability density')
plt.legend()
plt.show()

# Kolmogorov-Smirnov Test
D, p_value_ks = stats.kstest(data, 'foldnorm', args=param)
print(f'Kolmogorov-Smirnov Test: D={D}, p-value={p_value_ks}')

# Interpretation
alpha = 0.05  # Common significance level
if p_value_ks > alpha:
    print("Fail to reject the null hypothesis: The data follows the folded normal distribution.")
else:
    print("Reject the null hypothesis: The data does not follow the folded normal distribution.")