In [2]:
import pandas as pd
import warnings
import numpy as np
from scipy import stats

# Suppress all warnings
warnings.filterwarnings('ignore')


# Load the tournament_player_stats dataframe
tournament_player_stats = pd.read_csv('tournament_player_stats.csv')
# Load the matchwise_player_stats dataframe
data = pd.read_csv('ball_to_ball_data.csv')

In [3]:
# Filter data for two specific teams
team1_strike_rates = tournament_player_stats[tournament_player_stats['team'] == 'RCB']['strike_rate']
team2_strike_rates = tournament_player_stats[tournament_player_stats['team'] == 'SRH']['strike_rate']

print(team1_strike_rates)
print(team2_strike_rates)

4      100.00
6        0.00
10     127.27
40     100.00
46     143.26
56       0.00
65     206.67
66     188.44
72     154.70
80     183.82
89     120.93
91       0.00
122    133.33
129    177.13
152    120.00
169    150.00
176    176.19
183     50.00
194     33.33
199    176.92
203      0.00
207    161.62
Name: strike_rate, dtype: float64
0      168.52
2      205.96
8       83.33
25      79.31
35     144.68
54     192.86
70     172.92
85     138.46
86       0.00
87     125.71
90     112.28
112    142.92
151    116.67
153    129.52
177      0.00
184    146.02
189      0.00
190     92.31
191    140.00
198      0.00
Name: strike_rate, dtype: float64


In [4]:
# Calculate means and standard deviations
mean_team1 = np.mean(team1_strike_rates)
mean_team2 = np.mean(team2_strike_rates)
std_team1 = np.std(team1_strike_rates, ddof=1)
std_team2 = np.std(team2_strike_rates, ddof=1)

print("Mean Strike rate of RCB:",mean_team1)
print("Mean Strike rate of SRH:",mean_team2)
print("Standard Deviation Strike rate of RCB:",std_team1)
print("Standard Deviation Strike rate of SRH:",std_team1)

Mean Strike rate of RCB: 113.80045454545453
Mean Strike rate of SRH: 109.57350000000001
Standard Deviation Strike rate of RCB: 69.18045274960762
Standard Deviation Strike rate of SRH: 69.18045274960762


In [5]:
alpha = 0.05 #level of significance

# Z-test two-tailed
n_team1 = len(team1_strike_rates) #sample size
n_team2 = len(team2_strike_rates)
z_score = (mean_team1 - mean_team2) / np.sqrt((std_team1**2 / n_team1) + (std_team2**2 / n_team2))
print('Z-Score:', np.abs(z_score))

p_value_z = stats.norm.sf(abs(z_score)) * 2  # Two-tailed
print('P-Value:', p_value_z)

# Critical Z-Score
z_critical = stats.norm.ppf(1-alpha/2)
print('Critical Z-Score:',z_critical)

Z-Score: 0.2044322304124337
P-Value: 0.8380157457487278
Critical Z-Score: 1.959963984540054


In [6]:
# Two-Sample T-test
t_score, p_value_t = stats.ttest_ind(team1_strike_rates, team2_strike_rates)

alpha = 0.05 #level of significance

# Compute the degrees of freedom (df) (n_A-1)+(n_b-1)
df = len(team1_strike_rates)+len(team2_strike_rates)-2
 
# Calculate the critical t-value
# ppf is used to find the critical t-value for a two-tailed test
critical_t = stats.t.ppf(1 - alpha/2, df)

print(f'T-test: T-score = {t_score}, P-Value: {p_value_t} ,Critical-T:{critical_t}')

T-test: T-score = 0.20378214285831195, P-Value: 0.8395569798321503 ,Critical-T:2.021075390306273


In [7]:
# ANOVA
teams = tournament_player_stats['team'].unique()
strike_rates_by_team = [tournament_player_stats[tournament_player_stats['team'] == team]['strike_rate'] for team in teams]

f_statistic, p_value_anova = stats.f_oneway(*strike_rates_by_team)

print(f'ANOVA: F-statistic = {f_statistic}, P-value = {p_value_anova}')

ANOVA: F-statistic = 0.6059373973907172, P-value = 0.7911750050659204


In [8]:
# Create early and late over categories
data['over_category'] = np.where(data['over'] <= 10, 'early', 'late')
print(data['over_category'])

# Calculate the number of dismissals for a specific type in early and late overs
dismissal_type = 'caught'
early_dismissals = data[(data['over_category'] == 'early') & (data['wicket_type'] == dismissal_type)].shape[0]
late_dismissals = data[(data['over_category'] == 'late') & (data['wicket_type'] == dismissal_type)].shape[0]
total_early = data[data['over_category'] == 'early'].shape[0]
total_late = data[data['over_category'] == 'late'].shape[0]

print(early_dismissals)
print(late_dismissals)
print(total_early)
print(total_late)

0        early
1        early
2        early
3        early
4        early
         ...  
17051    early
17052    early
17053     late
17054     late
17055     late
Name: over_category, Length: 17056, dtype: object
272
372
8813
8243


In [9]:
# Calculate proportions
p_early = early_dismissals / total_early
p_late = late_dismissals / total_late

print(p_early)
print(p_late)

0.030863497106547146
0.045129200533786246


In [10]:
# Z-test
std_pooled = np.sqrt((p_early * (1 - p_early) / total_early) + (p_late * (1 - p_late) / total_late))
z_score = (p_early - p_late) / std_pooled
p_value_z = stats.norm.sf(abs(z_score)) * 2  # Two-tailed

print(f'Z-test: Z-score = {z_score}, P-Value: {p_value_z}')

Z-test: Z-score = -4.85842961982554, P-Value: 1.183204524704062e-06


In [11]:
# T-test
t_score, p_value_t = stats.ttest_ind_from_stats(mean1=p_early, std1=np.sqrt(p_early * (1 - p_early) / total_early), nobs1=total_early, mean2=p_late, std2=np.sqrt(p_late * (1 - p_late) / total_late), nobs2=total_late)

print(f'T-test: T-score = {t_score}, P-value = {p_value_t}')

T-test: T-score = -450.01405191463357, P-value = 0.0


In [12]:
# Create a contingency table
contingency_table = pd.crosstab(data['wicket_type'], data['over_category'])

# Chi-Square Test
chi2_stat, p_value_chi, dof, expected = stats.chi2_contingency(contingency_table)

print(f'Chi-Square Test: Chi2 Stat = {chi2_stat}, P-value = {p_value_chi}')


Chi-Square Test: Chi2 Stat = 14.907685867154184, P-value = 0.010764151017700542
