# **Hypothesis Testing**

In [None]:
import pandas as pd
import numpy as np
from scipy import stats

# **Z-test with one mean**

Suppose an e-commerce platform receives an average of 100 visitors per day (known population mean). We want to test if the average number of visitors for a recent sample of 40 days is significantly different from this known average. Verify the claim at significance level alpha = 0.05.

In [None]:
pop_mean = 100
pop_sd = 15
visitors = np.random.randint(150,size= 40)
print(visitors)

sample_mean = np.mean(visitors)
sample_size = len(visitors)

z_score_calculated = (sample_mean - pop_mean) / (pop_sd / np.sqrt(sample_size))

alpha = 0.05
z_critical = stats.norm.ppf(1 - alpha / 2)

print(f"Calculated Z-Score: {z_score_calculated:.3f}")
print(f"Critical Z-Value: ±{z_critical:.3f}")

if z_score_calculated > z_critical or z_score_calculated < -z_critical:
    print("Conclusion: Reject the null hypothesis. The sample mean is significantly different from the population mean.")
else:
    print("Conclusion: Fail to reject the null hypothesis. The sample mean is not significantly different from the population mean.")

[ 59 105  44 107  53   0  17  45  58  88 132  92  48  16  86  14  87  15
  26  35 128 102   2  76  66 111  50  39  28  56  38  61   2  13  89  94
  51   6  15  19]
Calculated Z-Score: -19.258
Critical Z-Value: ±1.960
Conclusion: Reject the null hypothesis. The sample mean is significantly different from the population mean.


In [None]:

alpha = 0.05

z_critical_right = stats.norm.ppf(1 - alpha)

z_critical_left = stats.norm.ppf(alpha)

z_critical_two = stats.norm.ppf(1 - alpha / 2)

print(f"Z-Critical (Right-Tailed Test) for alpha={alpha} :  {z_critical_right:.3f}")
print(f"Z-Critical (Left-Tailed Test) for alpha={alpha} :  {z_critical_left:.3f}")
print(f"Z-Critical (Two-Tailed Test) for alpha={alpha} :  {z_critical_two:.3f}")

Z-Critical (Right-Tailed Test) for alpha=0.05 :  1.645
Z-Critical (Left-Tailed Test) for alpha=0.05 :  -1.645
Z-Critical (Two-Tailed Test) for alpha=0.05 :  1.960


In [None]:
z_score = 1.96

probability = stats.norm.cdf(z_score)

print(f"The probability for Z = {z_score} is: {probability}")

The probability for Z = 1.96 is: 0.9750021048517795


# **Z -test for Two Mean**

In [None]:
campaign1_data = np.random.randint(120, 175, size=35)
print(campaign1_data)

campaign2_data = np.random.randint(150, 285, size=35)
print(campaign2_data)

camp1_mean = np.mean(campaign1_data)
camp1_std = np.std(campaign1_data)
n1 = len(campaign1_data)

camp2_mean = np.mean(campaign2_data)
camp2_std = np.std(campaign2_data)
n2 = len(campaign2_data)

z_score_calculated = (camp1_mean - camp2_mean) / np.sqrt(((camp1_std ** 2) / n1) + ((camp2_std ** 2) / n2))

alpha = 0.05

z_critical = stats.norm.ppf(1 - alpha / 2)

print(f"Campaign 1 Mean: {camp1_mean}")
print(f"Campaign 2 Mean: {camp2_mean}")
print(f"Z-Score: {z_score_calculated}")
print(f"Critical Z-Value: {z_critical}")

if z_score_calculated > z_critical or z_score_calculated < -z_critical:
    print("Reject the null hypothesis: There is a significant difference in the average number of daily visitors between the two campaigns.")
else:
    print("Fail to reject the null hypothesis: There is no significant difference in the average number of daily visitors between the two campaigns.")

[121 170 171 125 121 134 136 174 168 173 161 165 146 150 138 159 136 131
 171 150 166 139 138 167 140 173 174 156 166 124 159 167 157 155 124]
[245 243 170 173 205 182 192 211 261 241 225 202 208 196 212 175 157 166
 153 220 245 195 153 216 245 248 192 158 211 220 157 176 164 161 163]
Campaign 1 Mean: 151.57142857142858
Campaign 2 Mean: 198.31428571428572
Z-Score: -7.557076624020908
Critical Z-Value: 1.959963984540054
Reject the null hypothesis: There is a significant difference in the average number of daily visitors between the two campaigns.


# **T-test with single mean**

An eComm platform believes that the average number of daily visitors is 150. Test if the average number of visitors for a sample of 20 days is significantly different from this value.

In [None]:
visitors = np.random.randint(120, 174, size=20)
print(visitors)
pop_mean = 150
sample_mean = np.mean(visitors)
sample_std = np.std(visitors, ddof=1)
n = len(visitors)
df = n - 1
t_calc, _ = stats.ttest_1samp(visitors, popmean=pop_mean)
alpha = 0.05
t_critical = stats.t.ppf(1 - alpha / 2, df)
print(f"Sample Mean: {sample_mean}")
print(f"Sample Standard Deviation: {sample_std}")
print(f"T-Statistic: {t_calc}")
print(f"Critical T-Value: {t_critical}")
if abs(t_calc) > t_critical:
    print("Reject the null hypothesis: The sample mean is significantly different from the population mean.")
else:
    print("Fail to reject the null hypothesis: There is no significant difference between the sample mean and the population mean.")


[159 149 164 126 147 139 125 137 150 168 135 168 167 170 167 149 128 132
 155 128]
Sample Mean: 148.15
Sample Standard Deviation: 16.023913051239518
T-Statistic: -0.5163190470575606
Critical T-Value: 2.093024054408263
Fail to reject the null hypothesis: There is no significant difference between the sample mean and the population mean.


# **T- test with two sample**

In [12]:
data = {
    'Gender': ['Female', 'Male', 'Female', 'Female', 'Male', 'Male', 'Male', 'Male', 'Female', 'Female', 'Male', 'Male',
               'Female'],
    'Marks': [77, 89, 89, 91, 76, 85, 79, 78, 93, 88, 91, 77,78 ]
}

df = pd.DataFrame(data)

print(df)

male_marks = df[df['Gender'] == 'Male']['Marks']
female_marks = df[df['Gender'] == 'Female']['Marks']

print(male_marks)
print(female_marks)

mean_male = male_marks.mean()
mean_female = female_marks.mean()

std_male = male_marks.std(ddof=1)
std_female = female_marks.std(ddof=1)

n_male = len(male_marks)
n_female = len(female_marks)

pooled_variance = ((std_male ** 2) / n_male) + ((std_female ** 2) / n_female)

t_statistic = (mean_male - mean_female) / np.sqrt(pooled_variance)

df = ((std_male ** 2 / n_male + std_female ** 2 / n_female) ** 2) / \
     ((std_male ** 2 / n_male) ** 2 / (n_male - 1) + (std_female ** 2 / n_female) ** 2 / (n_female - 1))

alpha = 0.05

t_critical = stats.t.ppf(1 - alpha / 2, df)

print(f"Male Group Mean: {mean_male}")
print(f"Female Group Mean: {mean_female}")
print(f"T-Statistic: {t_statistic}")
print(f"Degrees of Freedom: {df}")
print(f"Critical T-Value: {t_critical}")

if abs(t_statistic) > t_critical:
    print("Reject the null hypothesis: There is a significant difference in the means of Marks for males and females.")
else:
    print("Fail to reject the null hypothesis: There is no significant difference in the means of Marks for males and females.")

    Gender  Marks
0   Female     77
1     Male     89
2   Female     89
3   Female     91
4     Male     76
5     Male     85
6     Male     79
7     Male     78
8   Female     93
9   Female     88
10    Male     91
11    Male     77
12  Female     78
1     89
4     76
5     85
6     79
7     78
10    91
11    77
Name: Marks, dtype: int64
0     77
2     89
3     91
8     93
9     88
12    78
Name: Marks, dtype: int64
Male Group Mean: 82.14285714285714
Female Group Mean: 86.0
T-Statistic: -1.0662141241223617
Degrees of Freedom: 10.2325086604953
Critical T-Value: 2.2212951975049116
Fail to reject the null hypothesis: There is no significant difference in the means of Marks for males and females.


# **F test**

In [13]:
import pandas as pd
from scipy.stats import f, f_oneway

data = {
    "Shipping Option": ["Standard", "Express", "Same-Day", "Standard", "Express", "Same-Day","Standard", "Express", "Same-Day"],
    "Purchase Amounts": [50, 70, 90, 55, 75, 85, 60, 80, 95]
}
df = pd.DataFrame(data)
standard_shipping = df[df["Shipping Option"] == "Standard"]["Purchase Amounts"]
express_shipping = df[df["Shipping Option"] == "Express"]["Purchase Amounts"]
same_day_shipping = df[df["Shipping Option"] == "Same-Day"]["Purchase Amounts"]
t = 3
n = len(df)
df_between = t - 1
df_within = n - t
f_statistic, _ = f_oneway(standard_shipping, express_shipping, same_day_shipping)
alpha = 0.05
f_critical = stats.f.ppf(1 - alpha, df_between, df_within)
print(f"F-Statistic: {f_statistic}")
print(f"F-Critical Value: {f_critical}")

if f_statistic > f_critical:
    print("Reject the null hypothesis: There are significant differences between the group means.")
else:
    print("Fail to reject the null hypothesis: There are no significant differences between the group means.")

F-Statistic: 37.00000000000006
F-Critical Value: 5.143252849784718
Reject the null hypothesis: There are significant differences between the group means.
