In [None]:
"""
Q1. Calculate the 95% confidence interval for a sample of data with a mean of 50 and a standard deviation
of 5 using Python. Interpret the results.
"""
import scipy.stats as stats
import numpy as np

# Given values
mean = 50
std = 5
n = 30  # assuming sample size

# Standard error
se = std / np.sqrt(n)

# 95% confidence interval using t-distribution
confidence = 0.95
t_value = stats.t.ppf((1 + confidence) / 2, df = n - 1)

lower = mean - t_value * se
upper = mean + t_value * se

(lower, upper)


In [None]:
"""
Q2. Conduct a chi-square goodness of fit test to determine if the distribution of colors of M&Ms in a bag
matches the expected distribution of 20% blue, 20% orange, 20% green, 10% yellow, 10% red, and 20%
brown. Use Python to perform the test with a significance level of 0.05.

"""
import numpy as np
from scipy.stats import chisquare

# ----------------------------------------
# ENTER YOUR OBSERVED COUNTS HERE
# Example: Suppose you counted M&Ms in a bag
observed = np.array([22, 18, 20, 10, 8, 22])
# Colors: [Blue, Orange, Green, Yellow, Red, Brown]
# ----------------------------------------

# Expected proportions
expected_proportions = np.array([0.20, 0.20, 0.20, 0.10, 0.10, 0.20])

# Convert proportions to expected counts
expected = expected_proportions * observed.sum()

# Perform chi-square test
chi_stat, p_value = chisquare(f_obs=observed, f_exp=expected)

print("Chi-square Statistic:", chi_stat)
print("p-value:", p_value)


In [None]:
"""

Q3. Use Python to calculate the chi-square statistic and p-value for a contingency table with the following
data:

         Group A  Group B

Outcome 1   20     15
Outcome 2   10     25
Outcome 3   15     20


"""
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
from scipy.stats import chi2

print("\n" + "=" * 60)
print("Q3: Chi-Square Test for Contingency Table")
print("=" * 60)

# Contingency table from the document
contingency_table = np.array([
    [20, 15],
    [10, 25],
    [15, 20]
])

# Perform chi-square test of independence
chi2_stat, p_val, dof, expected_freq = stats.chi2_contingency(contingency_table)

print("Contingency Table:")
print(pd.DataFrame(contingency_table, 
                   index=['Outcome 1', 'Outcome 2', 'Outcome 3'],
                   columns=['Group A', 'Group B']))
print(f"\nChi-square statistic: {chi2_stat:.4f}")
print(f"P-value: {p_val:.4f}")
print(f"Degrees of freedom: {dof}")
print("\nExpected frequencies:")
print(expected_freq)

if p_val < 0.05:
    print("\nConclusion: There is a significant association between groups and outcomes.")
else:
    print("\nConclusion: No significant association between groups and outcomes.")


In [None]:
""" Q4. A study of the prevalence of smoking in a population of 500 individuals found that 60 individuals
smoked. Use Python to calculate the 95% confidence interval for the true proportion of individuals in the
population who smoke.
"""

import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
from scipy.stats import chi2

print("\n" + "=" * 60)
print("Q4: 95% CI for Proportion of Smokers")
print("=" * 60)

n_sample = 500
x_smokers = 60
p_hat = x_smokers / n_sample

# Calculate 95% confidence interval for proportion
z_critical = stats.norm.ppf(0.975)
se = np.sqrt(p_hat * (1 - p_hat) / n_sample)
ci_lower_prop = p_hat - z_critical * se
ci_upper_prop = p_hat + z_critical * se

print(f"Sample size: {n_sample}")
print(f"Number of smokers: {x_smokers}")
print(f"Sample proportion: {p_hat:.4f}")
print(f"95% Confidence Interval: ({ci_lower_prop:.4f}, {ci_upper_prop:.4f})")
print(f"\nInterpretation: We are 95% confident that the true proportion of")
print(f"smokers in the population is between {ci_lower_prop:.2%} and {ci_upper_prop:.2%}")

In [None]:
"""
Q5. Calculate the 90% confidence interval for a sample of data with a mean of 75 and a standard deviation
of 12 using Python. Interpret the results.

"""
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
from scipy.stats import chi2

print("\n" + "=" * 60)
print("Q5: 90% Confidence Interval for Mean")
print("=" * 60)

mean_q5 = 75
std_dev_q5 = 12
n_q5 = 30  # Assuming sample size
confidence_level_q5 = 0.90

alpha_q5 = 1 - confidence_level_q5
z_critical_q5 = stats.norm.ppf(1 - alpha_q5/2)
margin_error_q5 = z_critical_q5 * (std_dev_q5 / np.sqrt(n_q5))
ci_lower_q5 = mean_q5 - margin_error_q5
ci_upper_q5 = mean_q5 + margin_error_q5

print(f"Sample Mean: {mean_q5}")
print(f"Standard Deviation: {std_dev_q5}")
print(f"Sample Size: {n_q5}")
print(f"90% Confidence Interval: ({ci_lower_q5:.2f}, {ci_upper_q5:.2f})")
print(f"\nInterpretation: We are 90% confident that the true population mean")
print(f"lies between {ci_lower_q5:.2f} and {ci_upper_q5:.2f}")

In [None]:
"""
Q6. Use Python to plot the chi-square distribution with 10 degrees of freedom. Label the axes and shade the
area corresponding to a chi-square statistic of 15.

"""
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
from scipy.stats import chi2

print("\n" + "=" * 60)
print("Q6: Plot Chi-Square Distribution")
print("=" * 60)

df = 10  # degrees of freedom
chi_square_value = 15

# Create x values
x = np.linspace(0, 30, 1000)
y = chi2.pdf(x, df)

# Create the plot
plt.figure(figsize=(10, 6))
plt.plot(x, y, 'b-', linewidth=2, label=f'Chi-square distribution (df={df})')
# Shade area for chi-square statistic of 15
x_fill = x[x >= chi_square_value]
y_fill = chi2.pdf(x_fill, df)
plt.fill_between(x_fill, y_fill, alpha=0.3, color='red', 
                 label=f'Area for χ² ≥ {chi_square_value}')

# Add vertical line at chi-square value
plt.axvline(x=chi_square_value, color='r', linestyle='--', 
            label=f'χ² = {chi_square_value}')

plt.xlabel('Chi-square value', fontsize=12)
plt.ylabel('Probability Density', fontsize=12)
plt.title(f'Chi-Square Distribution (df = {df})', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('chi_square_plot.png', dpi=300, bbox_inches='tight')
print("Plot saved as 'chi_square_plot.png'")
plt.show()

# Calculate p-value
p_value_q6 = 1 - chi2.cdf(chi_square_value, df)
print(f"\nP-value for χ² = {chi_square_value} with df = {df}: {p_value_q6:.4f}")


In [None]:
"""
Q7. A random sample of 1000 people was asked if they preferred Coke or Pepsi. Of the sample, 520
preferred Coke. Calculate a 99% confidence interval for the true proportion of people in the population who
prefer Coke.

"""
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
from scipy.stats import chi2

print("\n" + "=" * 60)
print("Q7: 99% CI for Proportion (Coke vs Pepsi)")
print("=" * 60)

n_q7 = 1000
x_coke = 520
p_hat_q7 = x_coke / n_q7

# Calculate 99% confidence interval
z_critical_q7 = stats.norm.ppf(0.995)  # 99% CI
se_q7 = np.sqrt(p_hat_q7 * (1 - p_hat_q7) / n_q7)
ci_lower_q7 = p_hat_q7 - z_critical_q7 * se_q7
ci_upper_q7 = p_hat_q7 + z_critical_q7 * se_q7

print(f"Sample size: {n_q7}")
print(f"Number preferring Coke: {x_coke}")
print(f"Sample proportion: {p_hat_q7:.4f}")
print(f"99% Confidence Interval: ({ci_lower_q7:.4f}, {ci_upper_q7:.4f})")
print(f"\nInterpretation: We are 99% confident that the true proportion of")
print(f"people preferring Coke is between {ci_lower_q7:.2%} and {ci_upper_q7:.2%}")


In [None]:
"""
Q8. A researcher hypothesizes that a coin is biased towards tails. They flip the coin 100 times and observe
45 tails. Conduct a chi-square goodness of fit test to determine if the observed frequencies match the
expected frequencies of a fair coin. Use a significance level of 0.05.

"""

print("\n" + "=" * 60)
print("Q8: Chi-Square Test for Biased Coin")
print("=" * 60)

n_flips = 100
observed_tails = 45
observed_heads = n_flips - observed_tails

# For a fair coin
expected_heads = n_flips * 0.5
expected_tails = n_flips * 0.5

observed_q8 = np.array([observed_heads, observed_tails])
expected_q8 = np.array([expected_heads, expected_tails])

chi2_stat_q8, p_val_q8 = stats.chisquare(f_obs=observed_q8, f_exp=expected_q8)

print(f"Number of flips: {n_flips}")
print(f"Observed: Heads = {observed_heads}, Tails = {observed_tails}")
print(f"Expected (fair coin): Heads = {expected_heads}, Tails = {expected_tails}")
print(f"\nChi-square statistic: {chi2_stat_q8:.4f}")
print(f"P-value: {p_val_q8:.4f}")
print(f"Significance level: 0.05")

if p_val_q8 < 0.05:
    print("\nConclusion: Reject null hypothesis. The coin appears to be biased.")
else:
    print("\nConclusion: Fail to reject null hypothesis. No evidence coin is biased.")

In [None]:
"""
Q9. A study was conducted to determine if there is an association between smoking status (smoker or
non-smoker) and lung cancer diagnosis (yes or no). The results are shown in the contingency table below.
Conduct a chi-square test for independence to determine if there is a significant association between
smoking status and lung cancer diagnosis.

              Lung Cancer: Yes   Lung Cancer: No

Smoker                60            140
Non-smoker            30            170
"""

print("\n" + "=" * 60)
print("Q9: Chi-Square Test - Smoking and Lung Cancer")
print("=" * 60)

# Contingency table
smoking_cancer = np.array([
    [60, 140],   # Smoker
    [30, 170]    # Non-smoker
])

chi2_stat_q9, p_val_q9, dof_q9, expected_q9 = stats.chi2_contingency(smoking_cancer)

print("Contingency Table:")
df_q9 = pd.DataFrame(smoking_cancer, 
                     index=['Smoker', 'Non-smoker'],
                     columns=['Lung Cancer: Yes', 'Lung Cancer: No'])
print(df_q9)

print(f"\nChi-square statistic: {chi2_stat_q9:.4f}")
print(f"P-value: {p_val_q9:.4f}")
print(f"Degrees of freedom: {dof_q9}")
print(f"Significance level: 0.05")

if p_val_q9 < 0.05:
    print("\nConclusion: There IS a significant association between smoking")
    print("status and lung cancer diagnosis.")
else:
    print("\nConclusion: No significant association between smoking and lung cancer.")

In [None]:
"""
Q10. A study was conducted to determine if the proportion of people who prefer milk chocolate, dark
chocolate, or white chocolate is different in the U.S. versus the U.K. A random sample of 500 people from
the U.S. and a random sample of 500 people from the U.K. were surveyed. The results are shown in the
contingency table below. Conduct a chi-square test for independence to determine if there is a significant
association between chocolate preference and country of origin.

Use a significance level of 0.01.

             Milk Chocolate Dark Chocolate White Chocolate

U.S. (n=500)       200           150          150
U.K. (n=500)       225           175          100

"""
print("\n" + "=" * 60)
print("Q10: Chi-Square Test - Chocolate Preference by Country")
print("=" * 60)

# Contingency table
chocolate_pref = np.array([
    [200, 150, 150],  # U.S.
    [225, 175, 100]   # U.K.
])

chi2_stat_q10, p_val_q10, dof_q10, expected_q10 = stats.chi2_contingency(chocolate_pref)

print("Contingency Table:")
df_q10 = pd.DataFrame(chocolate_pref,
                      index=['U.S.', 'U.K.'],
                      columns=['Milk Chocolate', 'Dark Chocolate', 'White Chocolate'])
print(df_q10)

print(f"\nChi-square statistic: {chi2_stat_q10:.4f}")
print(f"P-value: {p_val_q10:.4f}")
print(f"Degrees of freedom: {dof_q10}")
print(f"Significance level: 0.01")

if p_val_q10 < 0.01:
    print("\nConclusion: There IS a significant association between chocolate")
    print("preference and country of origin.")
else:
    print("\nConclusion: No significant association between chocolate preference")
    print("and country of origin.")

In [None]:
"""
Q11. A random sample of 30 people was selected from a population with an unknown mean and standard
deviation. The sample mean was found to be 72 and the sample standard deviation was found to be 10.
Conduct a hypothesis test to determine if the population mean is significantly different from 70. Use a
significance level of 0.05.

"""

print("\n" + "=" * 60)
print("Q11: Hypothesis Test for Population Mean")
print("=" * 60)

sample_mean = 72
sample_std = 10
n_q11 = 30
hypothesized_mean = 70
alpha_q11 = 0.05

# Calculate t-statistic
se_q11 = sample_std / np.sqrt(n_q11)
t_stat = (sample_mean - hypothesized_mean) / se_q11

# Calculate p-value (two-tailed test)
df_q11 = n_q11 - 1
p_value_q11 = 2 * (1 - stats.t.cdf(abs(t_stat), df_q11))

# Critical value
t_critical = stats.t.ppf(1 - alpha_q11/2, df_q11)

print(f"Sample mean: {sample_mean}")
print(f"Sample standard deviation: {sample_std}")
print(f"Sample size: {n_q11}")
print(f"Hypothesized population mean: {hypothesized_mean}")
print(f"Significance level: {alpha_q11}")
print(f"\nNull Hypothesis (H0): μ = {hypothesized_mean}")
print(f"Alternative Hypothesis (H1): μ ≠ {hypothesized_mean}")
print(f"\nT-statistic: {t_stat:.4f}")
print(f"Degrees of freedom: {df_q11}")
print(f"P-value: {p_value_q11:.4f}")
print(f"Critical t-value: ±{t_critical:.4f}")

if p_value_q11 < alpha_q11:
    print(f"\nConclusion: Reject null hypothesis (p-value = {p_value_q11:.4f} < {alpha_q11})")
    print(f"The population mean IS significantly different from {hypothesized_mean}.")
else:
    print(f"\nConclusion: Fail to reject null hypothesis (p-value = {p_value_q11:.4f} > {alpha_q11})")
    print(f"The population mean is NOT significantly different from {hypothesized_mean}.")

print("\n" + "=" * 60)
print("ALL QUESTIONS COMPLETED")
print("=" * 60)