In [8]:
from scipy.stats import ttest_1samp
from scipy.stats import ttest_ind
from scipy.stats import chi2
from scipy import stats
from statsmodels.stats import weightstats as stests
from statsmodels.formula.api import ols
import numpy as np
import pandas as pd
import statsmodels.api as sm

## Hypothesis testing

### One sampled t-test
Determine if the average height of samples are 162 cm or not. Use alpha = 0.05
* H0: u = 162
* Ha: u != 162

In [9]:
df_heights = pd.DataFrame(np.genfromtxt(r'../../../data/csv/heights.csv')).rename(columns = {0 : 'heights'})

In [76]:
# Getting the mean
var_mean_df_heights = df_heights.mean().values[0]
print(f'Heights mean: {var_mean_df_heights}')

# Getting the p-value
var_pval_df_heights = ttest_1samp(df_heights, 162)[1]
print(f'Heights p-values: {var_pval_df_heights} ' + ('reject the null hypothesis' if var_pval_df_heights < 0.05 else 'fail to reject the null hypothesis'))

Heights mean: 163.54
Heights p-values: [0.33774547] fail to reject the null hypothesis


### Two sampled t-test
Determine if the sales of a time period 1 is statistically different from the sales of another time period 2. Use alpha = 0.05
* H0: u1 - u2 = 0 -> H0: u1 = u2
* Ha: u1 - u2 != 0 -> Ha: u1 != u2 

In [42]:
df_unitsales = pd.DataFrame(dict(
    week1 = np.genfromtxt(r'../../../data/csv/unitsales1.csv'),
    week2 = np.genfromtxt(r'../../../data/csv/unitsales2.csv'),
))

df_unitsales.head(5)

Unnamed: 0,week1,week2
0,170.0,169.0
1,169.0,164.0
2,177.0,173.0
3,179.0,172.0
4,166.0,161.0


In [47]:
# Getting the mean
var_mean_week1_df_unitsales = df_unitsales['week1'].mean()
var_mean_week2_df_unitsales = df_unitsales['week2'].mean()

# Getting the standard deviation
var_std_week1_df_unitsales = df_unitsales['week1'].std()
var_std_week2_df_unitsales = df_unitsales['week2'].std()

In [48]:
print('Unit sales')
print(f'Week 1 mean: {var_mean_week1_df_unitsales}')
print(f'Week 1 std: {var_std_week1_df_unitsales}')
print(f'Week 2 mean: {var_mean_week2_df_unitsales}')
print(f'Week 2 std: {var_std_week2_df_unitsales}')

Unit sales
Week 1 mean: 170.6
Week 1 std: 6.021390442177194
Week 2 mean: 166.13333333333333
Week 2 std: 7.268981719477304


In [62]:
# Getting p-value using independent ttest
var_pval_df_unitsales = ttest_ind(df_unitsales['week1'], df_unitsales['week2'])[1]

print(f'Unit sales p-value: {var_pval_df_unitsales} ' + ('reject null hypothesis' if var_pval_df_unitsales < 0.05 else 'fail to reject null hypothesis'))

Unit sales p-value: 0.0774950425746379 fail to reject null hypothesis


### Paired sampled t-Test
Determine if the performance of a time period is 1 is statistically different from the sales of another time period 2.
* H0: u1 - u2 = 0 -> H0: u1 = u2
* Ha: u1 - u2 != 0 -> Ha: u1 != u2 

In [67]:
df_pairedperf = pd.read_csv(r'../../../data/csv/pairedperformance.csv')
df_pairedperf.head()

Unnamed: 0,perf_before,perf_after
0,62,62
1,67,82
2,78,65
3,67,60
4,63,73


In [77]:
# Getting p-value using t-test on two related samples
var_pval_df_pairedperf = stats.ttest_rel(df_pairedperf['perf_before'], df_pairedperf['perf_after'])[1]
print(f'Paired performance p-value: {var_pval_df_pairedperf} ' + ('reject the null hypothesis' if var_pval_df_pairedperf < 0.05 else 'fail to reject the null hypothesis'))

Paired performance p-value: 0.4178997657102256 fail to reject the null hypothesis


### One sample z-test
* Determine if the mean of the population is some number based on the samples.
* Determine if the average performance scores of samples is 65 or not. Use alpha = 0.05
  * H0: u = 65
  * Ha: u != 65

In [82]:
df_performances = pd.read_csv(r'../../../data/csv/performancez.csv')
df_performances.head()

Unnamed: 0,perf_before,perf_after
0,62,62
1,67,82
2,78,65
3,67,60
4,63,73


In [89]:
# Getting the p-value
var_pval_1samp_df_performances = stests.ztest((df_performances['perf_before']), x2 = None, value = 65)[1]
print(f'Performance 1samp p-value: {var_pval_1samp_df_performances} ' + ('reject the null hypothesis' if var_pval_1samp_df_performances < 0.05 else 'fail to reject the null hypothesis'))

Performance p-value: 0.0007847889688732645 reject the null hypothesis


### Two sample z-test
* Similar to two sample t-test, determine two independent group of samples if their sample means are equal or not.
* Determine if the average performance scores of the two samples are equal or not. Use alpha = 0.05
  * H0: u1 - u2 = 0
  * Ha: u1 - u2 != 0

In [92]:
var_pval_2samp_df_performances = stests.ztest(x1 = df_performances['perf_before'], x2 = df_performances['perf_after'], value = 0, alternative = 'two-sided')[1]
print(f'Performance 2samp p-value: {var_pval_2samp_df_performances} ' + ('reject null hypothesis' if var_pval_2samp_df_performances < 0.05 else 'fail to reject null hypothesis'))

Performance 2samp p-value: 0.217033050353992 fail to reject null hypothesis


### One way f-test
* This tells us where two or more groups are similar or not based on their similarity and f-score.
* Three plant categories and their weights are recorded.
* Determine if all groups are similar or not. Use alpha = 0.05

In [96]:
df_plantgrowth = pd.read_csv(r'../../../data/csv/plantgrowth.csv', index_col=0)
df_plantgrowth.head()

Unnamed: 0,weight,group
1,5.61,ctrl
2,6.41,ctrl
3,6.15,ctrl
4,8.11,ctrl
5,6.23,ctrl


In [138]:
# Get p-value using f-test one way
var_pval_1w_df_plantgrowth = stats.f_oneway(
    df_plantgrowth.groupby('group').get_group('ctrl')['weight'],
    df_plantgrowth.groupby('group').get_group('trt1')['weight'],
    df_plantgrowth.groupby('group').get_group('trt2')['weight'],
)[1]

print(f'Plantgrowth p-value: {var_pval_1w_df_plantgrowth} ' + ('reject null hypothesis' if var_pval_1w_df_plantgrowth < 0.05 else 'fail to reject null hypothesis'))

Plantgrowth p-value: 0.19684951887149554 fail to reject null hypothesis


### Two way f-test
* Investigate the grand mean crop yield of the data set crop yield.
* As well as the mean crop yield by each factor.
* As well as the factors grouped together.

In [144]:
df_cropyield = pd.read_csv(f'../../../data/csv/cropyield.csv')

In [162]:
model_df_cropyield = ols('Yield ~ C(Fert)*C(Water)', df_cropyield).fit()
print(f'Overall model_df_cropyield F({model_df_cropyield.df_model:.0f}, {model_df_cropyield.df_resid:.0f}) = {model_df_cropyield.fvalue:.3f}, p = {model_df_cropyield.f_pvalue:.4f}')

res_df_cropyield = sm.stats.anova_lm(model_df_cropyield, typ = 2)
res_df_cropyield

Overall model_df_cropyield F(3, 16) = 4.112, p = 0.0243


Unnamed: 0,sum_sq,df,F,PR(>F)
C(Fert),69.192,1.0,5.766,0.028847
C(Water),63.368,1.0,5.280667,0.035386
C(Fert):C(Water),15.488,1.0,1.290667,0.272656
Residual,192.0,16.0,,


### Chi-square test
* In an election survey, voters may be classified by sex [male or female] and voting preference [democrat, republican, or independent]
* Test for independence of these two variables to determine if sex is related to voting preference.
  * H0: The two samples [voting preference and sex] are independent
  * Ha: There is independency between the samples [voting preference and sex]

In [186]:
df_elecsurvey = pd.read_csv(r'../../../data/csv/chi-test.csv')
contigency_table_df_elecsurvey = pd.crosstab(df_elecsurvey['Gender'], df_elecsurvey['Voting Preference'])

contigency_table_df_elecsurvey

Voting Preference,Democrat,Independent,Republican
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,3,2,2
Male,4,1,3


In [226]:
# Observed values
var_observedval_df_elecsurvey = contigency_table_df_elecsurvey.values
var_observedval_df_elecsurvey

array([[3, 2, 2],
       [4, 1, 3]])

In [281]:
# Expected values
var_expectedval_df_elecsurvey = stats.chi2_contingency(contigency_table_df_elecsurvey)[3]
var_expectedval_df_elecsurvey

array([[3.26666667, 1.4       , 2.33333333],
       [3.73333333, 1.6       , 2.66666667]])

In [268]:
# Getting degrees of freedom
var_ddof_df_elecsurvey = ((len(contigency_table_df_elecsurvey.columns)-1) * (len(contigency_table_df_elecsurvey.index)-1))
print(f'Degrees of freedom: {var_ddof_df_elecsurvey}')
var_alpha_df_elecsurvey = 0.05

Degrees of freedom: 2


In [303]:
chi_square_df_elecsurvey = sum([(o-e)**2/e for o,e in zip(var_observedval_df_elecsurvey, var_expectedval_df_elecsurvey)])
var_chi_square_stat_df_elecsurvey = sum(chi_square_df_elecsurvey)
print(f'Chi-square statistic: {var_chi_square_stat_df_elecsurvey}')

var_criticalval_df_elecsurvey = chi2.ppf(q = 1 - var_alpha_df_elecsurvey, df = var_ddof_df_elecsurvey)
print(f'Critical value: {var_criticalval_df_elecsurvey}')

var_pval_df_elecsurvey = chi2.cdf(x = var_chi_square_stat_df_elecsurvey, df = var_ddof_df_elecsurvey)
print(f'P-value : {var_pval_df_elecsurvey}')

Chi-square statistic: 0.6122448979591839
Critical value: 5.991464547107979
P-value : 0.26370354481366337


In [310]:
print(f'Significance level: {var_alpha_df_elecsurvey}')
print(f'Degrees of freedom: {var_ddof_df_elecsurvey}')
print(f'Chi-square statistic: {var_chi_square_stat_df_elecsurvey}')
print(f'Critical value: {var_criticalval_df_elecsurvey}')
print(f'P-value: {var_pval_df_elecsurvey}')

Significance level: 0.05
Degrees of freedom: 2
Chi-square statistic: 0.6122448979591839
Critical value: 5.991464547107979
P-value: 0.26370354481366337


In [317]:
# Chi square statistic interpretation
print(f'Using chi-square statistic: {var_chi_square_stat_df_elecsurvey}')
print(f'Reject H0, there is a relationship between categorical cariables' if var_chi_square_stat_df_elecsurvey >= var_criticalval_df_elecsurvey else 'Retain H0, there is no relationship between categorical variables')

# Using p-value
print(f'\nUsing p-value: {var_pval_df_elecsurvey}')
print(f'Reject H0, there is a relationship between categorical cariables' if var_pval_df_elecsurvey <= var_alpha_df_elecsurvey else 'Retain H0, there is no relationship between categorical variables')

Using chi-square statistic: 0.6122448979591839
Retain H0, there is no relationship between categorical variables

Using p-value: 0.26370354481366337
Retain H0, there is no relationship between categorical variables
