# Loading the dataset 

This data is API requesed from [BreweryDB](https://www.brewerydb.com/developers/apps) website and the finlal working data is saved as csv file.

In [1]:
# Data analysis packages:
import pandas as pd
import numpy as np

# Visualization packages:
import seaborn as sns
sns.set_style('darkgrid')
import matplotlib.pyplot as plt
%matplotlib inline

%load_ext autoreload
%autoreload 2

<a id='sec1.3'></a>
## 1.3. Understanding the data
The dataset used in this analysis is available at [BreweryDB](https://www.brewerydb.com/developers/apps).

In order to define the hypotheses that will lead this analysis, it is first needed to know which information is available in the dataset (attributes). Thus, the first lines of the dataset are shown below:

In [2]:
## Loading the dataset
beers = pd.read_csv('dirty_data.csv')

## Printing the number of samples in the dataset:
print('This dataset has {0} samples.'.format(len(beers)), 'and {} columns'.format(len(beers.columns)-1))
beers.head()

FileNotFoundError: File b'dirty_data.csv' does not exist

In [None]:
# print('Beer style count:', beers.style_name.value_counts())
pilsener = beers[beers['style_name'] == 'American-Style Pilsener']['abv'] # 40672
pilsener = np.array(pilsener[pilsener.notnull()])
print('>> There are {} American-Style Pilsener records in the dataset.'.format(len(pilsener)))
# ipa == India Pale Ale
ipa = beers[beers['style_name'] == 'American-Style India Pale Ale']['abv'] # 40672
ipa = np.array(ipa[ipa.notnull()])
print('>> There are {} American-Style IPA records in the dataset.'.format(len(ipa)))

# IBU comparision
print('\n' +'ABV metrix for American-Style Pilsener and India Pale Ale (IPA)')
print('----------------')
print('American-Style Pilsener mean:',round(pilsener.mean(),3))
print('American-Style Pilsener standard divation:', round(pilsener.std(),3))
print('American-Style Pilsener size:', len(pilsener))
print('American-Style India Pale Ale mean:', round(ipa.mean(),3))
print('American-Style India Pale Ale standard divation:', round(ipa.std(),3))
print('American-Style India Pale Ale size:', len(ipa))

The five steps to performing a hypothesis test are:

1) Set up null and alternative hypotheses   
2) Choose a significance level   
3) Calculate the test statistic   
4) Determine the critical or p-value (find the rejection region)   
5) Compare t-value with critical t-value to reject or fail to reject the null hypothesis 


## The Null Hypothesis

The null hypothesis to be that there is no difference between a subject taking a placebo and the treatment drug.

>**$H_{0}$: The mean ABV difference between American-Style Pilsener and American-Style India Pale Ale is zero. i.e. $H_{0} = H_{1}$**

## The Alternate Hypothesis

>**$H_{1}$ (2-tailed): The mean difference between American-Style Pilsener and American-Style India Pale Ale, is different than zero.**

>**$H_{1}$ (1-tailed, >): The mean difference between American-Style Pilsener and American-Style India Pale Ale, is greater than zero.**

>**$H_{1}$ (1-tailed, <): The mean difference between American-Style Pilsener and American-Style India Pale Ale, is less than zero.**

In [None]:
## take 50 sample from American-Style Pilsener and Amercian-Style India Pale Ale sets 
import numpy as np
np.random.seed(2019)
pilsener_sample = np.random.choice(pilsener, size=50)
ipa_sample = np.random.choice(ipa, size=50)
print(ipa_sample.mean())
print(pilsener_sample.mean())

In [None]:
## modules
def sample_variance(sample):
    sample_mean = np.mean(sample)    
    return np.sum((sample-sample_mean)**2)/(len(sample) - 1)
def pooled_variance(sample1, sample2):
    n_1, n_2 = len(sample1), len(sample2)
    var_1, var_2 = sample_variance(sample1), sample_variance(sample2)
    
    return ((n_1 - 1) * var_1 + (n_2 - 1) * var_2)/(n_1 + n_2 - 2)
def twosample_tstatistic(expr, ctrl):
    exp_mean, ctrl_mean = np.mean(expr), np.mean(ctrl)
    pool_var = pooled_variance(expr, ctrl)
    n_e, n_c = len(expr), len(ctrl)
    num = exp_mean - ctrl_mean
    denom = np.sqrt(pool_var * ((1/n_e) + (1/n_c)))

    return num/denom

In [None]:
## Calculate the t statsits
t_stat = twosample_tstatistic(ipa_sample, pilsener_sample)
print('t-statistic value:', t_stat)
t_crit = stats.t.ppf(q=1-0.05, df=98)
print('t-critial value:', t_crit)

In [None]:
# Visualize t and p_value
import scipy.stats as stats

def visualize_t(t_stat, n_control, n_experimental):

    # initialize a matplotlib "figure"
    fig = plt.figure(figsize=(8,5))
    ax = fig.gca()

    # generate points on the x axis between -4 and 4:
    xs = np.linspace(-4,4,500)
    # use stats.t.pdf to get values on the probability density function for the t-distribution
    ys = stats.t.pdf(xs, (n_control + n_experimental - 2), 0, 1)#, labels=['Density'])
    ax.plot(xs, ys, linewidth=3, color='darkred')
    
    ax.axvline(t_stat, color='black', linestyle='--', lw=5)
    ax.axvline(-t_stat, color='black', linestyle='--', lw=5)
    # Draw two sided boundary for critical-t
    plt.show()
    return None

In [None]:
visualize_t(t_stat, len(ipa_sample), len(pilsener_sample))

### Welch's t-test

In [None]:
import test_moduls
test_moduls.p_value_welch_ttest(ipa_sample, pilsener_sample, two_sided=True)

In [None]:
## copied the same thing for ibu 
# print('Beer style count:', beers.style_name.value_counts())
pilsener_ibu = beers[beers['style_name'] == 'American-Style Pilsener']['ibu'] # 40672
pilsener_ibu = np.array(pilsener_ibu[pilsener_ibu.notnull()])
print('>> There are {} American-Style Pilsener records in the dataset.'.format(len(pilsener_ibu)))
# ipa == India Pale Ale
ipa_ibu = beers[beers['style_name'] == 'American-Style India Pale Ale']['ibu'] # 40672
ipa_ibu = np.array(ipa_ibu[ipa_ibu.notnull()])
print('>> There are {} American-Style IPA records in the dataset.'.format(len(ipa_ibu)))

# IBU comparision
print('\n' +'IBU metrix for American-Style Pilsener and India Pale Ale (IPA)')
print('--------------')
print('American-Style Pilsener mean:',round(pilsener_ibu.mean(),3))
print('American-Style Pilsener standard divation:', round(pilsener_ibu.std(),3))
print('American-Style Pilsener size:', len(pilsener_ibu))
print('American-Style India Pale Ale mean:', round(ipa_ibu.mean(),3))
print('American-Style India Pale Ale standard divation:', round(ipa_ibu.std(),3))
print('American-Style India Pale Ale size:', len(ipa_ibu))

In [None]:
np.unique(ipa_ibu)

### ABV of American Style Pilsner
American Pilsners, particularly Imperial Pilsners, typically have an alcohol content between 6.5% and 9% alcohol by volume. 


In [None]:
def boxplot(dataset):
    """Make a box plot for each column of ``x``.

    Parameters
    ----------
    X : Array or a sequence of vectors

    """
    fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(10, 5), sharey=False)

    axs[0].boxplot(dataset.abv, labels=['ABV'], notch=True, showmeans=True)
    axs[0].set_title('Alcohol By Volume')
    axs[1].boxplot(dataset.gravity, labels=['OG'], notch=True, showmeans=True)
    axs[1].set_title('Original Gravity')
    axs[2].boxplot(dataset.ibu, labels=['IBU'], notch=True, showmeans=True)
    axs[2].set_title('International Bitterness Units')

In [None]:
samp_pilsener = np.array(random.choices(pilsener, k=50))
sns.set(color_codes=True)
sns.set(rc={'figure.figsize':(10,5)})
sns.distplot(pilsener_stand, hist=False)

In [None]:
import test_moduls as test
sampling_from_five_ibu = np.array(random.choices(array1, k=50))

In [None]:
mu = 7.75
pilsener_stand = np.array([(p - pilsener.mean())/pilsener.std() for p in pilsener])
# test.ttest(pilsener, mu, 0.05)



In [None]:
beers.style_name.value_counts()

In [None]:
pilsener = beers[beers['style_name'] == 'American-Style Pilsener'][['abv', 'ibu', 'gravity']] # 
pilsener.gravity.value_counts()

In [None]:
import random
random.seed(2019)
# sampling with replacement 
pilsener_sampling = np.array(random.choices(pilsener, k=3000))
print('sampling with choices():', len(pilsener_sampling))

ipa_sampling = np.array(random.choices(ipa, k=3000))
print('sampling with choices():', len(ipa_sampling))

print(f'This is for the {} samples'.format(len(ipa_sampling)))
print('---------------------------')
print('American-Style Pilsener mean:',round(pilsener_sampling.mean(),3))
print('American-Style Pilsener standard divation:', round(pilsener_sampling.std(),3))
print('American-Style Pilsener size', len(pilsener_sampling))
print('American-Style India Pale Ale mean', round(ipa_sampling.mean(),3))
print('American-Style India Pale Ale standard divation:', round(ipa_sampling.std(),3))
print('American-Style India Pale Ale size', len(ipa_sampling))


# Draw a plot showing overlapping of distribution means and sds for incpection
sns.set(color_codes=True)
sns.set(rc={'figure.figsize':(10,5)})
sns.distplot(pilsener_sampling, hist=False)
sns.distplot(ipa_sampling, hist=False)

### Welch test

In [None]:
import test_moduls
test_moduls.p_value_welch_ttest(pilsener_sampling, ipa_sampling, two_sided=True)


### Standerdizing the data here

In [None]:
ipa_lst = np.array([(i - ipa_sampling.mean())/ipa_sampling.std() for i in ipa_sampling])
pilsener_lst = np.array([(p - pilsener_sampling.mean())/pilsener_sampling.std() for p in pilsener_sampling])

print('This is after standerdization of the 50 samples')
print('---------------------------')
print('American-Style Pilsener mean:',round(pilsener_lst.mean(),3))
print('American-Style Pilsener standard divation:', round(pilsener_lst.std(),3))
print('American-Style Pilsener size', len(pilsener_lst))
print('American-Style India Pale Ale mean', round(ipa_lst.mean(),3))
print('American-Style India Pale Ale standard divation:', round(ipa_lst.std(),3))
print('American-Style India Pale Ale size', len(ipa_lst))

# Draw a plot showing overlapping of distribution means and sds for incpection
sns.set(color_codes=True)
sns.set(rc={'figure.figsize':(10,5)})
sns.distplot(pilsener_lst, hist=False)
sns.distplot(ipa_lst, hist=False)

In [None]:
print(ipa_sampling)

In [None]:
beers.style_name.value_counts()

In [None]:
new_beer = beers[['ibu', 'abv']]
nb = new_beer[new_beer.notnull().all(axis=1)]
# nb.info()
print(nb.ibu.value_counts())

In [None]:
print(nb.abv.value_counts())

In [None]:
nb_five = nb[nb['abv']==5.0]
nb_ten = nb[nb['abv']==4.5]
print('ABV counnts are', nb_five.abv.value_counts())
print('ABV counnts are', nb_ten.abv.value_counts())
# nb_ten

In [None]:
nb_ten

## Two-independent sample t-test

In [None]:
# get a sample of 50 from the two independent groups
array_5_ibu, array_10_ibu = np.array(nb_five.ibu), np.array(nb_ten.ibu)
# sampling with replacemtnt
import random
sampling_from_five_ibu = np.array(random.choices(array1, k=50))
sampling_from_ten_ibu = np.array(random.choices(array2, k=50))
# mean_ibu_for_five_abv = nb_five.ibu.mean()
# print(mean_ibu_for_five_abv 
print('sampling fifty items from a 20336 items of five ibu list', len(sampling_from_five_ibu))
print('sampling fifty items from a 10186 items of five ibu list', len(sampling_from_ten_ibu))

In [None]:
print('Five ibu mean',sampling_from_five_ibu.mean())
print('Five ibu standard divation', sampling_from_five_ibu.std())
print('Five ibu size', len(sampling_from_five_ibu))
print('Ten ibu mean', sampling_from_ten_ibu.mean())
print('Ten ibu standard divation', sampling_from_ten_ibu.std())
print('Ten ibu size', len(sampling_from_ten_ibu))
# control.mean()

In [None]:
array2

In [None]:
import cleaning_data as cd

In [None]:
# cleaning dataset and pulling only the three columns
beer_vital_stats = cd.beer_vital_stats(beers)
beer_vital_stats.head()

In [None]:
beer_vital_stats.info()

In [None]:
# Check the box plot to see the data is uniform
import visualization as vis
vis.boxplot(beer_vital_stats)

Check the variability or dispersion of the data using boxplot

In [None]:
print(beer_vital_stats.min()) 
print(beer_vital_stats.max())

In [None]:
beer_vital_stats.ibu.value_counts()

## IBU Hypothesis

In [None]:
beers.head(3)

In [None]:
pilsener_ibu = beers[beers['style_name'] == 'American-Style Pilsener']['ibu'] # 40672
pilsener_ibu = np.array(pilsener_ibu[pilsener_ibu.notnull()])
# ipa == India Pale Ale
ipa_ibu = beers[beers['style_name'] == 'American-Style India Pale Ale']['ibu'] # 40672
ipa_ibu = np.array(ipa_ibu[ipa_ibu.notnull()])

# IBU comparision
print('Working data information')
print('--------------------------')
print('American-Style Pilsener mean:',round(pilsener_ibu.mean(),3))
print('American-Style Pilsener standard divation:', round(pilsener_ibu.std(),3))
print('American-Style Pilsener size', len(pilsener_ibu))
print('American-Style India Pale Ale mean', round(ipa_ibu.mean(),3))
print('American-Style India Pale Ale standard divation:', round(ipa_ibu.std(),3))
print('American-Style India Pale Ale size', len(ipa_ibu))

In [None]:
# Draw a plot showing overlapping of distribution means and sds for incpection
sns.set(color_codes=True)
sns.set(rc={'figure.figsize':(10,5)})
sns.distplot(pilsener_ibu, hist=False)
sns.distplot(ipa_ibu, hist=False)

In [None]:
# pil_ibu = beers[beers['style_name'] == 'American-Style Pilsener']['ibu']
beers.ibu.value_counts()

In [None]:
beers.gravity.value_counts()

In [None]:
og_ibu = beers[beers['gravity'] == 1.051]['ibu']
og_ibu.unique()

In [None]:
abv = beers[beers['abv']==4.5][['ibu','gravity']]
abv.info()