In [1]:
import pandas as pd
import numpy as np

# to make this notebook's output stable across runs
np.random.seed(123)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.offline as py
py.init_notebook_mode(connected=True)
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

In [2]:
mosquito = pd.read_csv("/home/isaac/Fundamentals_of_Data_Science_Certificate/3251 - Stats/A1/mosquitos_data.csv")

In [3]:
mosquito.head(50)

Unnamed: 0,Response,Treatment
0,27,Beer
1,20,Beer
2,21,Beer
3,26,Beer
4,27,Beer
5,31,Beer
6,24,Beer
7,21,Beer
8,20,Beer
9,19,Beer


In [4]:
df_beer = mosquito[mosquito["Treatment"] == 'Beer']
df_water = mosquito[mosquito["Treatment"] == 'Water']

In [5]:
df_beer.head(10)

Unnamed: 0,Response,Treatment
0,27,Beer
1,20,Beer
2,21,Beer
3,26,Beer
4,27,Beer
5,31,Beer
6,24,Beer
7,21,Beer
8,20,Beer
9,19,Beer


In [6]:
trace0 = go.Box(
    x=df_beer["Treatment"],
    y=df_beer["Response"],
    name='Beer'
)

trace1 = go.Box(
    x=df_water['Treatment'],
    y=df_water['Response'],
    name='Water'
)
    
data = [trace0, trace1]

layout = go.Layout(
    yaxis=dict(
        title='Number of mosquitos in each group'
    ),
    boxmode='group'
)
fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='mosquitos-beer-water')

As we can see from the side-by-side bar graph, the median of mosquito bites for those who consume beer (24) is greater than the median of mosquito bites for those who consume water (20).

All of the statistical values for beer are greater than that of the water suggesting that the beer consumption leads to a higher bites from mosquitos.

Distribution for water is skewed to the left as given by the higher distance between median and Q1; this skewness suggests that the dataset is predominantly filled by values that have a low number of mosquito bites. Distribution for beer is quite symmetric and seems to be generally normally distributed given by nearly equal distance between MIn, Q1, Median, Q3, and Max.

In [11]:
import plotly.figure_factory as ff

# Add histogram data
x1 = df_beer["Response"]
x2 = df_water["Response"]

# Group data together
hist_data = [x1, x2]

group_labels = ['Beer', 'Water']

# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels, bin_size=.2)

# Plot!
py.iplot(fig, filename='Distplot with Multiple Datasets')

In [13]:
import plotly.tools as tls

In [41]:
#First plot
trace0 = go.Histogram(
    x = df_beer["Response"],
    histnorm='probability',
    name="Beer"
)
#Second plot
trace1 = go.Histogram(
    x = df_water["Response"],
    histnorm='probability',
    name="Water"
)

#Creating the grid
fig = tls.make_subplots(rows=2, cols=2, specs=[[{}, {}], [{'colspan': 2}, None]],
                          subplot_titles=('Beer','Water'))

#setting the figs
fig.append_trace(trace0, 1, 1)
fig.append_trace(trace1, 1, 2)

fig['layout'].update(showlegend=True, title='Mosquito Bite Distribuition', bargap=0.05)
py.iplot(fig, filename='custom-sized-subplot-with-subplot-titles')

This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]
[ (2,1) x3,y3           -      ]



Just as expected, mosquito bites for water are skewed to the left signifying a greater number of mosquito bites in the left tail.

In [7]:
df_beer.describe()

Unnamed: 0,Response
count,25.0
mean,23.6
std,4.133199
min,17.0
25%,20.0
50%,24.0
75%,27.0
max,31.0


In [8]:
df_water.describe()

Unnamed: 0,Response
count,18.0
mean,19.222222
std,3.67112
min,12.0
25%,16.5
50%,20.0
75%,22.0
max,24.0


Alright, let's talk about the above numbers:

- Beer dataset contains 25 instances and Water dataset contains 18 instances.
- For beer, there seems be nearly the same distance (around 3-4) between Min (17), Q1 (20), Q2 (24), Q3 (27), Max (31) suggesting that the data is symmetrically distributed along the mean 23.6 with the standard deviation of 4.1.
- For water, there seems to be skewness to the left as given by the greater distance between Min (12), Q1 (16.5), and Q2 (20) and a lesser distance of 2 between Q2 (20), Q3 (22), and Max (24).

Let's take a look at the difference between descriptive statistics of beer and water.

In [42]:
df_beer.describe() - df_water.describe()

Unnamed: 0,Response
count,7.0
mean,4.377778
std,0.462079
min,5.0
25%,3.5
50%,4.0
75%,5.0
max,7.0


From the above, we can case and be reassured that beer consumption does indeed lead to higher mosquito bites compared to water as also evident from the bar graph above.

# Let's generate 50000 instances of mean difference

In [18]:
mean_dif_list = []
n_iter = 50000
Ha = 0 
for i in range(n_iter):
    mosquito["Treatment"] = mosquito["Treatment"].transform(np.random.permutation)
    df_beer = mosquito[mosquito["Treatment"] == 'Beer']
    df_water = mosquito[mosquito["Treatment"] == 'Water']
    mean_diff = np.mean(df_beer["Response"]) - np.mean(df_water["Response"])
    mean_dif_list.append(mean_diff)
    if mean_diff >= 4.4:
            Ha +=1
            
print('p = ', Ha/n_iter*100, "%")

p =  0.034 %


In [19]:
mean_dif_list[0:10]

[-0.3044444444444423,
 1.1288888888888877,
 -0.7822222222222202,
 0.2688888888888883,
 1.5111111111111093,
 -0.3999999999999986,
 -0.87777777777778,
 -0.7822222222222202,
 0.173333333333332,
 0.07777777777777928]

In [20]:
#plt.hist(list(mean_dif_list),bins = 250, color='b', alpha=0.3)

In [21]:
import plotly.figure_factory as ff

import numpy as np

# Group data together
##hist_data = [mean_dif_list]

##group_labels = ['Mean Diff Distribution']

# Create distplot with custom bin_size
##fig = ff.create_distplot(hist_data, group_labels, bin_size= 0.1)

# Plot!
##py.iplot(fig, filename='Distplot with One Dataset')

In [22]:
mosquito["Treatment"] = mosquito["Treatment"].transform(np.random.permutation)
mosquito.head(10)

Unnamed: 0,Response,Treatment
0,27,Water
1,20,Beer
2,21,Water
3,26,Water
4,27,Water
5,31,Water
6,24,Beer
7,21,Beer
8,20,Beer
9,19,Water


In [23]:
mosquito["Treatment"] = mosquito["Treatment"].transform(np.random.permutation)
mosquito.head(10)

Unnamed: 0,Response,Treatment
0,27,Water
1,20,Beer
2,21,Beer
3,26,Beer
4,27,Beer
5,31,Beer
6,24,Beer
7,21,Beer
8,20,Water
9,19,Beer


In [24]:
data = [go.Histogram(x=mean_dif_list)]

layout = go.Layout(
    title='Mean Dif Bar Graph',
    xaxis=dict(
        title='Mean'
    ),
    yaxis=dict(
        title='Count'
    ),
    bargap=0.2,
    bargroupgap=0.1
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig,filename='basic histogram')

In [24]:
pd.DataFrame(mean_dif_list).to_csv("mean_dif_list.csv")