# Notebook 10: Statistics

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# This is the main statistics package in python
import scipy.stats

## Probability distributions

Here is the scipy.stats documentation:

https://docs.scipy.org/doc/scipy/reference/stats.html

In [None]:
# Normal distribution
from scipy.stats import norm, chi2, expon

dist = norm #chi2(100)

x_min = dist.ppf(.001)
x_max = dist.ppf(.999)

# Create gridpoints in x
x = np.linspace(x_min, x_max, 1000)

# Get the values for the probability density
y = dist.pdf(x)

# Plot the probability distribution
#plt.plot(x, y)
plt.fill_between(x, y, 0)
plt.title('Probability Distribution Function (PDF)');

In [None]:
# There are other ways to characterize a distribution:
y = dist.cdf(x)
plt.plot(x,y)
plt.title('Cumulative Distribution Function (CDF)');

In [None]:
y = dist.ppf(x)
plt.plot(x,y)
plt.title('Percent Point Function (PPF)');

In [None]:
# draw random numbers
samples = dist.rvs(size=100)

In [None]:
# Histogram random numbers
sns.distplot(samples,bins=100)

Seaborn tries to estiamte the underlying distribution using a rather crude technique.
A more advanced method is implemented in a package called SUFTware (from my lab). It'll not just estimate a probability distribution from data, it will also estimate how uncertain that distribution is.

https://suftware.readthedocs.io

<img src="who.alcohol_consumption.png" alt="Drawing" style="width: 400px;"/>

In [None]:
# Install SUFTware
#!pip install suftware

In [None]:
# Suftw
import suftware as sw

# Make displot again
sns.distplot(samples,bins=100)

# Estimate probability density with SUFTware
est = sw.DensityEstimator(samples, num_posterior_samples=20)

# Plot SUFTware estimate
x = est.grid
#y = est.evaluate(x)
y = est.evaluate_samples(x)
plt.plot(x,y);

## Hypothesis testing

In [None]:
# Load the example tips dataset
tips = sns.load_dataset("tips")
tips.head()

In [None]:
# Draw a nested boxplot to show bills by day and time
sns.boxplot(x="day", y="total_bill",
            hue="smoker", palette=["m", "g"],
            data=tips)
sns.despine(offset=10, trim=True)

In [None]:
# Compute tip percentage
tips['pct'] = 100*tips['tip']/tips['total_bill']

# Make box plot of this
sns.boxplot(x="sex", y="pct", data=tips)
sns.despine(offset=10, trim=True)

In [None]:
# Extract data
m_tips = tips['pct'][tips['sex']=='Male']
f_tips = tips['pct'][tips['sex']=='Female']

# Plot data
sns.distplot(m_tips, bins=10, label='male')
sns.distplot(f_tips, bins=10, label='female')
plt.legend()

In [None]:
t, p = stats.ttest_ind(m_tips,f_tips)
print('Pvalue is ', p)

## Exercise

E10.1: Are smokers' meals less expensive than nonsmokers' meals?

In [None]:
# Answer here

E10.2: Do tips vary significantly by day of the week?

In [None]:
# Answer here