In [3]:
%matplotlib inline
import pandas as pd
from scipy import stats
from pydataset import data


tips = data('tips')

In [4]:
tips.shape


(244, 7)

In [5]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


In [22]:
tips.dtypes

total_bill    float64
tip           float64
sex            object
smoker         object
day            object
time           object
size            int64
dtype: object

In [8]:
data('tips', show_doc = True)

tips

PyDataset Documentation (adopted from R Documentation. The displayed examples are in R)

## Tipping data

### Description

One waiter recorded information about each tip he received over a period of a
few months working in one restaurant. He collected several variables:

### Usage

    data(tips)

### Format

A data frame with 244 rows and 7 variables

### Details

  * tip in dollars, 

  * bill in dollars, 

  * sex of the bill payer, 

  * whether there were smokers in the party, 

  * day of the week, 

  * time of day, 

  * size of the party. 

In all he recorded 244 tips. The data was reported in a collection of case
studies for business statistics (Bryant & Smith 1995).

### References

Bryant, P. G. and Smith, M (1995) _Practical Data Analysis: Case Studies in
Business Statistics_. Homewood, IL: Richard D. Irwin Publishing:




# chi2

## H0: sex is indep of whether or not someone is a smoker

In [9]:
contingency_table = pd.crosstab(tips.sex, tips.smoker)
contingency_table

smoker,No,Yes
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,54,33
Male,97,60


In [10]:
test_results = stats.chi2_contingency(contingency_table)
test_results

(0.008763290531773594,
 0.925417020494423,
 1,
 array([[53.84016393, 33.15983607],
        [97.15983607, 59.84016393]]))

In [12]:
_, p, _, expected = test_results

In [13]:
p

0.925417020494423

### p-value is significantly higher than .05 alpha, we fail to reject the null hypthosis that sex is independent of whether or not a customer is a smoker

In [15]:
# Here we'll do some data frame manipulation with pandas to get the two tables
# into a more comparable form
expected = pd.DataFrame(expected, index=['Female', 'Male'], columns=['Non-Smoker', 'Smoker'])

contingency_table.columns = ['Non-Smoker', 'Smoker']
contingency_table.index.name = ''

contingency_table['group'] = 'Actual'
expected['group'] = 'Expected'

(pd.concat([contingency_table, expected])
 .reset_index()
 .rename({'index': 'sex'}, axis=1)
 .set_index(['group', 'sex']))

Unnamed: 0_level_0,Unnamed: 1_level_0,Non-Smoker,Smoker
group,sex,Unnamed: 2_level_1,Unnamed: 3_level_1
Actual,Female,54.0,33.0
Actual,Male,97.0,60.0
Expected,Female,53.840164,33.159836
Expected,Male,97.159836,59.840164


# Pearson R

## Correlation

## H0: There is not linear correlation between the total bill and the tip amount.

In [16]:
test_results = stats.pearsonr(tips.total_bill, tips.tip)
test_results

(0.6757341092113645, 6.692470646863477e-34)

In [17]:
r, p = test_results

print(f'p is {p:.10f}')

p is 0.0000000000


### The p-value is less than a .05 alpha, we reject the null hypothesis that there is no correlation between total bill and the tip amount

# T Test

## 1 Sample T Test

## H0: The average bill for smokers is no different than the population mean.

In [18]:
smokers_total_bills = tips[tips.smoker == 'Yes'].total_bill
overall_total_bill_mean = tips.total_bill.mean()

test_results = stats.ttest_1samp(smokers_total_bills, overall_total_bill_mean)
test_results

Ttest_1sampResult(statistic=0.951796790928544, pvalue=0.3436939512284921)

### because the p-value is higher than a .05 alpha, we fail to reject the null hypothesis that there is no difference between the average smoker's bill and the average population's bill

## 2 Sample T Test

## H0: The average size of the tip left by parties of 2 and parties of 4 is the same.

In [20]:
parties_of_2 = tips[tips['size'] == 2]
parties_of_4 = tips[tips['size'] == 4]
test_results = stats.ttest_ind(parties_of_2.tip, parties_of_4.tip)
test_results

Ttest_indResult(statistic=-7.462130391296251, pvalue=2.924028981378475e-12)

### because the p-value is smaller than a .05 alpha, we reject the null hypothesis that parties of 2 and parties of 4 have the same average tip

## Exercise

## H0: there is no difference in average tip between lunchtime customers and average total population tip

In [25]:
lunchtime = tips[tips['time'] == 'Lunch'].tip

lunchtime.head()

78    4.00
79    3.00
80    2.71
81    3.00
82    3.40
Name: tip, dtype: float64

In [26]:
overall_tip_mean = tips.tip.mean()

overall_tip_mean

2.9982786885245902

In [27]:
test_results = stats.ttest_1samp(lunchtime, overall_tip_mean)
test_results

Ttest_1sampResult(statistic=-1.8484723073745426, pvalue=0.06894738419244793)

### p-value is barely over .05 alpha, we fail to reject the null hypothesis that there is no difference between average tip of a lunchtime customer and average tip of the total population

## H0: there is no difference between the average tip of a lunchtime customer and a dinnertime customer

In [30]:
dinnertime = tips[tips['time'] == 'Dinner'].tip

dinnertime.head()

1    1.01
2    1.66
3    3.50
4    3.31
5    3.61
Name: tip, dtype: float64

In [32]:
test_results = stats.ttest_ind(lunchtime, dinnertime)

test_results

Ttest_indResult(statistic=-1.9062569301202392, pvalue=0.05780153475171558)

### because the p-values is barely higher than a .05 alpha, we fail to reject the null hypothesis that there is no difference in the average tip between dinner and lungh time customers 

## H0: there is no correlation between party size and tip amount

In [34]:
tips['size'] = tips['size'].astype(float)

In [35]:
test_results = stats.pearsonr(tips.size, tips.tip)
test_results

TypeError: object of type 'numpy.int64' has no len()