In [1]:
import numpy as np
import pandas as pd
import statistics

In [2]:
kids = ['Greg',
       'Marcia',
       'Peter',
       'Jan',
       'Bobby',
       'Cindy',
       'Oliver']

brady = pd.DataFrame(index=kids)
brady['age'] = [14, 12, 11, 10, 8, 6, 8]
print ("mean: {}".format(np.mean(brady['age'])))
print ("median: {}".format(np.median(brady['age'])))
print ("mode: {}".format(statistics.mode(brady['age'])))
print ("variance: {}".format(brady['age'].var()))
print ("standard deviation: {}".format(np.std(brady['age'], ddof=1)))
print ("standard error: {}".format(np.std(brady['age'] ,ddof=1) / np.sqrt(len(brady['age']))))

mean: 9.857142857142858
median: 10.0
mode: 8
variance: 7.476190476190475
standard deviation: 2.734262327610589
standard error: 1.0334540197243192


In [3]:
# The mean and median are similar enough that neither is particularly better than the other for describing this dataset. The mode isn't helpful at all.
# Standard deviation is the best value to describe the variance of this set: standard error is irrelevant since the entire population is accounted for, and the variance is too large relative to the mean.

In [4]:
brady.loc['Cindy', 'age'] += 1
print ("mean: {}".format(np.mean(brady['age'])))
print ("median: {}".format(np.median(brady['age'])))
print ("mode: {}".format(statistics.mode(brady['age'])))
print ("variance: {}".format(brady['age'].var()))
print ("standard deviation: {}".format(np.std(brady['age'], ddof=1)))
print ("standard error: {}".format(np.std(brady['age'] ,ddof=1) / np.sqrt(len(brady['age']))))

mean: 10.0
median: 10.0
mode: 8
variance: 6.333333333333333
standard deviation: 2.516611478423583
standard error: 0.9511897312113418


In [5]:
# Responses above haven't changed for the same reasons outlined previously.

In [6]:
# brady = brady.drop(['Oliver'])
brady = brady.rename(index = {"Oliver": "Jessica"})
brady.loc['Jessica', 'age'] = 1
print ("mean: {}".format(np.mean(brady['age'])))
print ("median: {}".format(np.median(brady['age'])))
# print ("mode: {}".format(statistics.mode(brady['age'])))
print ("no mode, all values appear equally")
print ("variance: {}".format(brady['age'].var()))
print ("standard deviation: {}".format(np.std(brady['age'], ddof=1)))
print ("standard error: {}".format(np.std(brady['age'] ,ddof=1) / np.sqrt(len(brady['age']))))

mean: 9.0
median: 10.0
no mode, all values appear equally
variance: 18.0
standard deviation: 4.242640687119285
standard error: 1.6035674514745462


In [None]:
# Because an outlier has been introduced, the median is a better measure of central tendency.
# Standard deviation is still the best measure of variance.

In [9]:
ratings = [20, 23, 17, 5]
print(np.mean(ratings))
print(np.median(ratings))
print(np.std(ratings, ddof=1))

16.25
18.5
7.88986691902975


In [None]:
# There is clearly an outlier here, so the median is more useful than the mean for this dataset.
# Based on the median, ~18.5% of the respondents are fans of the show.
# However, since the respondents are taken from a biased sample (those who read TV-related magazines are more likely to be fans of any given show), there's no way to know what percentage of Americans in general are Brady Bunch fans.