In [85]:
import pandas as pd 
import numpy as np 

data = np.array([['Greg', 14], ['Marcia', 12], ['Peter', 11], ['Jan', 10], ['Bobby', 8],
['Cindy', 6], ['Oliver', 8]])

df = pd.DataFrame(data, columns = ['Name', 'Age'])
df['Age'] = pd.to_numeric(df['Age'])

print(df)

     Name  Age
0    Greg   14
1  Marcia   12
2   Peter   11
3     Jan   10
4   Bobby    8
5   Cindy    6
6  Oliver    8


In [68]:
# measure of central tendency
age_mean = df.Age.mean()
age_median = df.Age.median()
age_mode = df.Age.mode()

print('Mean: {}, Median: {}, Mode: {}'.format(age_mean, age_median, age_mode))
# in this case, the mean is very useful measure of central tendency as it's very close to the values of median and mode.

Mean: 9.857142857142858, Median: 10.0, Mode: 0    8
dtype: int64


In [69]:
# measure of variance

age_var = np.var(df.Age)
age_std = np.std(df.Age, ddof = 1 )
age_se = age_std / np.sqrt(len(df.Age))

print('Variance : {}, Std: {}, Standard Error: {}'.format(age_var, age_std, age_se))
# The mean and median are close, so standard deviation is good to pick as a measure of variace

Variance : 6.408163265306122, Std: 2.734262327610589, Standard Error: 1.0334540197243192


In [70]:
# Cindy has a birthday
df.loc[5,'Age'] = 7

# updating estimates of centarl tendency
updated_mean = df.Age.mean()
updated_median = df.Age.median()
updated_mode = df.Age.mode()
print('Mean: {}, Median: {}, Mode: {}'.format(updated_mean, updated_median, updated_mode))

# only the mean of the dataset changed. the median and mode are the same as before

Mean: 10.0, Median: 10.0, Mode: 0    8
dtype: int64


In [71]:
# updated estimates of variance

updated_var = np.var(df.Age)
updated_std = np.std(df.Age, ddof = 1 )
updated_se = age_std / np.sqrt(len(df.Age))

print('Variance : {}, Std: {}, Standard Error: {}'.format(updated_var, updated_std, updated_se))

# small changes in variance and standard deviation is observed. Standard Error is the same as before

Variance : 5.428571428571429, Std: 2.516611478423583, Standard Error: 1.0334540197243192


In [72]:
# replacing Oliver(8) with Jessica(1)

df.loc[6:6, 'Name':'Age'] = 'Jessica', 1

In [73]:
# new estimates of central tendency with Jessica(1)

updated2_mean = df.Age.mean()
updated2_median = df.Age.median()
updated2_mode = df.Age.mode()
print('Mean: {}, Median: {}, Mode: {}'.format(updated2_mean, updated2_median, updated2_mode))

# The mean is still good. All values are unique, so mode cannot be useful as a measure of central tendency as well as the median
# falling farther from the min value of the dataset than mean.

Mean: 9.0, Median: 10.0, Mode: 0     1
1     7
2     8
3    10
4    11
5    12
6    14
dtype: int64


In [74]:
# new estimates of variance with Jessica(1)

updated2_var = np.var(df.Age)
updated2_std = np.std(df.Age, ddof = 1 )
updated2_se = age_std / np.sqrt(len(df.Age))

print('Variance : {}, Std: {}, Standard Error: {}'.format(updated2_var, updated2_std, updated2_se))
# Variance tripled and standard deviation doubled. Only Standard error kept the same 

Variance : 15.428571428571429, Std: 4.242640687119285, Standard Error: 1.0334540197243192


In [83]:
# the Brady Bunch is general entertainment show. So we should exclude SciPhi Phanatic fans (narrow interest) and
# work with the rest 3 as they are more general

magazines = np.array([['TV Guide', 20], ['Entertainment Weekly', 23], ['Pop Culture Today', 17]])
data = pd.DataFrame(magazines, columns = ['Magazine', 'Fans %'])
data['Fans %'] = pd.to_numeric(data['Fans %'])
mean = np.mean(data['Fans %'])
print('the answer is: ', mean)

the answer is:  20.0
