# Chapter 4: Measures of dispersion
For additional context see chapter 4 of *The Virus of Variation*. Questions? Email James.Lehner@gmail.com or QualityIsBroken@gmail.com.

In [1]:
# Import libraries
from matplotlib import pyplot as plt
from process_improvement import comparison_charts as cc
import numpy as np
import seaborn as sns
import pandas as pd

%matplotlib inline

## Get the data

In [10]:
# Dataset URL
rate_url = r'https://raw.githubusercontent.com/jimlehner/the-virus-of-variation/refs/heads/main/data/vienna-general-death-to-birth-rates-by-clinic.csv'

# Get data
def get_data() -> pd.DataFrame:
    return pd.read_csv(rate_url)

# Get data
rate_df = get_data()

# Specify 'Year' column as dtype 'string'
rate_df['Year'] = rate_df['Year'].astype('string')

# Drop unnecessary columns
rate_df = rate_df[[#'Hospital', 
                   'Clinic', 
                   #'Employee', 
                   'Year', 
                   #'Births', 
                   #'Deaths', 
                   'Rate', 
                   #'Note'
                  ]]

# Show dataframe
rate_df.head()

Unnamed: 0,Clinic,Year,Rate
0,1st Clinic,1833,5.3
1,1st Clinic,1834,7.7
2,1st Clinic,1835,5.6
3,1st Clinic,1836,7.5
4,1st Clinic,1837,9.1


## Split rate_df into first_clinic_df and second_clinic_df

In [25]:
# Get first clinic data for 1844 to 1854
first_clinic_df = (rate_df[rate_df['Clinic'] == '1st Clinic']
                   .copy()
                   .iloc[11:22]
                   .reset_index(drop=True)
                   .rename(columns={'Rate':'Students & Doctors'})
                   .drop('Clinic', axis=1))

# Get second clinic data for 1844 to 1854
second_clinic_df = (rate_df[rate_df['Clinic'] == '2nd Clinic']
                    .copy()
                    .iloc[11:22]
                    .reset_index(drop=True)
                    .rename(columns={'Rate':'Midwives'})
                    .drop('Clinic', axis=1))

second_clinic_df

Unnamed: 0,Year,Midwives
0,1844,2.3
1,1845,2.0
2,1846,2.8
3,1847,1.0
4,1848,1.3
5,1849,2.6
6,1850,1.7
7,1851,3.6
8,1852,5.7
9,1853,1.9


## Table 4.1: Death-to-birth rates for the two maternity clinics

In [30]:
# Merge first and second clinic dataframes
merged_rates_df = (pd.merge(first_clinic_df, second_clinic_df, on='Year').set_index('Year'))

# Show dataframe
merged_rates_df

Unnamed: 0_level_0,Students & Doctors,Midwives
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
1844,8.2,2.3
1845,6.9,2.0
1846,11.4,2.8
1847,5.0,1.0
1848,1.3,1.3
1849,2.7,2.6
1850,2.0,1.7
1851,1.8,3.6
1852,4.0,5.7
1853,2.2,1.9


## Table 4.2: Death-to-birth rates for the two maternity clinics and the associated means

In [48]:
# Calculate sum, count, and mean for vienna_births_df
agg_stats_df = (merged_rates_df.agg(['sum','count','mean'])
                .round(1)
                .rename(index={'sum':'Sum', 
                               'count':'Count',
                               'mean':'Mean'}))

# Create dataframe of Table 4.2
table_4_2 = (pd.concat([merged_rates_df, agg_stats_df])
             .reset_index(drop=False)
             .rename(columns={'index':'Year'})
             .set_index('Year'))

# Show dataframe
table_4_2

Unnamed: 0_level_0,Students & Doctors,Midwives
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
1844,8.2,2.3
1845,6.9,2.0
1846,11.4,2.8
1847,5.0,1.0
1848,1.3,1.3
1849,2.7,2.6
1850,2.0,1.7
1851,1.8,3.6
1852,4.0,5.7
1853,2.2,1.9


## Table 4.3: Death-to-birth rates for the two maternity clinics and the associated ranges

In [49]:
# Calculate maximum, minimum and range
range_stats_df = (merged_rates_df.agg(['max',
                                       'min',
                                       lambda x: x.max() - x.min()])
                .round(1)
                .rename(index={'max':'Max', 
                               'min':'Min',
                               '<lambda>':'Range'}))

# Create dataframe of Table 4.3
table_4_3 = (pd.concat([merged_rates_df, range_stats_df])
             .reset_index(drop=False)
             .rename(columns={'index':'Year'})
             .set_index('Year'))

# Show dataframe
table_4_3

Unnamed: 0_level_0,Students & Doctors,Midwives
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
1844,8.2,2.3
1845,6.9,2.0
1846,11.4,2.8
1847,5.0,1.0
1848,1.3,1.3
1849,2.7,2.6
1850,2.0,1.7
1851,1.8,3.6
1852,4.0,5.7
1853,2.2,1.9


## Table 4.4: Death-to-birth rates for the two maternity clinics and the associated standard deviations

In [51]:
# Calculate maximum, minimum and range
stdev_stats_df = (merged_rates_df.agg(['std'])
                .round(1)
                .rename(index={'std':'s'}))

# Create dataframe of Table 4.4
table_4_4 = (pd.concat([merged_rates_df, stdev_stats_df])
             .reset_index(drop=False)
             .rename(columns={'index':'Year'})
             .set_index('Year'))

# Show dataframe
table_4_4

Unnamed: 0_level_0,Students & Doctors,Midwives
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
1844,8.2,2.3
1845,6.9,2.0
1846,11.4,2.8
1847,5.0,1.0
1848,1.3,1.3
1849,2.7,2.6
1850,2.0,1.7
1851,1.8,3.6
1852,4.0,5.7
1853,2.2,1.9


## Table 4.5: Death-to-birth rates, moving ranges, and average moving ranges for the clinic run by students and doctors

In [64]:
# Select only students and doctors
students_docs_df = merged_rates_df[['Students & Doctors']]

# Calculate the moving range
students_docs_df['Moving Range'] = abs(students_docs_df['Students & Doctors'].diff())

# Calculate sum, count, and average moving range
avemR_stats_df = (students_docs_df['Moving Range']
                  .agg(['sum','count','mean'])
                  .round(1)
                  .rename(index={'sum':'Sum',
                               'count':'Count',
                               'mean':'Ave. mR'}).to_frame())

# Create dataframe of Table 4.5
table_4_5 = (pd.concat([students_docs_df, avemR_stats_df])
             .reset_index(drop=False)
             .rename(columns={'index':'Year'})
             .set_index('Year'))

# Show dataframe
students_and_doctors_df

avemR_stats_df

table_4_5

Unnamed: 0_level_0,Students & Doctors,Moving Range
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
1844,8.2,
1845,6.9,1.3
1846,11.4,4.5
1847,5.0,6.4
1848,1.3,3.7
1849,2.7,1.4
1850,2.0,0.7
1851,1.8,0.2
1852,4.0,2.2
1853,2.2,1.8


## Table 4.6: Death-to-birth rates and average moving ranges for both maternity clinics at Vienna General

In [69]:
# Select only midwives
midwives_df = merged_rates_df[['Midwives']]

# Calculate the moving range
midwives_df['Moving Range'] = abs(midwives_df['Midwives'].diff())

# Show dataframe
midwives_df

Unnamed: 0_level_0,Midwives,Moving Range
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
1844,2.3,
1845,2.0,0.3
1846,2.8,0.8
1847,1.0,1.8
1848,1.3,0.3
1849,2.6,1.3
1850,1.7,0.9
1851,3.6,1.9
1852,5.7,2.1
1853,1.9,3.8


In [84]:
# Combine students_docs_df with midwives_df
combined_df = (pd.merge(students_docs_df, 
                        midwives_df, 
                        left_index=True, 
                        right_index=True)
               .rename(columns={"Moving Range_x":"mR (Students & Docs)",
                                "Moving Range_y":"mR (Midwives)"}))

# Calculate sum, count, and average moving range
combined_stats_df = (combined_df[['mR (Students & Docs)', 'mR (Midwives)']]
                  .agg(['sum','count','mean'])
                  .round(2)
                  .rename(index={'sum':'Sum',
                               'count':'Count',
                               'mean':'Ave. mR'}))


combined_avemR_stats_df

table_4_6 = (pd.concat([combined_df, combined_stats_df])
             .reset_index(drop=False)
             .rename(columns={'index':'Year'})
             .set_index('Year'))

table_4_6

Unnamed: 0_level_0,Students & Doctors,mR (Students & Docs),Midwives,mR (Midwives)
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1844,8.2,,2.3,
1845,6.9,1.3,2.0,0.3
1846,11.4,4.5,2.8,0.8
1847,5.0,6.4,1.0,1.8
1848,1.3,3.7,1.3,0.3
1849,2.7,1.4,2.6,1.3
1850,2.0,0.7,1.7,0.9
1851,1.8,0.2,3.6,1.9
1852,4.0,2.2,5.7,2.1
1853,2.2,1.8,1.9,3.8


## Table 4.7: Death-to-birth rates by clinic with associated mean and median

In [87]:
# Calculate mean and median
mean_median_df = (merged_rates_df.agg(['mean','median'])
                .round(1)
                .rename(index={'mean':'Mean', 
                               'median':'Median'}))

# Create Table 4.7 dataframe
table_4_7 = (pd.concat([merged_rates_df, mean_median_df]))

table_4_7

Unnamed: 0,Students & Doctors,Midwives
1844,8.2,2.3
1845,6.9,2.0
1846,11.4,2.8
1847,5.0,1.0
1848,1.3,1.3
1849,2.7,2.6
1850,2.0,1.7
1851,1.8,3.6
1852,4.0,5.7
1853,2.2,1.9


## Table 4.8: Measures of dispersion for maternity clinic death-to-birth rates

In [93]:
dispersion_stats_df = (merged_rates_df.agg([lambda x: x.max() - x.min(), 'std'])
                       .round(1)
                       .rename(index={'<lambda>':'R',
                                      'std':'s'}))

ave_mRs = combined_avemR_stats_df.rename(columns={"mR (Students & Docs)":"Students & Doctors",
                                                  "mR (Midwives)":"Midwives"})

table_4_8 = pd.concat([dispersion_stats_df, ave_mRs.iloc[[-1]]])

table_4_8

Unnamed: 0,Students & Doctors,Midwives
R,10.1,5.2
s,3.4,1.7
Ave. mR,2.9,1.8
