# Chapter 7: Measures of dispersion

For additional context see chapter 7 of [The Virus of Variation & Process Behavior Charts: A Guide for the Perplexed](https://www.brokenquality.com/book).

In [113]:
# Import libraries
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd

%matplotlib inline

## Get data

In [114]:
# Dataset URL
dataset_url = r'https://raw.githubusercontent.com/jimlehner/the-virus-of-variation-and-pbcs/refs/heads/main/data/07-anscombe-aerospace-manufacturing-line-data.csv'

# Get data
def get_data() -> pd.DataFrame:
    return pd.read_csv(dataset_url)

# Get data
df = get_data()
df.head()

Unnamed: 0,Line,Sample,Value
0,A,1,2.68
1,A,2,2.32
2,A,3,2.53
3,A,4,2.94
4,A,5,2.78


## Pivot the dataframe

In [115]:
# Pivot the DataFrame
df_pivot = df.pivot(index='Sample', columns='Line', values='Value')

# Show results
df_pivot

Line,A,B,C,D
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2.68,3.05,2.49,1.19
2,2.32,2.71,1.26,1.92
3,2.53,2.91,4.25,1.57
4,2.94,2.92,3.37,2.95
5,2.78,3.09,2.6,2.82
6,3.32,2.7,2.95,4.68
7,2.41,2.04,2.03,2.75
8,1.42,1.03,1.8,4.17
9,3.61,3.04,2.72,2.85
10,1.61,2.42,2.14,1.3


## Table 7.2: Sum, sample number, and mean of manufacturing lines

In [116]:
# Apply aggregation
summary_df = df_pivot.agg(['sum', 'count', 'mean']) \
                     .round(1)

# Rename row labels in index
summary_df = summary_df.rename(index={'sum': 'Sum', 
                                        'count': 'Sample', 
                                        'mean': 'Mean'})
# Show results
summary_df

Line,A,B,C,D
Sum,27.5,27.5,27.5,27.5
Sample,11.0,11.0,11.0,11.0
Mean,2.5,2.5,2.5,2.5


## Table 7.3: Range of manufacturing line data

In [117]:
# Calculate min, max, and range for each manufacturing line
range_df = round(df_pivot.agg(['min', 'max', lambda x: x.max() - x.min()]),1)

# Rename row labels in index
range_df = range_df.rename(index={'min':'Min',
                                  'max':'Max',
                                  '<lambda>':'Range'})
# Show results
range_df

Line,A,B,C,D
Min,1.4,1.0,1.3,1.2
Max,3.6,3.1,4.2,4.7
Range,2.2,2.1,3.0,3.5


## Table 7.4: Global standard deviation

In [118]:
# Calculate standard deviation for each manufacturing line
stdev_df = round(df_pivot.agg(['std']),1)

# Rename row labels in index
stdev_df = stdev_df.rename(index={'std':'s'})

# Show results
stdev_df

Line,A,B,C,D
s,0.7,0.7,0.8,1.2


## Table 7.5: Average moving range of line A

In [119]:
# Create dataframe of just line A data
line_A_df = df[df['Line'] == 'A'].drop('Line', axis=1).set_index('Sample')

# Calculate the moving range
line_A_df['mR'] = abs(line_A_df['Value'].diff())

# Calculate the average moving range
average_mR = round(line_A_df['mR'].mean(), 2)

# Create a new row with the average moving range
new_row = pd.DataFrame({'Value': [r'$\overline{mR}$'], 'mR': [average_mR]})

# Append the new row to the dataframe
line_A_df = pd.concat([line_A_df, new_row]).rename_axis('Sample', axis='index')

# Show results
line_A_df

Unnamed: 0_level_0,Value,mR
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2.68,
2,2.32,0.36
3,2.53,0.21
4,2.94,0.41
5,2.78,0.16
6,3.32,0.54
7,2.41,0.91
8,1.42,0.99
9,3.61,2.19
10,1.61,2.0


## Table 7.6: Calculate the ave. mR for each line

In [135]:
# Calculate the moving range for all columns (except 'Sample' or 'Line', if they exist)
for col in df_pivot.columns:
    if col not in ['Line', 'Sample']:  # Exclude non-numeric columns
        df_pivot[f'{col}-mR'] = abs(df_pivot[col].diff())
df_moving_range = df_pivot[['A-mR', 'B-mR', 'C-mR', 'D-mR']]

# Calculate the average moving ranges
df_ave_moving_range = round(df_moving_range.agg(['mean']),2).rename(index={'mean':'$\overline{mR}$'})
df_ave_moving_range.rename(columns={'A-mR':'A',
                                   'B-mR':'B',
                                   'C-mR':'C',
                                   'D-mR':'D'}, inplace=True)
# Show results
df_ave_moving_range

Line,A,B,C,D
$\overline{mR}$,0.8,0.62,0.91,1.07


## Table 7.9: Measures of dispersion

In [136]:
df_dispersion = pd.concat([range_df, stdev_df, df_ave_moving_range])
df_dispersion

Line,A,B,C,D
Min,1.4,1.0,1.3,1.2
Max,3.6,3.1,4.2,4.7
Range,2.2,2.1,3.0,3.5
s,0.7,0.7,0.8,1.2
$\overline{mR}$,0.8,0.62,0.91,1.07
