# Stat 593 : Robust Statistics
## Introduction to robustness
### *Joseph Salmon*

This notebook reproduces the pictures for the course "IntroRobustness"

- REM:
  - you need TeX install on your machine (otherwise errors could appends)
  - you need the file share_code somewhere available (for my architecture it was few level up)

In [None]:
# import packages
import numpy as np
from scipy import stats
from statsmodels.nonparametric.kde import KDEUnivariate
# dirty local imports:
import sys
sys.path.append("./../../../")
from share_code.utils import plt, sns, my_saving_display
from joblib import Memory
import scipy as sp

# cachedir = './joblib_cache/'
# memory = Memory(cachedir=cachedir, verbose=1)

sns.set_palette("colorblind")

In [None]:
# to get images in the notebook
%matplotlib inline 

# Plot initialization

In [None]:
dirname = "../prebuiltimages/"
imageformat = ".pdf"  # should be .pdf or .png    
# some colors I'll use
brown = (0.64, 0.16, 0.16)
purple = (148. / 255, 0, 211. / 255)
plt.close("all")

# Saving display function:

In [None]:
saving = False  # True

# Popular statistics:

In [None]:
mu, sigma, nb_samples = 1.5, 4, 15

# Random setting
np.random.seed(seed=2)
rgamma = np.random.gamma
X = rgamma(mu, sigma, nb_samples)  # Generate sample from a Gamma distribution
y = np.ones(nb_samples,)

X[np.argmin(X)] = -3
X[np.argmax(X)] = 30

# Various statistics:
meanX = np.mean(X)  # mean
minX = np.min(X)  # min
maxX = np.max(X)  # max
medX = np.median(X)  # median
MADX = np.median(np.abs(X - medX))  # mean absolute deviation
s = np.std(X, ddof=1)  # unbiased standard deviation

alpha_trim = 0.25
tmeanX = stats.trim_mean(X, alpha_trim)  # trimmed mean (level: alpha_trim)

xmin = minX - 20
xmax = 35

## Empirical mean display:

REM: meaning of shortcuts such as lw (linewidth), ls (linestyle) etc., are available at: https://matplotlib.org/api/lines_api.html

In [None]:
fig1, ax = plt.subplots(figsize=(10, 3))
ax.set_ylim(0, 1.5)
ax.set_xlim(xmin, xmax)
ax.get_xaxis().tick_bottom()
ax.axes.get_yaxis().set_visible(False)

ax.spines['right'].set_color('none')
ax.spines['top'].set_color('none')
ax.spines['bottom'].set_position(('data', 0.5))
ax.spines['left'].set_color('none')

ax.scatter(X, y, c='black', s=300, marker='o', edgecolors=brown, lw='1')
ax.plot([meanX, meanX], [0, 1.5], color=brown, lw=1.5, ls="--")

plt.xlabel(r'$x$', fontsize=18)
plt.annotate(r'$\overline{x}_n : \mbox{empirical mean}$',
             xy=(meanX, 0.4), xycoords='data', xytext=(+10, +30),
             textcoords='offset points', fontsize=18, color=brown)

plt.tight_layout()
plt.show()

if saving:
    my_saving_display(fig1, dirname, "GammaSampleMean", imageformat)

# Empirical median display:

In [None]:
fig1, ax = plt.subplots(figsize=(10, 3))
ax.set_ylim(0, 1.5)
ax.set_xlim(xmin, xmax)
ax.get_xaxis().tick_bottom()
ax.axes.get_yaxis().set_visible(False)

ax.spines['right'].set_color('none')
ax.spines['top'].set_color('none')
ax.spines['bottom'].set_position(('data', 0.5))
ax.spines['left'].set_color('none')

ax.scatter(X, y, c='black', s=300, marker='o', edgecolors=brown, lw='1')

ax.plot([medX, medX], [0, 1.5], color=purple, lw=1.5, ls="--")
plt.xlabel(r'$x$', fontsize=18)
plt.annotate(r'$\rm{Med}_n(\mathbb{x}): \mbox{empirical median}$',
             xy=(medX, 0.4), xycoords='data', xytext=(-210, +30),
             textcoords='offset points', fontsize=18, color=purple)

plt.tight_layout()
plt.show()

if saving:
    my_saving_display(fig1, dirname, "GammaSampleMediane", imageformat)

#  Trimmed mean (level: alpha_trim) display:


In [None]:
fig1, ax = plt.subplots(figsize=(10, 3))
ax.set_ylim(0, 1.5)
ax.set_xlim(xmin, xmax)
ax.get_xaxis().tick_bottom()
ax.axes.get_yaxis().set_visible(False)

ax.spines['right'].set_color('none')
ax.spines['top'].set_color('none')
ax.spines['bottom'].set_position(('data', 0.5))
ax.spines['left'].set_color('none')

ax.scatter(X, y, c='black', s=300, marker='o', edgecolors=brown, lw='1')

ax.plot([tmeanX, tmeanX], [0, 1.5], c='blue', lw=1.5, ls=":")
plt.xlabel(r'$x$', fontsize=18)
tt = "$\overline{x}_{n,%s} : \mbox{trimmed mean}$" % str(alpha_trim)
plt.annotate(tt, xy=(tmeanX - 18, 0.4), xycoords='data', xytext=(+5, +30),
             textcoords='offset points', fontsize=18, color='blue')

plt.tight_layout()
plt.show()

if saving:
    my_saving_display(fig1, dirname, "GammaSampleTrimmed", imageformat)

#  Empirical mean / median / trimmed mean :

In [None]:
fig1, ax = plt.subplots(figsize=(10, 3))
ax.set_ylim(0, 1.5)
ax.set_xlim(xmin, xmax)
ax.get_xaxis().tick_bottom()
ax.axes.get_yaxis().set_visible(False)

ax.spines['right'].set_color('none')
ax.spines['top'].set_color('none')
ax.spines['bottom'].set_position(('data', 0.5))
ax.spines['left'].set_color('none')

ax.scatter(X, y, c='black', s=300, marker='o', edgecolors=brown, lw='1')

ax.plot([meanX, meanX], [0, 1.5], color=brown, lw=1.5, ls="--")
ax.plot([medX, medX], [0, 1.5], color=purple, lw=1.5, ls="--")
ax.plot([tmeanX, tmeanX], [0, 1.5], color='blue',
        lw=1.5, ls=":")

plt.xlabel(r'$x$', fontsize=18)
plt.annotate(r'$\rm{Med}_n(\mathbb{x}): \mbox{empirical median}$',
             xy=(medX, 1), xycoords='data', xytext=(-210, +30),
             textcoords='offset points', fontsize=18, color=purple)
plt.annotate(r'$\bar{x}_n : \mbox{empirical mean}$', xy=(meanX, 0.4),
             xycoords='data', xytext=(+10, +30), textcoords='offset points',
             fontsize=18, color=brown)
plt.annotate(tt, xy=(tmeanX - 18, 0.4), xycoords='data', xytext=(+5, +30),
             textcoords='offset points', fontsize=18, color='blue')


plt.tight_layout()
plt.show()

if saving:
    my_saving_display(fig1, dirname, "GammaSampleMedianeMean", imageformat)

In [None]:
X[np.argmax(X)] = 10

fig1, ax = plt.subplots(figsize=(10, 3))
ax.set_ylim(0, 1.5)
ax.set_xlim(xmin, xmax)
ax.get_xaxis().tick_bottom()
ax.axes.get_yaxis().set_visible(False)

ax.spines['right'].set_color('none')
ax.spines['top'].set_color('none')
ax.spines['bottom'].set_position(('data', 0.5))
ax.spines['left'].set_color('none')

ax.scatter(X, y, c='black', s=300, marker='o', edgecolors=brown, lw='1')

ax.plot([np.mean(X), np.mean(X)], [0, 1.5], color=brown, lw=1.5, ls="--")
ax.plot([medX, medX], [0, 1.5], color=purple, lw=1.5, ls="--")
ax.plot([tmeanX, tmeanX], [0, 1.5], color='blue',
        lw=1.5, ls=":")

plt.xlabel(r'$x$', fontsize=18)
plt.annotate(r'$\rm{Med}_n(\mathbb{x}): \mbox{empirical median}$',
             xy=(medX, 1), xycoords='data', xytext=(-210, +30),
             textcoords='offset points', fontsize=18, color=purple)
plt.annotate(r'$\bar{x}_n : \mbox{empirical mean}$', xy=(meanX, 0.4),
             xycoords='data', xytext=(+10, +30), textcoords='offset points',
             fontsize=18, color=brown)
plt.annotate(tt, xy=(tmeanX - 18, 0.4), xycoords='data', xytext=(+5, +30),
             textcoords='offset points', fontsize=18, color='blue')


plt.tight_layout()
plt.show()

if saving:
    my_saving_display(fig1, dirname, "GammaSampleMedianeMeanBig10", imageformat)


In [None]:
X[np.argmax(X)] = 20

fig1, ax = plt.subplots(figsize=(10, 3))
ax.set_ylim(0, 1.5)
ax.set_xlim(xmin, xmax)
ax.get_xaxis().tick_bottom()
ax.axes.get_yaxis().set_visible(False)

ax.spines['right'].set_color('none')
ax.spines['top'].set_color('none')
ax.spines['bottom'].set_position(('data', 0.5))
ax.spines['left'].set_color('none')

ax.scatter(X, y, c='black', s=300, marker='o', edgecolors=brown, lw='1')

ax.plot([np.mean(X), np.mean(X)], [0, 1.5], color=brown, lw=1.5, ls="--")
ax.plot([medX, medX], [0, 1.5], color=purple, lw=1.5, ls="--")
ax.plot([tmeanX, tmeanX], [0, 1.5], color='blue',
        lw=1.5, ls=":")

plt.xlabel(r'$x$', fontsize=18)
plt.annotate(r'$\rm{Med}_n(\mathbb{x}): \mbox{empirical median}$',
             xy=(medX, 1), xycoords='data', xytext=(-210, +30),
             textcoords='offset points', fontsize=18, color=purple)
plt.annotate(r'$\bar{x}_n : \mbox{empirical mean}$', xy=(meanX, 0.4),
             xycoords='data', xytext=(+10, +30), textcoords='offset points',
             fontsize=18, color=brown)
plt.annotate(tt, xy=(tmeanX - 18, 0.4), xycoords='data', xytext=(+5, +30),
             textcoords='offset points', fontsize=18, color='blue')


plt.tight_layout()
plt.show()

if saving:
    my_saving_display(fig1, dirname, "GammaSampleMedianeMeanBig20", imageformat)

X[np.argmax(X)] = 30

# Standard deviation:

In [None]:
fig1, ax = plt.subplots(figsize=(10, 3))
ax.set_ylim(0, 1.5)
ax.set_xlim(xmin, xmax)
ax.get_xaxis().tick_bottom()
ax.axes.get_yaxis().set_visible(False)

ax.spines['right'].set_color('none')
ax.spines['top'].set_color('none')
ax.spines['bottom'].set_position(('data', 0.5))
ax.spines['left'].set_color('none')

ax.scatter(X, y, c='black', s=300, marker='o', edgecolors=brown, lw='1')
ax.plot([meanX, meanX], [0, 1.5], color=brown, lw=1.5, ls="--")

plt.arrow(meanX, 1.2, -s, 0, fc=brown, ec=brown,
          head_width=0.05, head_length=0.1, length_includes_head=True)
plt.arrow(meanX - s, 1.2, s, 0, fc=brown, ec=brown,
          head_width=0.05, head_length=0.1, length_includes_head=True)
plt.arrow(meanX, 1.2, s, 0, fc=brown, ec=brown,
          head_width=0.05, head_length=0.1, length_includes_head=True)
plt.arrow(meanX + s, 1.2, -s, 0, fc=brown, ec=brown,
          head_width=0.05, head_length=0.1, length_includes_head=True)

plt.xlabel(r'$x$', fontsize=18)

plt.annotate(r'$\bar{x}_n : \mbox{empirical mean}$',
             xy=(meanX, 0.4), xycoords='data', xytext=(+10, +30),
             textcoords='offset points', fontsize=18, color=brown)
plt.annotate(r'$s_n$', xy=(meanX + s * (0.4), 1), xycoords='data',
             xytext=(+10, +30), textcoords='offset points', fontsize=18,
             color=brown)
plt.annotate(r'$s_n$', xy=(meanX - s * (0.6), 1), xycoords='data',
             xytext=(+10, +30), textcoords='offset points', fontsize=18,
             color=brown)

plt.tight_layout()
plt.show()

if saving:
    my_saving_display(fig1, dirname, "GammaSD", imageformat)

# Mean Absolute Deviation (MAD):


In [None]:
fig1, ax = plt.subplots(figsize=(10, 3))
ax.set_ylim(0, 1.5)
ax.set_xlim(xmin, xmax)
ax.get_xaxis().tick_bottom()
ax.axes.get_yaxis().set_visible(False)

ax.spines['right'].set_color('none')
ax.spines['top'].set_color('none')
ax.spines['bottom'].set_position(('data', 0.5))
ax.spines['left'].set_color('none')

ax.scatter(X, y, c='black', s=300, marker='o', edgecolors=brown, lw='1')
ax.plot([medX, medX], [0, 1.5], color=purple, lw=1.5, ls="--")

plt.arrow(medX, 1.2, -MADX, 0, fc=purple, ec=purple, head_width=0.05,
          head_length=0.1, length_includes_head=True)
plt.arrow(medX - MADX, 1.2, MADX, 0, fc=purple, ec=purple, head_width=0.05,
          head_length=0.1, length_includes_head=True)
plt.arrow(medX, 1.2, MADX, 0, fc=purple, ec=purple, head_width=0.05,
          head_length=0.1, length_includes_head=True)
plt.arrow(medX + MADX, 1.2, -MADX, 0, fc=purple, ec=purple, head_width=0.05,
          head_length=0.1, length_includes_head=True)

plt.xlabel(r'$x$', fontsize=18)

plt.annotate(r'$\rm{Med}_n(\mathbb{x}): \mbox{empirical median}$',
             xy=(medX, 0.4), xycoords='data', xytext=(+10, +30),
             textcoords='offset points', fontsize=18, color=purple)
plt.annotate(r'$\rm{MAD}_n(\mathbb{x})$', xy=(medX + MADX * (0.1), 1),
             xycoords='data', xytext=(+10, +30), textcoords='offset points',
             fontsize=14, color=purple)
plt.annotate(r'$\rm{MAD}_n(\mathbb{x})$', xy=(medX - MADX * (1.2), 1),
             xycoords='data', xytext=(-55, +30), textcoords='offset points',
             fontsize=14, color=purple)

plt.tight_layout()
plt.show()

if saving:
    my_saving_display(fig1, dirname, "GammaMAD", imageformat)

# Newcomb's measurements for evaluating the speed of light

This is extracted from Maronna et al. 2006 (see also Stigler 1977)

In [None]:
X = np.array([28, 26, 33, 24, 34, 29, 22, 24, 21, 25, -44, 27, 16, 40, -2, 30, 23, 29, 31, 19]) * 0.001 + 24.8
nb_samples = X.shape[0]

In [None]:
# Various statistics:
y = np.ones(nb_samples,)
meanX = np.mean(X)  # mean
minX = np.min(X)  # min
maxX = np.max(X)  # max
medX = np.median(X)  # median
MADX = np.median(np.abs(X - medX))  # mean absolute deviation
MADNX = MADX / sp.stats.norm.ppf(0.75) # 0.6745 correponds to sp.stats.norm.ppf(0.75)
s = np.std(X, ddof=1)  # unbiased standard deviation

In [None]:
fig1, ax = plt.subplots(figsize=(10, 3))
ax.set_ylim(0, 1.5)
ax.set_xlim(minX-0.01, maxX+0.01)
ax.get_xaxis().tick_bottom()
ax.axes.get_yaxis().set_visible(False)

ax.spines['right'].set_color('none')
ax.spines['top'].set_color('none')
ax.spines['bottom'].set_position(('data', 0.5))
ax.spines['left'].set_color('none')
plt.xlabel(r'Raw observations', fontsize=18)

ax.scatter(X, y, c='black', s=300, marker='o', edgecolors=brown, lw='1')
plt.tight_layout()
plt.show()

if saving:
    my_saving_display(fig1, dirname, "Newcombe_raw", imageformat)

## Investigation of outliers with t-statistics 

In [None]:
t = (X - meanX) / s

In [None]:
from matplotlib.ticker import MaxNLocator
fig1, ax = plt.subplots(figsize=(10, 3))
ax.set_ylim(-15, 15)
ax.set_xlim(-1, t.shape[0])
ax.xaxis.set_major_locator(MaxNLocator(integer=True))
# ax.yaxis.set_major_locator(MaxNLocator(integer=True))

plt.ylabel(r't-statistics', fontsize=18)
plt.xlabel(r'index', fontsize=18)
ax.axhline(y=3, color='k', lw=1.5, ls="--")
ax.axhline(y=-3, color='k', lw=1.5, ls="--")

ax.scatter(np.arange(t.shape[0]),t, c='black', s=100, marker='o', edgecolors=brown, lw='1')
plt.tight_layout()
plt.show()

if saving:
    my_saving_display(fig1, dirname, "Newcombe_t", imageformat)

**Conclusion**: with the classical rule that "outliers" correspond to values with $|t_i|>3$, only the values t[10] = -3.72, qualifies as an outlier

## Investigation of outliers with "medianized" t-statistics

In [None]:
t_2 = (X - medX) / MADNX

In [None]:
from matplotlib.ticker import MaxNLocator
fig1, ax = plt.subplots(figsize=(10, 3))
ax.set_ylim(-15, 15)
ax.set_xlim(-1, t.shape[0])
ax.xaxis.set_major_locator(MaxNLocator(integer=True))

plt.ylabel(r't-statistics', fontsize=18)
plt.xlabel(r'index', fontsize=18)
ax.axhline(y=3, color='k', lw=1.5, ls="--")
ax.axhline(y=-3, color='k', lw=1.5, ls="--")

ax.scatter(np.arange(t_2.shape[0]),t_2, c='black', s=100, marker='o', edgecolors=brown, lw='1')
plt.tight_layout()
plt.show()

if saving:
    my_saving_display(fig1, dirname, "Newcombe_t_2", imageformat)

**Conclusion**: with the robust variant rule the "outliers" correspond to values with $|t'_i|>3$, and now two values t[10] = -11.72 and t_2[14] = -4.64 qualify as outliers