Skip to content

Commit

Permalink
Outlier detection and outlier exclusion
Browse files Browse the repository at this point in the history
  • Loading branch information
NickNeck committed Nov 10, 2023
1 parent b74e32a commit 3950de0
Show file tree
Hide file tree
Showing 4 changed files with 279 additions and 53 deletions.
197 changes: 163 additions & 34 deletions lib/statistex.ex
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ defmodule Statistex do
:mode,
:minimum,
:maximum,
:outliers_bounds,
:outliers,
sample_size: 0
]

Expand All @@ -47,6 +49,8 @@ defmodule Statistex do
mode: mode,
minimum: number,
maximum: number,
outliers_bounds: {number, number},
outliers: [number],
sample_size: non_neg_integer
}

Expand Down Expand Up @@ -81,6 +85,8 @@ defmodule Statistex do

@empty_list_error_message "Passed an empty list ([]) to calculate statistics from, please pass a list containing at least on number."

@iqr_factor 1.5

@doc """
Calculate all statistics Statistex offers for a given list of numbers.
Expand All @@ -89,7 +95,15 @@ defmodule Statistex do
`Argumenterror` is raised if the given list is empty.
## Options
In a `percentiles` options arguments for the calculation of percentiles (see `percentiles/2`) can be given. The 50th percentile is always calculated as it is the median.
In a `percentiles` options arguments for the calculation of percentiles (see `percentiles/2`) can
be given. The percentiles 25th, 50th (median) and 75th are always calculated.
The option `exclude_outliers` can be set to `:once`, `:repeatedly` or `nil`,
`nil` is the default. If this option set to `:once` the outliers are excluded
and the statistics are calculated with the rest of the samples. The value
`:repeatedly` repeats the outlier exclusion until the samples no longer
contains outliers.
## Examples
Expand All @@ -100,7 +114,7 @@ defmodule Statistex do
standard_deviation: 200.0,
standard_deviation_ratio: 0.4,
median: 500.0,
percentiles: %{50 => 500.0},
percentiles: %{25 => 400.0, 50 => 500.0, 75 => 600.0},
frequency_distribution: %{
200 => 1,
400 => 3,
Expand All @@ -112,7 +126,9 @@ defmodule Statistex do
minimum: 200,
maximum: 900,
sample_size: 9,
total: 4500
total: 4500,
outliers: [],
outliers_bounds: {200, 900.0}
}
iex> Statistex.statistics([])
Expand All @@ -125,13 +141,15 @@ defmodule Statistex do
standard_deviation: 0.0,
standard_deviation_ratio: 0.0,
median: 0.0,
percentiles: %{50 => 0.0},
percentiles: %{25 => 0.0, 50 => 0.0, 75 => 0.0},
frequency_distribution: %{0 => 4},
mode: 0,
minimum: 0,
maximum: 0,
sample_size: 4,
total: 0
total: 0,
outliers: [],
outliers_bounds: {0.0, 0.0}
}
"""
Expand All @@ -143,33 +161,65 @@ defmodule Statistex do
end

def statistics(samples, configuration) do
total = total(samples)
sample_size = length(samples)
average = average(samples, total: total, sample_size: sample_size)
variance = variance(samples, average: average, sample_size: sample_size)
standard_deviation = standard_deviation(samples, variance: variance)
samples = Enum.sort(samples)

standard_deviation_ratio =
standard_deviation_ratio(samples, standard_deviation: standard_deviation)
minimum = hd(samples)
maximum = List.last(samples)

percentiles = calculate_percentiles(samples, configuration)

frequency_distribution = frequency_distribution(samples)

%__MODULE__{
total: total,
average: average,
variance: variance,
standard_deviation: standard_deviation,
standard_deviation_ratio: standard_deviation_ratio,
median: median(samples, percentiles: percentiles),
percentiles: percentiles,
frequency_distribution: frequency_distribution,
mode: mode(samples, frequency_distribution: frequency_distribution),
minimum: minimum(samples),
maximum: maximum(samples),
sample_size: sample_size
}
outliers_bounds =
do_outliers_bounds(samples, percentiles: percentiles, minimum: minimum, maximum: maximum)

{outliers, rest} = do_outliers(samples, outliers_bounds: outliers_bounds)

if exclude_outliers?(configuration) and not Enum.empty?(outliers) do
configuration =
configuration
|> Keyword.put(:outliers_excluded, true)
|> Keyword.update!(:exclude_outliers, fn
:once -> :stop
:repeatedly -> :repeatedly
end)
|> Keyword.update(:acc_outliers, outliers, fn list -> list ++ outliers end)

statistics(rest, configuration)
else
outliers = outliers ++ Keyword.get(configuration, :acc_outliers, [])

total = total(samples)
sample_size = length(samples)
average = average(samples, total: total, sample_size: sample_size)
variance = variance(samples, average: average, sample_size: sample_size)

frequency_distribution = frequency_distribution(samples)

standard_deviation = standard_deviation(samples, variance: variance)

standard_deviation_ratio =
standard_deviation_ratio(samples, standard_deviation: standard_deviation)

%__MODULE__{
total: total,
average: average,
variance: variance,
standard_deviation: standard_deviation,
standard_deviation_ratio: standard_deviation_ratio,
median: median(samples, percentiles: percentiles),
percentiles: percentiles,
frequency_distribution: frequency_distribution,
mode: mode(samples, frequency_distribution: frequency_distribution),
minimum: minimum,
maximum: maximum,
outliers_bounds: outliers_bounds,
outliers: outliers,
sample_size: sample_size
}
end
end

defp exclude_outliers?(configuration) do
Keyword.get(configuration, :exclude_outliers) in [:once, :repeatedly]
end

@doc """
Expand Down Expand Up @@ -396,8 +446,10 @@ defmodule Statistex do
percentiles_configuration = Keyword.get(configuration, :percentiles, [])

# median_percentile is manually added so that it can be used directly by median
percentiles_configuration = Enum.uniq([@median_percentile | percentiles_configuration])
percentiles(samples, percentiles_configuration)
percentiles_configuration =
Enum.uniq([25, @median_percentile, 75 | percentiles_configuration])

Percentile.percentiles(samples, percentiles_configuration)
end

@doc """
Expand Down Expand Up @@ -447,7 +499,9 @@ defmodule Statistex do
"""
@spec percentiles(samples, number | [number(), ...]) ::
percentiles()
defdelegate(percentiles(samples, percentiles), to: Percentile)
def percentiles(samples, percentiles) do
samples |> Enum.sort() |> Percentile.percentiles(percentiles)
end

@doc """
A map showing which sample occurs how often in the samples.
Expand Down Expand Up @@ -541,10 +595,85 @@ defmodule Statistex do

def median(samples, options) do
percentiles =
Keyword.get_lazy(options, :percentiles, fn -> percentiles(samples, @median_percentile) end)
Keyword.get_lazy(options, :percentiles, fn ->
Percentile.percentiles(samples, @median_percentile)
end)

get_percentile(samples, @median_percentile, percentiles)
end

@doc """
Calculates the lower and upper bound for outliers.
Any sample that is `<` as the lower bound and any sample `>` are outliers of
the given `samples`.
## Examples
iex> Statistex.outliers_bounds([3, 4, 5])
{3, 5}
iex> Statistex.outliers_bounds([1, 2, 6, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50])
{22.5, 50}
iex> Statistex.outliers_bounds([50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 99, 99, 99])
{50, 80.625}
"""
@spec outliers_bounds(samples, keyword) :: {lower :: number, upper :: number}
def outliers_bounds(samples, options \\ [])
def outliers_bounds([], _), do: raise(ArgumentError, @empty_list_error_message)
def outliers_bounds(samples, options), do: samples |> Enum.sort() |> do_outliers_bounds(options)

defp do_outliers_bounds(samples, options) do
percentiles =
Keyword.get_lazy(options, :percentiles, fn -> Percentile.percentiles(samples, [25, 75]) end)

minimum = Keyword.get_lazy(options, :minimum, fn -> hd(samples) end)
maximum = Keyword.get_lazy(options, :maximum, fn -> List.last(samples) end)

p25 = get_percentile(samples, 25, percentiles)
p75 = get_percentile(samples, 75, percentiles)
iqr = p75 - p25

{max(p25 - iqr * @iqr_factor, minimum), min(p75 + iqr * @iqr_factor, maximum)}
end

@doc """
Returns all outliers for the given `samples`.
## Examples
iex> Statistex.outliers([3, 4, 5])
[]
iex> Statistex.outliers([1, 2, 6, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50])
[1, 2, 6]
iex> Statistex.outliers([50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 99, 99, 99])
[99, 99, 99]
"""
@spec outliers(samples, keyword) :: samples | []
def outliers(samples, options \\ []) do
{outliers, _rest} = samples |> Enum.sort() |> do_outliers(options)

outliers
end

defp do_outliers(samples, options) do
{lower_bound, upper_bound} =
Keyword.get_lazy(options, :outliers_bounds, fn -> do_outliers_bounds(samples, options) end)

{min, rest} = Enum.split_while(samples, fn sample -> sample < lower_bound end)

{max, rest} =
rest |> Enum.reverse() |> Enum.split_while(fn sample -> sample > upper_bound end)

{min ++ max, rest}
end

Map.get_lazy(percentiles, @median_percentile, fn ->
samples |> percentiles(@median_percentile) |> Map.fetch!(@median_percentile)
defp get_percentile(samples, percentile, percentiles) do
Map.get_lazy(percentiles, percentile, fn ->
samples |> Percentile.percentiles(percentile) |> Map.fetch!(percentile)
end)
end

Expand Down
3 changes: 1 addition & 2 deletions lib/statistex/percentile.ex
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,11 @@ defmodule Statistex.Percentile do

def percentiles(samples, percentile_ranks) do
number_of_samples = length(samples)
sorted_samples = Enum.sort(samples)

percentile_ranks
|> List.wrap()
|> Enum.reduce(%{}, fn percentile_rank, acc ->
perc = percentile(sorted_samples, number_of_samples, percentile_rank)
perc = percentile(samples, number_of_samples, percentile_rank)
Map.put(acc, percentile_rank, perc)
end)
end
Expand Down
34 changes: 17 additions & 17 deletions test/statistex/percentile_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,20 @@ defmodule Statistex.PercentileTest do

doctest Statistex.Percentile

@nist_sample_data [
95.1772,
95.1567,
95.1937,
95.1959,
95.1442,
95.0610,
95.1591,
95.1195,
95.1065,
95.0925,
95.1990,
95.1682
]
@nist_sample_data Enum.sort([
95.1772,
95.1567,
95.1937,
95.1959,
95.1442,
95.0610,
95.1591,
95.1195,
95.1065,
95.0925,
95.1990,
95.1682
])

# Test data from:
# http://www.itl.nist.gov/div898/handbook/prc/section2/prc262.htm
Expand Down Expand Up @@ -49,7 +49,7 @@ defmodule Statistex.PercentileTest do
end

describe "a list of two elements" do
@samples [300, 200]
@samples [200, 300]
test "1st percentile (small sample size simply picks first element)" do
%{1 => result} = percentiles(@samples, [1])
assert result == 200.0
Expand All @@ -67,7 +67,7 @@ defmodule Statistex.PercentileTest do
end

describe "seemingly problematic 2 element list [9, 1]" do
@samples [9, 1]
@samples [1, 9]

percentiles = %{
25 => 1,
Expand All @@ -88,7 +88,7 @@ defmodule Statistex.PercentileTest do
end

describe "a list of three elements" do
@samples [100, 300, 200]
@samples [100, 200, 300]
test "1st percentile (small sample size simply picks first element)" do
%{1 => result} = percentiles(@samples, [1])
assert result == 100.0
Expand Down

0 comments on commit 3950de0

Please sign in to comment.