In [11]:
using CorrectMatch: Copula, Uniqueness, Individual
using StatsBase
using CSVFiles, DataFrames
using Distributions

┌ Info: Precompiling CSVFiles [5d742f6a-9f54-50ce-8119-2520741973ca]
└ @ Base loading.jl:1260


In [13]:
df = DataFrame(load(File(format"CSV", "adult.csv.gz")))

Unnamed: 0_level_0,age,workclass,education-num,marital-status,occupation,relationship,race
Unnamed: 0_level_1,Int64,Int64,Int64,Int64,Int64,Int64,Int64
1,39,7,13,4,1,1,4
2,50,6,13,2,4,0,4
3,38,4,9,0,6,1,4
4,53,4,7,2,6,0,2
5,28,4,13,2,10,5,2
6,37,4,14,2,4,5,4
7,49,4,5,3,8,1,2
8,52,6,9,2,4,0,4
9,31,4,14,4,10,1,4
10,42,4,13,2,4,0,4


In [15]:
df_sub = df[!, [:age, :sex, :workclass, :relationship, Symbol("marital-status"), :race]];
data = Array{Int}(df_sub)
N, M = size(data)

(32561, 6)

In [16]:
N, M = size(data)

(32561, 6)

## Estimating population uniqueness

In [18]:
# True population uniqueness
u = Uniqueness.uniqueness(data)
println("True population uniqueness: $u")

True population uniqueness: 0.10853475016123583


In [21]:
# Fit model and estimate uniqueness
G = fit_mle(Copula.GaussianCopula, data; exact_marginal=true)
u = Uniqueness.uniqueness(rand(G, N))
println("Estimated population uniqueness: $u")

Estimated population uniqueness: 0.12514971898897453


In [23]:
# Fit model on 325 records (1% of the original data) and estimate uniqueness
ix = sample(1:N, 325; replace=false);
G = fit_mle(Copula.GaussianCopula, data[ix, :]; exact_marginal=false)
u = Uniqueness.uniqueness(rand(G, N))
println("Estimated population uniqueness (1% sample): $u")

Estimated population uniqueness (1% sample): 0.1440066337028961


## Estimating individual uniqueness

In [24]:
function extract_marginal_ordered(row::AbstractVector)
  cm = collect(values(countmap(row; alg=:dict)))
  Categorical(cm / sum(cm))
end

marginals = [extract_marginal_ordered(data[:, i]) for i=1:M];

In [26]:
G = fit_mle(Copula.GaussianCopula, marginals, data);

### Likely unique individual

In [27]:
indiv = data[1, :] # 39 years old male with non Asian/Black/White race

6-element Array{Int64,1}:
 39
  1
  7
  1
  4
  4

In [42]:
shifted_indiv = indiv - minimum(data, dims=1)[:] .+ 1
Individual.individual_uniqueness(G, shifted_indiv, N)

0.9954610615400545

### Unlikely unique individual

In [43]:
indiv = data[12, :] # 30 years old white male

6-element Array{Int64,1}:
 30
  1
  7
  0
  2
  1

In [44]:
shifted_indiv = indiv - minimum(data, dims=1)[:] .+ 1
Individual.individual_uniqueness(G, shifted_indiv, N)

0.0008347430829221553