In [9]:
%load_ext autoreload
%autoreload 2
from IPython.core.interactiveshell import InteractiveShell
from IPython.display import IFrame, display
InteractiveShell.ast_node_interactivity = "all"

# Probabilities to cluster algorithm

A notebook to hash out this algorithm and check it works.

Will hopefully turn into a unit test too, hence CVSs into version control.

In [5]:
from src import locations as loc

import pandas as pd
import duckdb
from pathlib import Path

In [104]:
clus = pd.read_csv(Path(loc.PROJECT_DIR, "test", "clusters.csv"))
prob = pd.read_csv(Path(loc.PROJECT_DIR, "test", "probabilities.csv"))

In [140]:
duckdb.sql("""
    select
        *
    from
        prob;
""")

┌───────┬───────────┬─────────┬──────────────┬────────┬─────────────┐
│ uuid  │ link_type │ cluster │      id      │ source │ probability │
│ int64 │  varchar  │  int64  │   varchar    │ int64  │   double    │
├───────┼───────────┼─────────┼──────────────┼────────┼─────────────┤
│     1 │ link      │       0 │ will_inc_t1  │      1 │         1.0 │
│     2 │ link      │       0 │ leo_inc_t1   │      1 │         1.0 │
│     3 │ link      │       0 │ sarah_inc_t1 │      1 │         1.0 │
│     4 │ link      │       1 │ will_inc_t2  │      2 │         0.9 │
│     5 │ link      │       2 │ will_inc_t2  │      2 │         0.7 │
│     6 │ link      │       1 │ woll_inc_t2  │      2 │         0.7 │
│     7 │ link      │       1 │ wall_inc_t2  │      2 │         0.7 │
│     8 │ link      │       2 │ lao_inc_t2   │      2 │         0.8 │
│     9 │ link      │       2 │ lio_inc_t2   │      2 │        0.85 │
│    10 │ link      │       2 │ leo_inc_t2   │      2 │         0.9 │
│    11 │ link      

In [139]:
duckdb.sql("""
    select
        *
    from
        clus;
""")

┌───────┬─────────┬──────────────┬────────┬───────┬───────┐
│ uuid  │ cluster │      id      │ source │ n_seq │ n_par │
│ int64 │  int64  │   varchar    │ int64  │ int64 │ int64 │
├───────┼─────────┼──────────────┼────────┼───────┼───────┤
│     1 │       1 │ will_inc_t1  │      1 │     0 │     0 │
│     2 │       1 │ will_inc_t2  │      2 │     1 │     1 │
│     3 │       1 │ will_inc_t3  │      3 │     2 │     1 │
│     4 │       2 │ leo_inc_t1   │      1 │     0 │     0 │
│     5 │       2 │ leo_inc_t2   │      2 │     1 │     1 │
│     6 │       3 │ sarah_inc_t1 │      1 │     0 │     0 │
│     7 │       4 │ pedro_inc_t2 │      2 │     1 │     1 │
└───────┴─────────┴──────────────┴────────┴───────┴───────┘

Having done some fiddling below, here's the core SQL.

Step 1: instantiate clusters (or already have a cluster table)
Step 2: run this to add any new clusters the probabilities table now holds

Note this notebook DOESN'T handle adding unmatched dimensions to the clusters table as new clusters.

Params:

* n: your current stage in the 🐙blocktopus
* threshold: the point where we consider a probability a valid match. For parallel, note this means all tables will use the same value, which might not be the optimal value across all tables. In this instance, consider making it an additive table

In [None]:
sql = """
    select
        nextval('uuid') as uuid, -- Create UUID in an appropriate way for Postgres
        cluster,
        id,
        source,
        1 as n,
    from (
        select
            distinct on (prob.cluster, prob.source)
            prob.*
        from
            prob
        anti join clus_init on
            clus_init.id = prob.id
            and clus_init.source = prob.source
        where 
            probability > 0.7 -- Should be set by calling function
            and link_type = 'link'
        order by
            probability desc,
            id desc
    )
    union
    select
        *
    from
        clus_init
"""

Something doesn't add up here. I previously thought this needed recursion. Why? Let's try and construct a situation:

* "will_inc" is 0.9 for cluster 1 and 0.8 for cluster 2
* "wedro_inc" is 0.8 for cluster 1 and 0.75 for cluster 2

In the current setup, "will_inc" goes to cluster 1 and cluster 2 is unmatched.

What we want is "will_inc" to go to cluster 1, and the second best option, "wedro_inc", to go to cluster 2.

## Parallel

Sometimes we might join several tables to `probabilities` at once, then add them to `clusters` together.

In [120]:
clus_init = duckdb.sql("""
    drop sequence if exists uuid;
    drop sequence if exists cluster;
    create sequence uuid start 1;
    create sequence cluster start 1;
    select
        nextval('uuid') as uuid,
        nextval('cluster') as cluster,
        id,
        source,
        0 as n,
    from
        prob
    where
        cluster = 0
""")
clus_init

┌───────┬─────────┬──────────────┬────────┬───────┐
│ uuid  │ cluster │      id      │ source │   n   │
│ int64 │  int64  │   varchar    │ int64  │ int32 │
├───────┼─────────┼──────────────┼────────┼───────┤
│     1 │       1 │ will_inc_t1  │      1 │     0 │
│     2 │       2 │ leo_inc_t1   │      1 │     0 │
│     3 │       3 │ sarah_inc_t1 │      1 │     0 │
└───────┴─────────┴──────────────┴────────┴───────┘

In [121]:
clus_complete = duckdb.sql("""
    select
        nextval('uuid') as uuid,
        cluster,
        id,
        source,
        1 as n,
    from (
        select
            distinct on (prob.cluster, prob.source)
            prob.*
        from
            prob
        anti join clus_init cl on
            cl.id = prob.id
            and cl.source = prob.source
        where 
            probability > 0.7
        order by
            probability desc,
            id desc
    )
    union
    select
        *
    from
        clus_init
""")

In [122]:
clus_check_l = duckdb.sql("""
    select
        cluster,
        id,
        source,
        n::int as n
    from
        clus_complete
    order by
        cluster,
        source,
        id,
        n
""")
clus_check_r = duckdb.sql("""
    select
        cluster,
        id,
        source,
        n_par::int as n
    from
        clus
    order by
        cluster,
        source,
        id,
        n_par
""")
clus_check_l.df().equals(clus_check_r.df())
clus_check_l.df().compare(clus_check_r.df())

True

## Sequential

Sometimes we'll add one table to `probabilities`, then resolve to `clusters`, then do that over and over.

In [132]:
clus_init = duckdb.sql("""
    drop sequence if exists uuid;
    drop sequence if exists cluster;
    create sequence uuid start 1;
    create sequence cluster start 1;
    select
        nextval('uuid') as uuid,
        nextval('cluster') as cluster,
        id,
        source,
        0 as n,
    from
        prob
    where
        cluster = 0
""")
clus_init

┌───────┬─────────┬──────────────┬────────┬───────┐
│ uuid  │ cluster │      id      │ source │   n   │
│ int64 │  int64  │   varchar    │ int64  │ int32 │
├───────┼─────────┼──────────────┼────────┼───────┤
│     1 │       1 │ will_inc_t1  │      1 │     0 │
│     2 │       2 │ leo_inc_t1   │      1 │     0 │
│     3 │       3 │ sarah_inc_t1 │      1 │     0 │
└───────┴─────────┴──────────────┴────────┴───────┘

In [133]:
prob_n1 = duckdb.sql("""
    select
        *
    from
        prob
    where
        source = 2
""")
prob_n2 = duckdb.sql("""
    select
        *
    from
        prob
    where
        source = 3
""")
prob_n3 = duckdb.sql("""
    select
        *
    from
        prob
    where
        source = 4
""")

In [134]:
clus_n1 = duckdb.sql("""
    select
        nextval('uuid') as uuid,
        cluster,
        id,
        source,
        1 as n,
    from (
        select
            distinct on (prob.cluster, prob.source)
            prob.*
        from
            prob_n1 prob
        anti join clus_init cl on
            cl.id = prob.id
            and cl.source = prob.source
        where
            probability > 0.7
        order by
            probability desc,
            id desc
    )
    union
    select
        *
    from
        clus_init
""")
clus_n1

┌───────┬─────────┬──────────────┬────────┬───────┐
│ uuid  │ cluster │      id      │ source │   n   │
│ int64 │  int64  │   varchar    │ int64  │ int32 │
├───────┼─────────┼──────────────┼────────┼───────┤
│     1 │       1 │ will_inc_t1  │      1 │     0 │
│     2 │       2 │ leo_inc_t1   │      1 │     0 │
│     3 │       3 │ sarah_inc_t1 │      1 │     0 │
│     4 │       1 │ will_inc_t2  │      2 │     1 │
│     5 │       4 │ pedro_inc_t2 │      2 │     1 │
│     6 │       2 │ leo_inc_t2   │      2 │     1 │
└───────┴─────────┴──────────────┴────────┴───────┘

In [135]:
clus_n2 = duckdb.sql("""
    select
        nextval('uuid') as uuid,
        cluster,
        id,
        source,
        2 as n,
    from (
        select
            distinct on (prob.cluster, prob.source)
            prob.*
        from
            prob_n2 prob
        anti join clus_n1 cl on
            cl.id = prob.id
            and cl.source = prob.source
        where
            probability > 0.7
        order by
            probability desc,
            id desc
    )
    union
    select
        *
    from
        clus_n1
""")
clus_n2

┌───────┬─────────┬──────────────┬────────┬───────┐
│ uuid  │ cluster │      id      │ source │   n   │
│ int64 │  int64  │   varchar    │ int64  │ int32 │
├───────┼─────────┼──────────────┼────────┼───────┤
│     1 │       1 │ will_inc_t1  │      1 │     0 │
│     2 │       2 │ leo_inc_t1   │      1 │     0 │
│     3 │       3 │ sarah_inc_t1 │      1 │     0 │
│    10 │       1 │ will_inc_t2  │      2 │     1 │
│    11 │       4 │ pedro_inc_t2 │      2 │     1 │
│    12 │       2 │ leo_inc_t2   │      2 │     1 │
│    13 │       1 │ will_inc_t3  │      3 │     2 │
└───────┴─────────┴──────────────┴────────┴───────┘

In [136]:
clus_n3 = duckdb.sql("""
    select
        nextval('uuid') as uuid,
        cluster,
        id,
        source,
        3 as n,
    from (
        select
            distinct on (prob.cluster, prob.source)
            prob.*
        from
            prob_n3 prob
        anti join clus_n2 on
            clus_n2.id = prob.id
            and clus_n2.source = prob.source
        where
            probability > 0.7
        order by
            probability desc,
            id desc
    )
    union
    select
        *
    from
        clus_n2
""")
clus_n3

┌───────┬─────────┬──────────────┬────────┬───────┐
│ uuid  │ cluster │      id      │ source │   n   │
│ int64 │  int64  │   varchar    │ int64  │ int32 │
├───────┼─────────┼──────────────┼────────┼───────┤
│     1 │       1 │ will_inc_t1  │      1 │     0 │
│     2 │       2 │ leo_inc_t1   │      1 │     0 │
│     3 │       3 │ sarah_inc_t1 │      1 │     0 │
│    14 │       1 │ will_inc_t2  │      2 │     1 │
│    15 │       4 │ pedro_inc_t2 │      2 │     1 │
│    16 │       2 │ leo_inc_t2   │      2 │     1 │
│    27 │       1 │ will_inc_t3  │      3 │     2 │
└───────┴─────────┴──────────────┴────────┴───────┘

In [137]:
clus_check_l1 = duckdb.sql("""
    select
        cluster,
        id,
        source,
        n::int as n
    from
        clus_n2
    order by
        cluster,
        source,
        id,
        n
""")
clus_check_l2 = duckdb.sql("""
    select
        cluster,
        id,
        source,
        n::int as n
    from
        clus_n3
    order by
        cluster,
        source,
        id,
        n
""")
clus_check_r = duckdb.sql("""
    select
        cluster,
        id,
        source,
        n_seq::int as n
    from
        clus
    order by
        cluster,
        source,
        id,
        n_par
""")
clus_check_l1.df().equals(clus_check_r.df())
clus_check_l1.df().compare(clus_check_r.df())
clus_check_l2.df().equals(clus_check_r.df())
clus_check_l2.df().compare(clus_check_r.df())

True

True