In [1]:
%load_ext autoreload
%autoreload 2
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [11]:
from cmf.clean import cleaning_function
from cmf.clean import steps
from cmf.data import utils as du
from cmf.data import DB, Clusters, Table

from dotenv import load_dotenv, find_dotenv
import os

load_dotenv(find_dotenv())

df = du.generate_dummy_df()

df

True

Unnamed: 0,irrational,rounded
0,pi,3.14
1,e,2.72
2,phi,1.62


In [9]:
def to_upper(column: str) -> str:
    """
    All characters to uppercase
    """
    return f"upper({column})"

logger = logging.getLogger(__name__)

logger.info(
    "Creating probabilities table "
    "test"
)

In [31]:
clusters._db_expected_fields

['uuid', 'id', 'cluster', 'source', 'n']

In [7]:
x = set(["id", "uuid", "name"])
y = set(["id", "uuid", "name"])
z = set(["id"])
a = set(["id", "foo"])

y <= x
z <= x
a <= x

True

True

False

In [5]:
x = {"foo": "bar", "baz": "quz"}
y = {"foo": "bar"}
len(x)
len(y)
for i, j in x.items():
    print(i)
    print(j)

2

1

foo
bar
baz
quz


In [8]:
from pydantic import BaseModel

class Test(BaseModel):
    x: str
    y: str

    @classmethod
    def make_test(cls, foo: str) -> "Test":
        return cls(
            x=foo[0],
            y=foo[-1]
        )

test = Test.make_test(foo="lorem")
test.x
test.y

'l'

'm'

In [15]:
p, q = tuple(y.items())[0]
p
q

'foo'

'bar'

In [26]:
from cmf.helpers import comparison, comparisons
from icecream import ic

comparison_name = comparison(sql_condition="company_name = company_name")
comparison_id = comparison(sql_condition="data_hub_id = data_hub_id")
comparison_name_id = comparisons(comparison_name, comparison_id)

comparison_name_id

{'comparisons': [{'comparison': 'company_name = company_name'},
  {'comparison': 'data_hub_id = data_hub_id'}]}

In [19]:
from cmf import query, process
from cmf.helpers import selector, cleaner, cleaners
from cmf.clean import company_name, company_number

select_ch = selector(
    table="companieshouse.companies", fields=["company_number", "company_name"]
)
ch_sample = query(select=select_ch, sample=0.05)

cleaner_name = cleaner(function=company_name, arguments={"column": "company_name"})
cleaner_number = cleaner(
    function=company_number, arguments={"column": "company_number"}
)
cleaner_name_number = cleaners(cleaner_name, cleaner_number)

ch_sample_cleaned = process(data=ch_sample, pipeline=cleaner_name_number)

In [1]:
from cmf import query, process, make_deduper
from cmf.dedupers import Naive
from cmf.helpers import selector, cleaner, cleaners
from cmf.clean import company_name, postcode_to_area

# Select
select_exp = selector(
    table="hmrc.trade__exporters", 
    fields=["id", "company_name", "postcode"]
)
exp_sample = query(select=select_exp, sample=0.05)

# Clean
cleaner_name = cleaner(function=company_name, arguments={"column": "company_name"})
cleaner_pc = cleaner(
    function=postcode_to_area, arguments={"column": "postcode"}
)
cleaner_name_pc = cleaners(cleaner_name, cleaner_pc)

exp_sample_cleaned = process(data=exp_sample, pipeline=cleaner_name_pc)

exp_naive_deduper = make_deduper(
    dedupe_run_name="basic_hmrc_exp",
    description="""
        Clean company name, extract postcode area
    """,
    deduper=Naive,
    data=exp_sample_cleaned,
    dedupe_settings={
        "id": "id",
        "unique_fields": [
            "company_name",
            "postcode"
        ]
    }
)

exp_deduped = exp_naive_deduper()

In [2]:
exp_deduped

Unnamed: 0,id,company_name,postcode
0,2635576,2gether support solutions,CT
1,3523148,3dprintuk,E
2,1934134,3style scooters,CO
3,3094466,a b c macintosh,NN
4,825933,a bratt son,NG
...,...,...,...
1326,1977117,ycs,SL
1327,2829967,yoshioka akira,AL
1328,126983,yumi,NW
1329,1897407,zebra a s,AB


In [37]:
from cmf.data.probabilities import ProbabilityResults
import pandas as pd
import uuid

df_prob = pd.DataFrame(
    {
        "probability": [0.1, 0.8, 0.99]
    }
)

df_prob["id"] = [uuid.uuid4() for _ in range(len(df_prob.index))]
df_prob["cluster"] = [uuid.uuid4() for _ in range(len(df_prob.index))]
df_prob["source"] = 1

results = ProbabilityResults(
    dataframe=df_prob,
    run_name="foo",
    run_description="bar"
)

results.to_df()

Unnamed: 0,probability,id,cluster,source
0,0.1,aec1148c-39ee-4d6e-a7a5-ebfe36b7104f,8229d1b6-5486-4dd5-ae92-d9ae3be9d7fc,1
1,0.8,f93f530b-cdee-47d7-bb6b-a906aef18c6b,3e5a0820-6736-4da5-afa7-488f86500759,1
2,0.99,3eda53ac-f501-4f47-afa2-79b7762effff,b7b72197-c2ec-43e3-aa15-232eafb45d83,1


In [30]:
from cmf.data import Probabilities, Table

prob = Probabilities(
    db_table=Table(
        db_schema=os.getenv("SCHEMA"), 
        db_table=os.getenv("PROBABILITIES_TABLE")
    )
)

In [38]:
results.to_cmf()

In [39]:
prob.db_table.read()

Unnamed: 0,uuid,link_type,model,source,cluster,id,probability
0,623adca9-1dd2-4db7-8b99-26d842b8a27d,link,foo,1,8229d1b6-5486-4dd5-ae92-d9ae3be9d7fc,aec1148c-39ee-4d6e-a7a5-ebfe36b7104f,0.1
1,93b7ffe9-01a8-4321-b0f0-e25fcd46ad37,link,foo,1,3e5a0820-6736-4da5-afa7-488f86500759,f93f530b-cdee-47d7-bb6b-a906aef18c6b,0.8
2,6e5ba1b4-d44b-483e-a433-a14aa2e06482,link,foo,1,b7b72197-c2ec-43e3-aa15-232eafb45d83,3eda53ac-f501-4f47-afa2-79b7762effff,0.99
