## Inclusion Dependency Discovery and Validation

In [3]:
from typing import Tuple, List, Dict, Any
from dataclasses import dataclass, field
import re
import textwrap

import pandas as pd
import desbordante
import desbordante.ind.algorithms as ind_algorithms

In [4]:
# file_path = "../../data/Food_Inspections_20250216_preprocessed.parquet"
file_path = "../../data/Food_Inspections_Violations_Expanded_with_cleandata.parquet"

df = pd.read_parquet(file_path)
df.head()

Unnamed: 0,dba_name,license_#,risk,address,zip,inspection_date,results,latitude,longitude,facility_type_cleaned,city_cleaned,inspection_type_cleaned,violation_number
0,BLOOMING BUD DAYCARE,2215789,Risk 1 (High),5715 N LINCOLN AVE,60659,3/7/2013,Pass,41.98539,-87.698734,DAYCARE COMBO 1586,CHICAGO,LICENSE,32
1,BLOOMING BUD DAYCARE,2215789,Risk 1 (High),5715 N LINCOLN AVE,60659,3/7/2013,Pass,41.98539,-87.698734,DAYCARE COMBO 1586,CHICAGO,LICENSE,32
2,BLOOMING BUD DAYCARE,2215789,Risk 1 (High),5715 N LINCOLN AVE,60659,3/7/2013,Pass,41.98539,-87.698734,DAYCARE COMBO 1586,CHICAGO,LICENSE,34
3,BLOOMING BUD DAYCARE,2215789,Risk 1 (High),5715 N LINCOLN AVE,60659,3/7/2013,Pass,41.98539,-87.698734,DAYCARE COMBO 1586,CHICAGO,LICENSE,35
4,Babas Halal,2684170,Risk 1 (High),7901 S DAMEN AVE,60620,12/3/2024,Fail,41.750189,-87.672986,RESTAURANT,CHICAGO,COMPLAINT,1


In [5]:
df = df.dropna()

In [None]:

def find_inds(df:list [pd.DataFrame] | pd.DataFrame, algorithm_name:str='Default'):
    """
    Finds inclusion dependencies in a given DataFrame using a specified algorithm.
    
    Parameters:
        df (pd.DataFrame): The input DataFrame.
        algorithm_name (str): The name of the FD algorithm to use. Defaults to 'Default'.
    
    Returns:
        list: A list of discovered approximate functional dependencies.
    """
    try:

        # Get the algorithm class dynamically from desbordante.fd.algorithms
        algo_class = getattr(ind_algorithms, algorithm_name, ind_algorithms.Default)

        print(f"Algorthm: {algo_class}")
        
        algo = algo_class()
        algo.load_data(tables=df)
        algo.execute(
            allow_duplicates=False,  # Ignore duplicate INDs
        )
        
        # Filter out self-dependencies
        return [
            ind for ind in algo.get_inds()
            if ind.get_lhs().column_indices != ind.get_rhs().column_indices
        ]
    except AttributeError:
        raise ValueError(f"Algorithm '{algorithm_name}' not found. Available algorithms: {dir(ind_algorithms)}")
    

@dataclass
class InclusionDependency:
    lhs: List[str]  # Left-hand side attributes
    rhs: List[str]  # Right-hand side attributes

    def __str__(self):
       lhs_count = len(self.lhs)
       base = f"LHS={self.lhs} ({lhs_count}), RHS={self.rhs}"
       return base
    
@dataclass
class InclusionDependencySet:
    dependencies: List[InclusionDependency] = field(default_factory=list)
    validation_results: Dict[Tuple[Tuple[str, ...], str], Dict[str, Any]] = field(default_factory=dict)

    def add_dependency(self, lhs: List[str], rhs: List[str]):
        """Adds a new functional dependency to the set."""
        self.dependencies.append(InclusionDependency(lhs, rhs))

    def __len__(self):
        """Returns the number of functional dependencies."""
        return len(self.dependencies)

    def __iter__(self):
        """Allows iteration over functional dependencies."""
        return iter(self.dependencies)
    
    def validate_ind(self, df):
        """
        
        """
        GREEN_CODE = "\033[1;42m"
        RED_CODE = "\033[1;41m"
        BLUE_CODE = "\033[1;46m"
        DEFAULT_COLOR_CODE = "\033[1;49m"
        RESET_CODE = "\033[0m"

        def ind_str(lhs, rhs):
            def cc_str(cc):
                (df, indices) = cc
                columns = [df.columns[idx] for idx in indices]
                return ", ".join(f"{col}" for col in columns)

            return f"[{cc_str(lhs)}] -> [{cc_str(rhs)}]"

        def ind_verify(lhs, rhs):
            (lhs_table, lhs_indices) = lhs
            (rhs_table, rhs_indices) = rhs

            print(f"Checking the IND {ind_str((lhs_table, lhs_indices), (rhs_table, rhs_indices))}")

            algo = desbordante.ind_verification.algorithms.Default()
            algo.load_data(tables=[lhs_table, rhs_table])
            algo.execute(lhs_indices=lhs_indices, rhs_indices=rhs_indices)

            return algo
        
        def print_results_for_ind(verifier):
            if verifier.get_error() == 0:
                print(GREEN_CODE, "IND holds", RESET_CODE)
            else:
                print(RED_CODE, f"AIND holds with error = {verifier.get_error():.2}",
                    RESET_CODE)

        verifier = desbordante.ind_verification.algorithms.Default()
        
        verifier.load_data(tables=[df,df])

        for fd in self.dependencies:
            lhs_idx = df.columns.get_indexer(fd.lhs)
            rhs_idx = df.columns.get_loc(fd.rhs)

            if lhs_idx[0] == -1:
                continue

            algo = ind_verify((df, [lhs_idx]),(df, [rhs_idx]))
            print_results_for_ind(algo)

In [7]:
def convert_ind(ind:desbordante.ind.IND) -> Tuple[list, list]:
    ind_str = str(ind)
    ind_str_split = ind_str.split("->") # split fd to lhs and rhs
    lhs = ind_str_split[0].strip() 
    rhs = ind_str_split[-1].strip()

    # Regex to match content within square brackets
    pattern = r"\[([^\[\]]+)\]"

    # Find matches
    lhs_matches = re.findall(pattern, lhs)

    rhs_matches = re.findall(pattern, rhs)

    return lhs_matches, rhs_matches

In [8]:
results = find_inds([df, df])

print(f"There are {len(results)} inclusion dependencies using Default algorithm.")

for ind in results:
    print(ind)

Algorthm: <class 'desbordante.ind.algorithms.Spider'>
There are 0 inclusion dependencies using Default algorithm.


In [9]:
ind_set = InclusionDependencySet()
for result in results:
    lhs, rhs =  convert_ind(result)
    ind_set.add_dependency(lhs, rhs)

In [10]:
# Validate all dependencies and store results
ind_set.validate_ind(df)