## Approximate Inclusion Dependency Discovery and Validation

In [22]:
from typing import Tuple, List, Dict, Any
from dataclasses import dataclass, field
import re
import textwrap

import pandas as pd
import desbordante
import desbordante.ind.algorithms as ind_algorithms

In [11]:
# file_path = "../../data/Food_Inspections_Violations_Expanded_with_cleandata.parquet"
file_path = "../../data/Food_Inspections_20250216.csv"

# df = pd.read_parquet(file_path)
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude,Location
0,1106427,BLOOMING BUD DAYCARE,BLOOMING BUD DAYCARE,2215789.0,Daycare Combo 1586,Risk 1 (High),5715 N LINCOLN AVE,CHICAGO,IL,60659.0,03/07/2013,License,Pass,32. FOOD AND NON-FOOD CONTACT SURFACES PROPERL...,41.98539,-87.698734,"(41.98538950526786, -87.69873407149943)"
1,2608378,Babas Halal,Babas Halal,2684170.0,Restaurant,Risk 1 (High),7901 S DAMEN AVE,CHICAGO,IL,60620.0,12/03/2024,Complaint,Fail,"1. PERSON IN CHARGE PRESENT, DEMONSTRATES KNOW...",41.750189,-87.672986,"(41.750189342293375, -87.67298583977204)"
2,1106406,FIRST ZABIHA MEAT BAZAAR,FIRST ZABIHA MEAT BAZAAR,2232559.0,Grocery Store,Risk 2 (Medium),2907 W DEVON AVE,CHICAGO,IL,60659.0,02/20/2013,License,Fail,"11. ADEQUATE NUMBER, CONVENIENT, ACCESSIBLE, D...",41.997401,-87.702385,"(41.99740137039031, -87.70238538227812)"
3,2609909,HAPPY MARKET,HAPPY MARKET,2912802.0,Grocery Store,Risk 2 (Medium),2334 S WENTWORTH AVE,CHICAGO,IL,60616.0,01/02/2025,Canvass,Pass w/ Conditions,2. CITY OF CHICAGO FOOD SERVICE SANITATION CER...,41.849954,-87.632094,"(41.84995400192252, -87.63209419559098)"
4,2609927,SAT KAIVAL FOOD INC/SUBWAY,SAT KAIVAL FOOD INC/SUBWAY,2728400.0,Restaurant,Risk 1 (High),1916 S STATE ST,CHICAGO,IL,60616.0,01/02/2025,Canvass,Pass,36. THERMOMETERS PROVIDED & ACCURATE - Comment...,41.856053,-87.627311,"(41.85605269621059, -87.62731125804903)"


In [12]:
def find_ainds(df:list [pd.DataFrame] | pd.DataFrame, algorithm_name:str='Default', error:float=0.3):
    """
    Finds approximate inclusion dependencies in a given DataFrame using a specified algorithm.
    
    Parameters:
        df (pd.DataFrame): The input DataFrame.
        algorithm_name (str): The name of the FD algorithm to use. Defaults to 'Default'.
    
    Returns:
        list: A list of discovered approximate functional dependencies.
    """
    try:

        # Get the algorithm class dynamically from desbordante.fd.algorithms
        algo_class = getattr(ind_algorithms, algorithm_name, ind_algorithms.Default)

        print(f"Algorthm: {algo_class}")
        
        algo = algo_class()
        algo.load_data(tables=df)
        algo.execute(
            max_lhs_size=2,  # Look for multi-column INDs
            allow_approximate=True,  # Enable approximate matches
            error_threshold=error  # Allow 20% violations
        )
        # Filter out self-dependencies
        return [
            ind for ind in algo.get_inds()
            if ind.get_lhs().column_indices != ind.get_rhs().column_indices
        ]
    except AttributeError:
        raise ValueError(f"Algorithm '{algorithm_name}' not found. Available algorithms: {dir(ind_algorithms)}")
    

In [17]:

def find_ainds(df:list [pd.DataFrame] | pd.DataFrame, algorithm_name:str='Default', error:float=0.3):
    """
    Finds inclusion dependencies in a given DataFrame using a specified algorithm.
    
    Parameters:
        df (pd.DataFrame): The input DataFrame.
        algorithm_name (str): The name of the FD algorithm to use. Defaults to 'Default'.
    
    Returns:
        list: A list of discovered approximate functional dependencies.
    """
    try:

        # Get the algorithm class dynamically from desbordante.fd.algorithms
        algo_class = getattr(ind_algorithms, algorithm_name, ind_algorithms.Default)

        print(f"Algorthm: {algo_class}")
        
        algo = algo_class()
        algo.load_data(tables=df)
        algo.execute(
            max_lhs_size=2,  # Look for multi-column INDs
            allow_approximate=True,  # Enable approximate matches
            error_threshold=error  # Allow 20% violations        
            )
        
        # Filter out self-dependencies
        return [
            ind for ind in algo.get_inds()
            if ind.get_lhs().column_indices != ind.get_rhs().column_indices
        ]
    except AttributeError:
        raise ValueError(f"Algorithm '{algorithm_name}' not found. Available algorithms: {dir(ind_algorithms)}")
    

@dataclass
class InclusionDependency:
    lhs: List[str]  # Left-hand side attributes
    rhs: List[str]  # Right-hand side attributes

    def __str__(self):
       lhs_count = len(self.lhs)
       base = f"LHS={self.lhs} ({lhs_count}), RHS={self.rhs}"
       return base
    
@dataclass
class InclusionDependencySet:
    dependencies: List[InclusionDependency] = field(default_factory=list)
    validation_results: Dict[Tuple[Tuple[str, ...], str], Dict[str, Any]] = field(default_factory=dict)

    def add_dependency(self, lhs: List[str], rhs: List[str]):
        """Adds a new functional dependency to the set."""
        self.dependencies.append(InclusionDependency(lhs, rhs))

    def __len__(self):
        """Returns the number of functional dependencies."""
        return len(self.dependencies)

    def __iter__(self):
        """Allows iteration over functional dependencies."""
        return iter(self.dependencies)
    
    def validate_ind(self, df):
        """
        
        """
        GREEN_CODE = "\033[1;42m"
        RED_CODE = "\033[1;41m"
        BLUE_CODE = "\033[1;46m"
        DEFAULT_COLOR_CODE = "\033[1;49m"
        RESET_CODE = "\033[0m"

        def prints(str):
            print(textwrap.fill(str, 80))


        def print_results_for_ind(verifier):
            if verifier.get_error() == 0:
                print(GREEN_CODE, "IND holds", RESET_CODE)
            else:
                print(RED_CODE, f"AIND holds with error = {verifier.get_error():.2}",
                    RESET_CODE)
                

        def ind_str(lhs, rhs):
            def cc_str(cc):
                (df, indices) = cc
                columns = [df.columns[idx] for idx in indices]
                return ", ".join(f"{col}" for col in columns)

            return f"[{cc_str(lhs)}] -> [{cc_str(rhs)}]"

                
        def aind_verify(lhs, rhs):
            (lhs_table, lhs_indices) = lhs
            (rhs_table, rhs_indices) = rhs

            print(f"Checking the IND {ind_str((lhs_table, lhs_indices), (rhs_table, rhs_indices))}")

            algo = desbordante.aind_verification.algorithms.Default()
            algo.load_data(tables=[lhs_table, rhs_table])
            algo.execute(lhs_indices=lhs_indices, rhs_indices=rhs_indices)

            return algo
        
        def print_results_for_ind(verifier):
            if verifier.get_error() == 0:
                print(GREEN_CODE, "IND holds", RESET_CODE)
            else:
                print(RED_CODE, f"AIND holds with error = {verifier.get_error():.2}",
                    RESET_CODE)

        for fd in self.dependencies:
            lhs_idx = df.columns.get_indexer(fd.lhs)
            rhs_idx = df.columns.get_loc(fd.rhs)

            if lhs_idx[0] == -1:
                continue

            algo = aind_verify((df, [lhs_idx]),(df, [rhs_idx]))
            print_results_for_ind(algo)

In [23]:
def convert_ind(ind:desbordante.ind.IND) -> Tuple[list, list]:
    ind_str = str(ind)
    ind_str_split = ind_str.split("->") # split fd to lhs and rhs
    lhs = ind_str_split[0].strip() 
    rhs = ind_str_split[-1].strip()

    # Regex to match content within square brackets
    pattern = r"\[([^\[\]]+)\]"

    # Find matches
    lhs_matches = re.findall(pattern, lhs)

    rhs_matches = re.findall(pattern, rhs)

    return lhs_matches, rhs_matches

In [19]:
results = find_ainds([df,df], error = 0.8)

print(f"There are {len(results)} inclusion dependencies.")

for ind in results:
    print(ind)

Algorthm: <class 'desbordante.ind.algorithms.Spider'>
There are 0 inclusion dependencies.


In [24]:
aind_set = InclusionDependencySet()
for result in results:
    lhs, rhs =  convert_ind(result)
    aind_set.add_dependency(lhs, rhs)

In [25]:
# Validate all dependencies and store results
aind_set.validate_ind(df)