## Functional Dependency Discovery and Validation

In [1]:
from typing import Tuple, List, Dict, Any
from dataclasses import dataclass, field

import pandas as pd
import desbordante
import desbordante.fd.algorithms as fd_algorithms

from pprint import pprint

In [2]:
file_path = "../../data/Food_Inspections_20250216_preprocessed.parquet"
df = pd.read_parquet(file_path)
df.head()

Unnamed: 0,dba_name,license_,facility_type,risk,address,city,zip,inspection_date,results,violations,latitude,longitude
0,BLOOMING BUD DAYCARE,2215789,Daycare Combo 1586,Risk 1 (High),5715 N LINCOLN AVE,CHICAGO,60659,03/07/2013,Pass,32. FOOD AND NON-FOOD CONTACT SURFACES PROPERL...,41.98539,-87.698734
1,Babas Halal,2684170,Restaurant,Risk 1 (High),7901 S DAMEN AVE,CHICAGO,60620,12/03/2024,Fail,"1. PERSON IN CHARGE PRESENT, DEMONSTRATES KNOW...",41.750189,-87.672986
2,FIRST ZABIHA MEAT BAZAAR,2232559,Grocery Store,Risk 2 (Medium),2907 W DEVON AVE,CHICAGO,60659,02/20/2013,Fail,"11. ADEQUATE NUMBER, CONVENIENT, ACCESSIBLE, D...",41.997401,-87.702385
3,HAPPY MARKET,2912802,Grocery Store,Risk 2 (Medium),2334 S WENTWORTH AVE,CHICAGO,60616,01/02/2025,Pass w/ Conditions,2. CITY OF CHICAGO FOOD SERVICE SANITATION CER...,41.849954,-87.632094
4,SAT KAIVAL FOOD INC/SUBWAY,2728400,Restaurant,Risk 1 (High),1916 S STATE ST,CHICAGO,60616,01/02/2025,Pass,36. THERMOMETERS PROVIDED & ACCURATE - Comment...,41.856053,-87.627311


In [3]:

def find_fds(df, algorithm_name='Default'):
    """
    Finds functional dependencies in a given DataFrame using a specified algorithm.
    
    Parameters:
        df (pd.DataFrame): The input DataFrame.
        algorithm_name (str): The name of the FD algorithm to use. Defaults to 'Default'. Options are 
    
    Returns:
        list: A list of discovered functional dependencies.
    """
    try:
        # Get the algorithm class dynamically from desbordante.fd.algorithms
        algo_class = getattr(fd_algorithms, algorithm_name, fd_algorithms.Default)

        print(f"Algorthm: {algo_class}")
        
        algo = algo_class()
        algo.load_data(table=df)
        algo.execute()
        return algo.get_fds()
    except AttributeError:
        raise ValueError(f"Algorithm '{algorithm_name}' not found. Available algorithms: {dir(fd_algorithms)}")

In [4]:
@dataclass
class FunctionalDependency:
    lhs: List[str]  # Left-hand side attributes
    rhs: str        # Right-hand side attribute

    def __str__(self):
       lhs_count = len(self.lhs)
       base = f"LHS={self.lhs} ({lhs_count}), RHS={self.rhs}"
       return base
    
@dataclass
class FunctionalDependencySet:
    dependencies: List[FunctionalDependency] = field(default_factory=list)
    validation_results: Dict[Tuple[Tuple[str, ...], str], Dict[str, Any]] = field(default_factory=dict)

    def add_dependency(self, lhs: List[str], rhs: str):
        """Adds a new functional dependency to the set."""
        self.dependencies.append(FunctionalDependency(lhs, rhs))

    def __len__(self):
        """Returns the number of functional dependencies."""
        return len(self.dependencies)

    def __iter__(self):
        """Allows iteration over functional dependencies."""
        return iter(self.dependencies)
    
    def validate_fd_dependencies(self, df):
        """Validates all functional dependencies in the dataset and stores the results."""
        GREEN_CODE = "\033[1;42m"
        RED_CODE = "\033[1;41m"
        DEFAULT_COLOR_CODE = "\033[1;49m"

        verifier = desbordante.fd_verification.algorithms.Default()
          
        verifier.load_data(table=df)

        for fd in self.dependencies:
            lhs_idx = df.columns.get_indexer(fd.lhs)
            rhs_idx = df.columns.get_loc(fd.rhs)

            if lhs_idx[0] == -1:
                continue

            verifier.execute(lhs_indices=lhs_idx, rhs_indices=[rhs_idx])
            highlights = verifier.get_highlights()

            fd_key = (tuple(fd.lhs), fd.rhs)
            self.validation_results[fd_key] = {
                "holds": verifier.fd_holds(),
                "num_violations": verifier.get_num_error_clusters(),
                "highlights": highlights
            }

            if self.validation_results[fd_key]["holds"]:
                print(GREEN_CODE, f"FD holds: {fd.lhs} -> {fd.rhs}", DEFAULT_COLOR_CODE)
            else:
                print(RED_CODE, f"FD does not hold: {fd.lhs} -> {fd.rhs}", DEFAULT_COLOR_CODE)
                print(f"Number of clusters violating FD: {self.validation_results[fd_key]['num_violations']}")

    def get_validation_result(self, lhs: List[str], rhs: str) -> Dict[str, Any]:
        """Retrieves stored validation results for a specific FD."""
        fd_key = (tuple(lhs), rhs)
        return self.validation_results.get(fd_key, {})

    def get_all_validation_results(self) -> Dict[Tuple[str, str], Dict[str, Any]]:
        """Returns all stored validation results."""
        return self.validation_results


In [5]:
def convert_fd(fd:desbordante.fd.FD) -> Tuple[list, str]:
    fd_str = str(fd) # convert fd to string
    fd_str_split = fd_str.split("->") # split fd to lhs and rhs
    lhs = fd_str_split[0].strip() 
    rhs = fd_str_split[-1].strip()

    lhs_list = lhs[1:-1].split(' ') # convert lhs to list of attributes

    return lhs_list, rhs

In [6]:
results = find_fds(df)

print(f"There are {len(results)} functional dependencies using Default algorithm.")

for fd in results:
    print(fd)

Algorthm: <class 'desbordante.fd.algorithms.HyFD'>
There are 35 functional dependencies using Default algorithm.
[dba_name license_ address violations] -> zip
[dba_name license_ violations latitude] -> zip
[dba_name license_ violations longitude] -> zip
[dba_name license_ facility_type inspection_date results violations] -> zip
[license_ address inspection_date violations] -> city
[license_ facility_type address violations] -> zip
[license_ risk address violations] -> zip
[license_ inspection_date violations latitude] -> zip
[license_ facility_type violations latitude] -> zip
[license_ risk results violations latitude] -> zip
[license_ inspection_date violations longitude] -> zip
[license_ facility_type violations longitude] -> zip
[license_ risk results violations longitude] -> zip
[dba_name facility_type address violations] -> zip
[dba_name address results violations] -> zip
[dba_name license_ inspection_date latitude] -> zip
[dba_name license_ inspection_date longitude] -> zip
[dba_

retrieve column index to validate FD

In [7]:
fd_set = FunctionalDependencySet()
for result in results:
    lhs, rhs =  convert_fd(fd=result)
    fd_set.add_dependency(lhs, rhs)

In [10]:
# Validate all dependencies and store results
fd_set.validate_fd_dependencies(df)

[1;42m FD holds: ['dba_name', 'license_', 'address', 'violations'] -> zip [1;49m
[1;42m FD holds: ['dba_name', 'license_', 'violations', 'latitude'] -> zip [1;49m
[1;42m FD holds: ['dba_name', 'license_', 'violations', 'longitude'] -> zip [1;49m
[1;42m FD holds: ['dba_name', 'license_', 'facility_type', 'inspection_date', 'results', 'violations'] -> zip [1;49m
[1;42m FD holds: ['license_', 'address', 'inspection_date', 'violations'] -> city [1;49m
[1;42m FD holds: ['license_', 'facility_type', 'address', 'violations'] -> zip [1;49m
[1;42m FD holds: ['license_', 'risk', 'address', 'violations'] -> zip [1;49m
[1;42m FD holds: ['license_', 'inspection_date', 'violations', 'latitude'] -> zip [1;49m
[1;42m FD holds: ['license_', 'facility_type', 'violations', 'latitude'] -> zip [1;49m
[1;42m FD holds: ['license_', 'risk', 'results', 'violations', 'latitude'] -> zip [1;49m
[1;42m FD holds: ['license_', 'inspection_date', 'violations', 'longitude'] -> zip [1;49m
[1;42m 

In [11]:
print(f"There are {len(fd_set)} fds in the dataset.")

There are 35 fds in the dataset.


In [12]:
# Retrieve validation result for a specific FD
result = fd_set.get_validation_result(lhs=['dba_name', 'license_', 'address', 'violations'],rhs="zip")
print(result)

{'holds': True, 'num_violations': 0, 'highlights': []}


In [13]:
all_results = fd_set.get_all_validation_results()

for (lhs, rhs), result in all_results.items():
    # Create a copy of result without the 'highlights' key
    filtered_result = {key: value for key, value in result.items() if key != "highlights"}
    
    print(f"FD: {lhs} -> {rhs}, Results: {filtered_result}")


FD: ('dba_name', 'license_', 'address', 'violations') -> zip, Results: {'holds': True, 'num_violations': 0}
FD: ('dba_name', 'license_', 'violations', 'latitude') -> zip, Results: {'holds': True, 'num_violations': 0}
FD: ('dba_name', 'license_', 'violations', 'longitude') -> zip, Results: {'holds': True, 'num_violations': 0}
FD: ('dba_name', 'license_', 'facility_type', 'inspection_date', 'results', 'violations') -> zip, Results: {'holds': True, 'num_violations': 0}
FD: ('license_', 'address', 'inspection_date', 'violations') -> city, Results: {'holds': True, 'num_violations': 0}
FD: ('license_', 'facility_type', 'address', 'violations') -> zip, Results: {'holds': True, 'num_violations': 0}
FD: ('license_', 'risk', 'address', 'violations') -> zip, Results: {'holds': True, 'num_violations': 0}
FD: ('license_', 'inspection_date', 'violations', 'latitude') -> zip, Results: {'holds': True, 'num_violations': 0}
FD: ('license_', 'facility_type', 'violations', 'latitude') -> zip, Results: {'h