## Functional Dependencies

Discovery functional dependencies from the columns.

- Functional Dependencies
- Approximate Functional Dependencies
- Pruning
- 

In [15]:
from rich.console import Console, inspect

In [12]:
console = Console()

In [1]:
import pandas as pd
import desbordante


In [2]:
file_path = "../data/Food_Inspections_20250216.csv"
df = pd.read_csv(file_path, header=[0])
df.head()

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude,Location
0,1106427,BLOOMING BUD DAYCARE,BLOOMING BUD DAYCARE,2215789.0,Daycare Combo 1586,Risk 1 (High),5715 N LINCOLN AVE,CHICAGO,IL,60659.0,03/07/2013,License,Pass,32. FOOD AND NON-FOOD CONTACT SURFACES PROPERL...,41.98539,-87.698734,"(41.98538950526786, -87.69873407149943)"
1,2608378,Babas Halal,Babas Halal,2684170.0,Restaurant,Risk 1 (High),7901 S DAMEN AVE,CHICAGO,IL,60620.0,12/03/2024,Complaint,Fail,"1. PERSON IN CHARGE PRESENT, DEMONSTRATES KNOW...",41.750189,-87.672986,"(41.750189342293375, -87.67298583977204)"
2,1106406,FIRST ZABIHA MEAT BAZAAR,FIRST ZABIHA MEAT BAZAAR,2232559.0,Grocery Store,Risk 2 (Medium),2907 W DEVON AVE,CHICAGO,IL,60659.0,02/20/2013,License,Fail,"11. ADEQUATE NUMBER, CONVENIENT, ACCESSIBLE, D...",41.997401,-87.702385,"(41.99740137039031, -87.70238538227812)"
3,2609909,HAPPY MARKET,HAPPY MARKET,2912802.0,Grocery Store,Risk 2 (Medium),2334 S WENTWORTH AVE,CHICAGO,IL,60616.0,01/02/2025,Canvass,Pass w/ Conditions,2. CITY OF CHICAGO FOOD SERVICE SANITATION CER...,41.849954,-87.632094,"(41.84995400192252, -87.63209419559098)"
4,2609927,SAT KAIVAL FOOD INC/SUBWAY,SAT KAIVAL FOOD INC/SUBWAY,2728400.0,Restaurant,Risk 1 (High),1916 S STATE ST,CHICAGO,IL,60616.0,01/02/2025,Canvass,Pass,36. THERMOMETERS PROVIDED & ACCURATE - Comment...,41.856053,-87.627311,"(41.85605269621059, -87.62731125804903)"


### Functional Dependencies

In [None]:
import desbordante.fd.algorithms as fd_algorithms

In [27]:

def find_fds(df, algorithm_name='Default'):
    """
    Finds functional dependencies in a given DataFrame using a specified algorithm.
    
    Parameters:
        df (pd.DataFrame): The input DataFrame.
        algorithm_name (str): The name of the FD algorithm to use. Defaults to 'Default'. Options are 
    
    Returns:
        list: A list of discovered functional dependencies.
    """
    try:
        # Get the algorithm class dynamically from desbordante.fd.algorithms
        algo_class = getattr(fd_algorithms, algorithm_name, fd_algorithms.Default)

        print(f"Algorthm: {algo_class}")
        
        algo = algo_class()
        algo.load_data(table=df)
        algo.execute()
        return algo.get_fds()
    except AttributeError:
        raise ValueError(f"Algorithm '{algorithm_name}' not found. Available algorithms: {dir(fd_algorithms)}")

In [15]:
dir(fd_algorithms)

['Aid',
 'DFD',
 'Default',
 'Depminer',
 'EulerFD',
 'FDep',
 'FUN',
 'FastFDs',
 'FdMine',
 'HyFD',
 'PFDTane',
 'Pyro',
 'Tane',
 '__doc__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__']

**Using the default FD algorithm**

In [35]:
default_results = find_fds(df)

print(f"There are {len(default_results)} functional dependencies using Default algorithm.")

for fd in default_results:
    print(fd)

Algorthm: <class 'desbordante.fd.algorithms.HyFD'>
There are 235 functional dependencies using Default algorithm.
[Inspection ID] -> Violations
[Inspection ID] -> License #
[Inspection ID] -> DBA Name
[Inspection ID] -> AKA Name
[Inspection ID] -> Address
[Inspection ID] -> Latitude
[Inspection ID] -> Longitude
[Inspection ID] -> Location
[Inspection ID] -> Inspection Date
[Inspection ID] -> Facility Type
[Inspection ID] -> Zip
[Inspection ID] -> Inspection Type
[Inspection ID] -> City
[Inspection ID] -> State
[Inspection ID] -> Results
[Inspection ID] -> Risk
[DBA Name AKA Name License # Violations Latitude] -> Zip
[DBA Name AKA Name License # Violations Longitude] -> Zip
[DBA Name AKA Name License # Violations Location] -> Zip
[DBA Name AKA Name License # Zip Violations] -> City
[DBA Name License # Address City Violations] -> Zip
[DBA Name License # City Violations Latitude] -> Zip
[DBA Name License # City Violations Longitude] -> Zip
[DBA Name License # City Violations Location] -> 

**Using TANE algorithm**

In [36]:
tane_results = find_fds(df, "Tane")

print(f"There are {len(tane_results)} functional dependencies using TANE algorithm.")
for fd in tane_results:
    print(fd)

Algorthm: <class 'desbordante.fd.algorithms.Tane'>
There are 235 functional dependencies using TANE algorithm.
[Inspection ID] -> DBA Name
[Inspection ID] -> AKA Name
[Inspection ID] -> License #
[Inspection ID] -> Facility Type
[Inspection ID] -> Risk
[Inspection ID] -> Address
[Inspection ID] -> City
[Inspection ID] -> State
[Inspection ID] -> Zip
[Inspection ID] -> Inspection Date
[Inspection ID] -> Inspection Type
[Inspection ID] -> Results
[Inspection ID] -> Violations
[Inspection ID] -> Latitude
[Inspection ID] -> Longitude
[Inspection ID] -> Location
[License #] -> State
[Address] -> Latitude
[Address] -> Longitude
[Latitude] -> Longitude
[Longitude] -> Latitude
[Address] -> Location
[Latitude] -> Location
[Location] -> Latitude
[Longitude] -> Location
[Location] -> Longitude
[DBA Name Address] -> State
[AKA Name Address] -> State
[Address City] -> State
[Address Inspection Date] -> State
[AKA Name License # Address] -> City
[DBA Name Address Zip] -> City
[AKA Name Address Zip] 

### Approximate Functional Dependencies

In [37]:
import desbordante.afd.algorithms as afd_algorithms

In [39]:
def find_afds(df:pd.DataFrame, algorithm_name:str='Default', error:float=0.1):
    """
    Finds approximate functional dependencies in a given DataFrame using a specified algorithm.
    
    Parameters:
        df (pd.DataFrame): The input DataFrame.
        algorithm_name (str): The name of the FD algorithm to use. Defaults to 'Default'.
    
    Returns:
        list: A list of discovered approximate functional dependencies.
    """
    try:
        # Get the algorithm class dynamically from desbordante.fd.algorithms
        algo_class = getattr(afd_algorithms, algorithm_name, afd_algorithms.Default)

        print(f"Algorthm: {algo_class}")
        
        algo = algo_class()
        algo.load_data(table=df)
        algo.execute(error=error)
        return algo.get_fds()
    except AttributeError:
        raise ValueError(f"Algorithm '{algorithm_name}' not found. Available algorithms: {dir(afd_algorithms)}")

**Using Default algorithm**

In [40]:
default_afd_results = find_afds(df)

print(f"There are {len(default_afd_results)} functional dependencies using default algorithm.")
for fd in default_afd_results:
    print(fd)

Algorthm: <class 'desbordante.fd.algorithms.Pyro'>
There are 175 functional dependencies using default algorithm.
[] -> State
[] -> City
[Inspection ID] -> License #
[Inspection ID] -> DBA Name
[Inspection ID] -> AKA Name
[Inspection ID] -> Inspection Type
[Inspection ID] -> Risk
[Inspection ID] -> Address
[Inspection ID] -> Facility Type
[Inspection ID] -> Zip
[Inspection ID] -> Violations
[Inspection ID] -> Inspection Date
[Inspection ID] -> Results
[License #] -> Inspection ID
[Address] -> Inspection ID
[Inspection Date] -> Inspection ID
[DBA Name] -> Risk
[Latitude] -> Inspection ID
[License #] -> Zip
[Longitude] -> Inspection ID
[Location] -> Inspection ID
[License #] -> Risk
[DBA Name] -> Inspection ID
[Address] -> Zip
[AKA Name] -> Inspection ID
[Zip] -> Inspection ID
[Violations] -> Inspection ID
[AKA Name] -> Risk
[License #] -> Violations
[Latitude] -> Zip
[DBA Name] -> Facility Type
[License #] -> Address
[License #] -> DBA Name
[DBA Name] -> AKA Name
[License #] -> Results


**Using TANE algorithm**

In [41]:
tane_afd_results = find_afds(df, algorithm_name="Tane", error=0.3)

print(f"There are {len(tane_afd_results)} functional dependencies using default algorithm.")
for fd in tane_afd_results:
    print(fd)

Algorthm: <class 'desbordante.fd.algorithms.Tane'>
There are 219 functional dependencies using default algorithm.
[] -> City
[] -> State
[Inspection ID] -> DBA Name
[Inspection ID] -> AKA Name
[Inspection ID] -> License #
[Inspection ID] -> Facility Type
[Inspection ID] -> Risk
[Inspection ID] -> Address
[Inspection ID] -> Zip
[Inspection ID] -> Inspection Date
[Inspection ID] -> Inspection Type
[Inspection ID] -> Results
[Inspection ID] -> Violations
[Inspection ID] -> Latitude
[Inspection ID] -> Longitude
[Inspection ID] -> Location
[DBA Name] -> Inspection ID
[AKA Name] -> Inspection ID
[DBA Name] -> AKA Name
[AKA Name] -> DBA Name
[License #] -> Inspection ID
[DBA Name] -> License #
[License #] -> DBA Name
[AKA Name] -> License #
[License #] -> AKA Name
[DBA Name] -> Facility Type
[AKA Name] -> Facility Type
[License #] -> Facility Type
[DBA Name] -> Risk
[AKA Name] -> Risk
[License #] -> Risk
[Facility Type] -> Risk
[Risk] -> Facility Type
[Address] -> Inspection ID
[DBA Name] -> 