# Relational Algebra Operators

In [21]:
import pandas as pd

def sigma(df: pd.DataFrame, column: str, operator: str, value: str) -> pd.DataFrame:
    """
    Selection operator : σ, to filter rows
    Arguments:
        df: the dataframe to filter
        column: the column name (ex. RACE)
        operator: a logical operator (ex. ==, <=, !=...)
        vaule: the value to compare column with (ex. "Cat")
    """
    return eval(f"df[df['{column}']{operator}'{value}']")

def pi(df: pd.DataFrame, columns: list) -> pd.DataFrame:    
    """
    Projection operator : ∏, to select columns
    Arguments:
        df: the dataframe to filter
        columns: a list of column names (ex. [RACE, AGE])
    """        
    return df[columns]

def union(dfA: pd.DataFrame, dfB: pd.DataFrame) -> pd.DataFrame:
    """
    Union operator : ∪, to get A and B elements without duplicates
    Arguments:
        dfA: the first dataframe
        dfB: the second dataframe
    """        
    return pd.concat([dfA,dfB]).drop_duplicates().reset_index(drop=True)

def intersection(dfA: pd.DataFrame, dfB: pd.DataFrame) -> pd.DataFrame:
    """
    Intersection operator : ∩, to get elements which are in A and B
    Arguments:
        dfA: the first dataframe
        dfB: the second dataframe
    """        
    return pd.merge(dfA, dfB, how='inner')

def substraction(dfA: pd.DataFrame, dfB: pd.DataFrame) -> pd.DataFrame:
    """
    Substraction operator : -, to get elements which are in A and not in B
    Arguments:
        dfA: the first dataframe
        dfB: the second dataframe
    """  
    df_all = dfA.merge(dfB, how='left', indicator=True)
    df_all = df_all[df_all["_merge"]=="left_only"].iloc[:,:-1]
    return df_all

def cartesian_product(dfA: pd.DataFrame, dfB: pd.DataFrame) -> pd.DataFrame:
    """
    Cartesian product operator : X, if it's of any use
    Arguments:
        dfA: the first dataframe
        dfB: the second dataframe
    """  
    return dfA.merge(dfB, how='cross')

def division(dfA, dfB, keyA, keyB):
    
    # Create three sets, with unique values of selected columns in A and in B
    setA = set(dfA[keyA])
    setB = set(dfB[keyB])
    
    # Create an empty set to store results
    setC = set()
    
    # For each value in set one
    for A in setA:
        
        # Get only rows with value A
        temp = dfA[dfA[keyA]==A]
        
        # If unique values of Temp is a superset of setB
        if set(temp[keyB]) >= setB:
            
            # Add A to result
            setC.add(A)
            
    return pd.DataFrame({keyA: list(setC)})

def join(dfA: pd.DataFrame, dfB: pd.DataFrame, key: str) -> pd.DataFrame:
    """
    Join : ⋈, to add columns from another dataframe using a key
    Arguments:
        dfA: the first dataframe
        dfB: the second dataframe
        key: the common column name to use to merge
    """  
    return pd.merge(dfA, dfB, on=key)

def theta_join(dfA, dfB, func) -> pd.DataFrame:
    """
    Theta Join : ⋈θ, to filter a cartesian product on a condition
    Arguments:
        dfA: the first dataframe
        dfB: the second dataframe
        func: a function which can use columns of dfA and dfB to return a boolean
    """ 
    
    temp = cartesian_product(dfA, dfB)
    
    return temp[temp.apply(f, axis=1)]

def semi_join(dfA: pd.DataFrame, dfB: pd.DataFrame, key: str) -> pd.DataFrame:
    
    """
    Semi-join : ⋉, to keep rows in which one column's value is in another table's column
    Arguments:
        dfA: the first dataframe
        dfB: the second dataframe
        key: the column on which to filter
    """ 
    setB = set(dfB[key])
    return dfA[dfA[key].isin(setB)]

# Data to test operators

In [5]:
# A first data frame with 7 animals
## Race
L1 = ["Cat","Dog","Cow","Cow","Dog","Cow","Cat"]
## Age
L2 = [6,4,2,9,7,3,15]
## Owner name
L3 = ["Tom","Cindy","Amy","Bob","Peter","Jacob","Lisa"]

Animals1 = pd.DataFrame({"RACE": L1,"AGE": L2,"OWNER_NAME": L3})

# A second data frame with 3 animals
## Race
L1 = ["Cat","Dog","Horse"]
## Age
L2 = [6,4,15]
## Owner name
L3 = ["Tom","Cindy","Mickael"]

Animals2 = pd.DataFrame({"RACE": L1,"AGE": L2,"OWNER_NAME": L3})

# A third data frame describing some animal features
## Race
L4 = ["Cat","Dog","Horse","Cow"]
## Feature1
L5 = ["Kittyish","Dogish","Horsish","Cowish"]
## Feature2
L6 = [2,6,8,10]

AnimalFeature = pd.DataFrame({"RACE": L4,"ANIMAL_FEATURE": L5, "SIZE": L6})

# Use cases

In [4]:
sigma(Animals1, "RACE", "==", "Cat")

Unnamed: 0,RACE,AGE,OWNER_NAME
0,Cat,6,Tom
6,Cat,15,Lisa


In [5]:
pi(Animals1, ["RACE", "AGE"])

Unnamed: 0,RACE,AGE
0,Cat,6
1,Dog,4
2,Cow,2
3,Cow,9
4,Dog,7
5,Cow,3
6,Cat,15


In [6]:
union(Animals1, Animals2)

Unnamed: 0,RACE,AGE,OWNER_NAME
0,Cat,6,Tom
1,Dog,4,Cindy
2,Cow,2,Amy
3,Cow,9,Bob
4,Dog,7,Peter
5,Cow,3,Jacob
6,Cat,15,Lisa
7,Horse,15,Mickael


In [7]:
intersection(Animals1, Animals2)

Unnamed: 0,RACE,AGE,OWNER_NAME
0,Cat,6,Tom
1,Dog,4,Cindy


In [8]:
substraction(Animals1, Animals2)

Unnamed: 0,RACE,AGE,OWNER_NAME
2,Cow,2,Amy
3,Cow,9,Bob
4,Dog,7,Peter
5,Cow,3,Jacob
6,Cat,15,Lisa


In [9]:
cartesian_product(Animals1, Animals2)

Unnamed: 0,RACE_x,AGE_x,OWNER_NAME_x,RACE_y,AGE_y,OWNER_NAME_y
0,Cat,6,Tom,Cat,6,Tom
1,Cat,6,Tom,Dog,4,Cindy
2,Cat,6,Tom,Horse,15,Mickael
3,Dog,4,Cindy,Cat,6,Tom
4,Dog,4,Cindy,Dog,4,Cindy
5,Dog,4,Cindy,Horse,15,Mickael
6,Cow,2,Amy,Cat,6,Tom
7,Cow,2,Amy,Dog,4,Cindy
8,Cow,2,Amy,Horse,15,Mickael
9,Cow,9,Bob,Cat,6,Tom


In [25]:
# A first data frame with 7 animals
## Race
L1 = ["Cat","Cat","Cow","Cow","Dog","Dog", "Chicken"]
## Age
L2 = [6,4,2,9,7,3,5]
## Owner name
L3 = ["Tom","Cindy","Tom","Tom","Cindy","Tom", "Gerard"]

Animals3 = pd.DataFrame({"RACE": L1,"AGE": L2,"OWNER_NAME": L3})

L3 = ["Tom", "Cindy"]

OwnerNames = pd.DataFrame({"OWNER_NAME": L3})

In [26]:
division(Animals3, OwnerNames, "RACE", "OWNER_NAME")

Unnamed: 0,RACE
0,Cat
1,Dog


In [27]:
semi_join(Animals3, OwnerNames, "OWNER_NAME")

Unnamed: 0,RACE,AGE,OWNER_NAME
0,Cat,6,Tom
1,Cat,4,Cindy
2,Cow,2,Tom
3,Cow,9,Tom
4,Dog,7,Cindy
5,Dog,3,Tom


In [10]:
join(Animals1, AnimalFeature, "RACE")

Unnamed: 0,RACE,AGE,OWNER_NAME,ANIMAL_FEATURE,SIZE
0,Cat,6,Tom,Kittyish,2
1,Cat,15,Lisa,Kittyish,2
2,Dog,4,Cindy,Dogish,6
3,Dog,7,Peter,Dogish,6
4,Cow,2,Amy,Cowish,10
5,Cow,9,Bob,Cowish,10
6,Cow,3,Jacob,Cowish,10


In [11]:
def f(df):

    return df["SIZE"] < df["AGE"]

theta_join(Animals1, AnimalFeature, f)

Unnamed: 0,RACE_x,AGE,OWNER_NAME,RACE_y,ANIMAL_FEATURE,SIZE
0,Cat,6,Tom,Cat,Kittyish,2
4,Dog,4,Cindy,Cat,Kittyish,2
12,Cow,9,Bob,Cat,Kittyish,2
13,Cow,9,Bob,Dog,Dogish,6
14,Cow,9,Bob,Horse,Horsish,8
16,Dog,7,Peter,Cat,Kittyish,2
17,Dog,7,Peter,Dog,Dogish,6
20,Cow,3,Jacob,Cat,Kittyish,2
24,Cat,15,Lisa,Cat,Kittyish,2
25,Cat,15,Lisa,Dog,Dogish,6
