# Import Utiltiy Functions Notebook

In [1]:
%run ./UtilityFunctions.ipynb 

# Import Dependencies

In [2]:
import findspark
findspark.init()
import pyspark # only run after findspark.init()
from pyspark.sql.functions import *
from pyspark.sql.functions import col, explode,coalesce,udf,pandas_udf
from pyspark.sql import SparkSession
from pyspark.sql.types import LongType,StructType,StringType,ArrayType
spark = SparkSession.builder.getOrCreate()
import datetime
from datetime import timedelta 
from pyspark.sql.functions import split
from pyspark.sql import functions
from pyspark.sql.window import *
import pandas as pd
import json

# Schema for UDF Rule and Transformation UDF Return types


In [3]:
# Rule Schema will have an array of 2 strings (OutputColumnValue,RuleStatusValue)
rule_schema = ArrayType(StringType())
# Trans Schema will have only 1 String i.e. TransformedColumnValue/OutputColumnValue
trans_schema = StringType()

In [4]:
# Class containing all DQ Rules.Every Rule has 2 or more parameters.Last 2 inputs are always Category,RuleID and the previous input parameters are Input columns to the functions.
class RuleEngine(object):
    # Rule takes in IpAddress as input
    def R001(ipaddr,rid,categ):
        # Trim leading and trailing whitespaces
        ipaddr_trimmed = ipaddr.strip()
        is_valid_ip = check_ip(ipaddr_trimmed)
        if is_valid_ip:
            return [ipaddr_trimmed,f""]
        
        return [ipaddr,f"{rid}_{categ}"]
    
    
    # Rule takes in Emp Join Date as Input
    def R002(join_date,rid,categ):
        # Trim leading and trailing whitespaces
        join_date_trimmed = join_date.strip()
        is_valid_date = check_date_fmt1(join_date_trimmed)
        if is_valid_date:
            return [join_date_trimmed,""]
        
        return [join_date,f"{rid}_{categ}"]
    
    # Rule takes in Emp Gender as Input
    def R003(gender,rid,categ):
        # Trim leading and trailing whitespaces
        gender_trimmed = gender.strip()
        clnsed_gender = check_gender(gender_trimmed)
        if clnsed_gender :
            return [clnsed_gender,""]
        
        return [gender,f"{rid}_{categ}"]
    
    # Rule takes in Emp Mail as Input and checks if it has msn.net
    def R004(empmail,rid,categ):
        # Trim leading and trailing whitespaces
        empmail_trimmed = empmail.strip()
        is_valid_mail = check_mail(empmail_trimmed)
        if is_valid_mail :
            return [empmail_trimmed,""]
        
        return [empmail,f"{rid}_{categ}"]
    
    # Rule takes in Employee ID and appends with EMP_ with Employee ID
    def R005(empid,rid,categ):
        # Trim leading and trailing whitespaces
        empid_trimmed = str(empid).strip()
        return "EMP_"+empid_trimmed
    
    # Function that calls ruleUDF dynamically
    @classmethod
    def udf_caller(cls,ruleID,rule_categ):
        # Get a pointer to the function name as a handle by passing the ClassName and RuleID Value
        func_name = getattr(cls,ruleID)
        # Check if the current rule being invoked as a transformation or rule.Correspondingly provide handle to the appropriate UDF by applying appropriate schema
        if rule_categ == "TRANS":
            rule_udf=udf(func_name,trans_schema)
        else:
            rule_udf=udf(func_name,rule_schema)
        # Switch caste to return UDF based on Input Columns and pass the last 2 parameters as Rule Category,Rule ID
        switcher = {
            "R001":rule_udf("ip_address",lit(rule_categ),lit(ruleID)),
            "R002":rule_udf("emp_join_date",lit(rule_categ),lit(ruleID)),
            "R003":rule_udf("gender",lit(rule_categ),lit(ruleID)),
            "R004":rule_udf("email",lit(rule_categ),lit(ruleID)),
            "R005":rule_udf("id",lit(rule_categ),lit(ruleID))
        }
        if ruleID not in switcher:
            raise Exception("Rule doesnt exist in Dictionary")
            
        return switcher.get(ruleID,rule_udf("id",lit(rule_categ),lit(ruleID)))