In [17]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
import yaml

from pandas import DataFrame
from typing import Dict
from typing import List
from typing import Any
from typing import Tuple

with open('config.yaml') as f:
    config = yaml.safe_load(f)

class Ingest:
    
    """
    Class to ingest dataframe input.
    """
    
    def __init__(
        self,
        config: Dict[Any, Any]
    ) -> None:
        
        """
        Inits class with the config file
        and unpacks the config file.
        """
        
        self.config = config
        
        self.unpack_config()

        
    def run(
        self
    ) -> DataFrame:
        
        """
        Run function for the class.
        
        :param None:
        :return df:
            DataFrame, ingested df
        """
        
        df = self.run_load()
        
        df = self.run_harmonize(df)
        
        return df
        
        
    def unpack_config(
        self
    ) -> None:
        
        """
        Function to unpack config vars.
        
        :var filepath:
            str, the relative filepath 
        :var group_variable:
            str, the column name for the 
            group variable of interest e.g.
            gender, which contains the target 
            class and non-target class e.g.
            females and males.
        :var group_target_val:
            str, within the group_variable column,
            contains the contains the target 
            class value e.g.
            females.
        :var group_other_val:
            str, within the group_variable column,
            contains the contains the non-target 
            class value e.g. males.
        :var outcome_variable:
            str, the column name for the 
            outcome variable of interest e.g.
            hired, which contains the target 
            class and non-target class e.g.
            hired and not-hired.
        :var outcome_target_val:
            str, within the outcome_variable column,
            contains the contains the target 
            class value e.g.
            hired.
        :var outcome_other_val:
            str, within the outcome_variable column,
            contains the contains the non-target 
            class value e.g. not-hired.
        :var grpers:
            Dict[str,str], can be any set of filterable
            columns to slice into particular groups within
            the broader employee roster. The key is the column,
            the value is the desired class within the column
            e.g. job_title: analyst.
        """
        
        config = self.config
        
        try:
            self.filepath: str = config["Ingest"]["filepath"]
            self.group_variable: str = config["Ingest"]["group_variable"]
            self.group_target_val: str = config["Ingest"]["group_target_val"]
            self.group_other_val: str = config["Ingest"]["group_other_val"]
            self.outcome_variable: str = config["Ingest"]["outcome_variable"]
            self.outcome_target_val: str = config["Ingest"]["outcome_target_val"]
            self.outcome_other_val: str = config["Ingest"]["outcome_other_val"]
            self.grpers: Dict[str, str] = config["Ingest"]["grpers"]

            # Type validation
            if not isinstance(self.filepath, str):
                raise TypeError("Expected 'filepath' to be of type 'str'.")
            if not isinstance(self.group_variable, str):
                raise TypeError("Expected 'group_variable' to be of type 'str'.")
            if not isinstance(self.group_target_val, str):
                raise TypeError("Expected 'group_target_val' to be of type 'str'.")
            if not isinstance(self.group_other_val, str):
                raise TypeError("Expected 'group_other_val' to be of type 'str'.")
            if not isinstance(self.outcome_variable, str):
                raise TypeError("Expected 'outcome_variable' to be of type 'str'.")
            if not isinstance(self.outcome_target_val, str):
                raise TypeError("Expected 'outcome_target_val' to be of type 'str'.")
            if not isinstance(self.outcome_other_val, str):
                raise TypeError("Expected 'outcome_other_val' to be of type 'str'.")
            if not isinstance(self.grpers, dict):
                raise TypeError("Expected 'grpers' to be of type 'dict'.")

        except KeyError as e:
            raise KeyError(f"Missing key '{e.args[0]}' in the config file. "
                           "Please ensure the config file contains all required keys under the 'Ingest' section: "
                           "'filepath', 'group_variable', 'group_target_val', 'group_other_val', "
                           "'outcome_variable', 'outcome_target_val', 'outcome_other_val', and 'grpers'.")

        except TypeError as e:
            raise TypeError(f"Config file error: {e}")
        
    def run_load(
        self
    ) -> DataFrame:
        
        """
        Loads csv file. Assumes headers are row 0.
        
        :param None:
        :return DataFrame:
        """
        
        filepath = self.filepath
        
        try:
            return pd.read_csv(filepath, skiprows=0)

        except FileNotFoundError:
            raise FileNotFoundError(
                f"The file at {filepath} was not found. Please check the file path."
            )

        except pd.errors.EmptyDataError:
            raise ValueError(
                f"The file at {filepath} is empty and cannot be loaded."
            )

        except pd.errors.ParserError:
            raise ValueError(
                f"The file at {filepath} contains malformed data and could not be parsed as a valid CSV."
            )

        except PermissionError:
            raise PermissionError(
                f"Permission denied when attempting to read the file at {filepath}."
                f"Please check the file permissions."
            )

        except Exception as e:
            raise Exception(
                f"An unexpected error occurred while loading the file: {str(e)}"
            )
        
    def run_harmonize(
        self,
        df: DataFrame
    ) -> DataFrame:        

        """
        Function to harmonize the dataset.
        
        :param df: 
            DataFrame, loaded df
        :return df:
            DataFrame, filtered down to target and other group and
            harmonize the fields
        """
    
        group_variable = self.group_variable
        group_target_val = self.group_target_val
        group_other_val = self.group_other_val
        outcome_variable = self.outcome_variable
        outcome_target_val = self.outcome_target_val
        outcome_other_val = self.outcome_other_val
        grpers = self.grpers

        df = self._apply_filters(
            df=df,
            group_variable=group_variable,
            group_target_val=group_target_val,
            group_other_val=group_other_val,
            grpers=grpers
        )
        
        df = self._apply_harmonize(
            df=df,
            group_variable=group_variable,
            group_target_val=group_target_val,
            group_other_val=group_other_val,
            outcome_variable=outcome_variable,
            outcome_target_val=outcome_target_val,
            outcome_other_val=outcome_other_val
        )
      
        return df
    
    def _apply_filters(
        self,
        df: DataFrame,
        group_variable: str,
        group_target_val: str,
        group_other_val: str,
        grpers: Dict[str,str],
    ) -> DataFrame:
        
        """
        Method to apply filters
        
        :param df:
            DataFrame, target df
        :param group_variable:
            str, column name of the
            target variable
        :param group_target_val:
            str, class target value of the group_variable
            aka the protected class value
        :param group_other_val:
            str, class nontarget value of the group_variable
            aka the nonprotected class value
        :return df:
            DataFrame, filtered df
        """
                
        df = df.loc[
            df[group_variable].isin(
                [
                    group_target_val, 
                    group_other_val
                ]
            )
        ]
        
        for k, v in grpers.items():
        
            df = df.loc[
                df[k].isin([v])
            ]  
            
        return df
    
    def _apply_harmonize(
        self,
        df: DataFrame,
        group_variable: str,
        group_target_val: str,
        group_other_val: str,
        outcome_variable: str,
        outcome_target_val: str,
        outcome_other_val: str
    ) -> DataFrame:
        
        """
        Method to harmonize targets.
        
        :param df:
            DataFrame, target df
        :param group_variable:
            str, column name of the
            target variable
        :param group_target_val:
            str, class target value of the group_variable
            aka the protected class value
        :param group_other_val:
            str, class nontarget value of the group_variable
            aka the nonprotected class value     
        :param outcome_variable:
            str, the column name of the outcome 
        :param outcome_target_val:
            str, class target value of the outcome_variable
            aka success
        :param outcome_other_val:
            str, class nontarget value of the outcome_variable
        :return df:
            DataFrame, target df
        """
        
        # harmonize the group target
        df['group_var_clean'] = np.where(
            df[group_variable]==group_target_val, 
            1,
            np.where(
                df[group_variable]==group_other_val, 
                0, 
                -1
            )
        )
        
        # harmonize the outcome target
        df['outcome_var_clean'] = np.where(
            df[outcome_variable]==outcome_target_val, 
            1,  
            np.where(
                df[self.outcome_variable]==outcome_other_val,
                0, 
                -1
            )
        )  
        
        return df
            
class Transform:
    
    """
    Class to transform dataframe inputs into 
    2x2 contingency table.
    """
    
    def __init__(
        self, 
        df: DataFrame
    ) -> None:
        
        """
        :param df:
            DataFrame, input df
        """
    
        self.df = df
    
    def run_build_cont_table(
        self
    ) -> List[int]:
        
        """
        Function to generate contingency table format.
        
        Places the target group val in the top row and the
        target group other to the bottom row.
        
        Places no-success outcome on the first column and success
        on the second column.
        
        :return tbl:
            List[int], filtered down to target and other group.
        """
        
        df = self.df
        
        cols = [
            'group_var_clean', 
            'outcome_var_clean'
        ]
        
        df = df[cols]
        
        tbl = (
            df.pivot_table(
                index='group_var_clean',
                columns='outcome_var_clean', 
                aggfunc=len
            ).
            sort_index(
                axis=1, 
                ascending=True
            ).
            sort_index(ascending=False). # ensure always [1,0]
            values.tolist()
        ) 
                    
        return tbl
        
class StatsTesting2x2Cont:
    
    """
    Class to perform 2x2 Contigency Table analysis
    with Chi2 and Phi Correlation Coefficent Testing.

    Provides context into potential association between
    variables and the strength of the association.
    """
    
    def __init__(
        self,
        config: Dict[Any, Any],
        tbl: List[int],
        df: DataFrame
    ) -> None:
        
        """
        Inits the class variables and unpacks the
        config variables.
        
        :param config:
            Dict[str,Any], loaded config file.
        :param tbl:
            List[int], 2x2 cont table.
        :param df:
            DataFrame, original input DataFrame.
        """
        
        self.config = config
        self.tbl = tbl
        self.df = df

        self.unpack_config()

    def run_testing(
        self
    ) -> DataFrame:
        
        """
        Run function for the class.
        
        Runs hypothesis evaluation and builds
        the output report DataFrame.
        
        :param None:
        :return df_results:
            DataFrame, with testing results.
        """
        
        alpha = self.alpha
        tbl = self.tbl
        process = self.process
        group_variable = self.group_variable
        group_target_val = self.group_target_val
        group_other_val = self.group_other_val
        bin_edges = self.bin_edges
        bin_labels = self.bin_labels
                
        res = self.gen_hypothesis_eval(tbl)

        df_results = self.run_report_bld(
            alpha=alpha,
            res=res,
            tbl=tbl,
            process=process,
            group_variable=group_variable,
            group_target_val=group_target_val,
            group_other_val=group_other_val,
            bin_edges=bin_edges,
            bin_labels=bin_labels
        )
        
        return df_results
    
    def unpack_config(
        self
    ) -> None:
        
        """
        Function to unpack config variables.
        
        :param None:
        :return None:
        """
        
        config = self.config

        try:
            self.alpha: float = config["StatsTesting2x2Cont"]["alpha"]
            self.group_variable: str = config["Ingest"]["group_variable"]
            self.group_target_val: str = config["Ingest"]["group_target_val"]
            self.group_other_val: str = config["Ingest"]["group_other_val"]
            self.outcome_variable: str = config["Ingest"]["outcome_variable"]
            self.outcome_target_val: str = config["Ingest"]["outcome_target_val"]
            self.outcome_other_val: str = config["Ingest"]["outcome_other_val"]
            self.grpers: Dict[str, str] = config["Ingest"]["grpers"]
            self.testing: str = config["StatsTesting2x2Cont"]["testing"]
            self.process: str = config["StatsTesting2x2Cont"]["process"]
            self.bin_edges: List[float] = config["StatsTesting2x2Cont"]["phi_bin_edges"]
            self.bin_labels: List[str] = config["StatsTesting2x2Cont"]["phi_bin_labels"]

            if not isinstance(self.alpha, float):
                raise TypeError("Expected 'alpha' to be of type 'float'.")
            if not isinstance(self.group_variable, str):
                raise TypeError("Expected 'group_variable' to be of type 'str'.")
            if not isinstance(self.group_target_val, str):
                raise TypeError("Expected 'group_target_val' to be of type 'str'.")
            if not isinstance(self.group_other_val, str):
                raise TypeError("Expected 'group_other_val' to be of type 'str'.")
            if not isinstance(self.outcome_variable, str):
                raise TypeError("Expected 'outcome_variable' to be of type 'str'.")
            if not isinstance(self.outcome_target_val, str):
                raise TypeError("Expected 'outcome_target_val' to be of type 'str'.")
            if not isinstance(self.outcome_other_val, str):
                raise TypeError("Expected 'outcome_other_val' to be of type 'str'.")
            if not isinstance(self.grpers, dict):
                raise TypeError("Expected 'grpers' to be of type 'dict'.")
            if not isinstance(self.testing, str):
                raise TypeError("Expected 'testing' to be of type 'str'.")
            if not isinstance(self.process, str):
                raise TypeError("Expected 'process' to be of type 'str'.")
            if not isinstance(
                self.bin_edges, list
            ) or not all(
                isinstance(
                    i, (int, float)
                ) for i in self.bin_edges
            ):
                raise TypeError("Expected 'bin_edges' to be a list of floats.")
            if not isinstance(
                self.bin_labels, list
            ) or not all(
                isinstance(i, str) for i in self.bin_labels
            ):
                raise TypeError("Expected 'bin_labels' to be a list of strings.")
        
        except KeyError as e:
            raise KeyError(
                f"Missing key '{e.args[0]}' in the config file. "
                f"Ensure all required keys are present in the 'Ingest' and 'StatsTesting2x2Cont' sections."
            )

        except TypeError as e:
            raise TypeError(f"Config file error: {e}")

        
    def gen_hypothesis_eval(
        self,
        tbl: List[int]
    ) -> chi2_contingency:
        
        """
        Function to generate the chi2_contigency
        statistic and result.
        """
        
        #size = np.shape(tbl)
        #tbl_len = len(tbl)
        
        res = chi2_contingency(
            tbl
        )
            
        return res
        
    def run_report_bld(
        self,
        alpha: float,
        res: chi2_contingency,
        tbl: List[int],
        process: str,
        group_variable: str,
        group_target_val: str,
        group_other_val: str,
        bin_edges: List[float],
        bin_labels: List[str]
    ) -> DataFrame:
        
        """
        Runs report for statistical testing
        chi2_contingency results
        
        :param alpha:
            float, alpha value for significance evaluation.
        :param res:
            chi2_contingency, result of the chi2_contingency.
        :param tbl:
            List[int], the contingency table.
        :param process: 
            str, the name of the business process
            being tested, e.g. 'hiring'.
        :param group_variable:
            str, column name of the
            target variable.
        :param group_target_val:
            str, class target value of the group_variable
            aka the protected class value.
        :param group_other_val:
            str, class nontarget value of the group_variable
            aka the nonprotected class value.  
        :param bin_edges:
            List[float], edges for phi
            bins.
        :param bin_labels:
            List[str], labels for the phi
            bins.
        :return df:
            DataFrame, target
        """
        
        pvalue = res[1]
        
        df = pd.DataFrame()

        df = self._gen_significance_test(
            df=df,
            pvalue=pvalue,
            alpha=alpha
        )
        
        (
            df,
            A,
            B,
            C,
            D,
            total_target_grp,
            total_non_target_grp,
            diagonals,
            percent_target_succ,
            percent_non_target_succ,
            phi_numerator,
            phi_denominator
        ) = self._gen_table_calcs(
                df=df,
                tbl=tbl,
        )
        
        if res[1] <= alpha:
            df, phi_result = self._gen_phi_coefficient(
                df=df,
                tbl=tbl,
                bin_edges=bin_edges,
                bin_labels=bin_labels,
                process=process,
                group_variable=group_variable,
                group_target_val=group_target_val,
                group_other_val=group_other_val,
                diagonals=diagonals,
                numerator=phi_numerator,
                denominator=phi_denominator,
                percent_target_succ=percent_non_target_succ,
                percent_non_target_succ=percent_non_target_succ,
            )
            
        else:
            df['phi_corr_coeff'] = np.nan
            df['phi_bins'] = np.nan
            
            phi_result = ""
        
        df = self._gen_four_fifths_test(
            df,
            percent_target_succ=percent_non_target_succ,
            percent_non_target_succ=percent_non_target_succ
        )
                
        df = self._gen_outcome_meta(
            df,
            round(res[1],3),
            phi_result
        )
        
        df = self._gen_unpack_stats(
            df,
            res
        )
        
        return df
        
    def _gen_unpack_stats(
        self,
        df: DataFrame,
        res: chi2_contingency
    ) -> DataFrame:
        
        """
        Method to unpack test stats from
        chi2_contingency results.
        
        :param df:
            DataFrame, output df.
        :param res:
            chi2_contingency, results array.
        :return df:
            DataFrame, output df.
        """
        
        group_target_val = self.group_target_val
        group_other_val = self.group_other_val
        rows = [group_target_val] + [group_other_val]
    
        df['statistic'] = res[0]
        df['pvalue'] = res[1]
        df['dof'] = res[2]
        df['tbl_row'] = [rows]
        df['tbl'] = [tbl]
        df['expected_freq'] = [res[3]]
        df['tbl_expected_diff'] = [tbl - res[3]]
        
        return df
    
    def _gen_significance_test(
        self,
        df: DataFrame,
        pvalue: float,
        alpha: float
    ):
        """
        Method to report on test significance.
        
        :param df:
            DataFrame, results df.
        :param pval:
            int, pvalue.
        :param alpha:
            float, the alpha value for testing eval.
        :return df:
            DataFrame with metadata added.     
        """
            
        if pvalue <= alpha:
            val = 'statistically significant result'
            
        else:
            val = 'no statistically significant result'
                    
        df['test_result'] = [val]
        
        return df
    
    def _gen_phi_coefficient(
        self,
        df: DataFrame,
        tbl: List[int],
        process: str,
        group_variable: str,
        group_target_val: str,
        group_other_val: str,
        bin_edges: List[float],
        bin_labels: List[str],
        diagonals: List[float],
        numerator: float,
        denominator: float,
        percent_target_succ: float,
        percent_non_target_succ: float,
    ) -> DataFrame:
        
        """
        Method to generate the phi coefficient.
        
        :param df:
            DataFrame, the results df.
        :param tbl:
            List[int], the 2x2 cont table.
        :param process: 
            str, the name of the business process
            being tested, e.g. 'hiring'.
        :param group_variable:
            str, column name of the
            target variable.
        :param group_target_val:
            str, class target value of the group_variable
            aka the protected class value.
        :param group_other_val:
            str, class nontarget value of the group_variable
            aka the nonprotected class value.  
        :param bin_edges:
            List[float], edges for phi
            bins.
        :param bin_labels:
            List[str], lab
        :return df:
            DataFrame, output df.
        """
        phi = numerator / denominator if denominator != 0 else 0

        df['phi_corr_coeff'] = phi
                
        df = self._gen_prep_phi_bins(
            df=df,
            bin_edges=bin_edges,
            bin_labels=bin_labels
        )

        df, phi_result = self._gen_prep_diagonals(
            df=df,
            diagonals=diagonals,
            process=process,
            group_variable=group_variable,
            group_other_val=group_other_val,
            group_target_val=group_target_val,
            percent_non_target_succ=percent_non_target_succ,
            percent_target_succ=percent_target_succ,
        )
        
        return df, phi_result
    
    def _gen_table_calcs(
        self,
        df: DataFrame,
        tbl: List[int]
    ) -> Tuple[
        DataFrame, float, float, float, float,
        float, float, float, float, 
        float, float, float
    ]:
        
        """
        Method to generate phi bins. Provides additional
        explainability on the magnitude of association, when 
        an association is found.
        
        :param df:
            DataFrame, output df.
        :param tbl:
            List[int], 2x2 contingency.
        :return [
            df, A, B, C, D, total_target_grp,
            total_non_target_grp, diagonals,
            percent_target_succ, percent_non_target_succ,
            phi_numerator, phi_denominator
        ]:
            Tuple[DataFrame, float, float, float, float,
        float, float, float, float, 
        float, float, float
        ]
        """
        
        # females, males; no succ, succ
        A, B = tbl[0] 
        C, D = tbl[1]
        
        total_target_grp = A + B
        total_non_target_grp = C + D
        diagonals = (A + D) > (B + C)
        percent_target_succ = (B / total_target_grp) * 100
        percent_non_target_succ = (D / total_non_target_grp) * 100
        phi_numerator = (A * D) - (B * C)
        phi_denominator = np.sqrt((A + B) * (C + D) * (A + C) * (B + D))      
           
        return (
            df,
            A,
            B,
            C,
            D,
            total_target_grp,
            total_non_target_grp,
            diagonals,
            percent_target_succ,
            percent_non_target_succ,
            phi_numerator,
            phi_denominator
        )
    
    def _gen_prep_phi_bins(
        self,
        df: DataFrame,
        bin_edges: List[float],
        bin_labels: List[str]
    ) -> DataFrame:
        
        """
        Method to generate pandas bins for 
        phi coeff.
        
        :param df:
            DataFrame, output df.
        :param bin_edges:
            List[float], edges for phi
            bins.
        :param bin_labels:
            List[str], labels for the phi
            bins.
        :return df:
            DataFrame, output df.
        """
    
        df['phi_bins'] = pd.cut(
            df['phi_corr_coeff'], 
            bins=bin_edges, 
            labels=bin_labels, 
            include_lowest=True
        )
        
        return df
    
    def _gen_four_fifths_test(
        self,
        df: DataFrame,
        percent_target_succ: float,
        percent_non_target_succ: float
    ) -> DataFrame:
        
        ratio = percent_target_succ / percent_non_target_succ
        
        if ratio < .8:
            ratio_desc = f'failed with 4/5 test at {round(ratio,3)}'
        elif ratio >= .8:
            ratio_desc = f'passed with 4/5 test at {round(ratio,3)}'
        else:
            ratio_desc = 'error calculating 4/5 test'
        
        df['four_fifths_test'] = ratio_desc
        return df
    
    def _gen_prep_diagonals(
        self,
        df: DataFrame,
        diagonals: bool,
        process: str,
        group_variable: str,
        group_other_val: str,
        group_target_val: str,
        percent_non_target_succ: float,
        percent_target_succ: float,
    ) -> Tuple[DataFrame, str]:
        
        """
        Method to generate the magnitude of the
        assocation using phi coefficient analysis.
        
        :param df:
            DataFrame, output df.
        :param diagonals:
            bool,
        :param process: 
            str, the name of the business process
            being tested, e.g. 'hiring'.
        :param group_variable:
            str, column name of the
            target variable.
        :param group_target_val:
            str, class target value of the group_variable
            aka the protected class value.
        :param group_other_val:
            str, class nontarget value of the group_variable
            aka the nonprotected class value.   
        :param percent_non_target_succ:
            float, the success percentage attained
            for the the non-target group.
        :param percent_target_succ:
            float, the success percentage attained for the
            target class.
        :return (df, phi_col):
            Tuple[df, phi_col]
        """
        
        phi_bin = df['phi_bins'].values[0]    
        phi_corr_coeff = df['phi_corr_coeff'].values[0]    

        if diagonals:
            diagonal_msg = (
                f"The values on the positive diagonal of the 'tbl' indicate the distribution of {process} success across {group_variable} categories."
                f" {group_other_val} had a higher proportion of successful outcomes compared to {group_target_val}."
                f" Specifically, {percent_non_target_succ:.1f}% of {group_other_val} had success while only {percent_target_succ:.1f}%"
                f" of {group_target_val} had success."
                f" This significant difference in {process} success rates suggests a potential {group_variable} bias, with {group_other_val} success in {process}"
                f" at a higher rate than {group_target_val}."
            )
            
        else:
            diagonal_msg = "the diagonal values are not substantially higher, suggesting the relationship might be more nuanced."
        
        phi_col = f"The phi correlation coefficient is {phi_corr_coeff:.3f}, indicating a {phi_bin} effect size. {diagonal_msg}"
            
        return df, phi_col
    
    def _gen_outcome_meta(
        self,
        df: DataFrame,
        pval: float,
        phi_result: str
    ) -> DataFrame:
        
        """
        Method to generate meta data for 
        reporting dataframe
        
        :param df:
            DataFrame, results df
        :param pval:
            int, pvalue
        :param phi_result:
            str, result of phi testing.
        :return df:
            DataFrame with metadata added
        """
        
        grpers = self.grpers
        result = df['test_result'].values[0]
        phi_col = df['phi_corr_coeff'].values[0]
        testing = self.testing
        process = self.process
        group_target_val = self.group_target_val
        alpha = self.alpha
        four_fifths = df['four_fifths_test'].values[0]
        
        col = f"Testing for {grpers}, {four_fifths}. Based on the results of the chi-square test of independence, there is {result} for {testing}-based {process} discrimination against {group_target_val} at the chosen significance level of {alpha}."

        if result == "statistically significant result":
            col = f"{col} {phi_result}"
                
        df['result_desc'] = col
        
        return df
        
# pipeline

ingestObj = Ingest(config)
df = ingestObj.run()

transObj = Transform(
    df.copy()
)
tbl = transObj.run_build_cont_table()

statsObj = StatsTesting2x2Cont(
    config,
    tbl,
    df.copy() # need to add some more context in plain text
)
df_result = statsObj.run_testing()

df_result['result_desc'].tolist()

["Testing for {'job_title': 'analyst'}, passed with 4/5 test at 1.0. Based on the results of the chi-square test of independence, there is statistically significant result for gender-based hiring discrimination against Female at the chosen significance level of 0.05. The phi correlation coefficient is 0.397, indicating a moderate effect size. The values on the positive diagonal of the 'tbl' indicate the distribution of hiring success across gen categories. Male had a higher proportion of successful outcomes compared to Female. Specifically, 54.5% of Male had success while only 54.5% of Female had success. This significant difference in hiring success rates suggests a potential gen bias, with Male success in hiring at a higher rate than Female."]

In [1]:
import yaml
import model

with open('config.yaml') as f:
    config = yaml.safe_load(f)
    
model = model.Model(config)

df_prep, tbl = model.prep()

df_result = model.analysis(df_prep.copy(), tbl)

In [3]:
df_result

Unnamed: 0,test_result,phi_corr_coeff,phi_bins,four_fifths_test,result_desc,statistic,pvalue,dof,tbl_rows,tbl_cols,tbl,expected_freq,tbl_expected_diff
0,statistically significant result,0.39736,moderate,passed with 4/5 test at 1.0,"Testing for {'job_title': 'analyst'}, passed w...",5.218246,0.022351,1,"[Female, Male]","[hired, not_hired]","[[10, 1], [15, 18]]","[[6.25, 4.75], [18.75, 14.25]]","[[3.75, -3.75], [-3.75, 3.75]]"


In [24]:
df_result

Unnamed: 0,test_result,phi_corr_coeff,phi_bins,four_fifths_test,result_desc,statistic,pvalue,dof,tbl_row,tbl,expected_freq,tbl_expected_diff
0,statistically significant result,0.39736,moderate,failed 4/5 test at 0.167,"Testing for {'job_title': 'analyst'}, based on...",5.218246,0.022351,1,"[Female, Male]","[[10, 1], [15, 18]]","[[6.25, 4.75], [18.75, 14.25]]","[[3.75, -3.75], [-3.75, 3.75]]"


to do:
    
implement these tests


https://en.wikipedia.org/wiki/Disparate_impact

Add handler for filtered size of group must be ...

# need to check this size
# https://online.stat.psu.edu/stat500/lesson/8/8.2#:~:text=That%20equates%20to%20the%20Chi,count%20of%20at%20least%205.

# make sure at least 5 in each slice, then at least 50