In [2]:
import dask.dataframe

In [3]:
def filter_nulls(data: dask.dataframe, nulls_threshold: float):

    summary_df = data.isnull().sum().compute()
    summary_df = summary_df.to_frame(name="Volume")
    summary_df["Proportions"] = summary_df["Volume"] / data.shape[0].compute()
    summary_df.sort_values(by="Volume", ascending=False, inplace=True)

    mask_nulls = summary_df["Proportions"] > nulls_threshold
    summary_df.loc[mask_nulls, "Removed"]  = 1
    summary_df.loc[~mask_nulls, "Removed"]  = 0
    
    removed_cols = list(summary_df[mask_nulls].index.values)

    return data.drop(columns=removed_cols), summary_df

In [4]:
def filter_numerical_variance(data: dask.dataframe, variance_thresholds: list):

    summary_df = data.select_dtypes(include=[np.number]).describe().compute()
    summary_df = summary_df.T.reset_index()
    summary_df.rename(columns={"index": "column_name"}, inplace=True)
    summary_df.sort_values(by="column_name", inplace=True)

    thresholds = [float(value) for value in variance_thresholds]
    mask_variance = summary_df["std"].between(min(thresholds), max(thresholds))

    removed_cols = list(summary_df.loc[~mask_variance, "column_name"].values)
    mask_removed = summary_df["column_name"].isin(removed_cols)
    
    summary_df.loc[mask_removed, "Removed"]  = 1
    summary_df.loc[~mask_removed, "Removed"]  = 0
    
    return data.drop(columns=removed_cols), summary_df

In [5]:
def filter_categorical_variance(data: dask.dataframe, variance_thresholds: list):

    summary_df = data.select_dtypes(exclude=[np.number], include=["object"]).describe().compute()
    summary_df = summary_df.T.reset_index()
    summary_df.rename(columns={"index": "column_name"}, inplace=True)
    summary_df.sort_values(by="column_name", inplace=True)
    summary_df["unique_proportion"] = summary_df["unique"] / summary_df["count"]
    summary_df.sort_values(by="unique_proportion", ascending=False, inplace=True)

    thresholds = [float(value) for value in variance_thresholds]
    mask_variance = summary_df["unique_proportion"].between(min(thresholds), max(thresholds))
    removed_cols = list(summary_df.loc[~mask_variance, "column_name"].values)
    mask_removed = summary_df["column_name"].isin(removed_cols)
    summary_df.loc[mask_removed, "Removed"]  = 1
    summary_df.loc[~mask_removed, "Removed"]  = 0
    
    return data.drop(columns=removed_cols), summary_df