In [36]:
import dask.dataframe
import dask.dataframe as dd
import pandas as pd
import numpy as np
from math import log, e
pd.util.testing.N = 100
pd.util.testing.K = 50

# Generate Data

In [136]:

df = pd.util.testing.makeMixedDataFrame()
df = df.merge(df, left_index=True, right_index=True)
df = dd.from_pandas(df, npartitions=1)

In [77]:
def entropy(labels, base=None):
  """ Computes entropy of label distribution. 
  
  References:
    [1] https://stackoverflow.com/questions/15450192/fastest-way-to-compute-entropy-in-python
  """

  n_labels = len(labels)

  if n_labels <= 1:
    return 0

  value,counts = np.unique(labels, return_counts=True)
  probs = counts / n_labels
  n_classes = np.count_nonzero(probs)

  if n_classes <= 1:
    return 0

  ent = 0.

  # Compute entropy
  base = e if base is None else base
  for i in probs:
    ent -= i * log(i, base)

  return ent

# Filter Nulls

In [182]:
def filter_nulls(data: dask.dataframe, nulls_threshold: float):

    summary_df = data.isnull().sum().compute()
    summary_df = summary_df.to_frame(name="nulls_count")
    summary_df["nulls_proportions"] = summary_df["nulls_count"] / data.shape[0].compute()
    summary_df.sort_values(by="nulls_count", ascending=False, inplace=True)

    mask_nulls = summary_df["nulls_proportions"] > nulls_threshold
    summary_df.loc[mask_nulls, "filtered_nulls"]  = 1
    summary_df.loc[~mask_nulls, "filtered_nulls"]  = 0
    
    removed_cols = list(summary_df[mask_nulls].index.values)

    return data.drop(labels=removed_cols, axis=1), summary_df

In [183]:
data, summary = filter_nulls(data, 0.75)

In [184]:
summary

Unnamed: 0,nulls_count,nulls_proportions,filtered_nulls
A_x,0,0.0,0.0
B_x,0,0.0,0.0
C_x,0,0.0,0.0
D_x,0,0.0,0.0
A_y,0,0.0,0.0
B_y,0,0.0,0.0
C_y,0,0.0,0.0
D_y,0,0.0,0.0


# Numerical Variance

In [173]:
def filter_numerical_variance(data: dask.dataframe, variance_thresholds: list=[0, np.inf], inclusive: bool=False):

    summary_df = data.select_dtypes(include=[np.number]).describe().compute()
    summary_df = summary_df.T.reset_index()
    summary_df.rename(columns={"index": "column_name"}, inplace=True)
    summary_df.sort_values(by="column_name", inplace=True)

    thresholds = [float(value) for value in variance_thresholds]
    mask_variance = summary_df["std"].between(min(thresholds), max(thresholds), inclusive=inclusive)

    removed_cols = list(summary_df.loc[~mask_variance, "column_name"].values)
    mask_removed = summary_df["column_name"].isin(removed_cols)
    
    summary_df.loc[mask_removed, "filtered_variance"]  = 1
    summary_df.loc[~mask_removed, "filtered_variance"]  = 0
    
    return data.drop(labels=removed_cols, axis=1), summary_df.set_index("column_name")

In [174]:
data, num_summary = filter_numerical_variance(df, [0, 100])

In [175]:
num_summary

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max,Removed due to Variance
column_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
A_x,5.0,2.0,1.581139,0.0,1.0,2.0,3.0,4.0,0.0
A_y,5.0,2.0,1.581139,0.0,1.0,2.0,3.0,4.0,0.0
B_x,5.0,0.4,0.547723,0.0,0.0,0.0,1.0,1.0,0.0
B_y,5.0,0.4,0.547723,0.0,0.0,0.0,1.0,1.0,0.0


In [176]:
summary = summary.merge(num_summary, left_index=True, right_index=True, how="left")

# Categorical

In [177]:
def filter_categorical_variance(data: dask.dataframe, entropy_thresholds: list=[0, np.inf], inclusive: bool=False):

    summary_df = data.select_dtypes(exclude=[np.number], include=["object"]).describe().compute()
    summary_df = summary_df.T

    entropies = data.select_dtypes(exclude=[np.number], include=["object"]).compute().apply(entropy, axis=0)
    entropies = entropies.to_frame(name="entropy")

    summary_df = summary_df.merge(entropies, left_index=True, right_index=True)

    summary_df.reset_index(inplace=True)
    summary_df.rename(columns={"index": "column_name"}, inplace=True)
    summary_df.sort_values(by="column_name", inplace=True)

    thresholds = [float(value) for value in entropy_thresholds]
    mask_entropy = summary_df["entropy"].between(min(thresholds), max(thresholds), inclusive=inclusive)
    removed_cols = list(summary_df.loc[~mask_entropy, "column_name"].values)
    mask_removed = summary_df["column_name"].isin(removed_cols)
    summary_df.loc[mask_removed, "Removed"]  = 1
    summary_df.loc[~mask_removed, "Removed"]  = 0
    
    return data.drop(labels=removed_cols, axis=1), summary_df.set_index("column_name")

In [178]:
data, summary_cat = filter_categorical_variance(df)

In [179]:
summary.merge(summary_cat, left_index=True, right_index=True, how="left")

Unnamed: 0,Nulls Count,Nulls Proportions,Removed due to Nulls,count_x,mean,std,min,25%,50%,75%,max,Removed due to Variance,unique,count_y,top,freq,entropy,Removed
A_x,0,0.0,0.0,5.0,2.0,1.581139,0.0,1.0,2.0,3.0,4.0,0.0,,,,,,
B_x,0,0.0,0.0,5.0,0.4,0.547723,0.0,0.0,0.0,1.0,1.0,0.0,,,,,,
C_x,0,0.0,0.0,,,,,,,,,,5.0,5.0,foo5,1.0,1.609438,0.0
D_x,0,0.0,0.0,,,,,,,,,,,,,,,
A_y,0,0.0,0.0,5.0,2.0,1.581139,0.0,1.0,2.0,3.0,4.0,0.0,,,,,,
B_y,0,0.0,0.0,5.0,0.4,0.547723,0.0,0.0,0.0,1.0,1.0,0.0,,,,,,
C_y,0,0.0,0.0,,,,,,,,,,5.0,5.0,foo5,1.0,1.609438,0.0
D_y,0,0.0,0.0,,,,,,,,,,,,,,,


In [180]:
data

Unnamed: 0_level_0,A_x,B_x,C_x,D_x,A_y,B_y,C_y,D_y
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,float64,float64,object,datetime64[ns],float64,float64,object,datetime64[ns]
4,...,...,...,...,...,...,...,...
