# IMPORTS

In [80]:
import pandas as pd
import numpy as np

# Fake Data Functions

In [99]:
def get_dataset(size):
    # Create Fake Dataset
    df = pd.DataFrame()
    df['size'] = np.random.choice(['big','medium','small'], size)
    df['age'] = np.random.randint(1, 50, size)
    df[' TeaM'] = np.random.choice(['red','blue','yellow','green'], size)
    df['win'] = np.random.choice(['yes','no'], size)
    dates = pd.date_range('2020-01-01', '2022-12-31')
    df['Date'] = np.random.choice(dates, size)
    df['prob Uniform'] = np.random.uniform(0, 1, size)
    return df

def set_dtypes(df):
    df['size'] = df['size'].astype('category')
    df[' TeaM'] = df['team'].astype('category')
    df['age'] = df['age'].astype('int16')
    df['win'] = df['win'].map({'yes':True, 'no': False})
    df['prob Uniform'] = df['prob'].astype('float32')
    return df

In [100]:
df = get_dataset(1_000_000)
df.head(2)

Unnamed: 0,size,age,TeaM,win,Date,prob Uniform
0,small,35,yellow,no,2020-08-01,0.241785
1,medium,46,red,yes,2022-09-08,0.984789


# Pandas Pipelining

### Data Functions

In [105]:
import datetime as dt

def loggg(f):
  # decorator to add calc times and logging
  def wrapper(dataf, *args, **kwargs):
    tic = dt.datetime.now()
    result = f(dataf, *args, **kwargs)
    toc = dt.datetime.now()
    print(f"****::{f.__name__}, took = {toc - tic}, shape:{result.shape}")
    return result
  return wrapper


@loggg
def clean_dataset_columns(dataf):
  dataf.columns = [col.lstrip().rstrip().lower().replace (" ", "_") for col in dataf]
  return dataf

@loggg
def remove_outliers(dataf):
  return dataf

@loggg
def start_pipeline(dataf):
  dataf = dataf.copy()
  return dataf

@loggg
def clean_dataset(dataf):
  """
  Place holder for any specific cleaning steps that can then be piped into 
  the processing allowing one to see processing time and simplify the layout
  of the notebook.  All specific cleaning steps will be here.
  """
  return dataf

@loggg
def missing_data(dataf):
  """
  Calculate max count in each column compare each column vs max
  """
  return dataf


@loggg
def assess_NA(data):
    """
    Returns a pandas dataframe denoting the total number of NA values and the percentage of NA values in each column.
    The column names are noted on the index.
    
    Parameters
    ----------
    data: dataframe
    """
    # pandas series denoting features and the sum of their null values
    null_sum = data.isnull().sum()# instantiate columns for missing data
    total = null_sum.sort_values(ascending=False)
    percent = ( ((null_sum / len(data.index))*100).round(2) ).sort_values(ascending=False)
    
    # concatenate along the columns to create the complete dataframe
    df_NA = pd.concat([total, percent], axis=1, keys=['num_NA', 'pct_NA'])
    
    # drop rows that don't have any missing data; omit if you want to keep all rows
    # df_NA = df_NA[ (df_NA.T != 0).any() ]

    print('-'*35)
    print(df_NA)
    print('-'*35)
    return data


@loggg
def info_and_describe(dataf):
  print('-'*35)
  print(dataf.info())
  print('-'*35)
  print(dataf.describe())
  print('-'*35)
  return dataf

@loggg
def basic_return(dataf):
  return dataf

In [106]:
clean_df = (df
            .pipe(start_pipeline)
            .pipe(clean_dataset_columns)
            .pipe(assess_NA)
            .pipe(info_and_describe)
            .pipe(basic_return)
            )

****::start_pipeline, took = 0:00:00.197910, shape:(1000000, 6)
****::clean_dataset_columns, took = 0:00:00.000549, shape:(1000000, 6)
-----------------------------------
              num_NA  pct_NA
size               0     0.0
age                0     0.0
team               0     0.0
win                0     0.0
date               0     0.0
prob_uniform       0     0.0
-----------------------------------
****::assess_NA, took = 0:00:00.975006, shape:(1000000, 6)
-----------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 6 columns):
 #   Column        Non-Null Count    Dtype         
---  ------        --------------    -----         
 0   size          1000000 non-null  object        
 1   age           1000000 non-null  int64         
 2   team          1000000 non-null  object        
 3   win           1000000 non-null  object        
 4   date          1000000 non-null  datetime64[ns]
 5   prob_uniform  1