# Machine Learning Pipeline

## Create Data

In [1]:
import os
import yaml
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split

from generate_data_module import calc_wts, generate_dates

## Load Configuration Parameters

In [2]:
with open('params.yaml', 'r') as config_file:
    config_params = yaml.safe_load(config_file)

target_name = config_params['target_name']

In [4]:
# create ground truth
target_col = []
target_vals = [0,1]


col_list = zip([target_col],
                [target_vals],
                [config_params['target_wts']])

for col, vals, wts in col_list:
    target_col = random.choices(vals, wts, k=config_params['sample_size'])

gt_df = pd.DataFrame({
    target_name:target_col})
print(f"Ground truth provided sample size: {config_params['sample_size']}")
print(f'Ground truth dataframe shape: {gt_df.shape}')
print(f'Ground truth distribution:\n{gt_df[target_name].value_counts()}')
gt_df.to_parquet(os.path.join(config_params['project_dir'], config_params['data_dir'], 'ground_truth.parquet'), index=False)

Ground truth provided sample size: 10000
Ground truth dataframe shape: (10000, 1)
Ground truth distribution:
target
0    5007
1    4993
Name: count, dtype: int64


In [6]:
# Create Features
sample_df = gt_df.copy()

# All features include null, NaN, or None values to simulate missing data.

# single value observations
# Examples include 0, 1, 5, 9, or 0.85
# These simulate numeric features, categories, and booleans stored as numbers
col_names = ['true_false', 'one_hot', 'floats', 'random_col', 'other']

tf_vals = ['true', 'false', np.nan, '1', '0']
onehot_vals = ['red', 'orange', 'yellow', np.nan, 'green', 'blue', 'purple']
float_vals = list(range(0,10)) + [np.nan] + [x/10 for x in range(0, 100, 5)]
drop_vals = [np.nan] + list(range(0,10))
xrand_vals = list(range(5))

tf_high = config_params['predictability']/2
tf_low = (1 - config_params['predictability'] - 0.01)/2
tf_wts = [tf_high, tf_low, 0.01, tf_high, tf_low]
tf_xwts = [tf_low, tf_high, 0.01, tf_low, tf_high]
onehot_wts, onehot_xwts = calc_wts(onehot_vals, config_params['predictability'])
float_wts, float_xwts = calc_wts(float_vals, config_params['predictability'])
drop_wts, drop_xwts = calc_wts(drop_vals, 0)
xrand_wts, xrand_xwts = calc_wts(xrand_vals, config_params['predictability'])

col_list = zip(col_names,
                [tf_vals, onehot_vals, float_vals, drop_vals, xrand_vals],
                [tf_wts, onehot_wts, float_wts, drop_wts, xrand_wts],
                [tf_xwts, onehot_xwts, float_xwts, drop_xwts, xrand_xwts])

for col, vals, col_wts, col_xwts in col_list:
    true_vals = random.choices(vals, col_wts, k=len(sample_df[sample_df[target_name]==1]))
    false_vals = random.choices(vals, col_xwts, k=len(sample_df[sample_df[target_name]==0]))
    sample_df.loc[sample_df[target_name]==1, col] = true_vals
    sample_df.loc[sample_df[target_name]==0, col] = false_vals
    
# date observations
# These simulate date features in the YYYY-MM-DD (2022-12-15) date format.
true_dates, false_dates = generate_dates(sample_df, target_name)
sample_df.loc[sample_df[target_name]==1, 'dates'] = true_dates
sample_df.loc[sample_df[target_name]==0, 'dates'] = false_dates

# multivalue observations
# These simulate list type features such as [red, blue, purple], [1, 2, 3, 4]
nbr_vals = list(range(0,10))
str_vals = ['apple', 'orange', 'grape', 'pineapple', 'strawberry', 'blueberry', 'grapefruit', 'apple']

nunique_col = []

for _ in range(config_params['sample_size']):
    val_size = random.randint(0,6)
    if val_size < 1:
        nunique_col.append(np.nan)
    else:
        if random.randint(0,10) < 5:
            val_type = str_vals
        else:
            val_type = [str(x) for x in nbr_vals]
        val = random.choices(val_type,k=val_size)
        strified = ','.join(val)
        nunique_col.append(strified)

descstat_col = []
max_col = []

nbrlst_cols = [descstat_col, max_col]

for col in nbrlst_cols:
    for _ in range(config_params['sample_size']):
        val_size = random.randint(0,6)
        if val_size < 1:
            col.append(np.nan)
        else:
            val_type = [str(x) for x in nbr_vals]
            val = random.choices(val_type,k=val_size)
            strified = ','.join(val)
            col.append(strified)

multi_col = []

for _ in range(config_params['sample_size']):
    val_size = random.randint(0,6)
    if val_size < 1:
        multi_col.append(np.nan)
    else:
        val = random.choices(str_vals, k=val_size)
        strified = ','.join(val)
        multi_col.append(strified)
    
# Add to dataframe
sample_df['max_of_list'] = max_col
sample_df['nunique_of_list'] = nunique_col
sample_df['desc_stats'] = descstat_col
sample_df['multi_label'] = multi_col

print(f"Dataset provided sample size: {config_params['sample_size']}")
print(f'Full dataframe shape: {sample_df.shape}')

train, other = train_test_split(sample_df, train_size=config_params['train_size'], random_state=12, stratify=sample_df[target_name])
test, validate = train_test_split(other, train_size=0.5, random_state=12, stratify=other[target_name])

train.to_parquet(os.path.join(config_params['project_dir'], config_params['data_dir'], 'train.parquet'), index=False)
validate.to_parquet(os.path.join(config_params['project_dir'], config_params['data_dir'], 'validate.parquet'), index=False)
test.to_parquet(os.path.join(config_params['project_dir'], config_params['data_dir'], 'test.parquet'), index=False)

Dataset provided sample size: 10000
Full dataframe shape: (10000, 11)


In [7]:
train.sample(5)

Unnamed: 0,target,true_false,one_hot,floats,random_col,other,dates,max_of_list,nunique_of_list,desc_stats,multi_label
2951,0,false,purple,9.5,7.0,3.0,,951,84633.0,4035,
7032,1,false,red,0.0,7.0,1.0,2022-01-01,465089,67.0,367,
7338,0,1,blue,9.5,0.0,4.0,2022-01-30,85565,99.0,107,"grapefruit,pineapple"
484,0,false,purple,0.0,9.0,4.0,2022-04-30,29,38.0,1,"blueberry,orange,grape,apple,apple,blueberry"
7052,1,1,,0.0,4.0,0.0,2022-09-14,21433,,918,"apple,apple,pineapple,apple"
