# Create Sample Dataset

In [2]:
import pandas as pd
import numpy as np
import datetime
import random

import sagemaker
import sagemaker.session

from sklearn.model_selection import train_test_split

In [3]:
session = sagemaker.session.Session()
region = session.boto_region_name
role = sagemaker.get_execution_role()
bucket = session.default_bucket()
prefix = 'custom_preprocessing'

In [4]:
tags = [
    {"Key": "PLATFORM", "Value": "FO-ML"},
    {"Key": "BUSINESS_REGION", "Value": "GLOBAL"},
    {"Key": "BUSINESS_UNIT", "Value": "MOBILITY"},
    {"Key": "CLIENT", "Value": "MULTI_TENANT"}
   ]

## Create sample data

In [5]:
sample_ct = 10000

In [6]:
target_col = []
tf_col = []
onehot_col = []
float_col = []
drop_col = []
xrand_col = []

target_vals = [0,1]
tf_vals = ['true', 'false', np.nan, '1', '0']
onehot_vals = [np.nan, 'red', 'orange', 'yellow', 'green', 'blue', 'purple']
float_vals = list(range(0,10)) + [x/10 for x in range(0, 100, 5)] +[np.nan]
drop_vals = [np.nan] + list(range(0,10))
xrand_vals = list(range(5))

col_list = zip([target_col, tf_col, onehot_col, float_col, drop_col, xrand_col],
               [target_vals, tf_vals, onehot_vals, float_vals, drop_vals, xrand_vals])

for col, vals in col_list:
    for _ in range(sample_ct):
        col.append(random.choice(vals))

In [7]:
date_col = []

for _ in range(sample_ct):
    try:
        date = datetime.date(2022, random.randint(1, 12), random.randint(1, 31))
        date_col.append(date)
    except ValueError:
        date_col.append(np.nan)

In [8]:
nbr_vals = list(range(0,10))
str_vals = ['apple', 'orange', 'grape', 'pineapple', 'strawberry', 'blueberry', 'grapefruit', 'apple']

In [9]:
nunique_col = []

for _ in range(sample_ct):
    val_size = random.randint(0,6)
    if val_size < 1:
        nunique_col.append(np.nan)
    else:
        if random.randint(0,10) < 5:
            val_type = nbr_vals
            val_type = [str(x) for x in val_type]
        else:
            val_type = str_vals
        val = random.choices(val_type,k=val_size)
        if len(val) <= 1:
            nunique_col.append(val)
        else:
            strified = ','.join(val)
            nunique_col.append(strified)

In [10]:
descstat_col = []
max_col = []

nbrlst_cols = [descstat_col, max_col]

for col in nbrlst_cols:
    for _ in range(sample_ct):
        val_size = random.randint(0,6)
        if val_size < 1:
            col.append(np.nan)
        else:
            val_type = [str(x) for x in nbr_vals]
            val = random.choices(val_type,k=val_size)
            strified = ','.join(val)
            col.append(strified)

In [11]:
multi_col = []

for _ in range(sample_ct):
    val_size = random.randint(0,6)
    if val_size < 1:
        multi_col.append(np.nan)
    else:
        val = random.choices(str_vals, k=val_size)
        strified = ','.join(val)
        multi_col.append(strified)

In [12]:
sample_df = pd.DataFrame({
    'target':target_col,
    'true_false':tf_col,
    'one_hot':onehot_col,
    'dates':date_col,
    'floats':float_col,
    'max_of_list':max_col,
    'nunique_of_list':nunique_col,
    'desc_stats':descstat_col,
    'multi_label':multi_col,
    'random_col':drop_col,
    'other':xrand_col})
sample_df

Unnamed: 0,target,true_false,one_hot,dates,floats,max_of_list,nunique_of_list,desc_stats,multi_label,random_col,other
0,1,1,orange,2022-01-04,9.0,09,5084,522476,"strawberry,grape,pineapple,blueberry,pineapple...",5.0,2
1,1,0,blue,2022-04-16,6.0,6,"blueberry,pineapple,strawberry,grape,orange",73,,3.0,1
2,0,,orange,,7.0,4,[pineapple],42253,"grapefruit,grapefruit,grapefruit,strawberry,gr...",9.0,4
3,0,1,orange,2022-05-09,9.5,5832,74551,9,"pineapple,grape,apple,strawberry,orange",3.0,0
4,0,,orange,2022-06-25,0.5,2089,"grape,orange,grapefruit",4387,"pineapple,strawberry,grape,orange",,3
...,...,...,...,...,...,...,...,...,...,...,...
9995,1,1,yellow,2022-11-24,7.0,83270,"apple,pineapple,strawberry,orange",22312,blueberry,8.0,2
9996,0,0,blue,2022-11-01,6.0,3,"apple,apple,apple,blueberry,grapefruit,orange",277,"pineapple,orange",6.0,1
9997,0,0,purple,2022-03-07,4.0,616070,656466,31475,orange,3.0,1
9998,1,1,green,2022-04-06,7.0,,"grape,apple",2019,pineapple,7.0,0


In [13]:
train, other = train_test_split(sample_df, train_size=0.8, random_state=12, stratify=sample_df['target'])
test, validate = train_test_split(other, train_size=0.5, random_state=12, stratify=other['target'])

In [14]:
train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)
validate.to_csv('validate.csv', index=False)

train_input = session.upload_data('train.csv', bucket=bucket, key_prefix=f'{prefix}/sample_data')
test_input = session.upload_data('test.csv', bucket=bucket, key_prefix=f'{prefix}/sample_data')
validate_input = session.upload_data('validate.csv', bucket=bucket, key_prefix=f'{prefix}/sample_data')