# Data Preprocessing for PetFinder6000

In [1]:
import pandas as pd
import numpy as np
import re
import os
import glob
import shutil

import boto3
import sagemaker

## Interactions

## Load Data from S3

In [2]:
%env AWS_PROFILE=aeroxye-sagemaker

env: AWS_PROFILE=aeroxye-sagemaker


In [3]:
!aws sts get-caller-identity

{
    "UserId": "AROAWC4YSIQL5OBFCNGEX:botocore-session-1686898676",
    "Account": "418542404631",
    "Arn": "arn:aws:sts::418542404631:assumed-role/SageMaker-UserRole/botocore-session-1686898676"
}


In [4]:
try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='SageMaker-UserRole')['Role']['Arn']

Couldn't call 'get_role' to get Role ARN from role name SageMaker-UserRole to get Role path.


In [5]:
region = boto3.Session().region_name

data_bucket = "dynamodbpetfinder"
object_prefix = "interaction_"
local_path = "../data/interaction/"

if not os.path.exists(local_path):
    os.makedirs(local_path)
else:
    shutil.rmtree(local_path)
    os.makedirs(local_path)

pattern = r'[0-9]+'
s3 = boto3.client("s3")

result = s3.list_objects(Bucket=data_bucket, Prefix=object_prefix, Delimiter="/")
subfolders = [re.search(pattern, o.get("Prefix")).group() for o in result.get("CommonPrefixes")]
subfolders.sort(reverse=True)

object_path = object_prefix + subfolders[0] + "/"
files = s3.list_objects(Bucket=data_bucket, Prefix=object_path, Delimiter="/")

pattern = rf'{object_path}(.+)'
for content in files.get("Contents"):
    file_path = content.get("Key")
    filename = re.findall(pattern, file_path)[0]
    print(filename)

    with open(local_path+filename, 'wb') as file:
        s3.download_fileobj(
            Bucket=data_bucket,
            Key=file_path,
            Fileobj=file
        )

run-1686726857238-part-r-00000
run-1686726857238-part-r-00001
run-1686726857238-part-r-00002
run-1686726857238-part-r-00003


In [6]:
file_list = glob.glob(local_path + "*")

print(file_list)

dfs = [] # an empty list to store the data frames
for file in file_list:
    data = pd.read_csv(file) # read data frame from csv file
    dfs.append(data) # append the data frame to the list
    
interactions = pd.concat(dfs, ignore_index=True)

['../data/interaction\\run-1686726857238-part-r-00000', '../data/interaction\\run-1686726857238-part-r-00001', '../data/interaction\\run-1686726857238-part-r-00002', '../data/interaction\\run-1686726857238-part-r-00003']


In [7]:
interactions.head()

Unnamed: 0,id,__typename,_lastChangedAt,_version,catID,userID,updatedAt,like,createdAt,dwell_time_ms,click
0,c54a5a6b-dcfb-41cf-9090-5e35eef55af0,Interaction,1685988876889,1,0b7a6618-7c9b-4169-9916-a2e06b7b352e,2fea2054-830c-4399-ab5b-b25df10850b2,2023-06-05T18:14:36.859Z,False,2023-06-05T18:14:36.859Z,1173,True
1,490f7835-d803-4c5e-8543-28fdb24e6dd6,Interaction,1686149751346,1,23eccd95-56d8-407e-98fa-32c49158d4dd,43c4ff06-20cc-489d-83c4-5074ad36efb5,2023-06-07T14:55:51.343Z,False,2023-06-07T14:55:51.343Z,797,True
2,5ace4bcc-cf60-4e62-9ae7-822617a86c8d,Interaction,1684813208891,1,bd8c1b25-bb4b-4d0d-ac37-3d6799d885f9,082a5d87-5cb2-41d1-a29d-601b38d9f380,2023-05-23T03:40:08.864Z,False,2023-05-23T03:40:08.864Z,3809,True
3,a2b1ca94-068f-460e-876d-70d6d4568a47,Interaction,1684463433803,1,0076ee93-e587-4b01-b916-56d008f1f233,9753bff1-0f69-4cde-8a3a-9c4c7cabc8b8,2023-05-19T02:30:33.800Z,False,2023-05-19T02:30:33.800Z,733,True
4,f6a70902-c175-404d-a5f7-d85a64eccb51,Interaction,1684486874155,1,16949c4a-5b34-475b-8d9c-d180caaf6154,9af16262-416a-466a-973d-4d54ea0c2172,2023-05-19T09:01:14.153Z,False,2023-05-19T09:01:14.153Z,1335,False


In [28]:
# interactions = pd.read_csv('../data/interactions_5-23-2023.csv', header=0)

## Process Data

In [8]:
# rename headers
cl_interactions = interactions.rename(columns={
                              'createdAt': 'created_at',
                              'updatedAt': 'updated_at',
                              })

# convert types
cl_interactions = cl_interactions.astype({
                          'like': 'int',
                          'click': 'int',
                          })
cl_interactions['created_at'] = pd.to_datetime(cl_interactions['created_at']).dt.strftime('%Y-%m-%dT%H:%M:%SZ')
cl_interactions['updated_at'] = pd.to_datetime(cl_interactions['updated_at']).dt.strftime('%Y-%m-%dT%H:%M:%SZ')

In [9]:
# drop glue columns
cl_interactions = cl_interactions.drop(['__typename', '_lastChangedAt', '_version'], axis=1)

In [10]:
cl_interactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4841 entries, 0 to 4840
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             4841 non-null   object
 1   catID          4841 non-null   object
 2   userID         4841 non-null   object
 3   updated_at     4841 non-null   object
 4   like           4841 non-null   int32 
 5   created_at     4841 non-null   object
 6   dwell_time_ms  4841 non-null   int64 
 7   click          4841 non-null   int32 
dtypes: int32(2), int64(1), object(5)
memory usage: 264.9+ KB


In [11]:
cl_interactions.head()

Unnamed: 0,id,catID,userID,updated_at,like,created_at,dwell_time_ms,click
0,c54a5a6b-dcfb-41cf-9090-5e35eef55af0,0b7a6618-7c9b-4169-9916-a2e06b7b352e,2fea2054-830c-4399-ab5b-b25df10850b2,2023-06-05T18:14:36Z,0,2023-06-05T18:14:36Z,1173,1
1,490f7835-d803-4c5e-8543-28fdb24e6dd6,23eccd95-56d8-407e-98fa-32c49158d4dd,43c4ff06-20cc-489d-83c4-5074ad36efb5,2023-06-07T14:55:51Z,0,2023-06-07T14:55:51Z,797,1
2,5ace4bcc-cf60-4e62-9ae7-822617a86c8d,bd8c1b25-bb4b-4d0d-ac37-3d6799d885f9,082a5d87-5cb2-41d1-a29d-601b38d9f380,2023-05-23T03:40:08Z,0,2023-05-23T03:40:08Z,3809,1
3,a2b1ca94-068f-460e-876d-70d6d4568a47,0076ee93-e587-4b01-b916-56d008f1f233,9753bff1-0f69-4cde-8a3a-9c4c7cabc8b8,2023-05-19T02:30:33Z,0,2023-05-19T02:30:33Z,733,1
4,f6a70902-c175-404d-a5f7-d85a64eccb51,16949c4a-5b34-475b-8d9c-d180caaf6154,9af16262-416a-466a-973d-4d54ea0c2172,2023-05-19T09:01:14Z,0,2023-05-19T09:01:14Z,1335,0


## Export Processed Data as csv to Local Folder

In [12]:
processed_folder = "../data/processed/auxiliary/"

if not os.path.exists(processed_folder):
    os.makedirs(processed_folder)

cl_interactions.to_csv(f'{processed_folder}interactions.csv')

# Train/Test Split

In [13]:
from sklearn.model_selection import train_test_split

test_size = 0.2
validation_size = 0.5 # of test size
random_state = 2023

In [19]:
output_path = '../data/output/'
if not os.path.exists(output_path):
    os.makedirs(output_path)
else:
    shutil.rmtree(output_path)
    os.makedirs(output_path)

## Leave some users out

### In notebook processing

In [14]:
users = cl_interactions['userID'].unique()

train_users, test_users = train_test_split(users, test_size=test_size, shuffle=True, random_state=random_state)
validation_users, test_users = train_test_split(test_users, test_size=validation_size, shuffle=True, random_state=random_state)

In [15]:
train_set = cl_interactions[cl_interactions['userID'].isin(train_users)]
validation_set = cl_interactions[cl_interactions['userID'].isin(validation_users)]
test_set = cl_interactions[cl_interactions['userID'].isin(test_users)]

In [20]:
train_set.to_csv('../data/output/lsuo_train.csv')
validation_set.to_csv('../data/output/lsuo_validation.csv')
test_set.to_csv('../data/output/lsuo_test.csv')

### Container processing

In [21]:
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.session import Session

role = sagemaker.get_execution_role()
region = boto3.Session().region_name

bucket_name = "petfinder6000-training"
base_job_name = "lsuo"
sklearn_processor = SKLearnProcessor(
    framework_version="1.2-1", role=role, instance_type="ml.m5.xlarge", instance_count=1,
    sagemaker_session=Session(default_bucket=bucket_name),
    base_job_name=base_job_name
)

Couldn't call 'get_role' to get Role ARN from role name SageMaker-UserRole to get Role path.


In [22]:
from sagemaker.processing import ProcessingInput, ProcessingOutput

processed_folder = "../data/processed/auxiliary/"
sklearn_processor.run(
    code="../scripts/leave-some-users-out.py",
    # arguments = ["arg1", "arg2"], # Arguments can optionally be specified here
    inputs=[ProcessingInput(source=f"{processed_folder}interactions.csv", destination="/opt/ml/processing/input")],
    outputs=[
        ProcessingOutput(source=f"/opt/ml/processing/train", output_name="train"),
        ProcessingOutput(source=f"/opt/ml/processing/validation", output_name="validation"),
        ProcessingOutput(source=f"/opt/ml/processing/test", output_name="test"),
    ],
)

INFO:sagemaker:Creating processing-job with name lsuo-2023-06-16-07-03-00-111


......................[34mShape of data is: (4841, 9)[0m
[34mSuccessfully created directories[0m
[34mWrote files successfully[0m
[34mCompleted running the processing job[0m



## Stratified Split

### In notebook processing

In [23]:
train_set = cl_interactions.groupby('userID').sample(frac=1-test_size, random_state=random_state)
test_set = cl_interactions.drop(train_set.index)

validation_set = test_set.groupby('userID').sample(frac=validation_size, random_state=random_state)
test_set = test_set.drop(validation_set.index)

In [24]:
print(f'Total number of users: {cl_interactions["userID"].nunique()}')
print(f'Number of users in training: {train_set["userID"].nunique()}')
print(f'Number of users in validation: {validation_set["userID"].nunique()}')
print(f'Number of users in test: {test_set["userID"].nunique()}')

Total number of users: 104
Number of users in training: 104
Number of users in validation: 96
Number of users in test: 100


In [25]:
train_set.to_csv('../data/output/strat_train.csv')
validation_set.to_csv('../data/output/strat_validation.csv')
test_set.to_csv('../data/output/strat_test.csv')

### Container processing

In [26]:
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.session import Session

role = sagemaker.get_execution_role()
region = boto3.Session().region_name

bucket_name = "petfinder6000-training"
base_job_name = "strat"
sklearn_processor = SKLearnProcessor(
    framework_version="1.2-1", role=role, instance_type="ml.m5.xlarge", instance_count=1,
    sagemaker_session=Session(default_bucket=bucket_name),
    base_job_name=base_job_name
)

INFO:sagemaker.image_uris:Defaulting to only available Python version: py3


In [27]:
from sagemaker.processing import ProcessingInput, ProcessingOutput

processed_folder = "../data/processed/auxiliary/"
sklearn_processor.run(
    code="../scripts/stratified-split.py",
    # arguments = ["arg1", "arg2"], # Arguments can optionally be specified here
    inputs=[ProcessingInput(source=f"{processed_folder}interactions.csv", destination="/opt/ml/processing/input")],
    outputs=[
        ProcessingOutput(source=f"/opt/ml/processing/train", output_name="train"),
        ProcessingOutput(source=f"/opt/ml/processing/validation", output_name="validation"),
        ProcessingOutput(source=f"/opt/ml/processing/test", output_name="test"),
    ],
)

INFO:sagemaker:Creating processing-job with name strat-2023-06-16-07-09-28-233


.......................[34mShape of data is: (4841, 9)[0m
[34mSuccessfully created directories[0m
[34mWrote files successfully[0m
[34mCompleted running the processing job[0m

