# [Module 4] Interactions, Items, Users 데이터셋을 S3에 업로드

이 워크샵은 기본 커널을 conda_python3를 사용합니다.

## 0. 환경 설정

#### Library Import 

파이썬에는 광범위한 라이브러리 모음이 포함되어 있으며, 본 LAB을 위해서 핵심 Data Scientist용 Tool 인 boto3 (AWS SDK) 및 Pandas/Numpy와 같은 라이브러리를 가져와야 합니다.

In [1]:
import boto3
import json
import numpy as np
import pandas as pd
import time
from datetime import datetime

import matplotlib.pyplot as plt

변수 로딩

In [2]:
%store -r

## 1. 데이터셋 확인

In [3]:
interactions_df.head(5)

Unnamed: 0,ITEM_ID,USER_ID,EVENT_TYPE,TIMESTAMP
0,26bb732f-9159-432f-91ef-bad14fedd298,3156,View,1591803788
1,26bb732f-9159-432f-91ef-bad14fedd298,3156,View,1591803788
2,dc073623-4b95-47d9-93cb-0171c20baa04,332,View,1591803812
3,dc073623-4b95-47d9-93cb-0171c20baa04,332,View,1591803812
4,31efcfea-47d6-43f3-97f7-2704a5397e22,3981,View,1591803830


In [4]:
items_df.head(5)

Unnamed: 0,ITEM_ID,NAME,CATEGORY_L1,STYLE,PRODUCT_DESCRIPTION,PRICE
0,e1669081-8ffc-4dec-97a6-e9176d7f6651,Sans Pareil Scarf,apparel,scarf,Sans pareil scarf for women,124.99
1,cfafd627-7d6b-43a5-be05-4c7937be417d,Chef Knife,housewares,kitchen,A must-have for your kitchen,57.99
2,6e6ad102-7510-4a02-b8ce-5a0cd6f431d1,Gainsboro Jacket,apparel,jacket,This gainsboro jacket for women is perfect for...,133.99
3,49b89871-5fe7-4898-b99d-953e15fb42b2,High Definition Speakers,electronics,speaker,High definition speakers to fill the house wit...,196.99
4,5cb18925-3a3c-4867-8f1c-46efd7eba067,Spiffy Sandals,footwear,sandals,This spiffy pair of sandals for woman is perfe...,9.99


In [5]:
users_df.head(5)

Unnamed: 0,USER_ID,USER_NAME,AGE,GENDER
0,1,user1,31,M
1,2,user2,58,F
2,3,user3,43,M
3,4,user4,38,M
4,5,user5,24,M


## 2. 데이터 분리

이 데이터에는 UserID, ItemID, Event_Type 및 Timestamp 컬럼이 포함되어 있습니다.<br>

- 시간순으로 정렬한 후에 90%는 학습용 데이터로, 마지막 최근 10% 데이터를 검증용 데이터로 분리합니다.

#### Data Set을 Train, Validation(holdout) 데이터 분리하기 

모든 사용자의 마지막(Timestamp기준으로) 10%의 데이터를 Validation(Holdout) 데이터로 분리합니다.

In [6]:
pd.options.display.max_rows = 5
def split_holdout(data, pct):
    df = data.copy()
    # Rank per each subgroup, 'USER_ID'
    ranks = df.groupby('USER_ID').TIMESTAMP.rank(pct=True, method='first')
    df = df.join((ranks> pct).to_frame('holdout'))
    
    holdout = df[df['holdout']].drop('holdout', axis=1)
    train = df[~df['holdout']].drop('holdout', axis=1)    
    
    return train, holdout

df_warm_train, df_warm_holdout = split_holdout(interactions_df, pct=0.9)

train 과 holdout의 분리된 데이타의 정보입니다. holdout은 대략 전체 대비 10%의 Row의 갯수 입니다. holdout의 timestamp는 train보다 미래의 숫자인 것을 인지할 수 있습니다. (숫자가 많은 것이 더 미래의 날짜를 의미함)

In [7]:
df_warm_train.info()
df_warm_train.nunique()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 526581 entries, 0 to 664340
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   ITEM_ID     526581 non-null  object
 1   USER_ID     526581 non-null  object
 2   EVENT_TYPE  526581 non-null  object
 3   TIMESTAMP   526581 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 20.1+ MB


ITEM_ID         2449
USER_ID         5250
EVENT_TYPE         2
TIMESTAMP     282956
dtype: int64

In [8]:
df_warm_holdout.info()
df_warm_holdout.nunique()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 61138 entries, 432222 to 675003
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ITEM_ID     61138 non-null  object
 1   USER_ID     61138 non-null  object
 2   EVENT_TYPE  61138 non-null  object
 3   TIMESTAMP   61138 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


ITEM_ID        2446
USER_ID        5250
EVENT_TYPE        2
TIMESTAMP     34393
dtype: int64

In [9]:
df_warm_train.sort_values(['USER_ID','TIMESTAMP'])

Unnamed: 0,ITEM_ID,USER_ID,EVENT_TYPE,TIMESTAMP
21464,35efa417-357d-465e-99cb-b208bbc63f8b,1,View,1592007327
21465,35efa417-357d-465e-99cb-b208bbc63f8b,1,View,1592007327
...,...,...,...,...
585017,072ded32-2903-4f35-9f28-d6284c5f5605,5250,View,1597351374
585018,072ded32-2903-4f35-9f28-d6284c5f5605,5250,View,1597351374


In [10]:
df_warm_holdout.sort_values(['USER_ID','TIMESTAMP'])

Unnamed: 0,ITEM_ID,USER_ID,EVENT_TYPE,TIMESTAMP
612212,be14695b-f8cb-46b8-aecd-ef28f0218514,1,View,1597609245
621508,079ab14b-3435-4a95-ba1d-fc0b21e0cf4b,1,View,1597697421
...,...,...,...,...
609116,e66109bf-9ad5-430a-90e5-900c00119f39,5250,View,1597579890
662349,072ded32-2903-4f35-9f28-d6284c5f5605,5250,View,1598084705


## 3. 로컬에 train, item, validation(holdout) 및 coldstart 를 CSV 저장

완료되면 파일을 새 CSV로 저장한 다음, S3에 업로드합니다.<br>

In [11]:
import os
os.makedirs('base-dataset', exist_ok=True)

train, item, validation(holdout) 데이터를 로컬에 csv 파일로 저장 합니다.

In [12]:
# train, item, validation 로컬 저장
base_warm_train_interaction_filename="base-dataset/training_interaction.csv"
base_items_filename="base-dataset/training_item.csv"
base_users_filename="base-dataset/training_user.csv"
base_validation_interaction_filename="base-dataset/validation_interaction.csv"

df_warm_train.to_csv(base_warm_train_interaction_filename,index=False)
items_df.to_csv(base_items_filename,index=False)
users_df.to_csv(base_users_filename,index=False)
df_warm_holdout.to_csv(base_validation_interaction_filename,index=False)    

In [13]:
# warm data set 로컬 저장
base_warm_interation_filename="base-dataset/warm_interaction.csv"
interactions_df.to_csv(base_warm_interation_filename,index=False)


## 4. S3에 로컬 CSV 업로드

In [14]:
import sagemaker
#bucket='<YOUR BUCKET NAME>' # replace with the name of your S3 bucket
bucket = sagemaker.Session().default_bucket() 

In [15]:
#upload file for training
response_upload = boto3.Session().resource('s3').Bucket(bucket).Object(base_warm_train_interaction_filename).upload_file(base_warm_train_interaction_filename)
boto3.Session().resource('s3').Bucket(bucket).Object(base_users_filename).upload_file(base_users_filename)
boto3.Session().resource('s3').Bucket(bucket).Object(base_items_filename).upload_file(base_items_filename)

s3_base_warm_train_interaction_filename = "s3://{}/{}".format(bucket, base_warm_train_interaction_filename)
s3_base_items_filename = "s3://{}/{}".format(bucket, base_items_filename)
s3_base_users_filename = "s3://{}/{}".format(bucket, base_users_filename)

print("s3_base_warm_train_interaction_filename: \n", s3_base_warm_train_interaction_filename)
print("s3_base_items_filename: \n", s3_base_items_filename)
print("s3_base_users_filename: \n", s3_base_users_filename)

s3_base_warm_train_interaction_filename: 
 s3://sagemaker-us-east-1-376278017302/base-dataset/training_interaction.csv
s3_base_items_filename: 
 s3://sagemaker-us-east-1-376278017302/base-dataset/training_item.csv
s3_base_users_filename: 
 s3://sagemaker-us-east-1-376278017302/base-dataset/training_user.csv


In [16]:
! aws s3 ls {s3_base_warm_train_interaction_filename} --recursive
! aws s3 ls {s3_base_items_filename} --recursive
! aws s3 ls {s3_base_users_filename} --recursive

2023-06-20 10:49:49   30451496 base-dataset/training_interaction.csv
2023-06-20 10:49:50     300071 base-dataset/training_item.csv
2023-06-20 10:49:50      97565 base-dataset/training_user.csv


## 5. 변수 저장

다음 노트북에서 활용할 변수를 저장 합니다.

In [17]:
%store bucket

%store s3_base_warm_train_interaction_filename
%store s3_base_users_filename
%store s3_base_items_filename
%store base_warm_train_interaction_filename
%store base_items_filename
%store base_users_filename
%store base_validation_interaction_filename
%store base_warm_interation_filename


Stored 'bucket' (str)
Stored 's3_base_warm_train_interaction_filename' (str)
Stored 's3_base_users_filename' (str)
Stored 's3_base_items_filename' (str)
Stored 'base_warm_train_interaction_filename' (str)
Stored 'base_items_filename' (str)
Stored 'base_users_filename' (str)
Stored 'base_validation_interaction_filename' (str)
Stored 'base_warm_interation_filename' (str)
