In [1]:
# load data
import pandas as pd
import ast
import os
import json
import shutil
from shutil import copyfile
from PIL import Image
from sklearn.model_selection import train_test_split

In [2]:
!pip install openpyxl

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_p38/bin/python -m pip install --upgrade pip' command.[0m[33m
[0m

In [3]:
#download data
#!mkdir data_0731
#!aws s3 cp s3://jackie-test/bumingjueli/data0731/ ./data_0731/ --recursive
#!unzip ./data_0731/Sports.zip -d ./data_0731

# data prepare

In [6]:
#filter not 13 list
def get_feature_len(x):
    t = json.loads(x)
    return len(t)

def get_key_list(x):
    # get key dictionary
    t = json.loads(x)
    res = [i for i in list(t.values())]
    res = [list(i.keys())[0] for i in res]
    return res

def get_keys(df):
    lst = list(df['feature_dict'])
    myList = [x for j in lst for x in j]
    res = list(set(myList))
    #res_str = ','.join(res)
    return res
    
def map_feature(x,leng):
    t = json.loads(x)
    for i in range(leng):
        if str(i) in t.keys():
            continue
        else:
            t[str(i)] = ''
    return t

def get_res(x):
    try:
        a = ast.literal_eval(str(x))
        return a
    except:
        return {'res':'others'}
        
#map back labels
def get_label_txt():
    with open('./data/label.txt') as f:
        lines = f.readlines()
    
    keys =  [i.split('\t')[0] for i in lines]
    keys_update = [str(int(i)-1) for i in keys]
    res = [i.split('\t')[1][:-1] for i in lines ]
    dict_res = dict(zip(keys_update, res))
    return dict_res

def get_key_value(x,i):
#x = df['data'][59335]

    t = json.loads(x)

    res = [i for i in list(t.values())]
    keys = [list(i.keys())[0] for i in res]
    values = [list(i.values())[0] for i in res]
    dict_res = dict(zip(keys, values))
    if i in dict_res.keys():
        return dict_res[i]
    else:
        return 'other'


def test_path(x,category):
    root_path = os.path.join('/home/ec2-user/SageMaker/bumingjueli/img-cls/data/data_0731',category)
    img_name = os.path.join(root_path,str(x)+'.png')
    #print ('img_name',img_name)
    if os.path.exists(img_name):  
        # for local training
        #return img_name
        # for sagemaker only
        img = Image.open(img_name)
        if len(img.getbands())==3:
            return os.path.join('/opt/ml/input/data/training',str(x)+'.png')
        else:
            return 'none'
    else:
        return 'none'

def self_mkdir(folder):
    isExists = os.path.exists(folder)
    if not isExists:
        os.makedirs(folder)
        print('path of %s is build' % (folder))

def copy_files(df,category,output_dir):
    for i in df['md5_url']:
        copyfile(os.path.join('./data_0731',category,i+'.png'),os.path.join(output_dir,i+'.png'))
        
        
def get_data(path,category,output_dir):
    df = pd.read_excel(path,engine="openpyxl")
    df = df[df['creg']==category]
    #df['feature_len'] = df['data'].map(lambda x: get_feature_len(x))
    #leng = max(df['feature_len'])
    df['feature_dict'] = df['data'].map(lambda x: get_key_list(x))
    res_keys = get_keys(df)
    print ("<<< predict for keys: ", ','.join(res_keys))
    
    for i in res_keys:
        df[i] = df['data'].map(lambda x: get_key_value(x,i))
    
    #repath
    df['image_path'] = df['md5_url'].map(lambda x: test_path(x,category))
    df = df[df['image_path']!='none']
    
    #make dir if not exist
    self_mkdir(output_dir)
    #save data
    df[res_keys].to_csv(os.path.join(output_dir, 'total.csv'),index=False)
    
    #sample
    df = df.head(50)
    #copy images
    copy_files(df,category,output_dir)
    
    train, test = train_test_split(df,test_size=0.2,random_state=0)
    train.to_csv(os.path.join(output_dir, 'train.csv'))
    test.to_csv(os.path.join(output_dir, 'test.csv'))
    print ("train size {}, test size{}".format(train.shape,test.shape))
    
    return df

category = 'Women-Sweatshirts'
output_dir = os.path.join("./train_sample",category)
df = get_data('./data_0731/shein_info.xlsx',category=category,output_dir = output_dir)

<<< predict for keys:  Size Fit,Fit Type,Details,Sleeve Length,Color,Quantity,Sheer,Lining,Fabric,Warm Lined,Composition,Body,Care Instructions,Pattern Type,Pockets,Length,Type,Hem Shaped,Material,Belt,Neckline,Sleeve Type,Arabian Clothing,Style
path of ./train_sample/Women-Sweatshirts is build
train size (40, 33), test size(10, 33)


In [7]:
output_dir

'./train_sample/Women-Sweatshirts'

# train

In [8]:
import sagemaker as sage
from sagemaker import get_execution_role
from sagemaker.pytorch import PyTorch

sess = sage.Session()

WORK_DIRECTORY = output_dir

# S3 prefix
prefix = "bmjl-train-"+category

role = get_execution_role()

data_location = sess.upload_data(WORK_DIRECTORY, key_prefix=prefix)

In [9]:
hyperparameters = {
    "epoch":1, # test 1
    "batch_size":4,
    "num_workers":8,  
    'val_epoch':1,
    'save_epoch':1,
    'model_name':'resnet'
}

In [10]:
entry_point = 'train_general_sagemaker.py'
source_dir = './code'
git_config = None
role = get_execution_role()
framework_version = '1.7.1'
py_version='py36'
instance_type='ml.p3.2xlarge'
#instance_type='local_gpu'
instance_count=1
volume_size=50

In [11]:
estimator = PyTorch(
    entry_point = entry_point,
    source_dir = source_dir,
    git_config = git_config,
    role = role,
    debugger_hook_config=False,
    hyperparameters = hyperparameters,
    framework_version = framework_version, 
    py_version = py_version,
    instance_type = instance_type,
    instance_count = instance_count,
    base_job_name = prefix+hyperparameters['model_name'],
    volume_size=volume_size
)

In [None]:
response = estimator.fit(data_location)

2022-08-16 08:59:35 Starting - Starting the training job.

# endpoint

In [41]:
from sagemaker.pytorch.model import PyTorchModel

s3_model = estimator.model_data 
#s3_model = "s3://sagemaker-us-east-1-726335585155/bmjl-train-Bottoms-2022-08-08-10-34-18-585/output/model.tar.gz"

pytorch_model = PyTorchModel(model_data=s3_model, 
                             role=role,
                             entry_point='inference.py', 
                             source_dir='./code', 
                             framework_version='1.7.1', 
                             py_version='py36'
                ) # TODO set model_server_workers=1 to avoid torchhub bug

predictor = pytorch_model.deploy(instance_type=instance_type, initial_instance_count=1)

----------!

In [43]:
import json
import numpy as np
from boto3.session import Session

session = Session()
runtime = session.client("runtime.sagemaker")

with open('train_data/Women-Bottoms/00017534ec95a66ab3518dc71bbc970d.html.png', "rb") as f:
    payload = f.read()
    payload = bytearray(payload)
    
response = runtime.invoke_endpoint(
    EndpointName='pytorch-inference-2022-08-10-08-15-19-340', ContentType="application/x-image", Body=payload
)

result = response["Body"].read()
# result will be in json format and convert it to ndarray
result = json.loads(result)

In [44]:
result

{'result': {'Warm Lined': 'other',
  'Details': 'Zipper',
  'Type': 'Pullovers',
  'Composition': '100% Polyester',
  'Quantity': '1 piece',
  'Belt': 'other',
  'Fit Type': 'Regular Fit',
  'Length': 'Regular',
  'Sleeve Length': 'Long Sleeve',
  'Pattern Type': 'Plain',
  'Color': 'Black',
  'Lining': 'other',
  'Sleeve Type': 'Drop Shoulder',
  'Pockets': 'other',
  'Sheer': 'No',
  'Size Fit': 'other',
  'Neckline': 'Hooded',
  'Fabric': 'Slight Stretch',
  'Material': 'Polyester',
  'Hem Shaped': 'other',
  'Care Instructions': 'Machine wash or professional dry clean',
  'Body': 'other',
  'Arabian Clothing': 'other',
  'Style': 'Casual'}}

# batch transform

In [47]:
image_dir = './test'
inference_prefix = "batch_transform"
inference_inputs = sess.upload_data(
    path=image_dir, key_prefix=inference_prefix
)
print("Input S3 path: {}".format(inference_inputs))

Input S3 path: s3://sagemaker-us-east-1-726335585155/batch_transform


In [54]:
# Create transformer from PyTorchModel object
transformer = pytorch_model.transformer(instance_count=1, instance_type="ml.m5.xlarge")

In [55]:
transformer.transform(
    data=inference_inputs,
    data_type="S3Prefix",
    content_type="application/x-image",
    wait=True,
)

..............................
[34m2022-08-10 08:58:58,084 [INFO ] main org.pytorch.serve.ModelServer - [0m
[34mTorchserve version: 0.3.1[0m
[34mTS Home: /opt/conda/lib/python3.6/site-packages[0m
[34mCurrent directory: /[0m
[34mTemp directory: /home/model-server/tmp[0m
[34mNumber of GPUs: 0[0m
[34mNumber of CPUs: 4[0m
[34mMax heap size: 2950 M[0m
[34mPython executable: /opt/conda/bin/python3.6[0m
[34mConfig file: /etc/sagemaker-ts.properties[0m
[34mInference address: http://0.0.0.0:8080[0m
[34mManagement address: http://0.0.0.0:8080[0m
[34mMetrics address: http://127.0.0.1:8082[0m
[34mModel Store: /.sagemaker/ts/models[0m
[34mInitial Models: model.mar[0m
[34mLog dir: /logs[0m
[34mMetrics dir: /logs[0m
[34mNetty threads: 0[0m
[34mNetty client threads: 0[0m
[34mDefault workers per model: 4[0m
[34mBlacklist Regex: N/A[0m
[34mMaximum Response Size: 6553500[0m
[34mMaximum Request Size: 6553500[0m
[34mPrefer direct buffer: false[0m
[34mAllowed

In [57]:
import pprint as pp
sm_cli = sess.sagemaker_client

transform_jobs = sm_cli.list_transform_jobs()["TransformJobSummaries"]
for job in transform_jobs:
    pp.pprint(job)

{'CreationTime': datetime.datetime(2022, 8, 10, 8, 54, 8, 975000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2022, 8, 10, 8, 59, 9, 644000, tzinfo=tzlocal()),
 'TransformEndTime': datetime.datetime(2022, 8, 10, 8, 59, 9, 265000, tzinfo=tzlocal()),
 'TransformJobArn': 'arn:aws:sagemaker:us-east-1:726335585155:transform-job/pytorch-inference-2022-08-10-08-54-08-963',
 'TransformJobName': 'pytorch-inference-2022-08-10-08-54-08-963',
 'TransformJobStatus': 'Completed'}
{'CreationTime': datetime.datetime(2020, 1, 7, 8, 6, 7, 55000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2020, 1, 7, 8, 11, 17, 354000, tzinfo=tzlocal()),
 'TransformEndTime': datetime.datetime(2020, 1, 7, 8, 11, 17, tzinfo=tzlocal()),
 'TransformJobArn': 'arn:aws:sagemaker:us-east-1:726335585155:transform-job/image-classification-model-2020-01-07-08-06-02',
 'TransformJobName': 'image-classification-model-2020-01-07-08-06-02',
 'TransformJobStatus': 'Completed'}
{'CreationTime': datetime.datet

In [59]:
job_info = sm_cli.describe_transform_job(
    TransformJobName=transformer.latest_transform_job.name
)

pp.pprint(job_info)

{'CreationTime': datetime.datetime(2022, 8, 10, 8, 54, 8, 975000, tzinfo=tzlocal()),
 'DataProcessing': {'InputFilter': '$',
                    'JoinSource': 'None',
                    'OutputFilter': '$'},
 'ModelName': 'pytorch-inference-2022-08-10-08-54-07-174',
 'ResponseMetadata': {'HTTPHeaders': {'content-length': '895',
                                      'content-type': 'application/x-amz-json-1.1',
                                      'date': 'Wed, 10 Aug 2022 09:03:53 GMT',
                                      'x-amzn-requestid': 'c11b8ac6-4177-452f-a12d-10a3e75bac68'},
                      'HTTPStatusCode': 200,
                      'RequestId': 'c11b8ac6-4177-452f-a12d-10a3e75bac68',
                      'RetryAttempts': 0},
 'TransformEndTime': datetime.datetime(2022, 8, 10, 8, 59, 9, 265000, tzinfo=tzlocal()),
 'TransformInput': {'CompressionType': 'None',
                    'ContentType': 'application/x-image',
                    'DataSource': {'S3DataSource':

In [61]:
import re


def get_bucket_and_prefix(s3_output_path):
    trim = re.sub("s3://", "", s3_output_path)
    bucket, prefix = trim.split("/")
    return bucket, prefix


local_path = "output"  # Where to save the output locally

bucket, output_prefix = get_bucket_and_prefix(job_info["TransformOutput"]["S3OutputPath"])
print(bucket, output_prefix)

sess.download_data(path=local_path, bucket=bucket, key_prefix=output_prefix)


sagemaker-us-east-1-726335585155 pytorch-inference-2022-08-10-08-54-08-963


In [62]:
# Inspect the output

import json

for f in os.listdir(local_path):
    path = os.path.join(local_path, f)
    with open(path, "r") as f:
        pred = json.load(f)
        print(pred)

{'result': {'Warm Lined': 'other', 'Details': 'Zipper', 'Type': 'Pullovers', 'Composition': '100% Polyester', 'Quantity': '1 piece', 'Belt': 'other', 'Fit Type': 'Regular Fit', 'Length': 'Regular', 'Sleeve Length': 'Long Sleeve', 'Pattern Type': 'Plain', 'Color': 'Black', 'Lining': 'other', 'Sleeve Type': 'Drop Shoulder', 'Pockets': 'other', 'Sheer': 'No', 'Size Fit': 'other', 'Neckline': 'Hooded', 'Fabric': 'Slight Stretch', 'Material': 'Polyester', 'Hem Shaped': 'other', 'Care Instructions': 'Machine wash or professional dry clean', 'Body': 'other', 'Arabian Clothing': 'other', 'Style': 'Casual'}}
{'result': {'Warm Lined': 'other', 'Details': 'Zipper', 'Type': 'Pullovers', 'Composition': '100% Polyester', 'Quantity': '1 piece', 'Belt': 'other', 'Fit Type': 'Regular Fit', 'Length': 'Regular', 'Sleeve Length': 'Long Sleeve', 'Pattern Type': 'Plain', 'Color': 'Black', 'Lining': 'other', 'Sleeve Type': 'Drop Shoulder', 'Pockets': 'other', 'Sheer': 'No', 'Size Fit': 'other', 'Neckline': '