In [24]:
import sagemaker as sage
import pandas as pd
from time import gmtime, strftime
from sagemaker import get_execution_role
from sagemaker.pytorch import PyTorch
import os
import numpy as np
import shutil

# data prepare

In [None]:
! python data_prepare.py\
--label_dir '../data/good'

In [None]:
! python data_prepare.py\
--label_dir '../data/bad'

In [None]:
#merge data
good = '../data/good/aspect_category.csv'
bad = '../data/bad/aspect_category.csv'
df1 = pd.read_csv(good)
df2 = pd.read_csv(bad)
df_res = pd.concat([df1,df2])
df_res.to_csv('./aspect_category.csv')

In [25]:
import pandas as pd   
import numpy as np
import os
import shutil

#preprocess data
def write_txt(df,path):
    '''
    write back to txt
    '''
    #output txt file
    df = df.reset_index()
    with open(path,'a')as f:
        for i in range(len(df)):
            f.write("{} #### {}".format(df.loc[i,'text'].strip(),df.loc[i,'label']))
            f.write('\n')
            
            
def mkdir_rm(folder):
    '''
    make directory if not exists
    '''
    if os.path.exists(folder):
        shutil.rmtree(folder) 
    os.mkdir(folder)
    print ("<< path valid!")
    

def preprocess_data(input_file,output_path,over_sample=True):
    jsonObj = pd.read_csv(input_file)
    jsonObj = jsonObj[jsonObj['label']!='[]']
    print (jsonObj.head())
    
    #remove & remake the output folder 
    mkdir_rm(output_path)
    
    #generate tag.txt
    #a_list = ['consumer','zone','target','consequence','product','product_spec']
    #with open('tag.txt', 'w') as filehandle:
     #   filehandle.writelines("%s\n" % tag for tag in a_list)
    
    #train/test/val split
    train, validate, test = np.split(jsonObj.sample(frac=1), [int(.8*len(jsonObj)), int(.9*len(jsonObj))])
   
    print ("training size: ",train.shape)
    print ("test size: ",test.shape)
    print ("validate size: ",validate.shape)
    
    # write train/test/dev
    write_txt(train,os.path.join(output_path,'train.txt'))
    write_txt(test,os.path.join(output_path,'test.txt'))
    write_txt(validate,os.path.join(output_path,'dev.txt'))
    print ("<<<finish data preparing!")
    
input_file = './aspect_category.csv'
output_path = './data/tasd/bmjl'
preprocess_data(input_file,output_path,over_sample=False)

   Unnamed: 0  Unnamed: 0.1  sent_num  \
0           0             0         0   
1           1             0         0   
2           2             0         0   
3           3             0         0   
4           4             0         0   

                                                text  sent_start  sent_end  \
0  I like the shorts. They're comfortable. But I ...           0       313   
1  Absolutely love these shorts! The green color ...           0       215   
2  I ordered the purple tie dye color and was sen...           0       322   
3  I am 5’1 and around 110-112 lbs. I love these ...           0       453   
4  I am obsessed with these shorts. I’ve ordered ...           0       351   

   sent_len                                              label  
0       313  [('comfortable', 'feelings'), ('being able to ...  
1       215  [('green color was perfect', 'color'), ('runni...  
2       322  [('Just wish I was given the correct color', '...  
3       453  [('I love t

# train

In [26]:
sess = sage.Session()

WORK_DIRECTORY = "./data"

# S3 prefix
prefix = "bmjl"

role = get_execution_role()

data_location = sess.upload_data(WORK_DIRECTORY, key_prefix=prefix)

In [27]:
hyperparameters = {
    "task" : "tasd", 
    "dataset" : "bmjl", 
    "model_name_or_path" : "t5-base", 
    "paradigm": "extraction",
    "eval_batch_size" :"16",
    "train_batch_size" :"2",
    "learning_rate" :"3e-4",
    "num_train_epochs":"30",
    "n_gpu": "1"
}

In [28]:
entry_point = 'finetune.py'
source_dir = './'
git_config = None
role = get_execution_role()
framework_version = '1.7.1'
py_version='py36'
instance_type='ml.p3.2xlarge'
#instance_type='local_gpu'
instance_count=1

In [29]:
estimator = PyTorch(
    entry_point = entry_point,
    source_dir = source_dir,
    git_config = git_config,
    role = role,
    debugger_hook_config=False,
    hyperparameters = hyperparameters,
    framework_version = framework_version, 
    py_version = py_version,
    instance_type = instance_type,
    instance_count = instance_count
)

In [30]:
inputs = {'tasd': data_location+'/tasd/'}

In [None]:
response = estimator.fit(inputs)

2022-08-12 07:19:56 Starting - Starting the training job...
2022-08-12 07:20:25 Starting - Preparing the instances for trainingProfilerReport-1660288795: InProgress
.........
2022-08-12 07:21:52 Downloading - Downloading input data...
2022-08-12 07:22:22 Training - Downloading the training image................

# deploy 

In [2]:
import sagemaker

instance_type = 'ml.m5.4xlarge'
role = sagemaker.get_execution_role()

In [14]:
s3_model = estimator.model_data 

In [15]:
from sagemaker.pytorch.model import PyTorchModel

pytorch_model = PyTorchModel(model_data=s3_model, 
                             role=role,
                             entry_point='inference.py', 
                             source_dir='./', 
                             framework_version='1.7.1', 
                             py_version='py36'
                ) # TODO set model_server_workers=1 to avoid torchhub bug

predictor = pytorch_model.deploy(instance_type=instance_type, initial_instance_count=1)

--------------!

In [4]:
from boto3.session import Session
import json

body = {"inputs": "I am pretty new to pickleball and finally decided to try out some different paddles."}

session = Session()
runtime = session.client("runtime.sagemaker")
response = runtime.invoke_endpoint(
    EndpointName=predictor.endpoint_name,
    ContentType="application/json",
    Body=json.dumps(body),
)
result = json.loads(response["Body"].read())
print (result)


NameError: name 'predictor' is not defined

In [17]:
%%time

predictor.serializer = sagemaker.serializers.JSONSerializer()
predictor.deserializer = sagemaker.deserializers.JSONDeserializer()

body = {"inputs": "I am pretty new to pickleball and finally decided to try out some different paddles."}

predictor.predict(body,initial_args={"ContentType":"application/json"})

CPU times: user 12.2 ms, sys: 376 µs, total: 12.5 ms
Wall time: 3.54 s


{'result': '(pretty new to pickleball, scene); (finally decided to try out some different paddles, purchase_behavior); (pretty new to pickleball, scene); (pretty new to pickleball, scene); (pretty new to pickleball, scene); (pretty new to pickleball, scene); (pretty new to pickleball, scene); (pretty new to pickleball, scene); (pretty new to pickleball, scene)'}

# batch transform

In [5]:
import csv
from sagemaker.s3 import S3Uploader,s3_path_join

# get the s3 bucket
sess = sagemaker.Session()
role = sagemaker.get_execution_role()
sagemaker_session_bucket = sess.default_bucket()

#prepare data
dataset_csv_file = 'predict_0811.csv'
dataset_jsonl_file = "predict.jsonl"


i = 0
with open(dataset_csv_file, "r+") as infile, open(dataset_jsonl_file, "w+") as outfile:
    reader = csv.DictReader(infile)
    for row in reader:
        if i <5:
            json.dump({"inputs":row["0"]}, outfile)
            outfile.write('\n')
        i = i+1
                
# uploads a given file to S3.
input_s3_path = s3_path_join("s3://",sagemaker_session_bucket,"batch_transform/input")
output_s3_path = s3_path_join("s3://",sagemaker_session_bucket,"batch_transform/output")
s3_file_uri = S3Uploader.upload(dataset_jsonl_file,input_s3_path)

print(f"{dataset_jsonl_file} uploaded to {s3_file_uri}")

predict.jsonl uploaded to s3://sagemaker-us-east-1-726335585155/batch_transform/input/predict.jsonl


In [68]:
# create transformer to run a batch job
batch_job = pytorch_model.transformer(
    instance_count=1,
    instance_type='ml.g4dn.xlarge',
    output_path=output_s3_path,
    strategy='SingleRecord'
)

In [69]:
# starts batch transform job and uses S3 data as input
batch_job.transform(
    data=input_s3_path,
    content_type='application/json',    
    split_type='Line'
)

........................................................[34mCollecting transformers==4.6.0
  Downloading transformers-4.6.0-py3-none-any.whl (2.3 MB)[0m
[34mCollecting datasets==1.11.0
  Downloading datasets-1.11.0-py3-none-any.whl (264 kB)[0m
[34mCollecting sentencepiece==0.1.91
  Downloading sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1 MB)[0m
[34mCollecting pytorch_lightning==0.8.1
  Downloading pytorch_lightning-0.8.1-py3-none-any.whl (293 kB)[0m
[34mCollecting jieba
  Downloading jieba-0.42.1.tar.gz (19.2 MB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'[0m
[34mCollecting editdistance
  Downloading editdistance-0.6.0-cp36-cp36m-manylinux2010_x86_64.whl (284 kB)[0m
[34mCollecting filelock
  Downloading filelock-3.4.1-py3-none-any.whl (9.9 kB)[0m
[34mCollecting regex!=2019.12.17
  Downloading regex-2022.7.25-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (751 kB)[0m
[34mCollecting huggin

In [7]:
import json
from sagemaker.s3 import S3Downloader
from ast import literal_eval
# creating s3 uri for result file -> input file + .out
output_file = f"{dataset_jsonl_file}.out"
output_path = s3_path_join("s3://sagemaker-us-east-1-726335585155/batch_transform/output",output_file)

local_path = "output"  # Where to save the output locally

S3Downloader.download(output_path,local_path)

In [13]:
!pip install jsonlines

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting jsonlines
  Downloading jsonlines-3.1.0-py3-none-any.whl (8.6 kB)
Installing collected packages: jsonlines
Successfully installed jsonlines-3.1.0
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_p38/bin/python -m pip install --upgrade pip' command.[0m[33m
[0m

In [23]:
# Inspect the output

import os
import jsonlines
import json
from ast import literal_eval

output_file = f"{dataset_jsonl_file}.out"

batch_transform_result = []

path = os.path.join(local_path, output_file)
with open(path, "r") as f:
    for line in f:
        print (line)

{"result": "(extra large, size); (very large, size); (very large, size); (very large, size); (very large, size); (very large, size); (very large, size); (very large, size)"}{"result": "(liked the look of this dress, feelings); (wasn\u2019t cute on me, feelings); (liked the look of this dress, feelings); (liked the look of this dress, feelings); (liked the look of this dress, feelings); (liked the look of this dress, feelings); (liked the look of this dress, feelings); (liked the look of this dress, feelings); (liked the look of this dress, feelings); (liked the look of this dress, feelings); (liked the look of this dress, feelings)"}{"result": "(Beautifully made, quality); (Can't believe it was made in China, China, China)"}{"result": "(a vow renewal in Vegas, scene); (great quality, quality); (lined, fabric); (light weight, fabric); (very happy with this purchase, feelings); (highly recommend, purchase_behavior)"}{"result": "(Not true to size, size); (Not true to size, size)"}
