In [1]:
import sagemaker as sage
import pandas as pd
from time import gmtime, strftime
from sagemaker import get_execution_role
from sagemaker.pytorch import PyTorch
import os
import numpy as np
import shutil

# data prepare

In [2]:
%%time
#preprocess data
def write_txt(df,path):
    #output txt file
    df = df.reset_index()
    with open(path,'a')as f:
        for i in range(len(df)):
            f.write('{},{} ####[("{}","{}")]'.format(df.loc[i,'退货原因翻译'],df.loc[i,'客户评价翻译'],df.loc[i,'品质不良归类'],df.loc[i,'汇总原因']))
            f.write('\n')
            
def mkdir_rm(folder):
    if os.path.exists(folder):
        shutil.rmtree(folder) 
    os.mkdir(folder)
    print ("<< path valid!")
    
def flag_oversample(x,over_sample_list):
    res = 0
    for i in x:
        if i['tag'] in over_sample_list:
            res =1
    return res
        
def preprocess_data(input_file,output_path,over_sample=True):
    df = pd.read_excel(input_file,engine='openpyxl')
    
    #only use 10000 for exp
    df = df.head(1000)
    
    #load 
    df_sub = df[-df['品质不良归类'].isnull()]
    df_sub = df_sub[-df_sub['汇总原因'].isnull()]
    
    #write the tag list
    a_list = df_sub['汇总原因'].unique()
    
    with open('tag.txt', 'w') as filehandle:
        filehandle.writelines("%s\n" % tag for tag in a_list)

    #remove & remake the output folder 
    mkdir_rm(output_path)
    
    #train/test/val split
    train, validate, test = np.split(df_sub.sample(frac=1), [int(.8*len(df_sub)), int(.9*len(df_sub))])
    #flag over sample
    #train['flag'] = train['label'].map(lambda x:flag_oversample(x,over_sample_list))
     
    print ("training size: ",train.shape)
    print ("test size: ",test.shape)
    print ("validate size: ",validate.shape)
    
    # write train/test/dev
    write_txt(train,os.path.join(output_path,'train.txt'))
    write_txt(test,os.path.join(output_path,'test.txt'))
    write_txt(validate,os.path.join(output_path,'dev.txt'))
    print ("<<<finish data preparing!")
    
input_file = './数据提供2：退货数据_20220523.xlsx'
output_path = './data/tasd/haofang'
preprocess_data(input_file,output_path,over_sample=False)

<< path valid!
training size:  (554, 9)
test size:  (70, 9)
validate size:  (69, 9)
<<<finish data preparing!
CPU times: user 37.5 s, sys: 151 ms, total: 37.6 s
Wall time: 37.6 s


# train

In [3]:
sess = sage.Session()

WORK_DIRECTORY = "./data"

# S3 prefix
prefix = "haofang"

role = get_execution_role()

data_location = sess.upload_data(WORK_DIRECTORY, key_prefix=prefix)

In [4]:
hyperparameters = {
    "task" : "tasd", 
    "dataset" : "haofang", 
    "model_name_or_path" : "lemon234071/t5-base-Chinese", 
    "paradigm": "extraction",
    "eval_batch_size" :"16",
    "train_batch_size" :"2",
    "learning_rate" :"3e-4",
    "num_train_epochs":"1",
    "n_gpu": "1"
}

In [9]:
entry_point = 'finetune.py'
source_dir = './'
git_config = None
role = get_execution_role()
framework_version = '1.7.1'
py_version='py36'
instance_type='ml.p3.2xlarge'
#instance_type='local_gpu'
instance_count=1

In [10]:
estimator = PyTorch(
    entry_point = entry_point,
    source_dir = source_dir,
    git_config = git_config,
    role = role,
    debugger_hook_config=False,
    hyperparameters = hyperparameters,
    framework_version = framework_version, 
    py_version = py_version,
    instance_type = instance_type,
    instance_count = instance_count
)

In [11]:
inputs = {'tasd': data_location+'/tasd/'}

In [None]:
response = estimator.fit(inputs)

# deploy 

In [None]:
import sagemaker

instance_type = 'ml.m5.4xlarge'
role = sagemaker.get_execution_role()

In [21]:
estimator.model_data

's3://sagemaker-us-east-1-726335585155/pytorch-training-2022-06-16-05-59-18-675/output/model.tar.gz'

In [20]:
from sagemaker.pytorch.model import PyTorchModel

pytorch_model = PyTorchModel(model_data=estimator.model_data, 
                             role=role,
                             entry_point='inference.py', 
                             source_dir='./', 
                             framework_version='1.7.1', 
                             py_version='py36'
                ) # TODO set model_server_workers=1 to avoid torchhub bug

predictor = pytorch_model.deploy(instance_type=instance_type, initial_instance_count=1)

ClientError: An error occurred (404) when calling the HeadObject operation: Not Found

In [None]:
predictor.predict({"inputs": "商品存在瑕疵,不会像宣传的那样上传到wifi,致电客户服务进行验证"}, initial_args={'ContentType': 'application/json'})