In [1]:
#preprocess
import json
import os
import pandas as pd
from sklearn.model_selection import train_test_split

# data process

In [2]:
def load_json(json_file):
    # Opening JSON file 
    f = open(json_file) 

    # returns JSON object as  
    # a dictionary 
    data = json.load(f) 
    content_ls = [' '.join(data['content'][str(i)]['word_list']) for i in range(len(data['content']))]
    #label_ls = [data['content'][str(i)]['dialogue_label'] for i in range(len(data['content']))]
    label_ls = [int(data['content'][str(i)]['ner_label'][0]) for i in range(len(data['content']))]
    role_dict = data['role_id']
    return content_ls,label_ls,role_dict

def edit_b(x,role):
    res = []
    for i in range(len(x)):
        if i<3:
            res_str = "prefix: "+','.join(x[:i])+" center: "+x[i] + " after: "+ ','.join(x[i:i+2]) + " roles: "+str(list(role.values()))
            res.append(res_str) 
        elif len(x)-i<3:
            res_str = "prefix: "+','.join(x[i-2:i])+" center: "+x[i] + " after: "+ ','.join(x[i:])+" roles: "+str(list(role.values()))
            res.append(res_str)
        else:
            res_str = "prefix: "+','.join(x[i-2:i])+" center: "+x[i] + " after: "+ ','.join(x[i:i+2])+" roles: "+str(list(role.values()))
            res.append(res_str)
    return res
            

In [3]:
#load one book
def load_book(book_path,tag):
    chapter_ls = os.listdir(book_path)
    cut=int(len(chapter_ls)*0.9)
    if tag is True:
        chapter_ls = chapter_ls[:cut]
    else:
        chapter_ls = chapter_ls[cut:]
        
    print ("<<< books: ", chapter_ls)
    res = []
    for i in chapter_ls:
        if i[-4:]=="json":
            json_file = os.path.join(book_path,i)
            content_ls, label_ls, role_dict = load_json(json_file)
            content_ls = edit_b(content_ls,role_dict)
            
            df_res = pd.DataFrame({'sentence1_key':content_ls,'label':label_ls})
            df_res = df_res[df_res['label']!=0]
            df_res["label"] = df_res["label"].map(lambda x: role_dict[str(x)])
            df_res["label"] = df_res["label"].map(lambda x: x+" said the sentence")

            df_res['sentence1_key'] = df_res['sentence1_key'].map(lambda x: x.replace('“','"'))
            df_res['sentence1_key'] = df_res['sentence1_key'].map(lambda x: x.replace('”','"'))
            res.append(df_res)
    res_table = pd.concat(res)

    return res_table

In [6]:
book_path = '../modelA/new_example_ten_json'
train = load_book(book_path,True)
test = load_book(book_path,False)

<<< books:  ['60%_(2040244Lunar wolvesHis to own Book 1Complete).json', "20%(2271377The Mafia's Good Wife)(1).json", '80%(2059119Heart of Freeman).json', '85%_(2061307His Ruthless Assistant (completed )).json', '60%_(2164082New Husband For My Wife) .json', '30%_(2165912The Curse Of Violet Wraith).json', '80%(2192588love&mate) (1).json', '60%_(2144894A Moonlit Encounter).json', '20%_(1993322ASHER RICK).json']
<<< books:  ['20%_(2070697Revenge on my Ex-Husband).json']


In [7]:
test1, test2 = train_test_split(test,test_size=0.25,random_state=0)
train = pd.concat([train,test1])
print ("train size {}, test size{}".format(train.shape,test2.shape))

train size (2652, 2), test size(28, 2)


In [8]:
test1.head()

Unnamed: 0,sentence1_key,label
182,"prefix: realized, that I had to pay attention ...",Aubrey said the sentence
23,prefix: She is supposed to be here for me on t...,Aubrey said the sentence
283,prefix: She was a woman in her mid-forties. Sh...,Aubrey said the sentence
479,prefix: My dad offered to drive me to the airp...,Aubrey said the sentence
639,"prefix: ""Really?"",""Yes,"" center: ""Victoria, st...",Eli said the sentence


In [9]:
import os

path = '../model_b_data'

# Check whether the specified path exists or not
isExist = os.path.exists(path)

if not isExist:
    os.makedirs(path)
    print("The new directory is created!")
    
train[["label","sentence1_key"]].to_csv('../model_b_data/train.csv',index=False,encoding='utf-8')
test[["label","sentence1_key"]].to_csv('../model_b_data/test.csv',index=False,encoding='utf-8')

The new directory is created!


# train - model b

In [10]:
import boto3
import sagemaker
import os
from sagemaker import get_execution_role

sess = sagemaker.Session()
role = get_execution_role()

prefix='stary-datalab-modelb'

bucket = sess.default_bucket() 
boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join(prefix, "train/train.csv")
).upload_file("../model_b_data/train.csv")
boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join(prefix, "test/test.csv")
).upload_file("../model_b_data/test.csv")

training_input_path = f's3://{sess.default_bucket()}/{prefix}/train/train.csv'
test_input_path = f's3://{sess.default_bucket()}/{prefix}/test/test.csv'

In [None]:
from sagemaker.huggingface import HuggingFace
from sagemaker.huggingface import TrainingCompilerConfig

#speed up use sagemaker compiler https://towardsdatascience.com/speed-up-hugging-face-training-jobs-on-aws-by-up-to-50-with-sagemaker-training-compiler-9ad2ac5b0eb

# hyperparameters which are passed to the training job
hyperparameters={'reference_column':'sentence1_key',
                 'hypothesis_column':'label',
                 'train_file':'/opt/ml/input/data/train/train.csv',
                 'validation_file':'/opt/ml/input/data/validation/test.csv',
                 'test_file':'/opt/ml/input/data/test/test.csv',
                 'output_dir':'/opt/ml/model',
                 'do_train':True,
                 'do_eval':True,
                 'max_source_length': 128,
                 'max_target_length': 128,
                 'model_name_or_path': 't5-base',
                 'learning_rate': 3e-4,
                 'num_train_epochs': 1,
                 'per_device_train_batch_size': 2,#16
                 'gradient_accumulation_steps':2, 
                 'save_strategy':'steps',
                 'evaluation_strategy':'epoch',
                 'save_total_limit':1,
                 'eval_steps':5000,
                 'predict_with_generate':True # customerized accuracy
                 }

# create the Estimator
huggingface_estimator = HuggingFace(
        entry_point='run_train.py',
        source_dir='./scripts',
        instance_type='ml.p3.2xlarge',#'ml.p3dn.24xlarge'
        instance_count=1,
        role=role,
        max_run=24*60*60,
        transformers_version='4.6',
        pytorch_version='1.7',
        py_version='py36',
        volume_size=128,
        #compiler_config=TrainingCompilerConfig(),
        base_job_name='train-modelb-stary-1epoch',
        hyperparameters = hyperparameters,
#         distribution=distribution
)

huggingface_estimator.fit({'train':training_input_path,'test':test_input_path,'validation': test_input_path})

2022-07-26 02:56:26 Starting - Starting the training job...
2022-07-26 02:56:54 Starting - Preparing the instances for trainingProfilerReport-1658804186: InProgress
.........
2022-07-26 02:58:23 Downloading - Downloading input data
2022-07-26 02:58:23 Training - Downloading the training image............

# deploy

In [1]:
from sagemaker.huggingface import HuggingFaceModel
import sagemaker

role = sagemaker.get_execution_role()

# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
   model_data="s3://sagemaker-us-east-1-726335585155/train-modelb-stary-0713-crossbook-2022-07-14-06-27-10-862/output/model.tar.gz",  # path to your trained sagemaker model
   role=role, # iam role with permissions to create an Endpoint
   transformers_version="4.6", # transformers version used
   pytorch_version="1.7", # pytorch version used
   py_version="py36", # python version of the DLC
)


In [2]:
predictor = huggingface_model.deploy(
   initial_instance_count=1,
   instance_type="ml.g4dn.xlarge"
)

-------!

In [3]:
%%time
# example request, you always need to define "inputs"
import time


data = {
   "inputs": 'prefix: "Whatever. Chris has a heir to the pack now." She says.,"Um last time I remembered in the laws it was clearly written that the offspring of the  Alpha can only take over if he is the son of the Alpha and the Luna and clearly after today no one will take this mistake," I point to her stomach , center: "seriously again as you are just a mistress not the mate." I say putting a lot of emphasis on the word \'mistress\' as if it is the world\'s most disgusting word. after: "seriously again as you are just a mistress not the mate." I say putting a lot of emphasis on the word \'mistress\' as if it is the world\'s most disgusting word.,"Well at least he loves me." She says desperately. roles: [\'Skylar\', \'Logan\']'
}

# request
predictor.predict(data)


CPU times: user 13.3 ms, sys: 0 ns, total: 13.3 ms
Wall time: 864 ms


[{'generated_text': 'Annie said the sentence'}]

In [4]:
import pandas as pd
df = pd.read_csv('../model_b_data/test.csv')

In [5]:
res = []
for i in df['sentence1_key']:
    data = {"inputs": i}

    # request
    res.append(predictor.predict(data))

In [6]:
#p f r
from sklearn.metrics import classification_report

y_pred = [i[0]['generated_text'] for i in res]
y_true = df['label']
 
print(classification_report(y_true, y_pred))

                         precision    recall  f1-score   support

Annie said the sentence       0.96      0.99      0.97       211
  Ava said the sentence       0.96      0.99      0.97       210
 Kade said the sentence       0.97      0.93      0.95       120
 Zach said the sentence       0.98      0.95      0.96       146

               accuracy                           0.97       687
              macro avg       0.97      0.96      0.96       687
           weighted avg       0.97      0.97      0.97       687



In [60]:
df['sentence1_key'][0]

'prefix: "Whatever. Chris has a heir to the pack now." She says.,"Um last time I remembered in the laws it was clearly written that the offspring of the  Alpha can only take over if he is the son of the Alpha and the Luna and clearly after today no one will take this mistake," I point to her stomach , center: "seriously again as you are just a mistress not the mate." I say putting a lot of emphasis on the word \'mistress\' as if it is the world\'s most disgusting word. after: "seriously again as you are just a mistress not the mate." I say putting a lot of emphasis on the word \'mistress\' as if it is the world\'s most disgusting word.,"Well at least he loves me." She says desperately. roles: [\'Skylar\', \'Logan\']'

In [61]:
df['label'][0]

'Logan said the sentence'

In [57]:
x = 0
for i in range(len(y_true)):
    if y_true[i]==y_pred[i]:
        x = x+1

In [59]:
x/len(y_true)

0.838

In [23]:
df['sentence1_key'][362]

'"Who do you think I am? Of course I want it." She whispers and look at her stomach. '

In [24]:
df['sentence1_key'][370]

'"What do you mean by that?" She asks me surprised. '