In [22]:
#preprocess
import json
import os
import pandas as pd
from sklearn.model_selection import train_test_split

In [23]:
def load_json(json_file):
    # Opening JSON file 
    f = open(json_file) 

    # returns JSON object as  
    # a dictionary 
    data = json.load(f) 
    content_ls = [' '.join(data['content'][str(i)]['word_list']) for i in range(len(data['content']))]
    label_ls = [data['content'][str(i)]['dialogue_label'] for i in range(len(data['content']))]
    return content_ls,label_ls

In [24]:
#load one book
def load_books(book_path):
    chapter_ls = os.listdir(book_path)
    content_res = []
    label_res = []
    for i in chapter_ls:
        try:
            json_file = os.path.join(book_path,i)
            content_ls, label_ls = load_json(json_file)
            content_res = content_res + content_ls
            label_res = label_res + label_ls
        except:
            continue
            
    return content_res,label_res

In [25]:
content_res,label_res = load_books('../new_example_ten_json')
df_res = pd.DataFrame({'sentence1_key':content_res,'label':label_res})

In [26]:
# 半角全角转换
df_res['sentence1_key'] = df_res['sentence1_key'].map(lambda x: x.replace('“','"'))
df_res['sentence1_key'] = df_res['sentence1_key'].map(lambda x: x.replace('”','"'))

In [27]:
train, test = train_test_split(df_res,test_size=0.25,random_state=0)
print ("train size {}, test size{}".format(train.shape,test.shape))

train size (7335, 2), test size(2446, 2)


In [28]:
import os

path = '../model_a_data'

# Check whether the specified path exists or not
isExist = os.path.exists(path)

if not isExist:
    os.makedirs(path)
    print("The new directory is created!")
    
train[["label","sentence1_key"]].to_csv('../model_a_data/train.csv',index=False,encoding='utf-8')
test[["label","sentence1_key"]].to_csv('../model_a_data/test.csv',index=False,encoding='utf-8')

In [29]:
res = []
for i in test['sentence1_key']:
    if '"' in i:
        res.append(1)
    else:
        res.append(0)

In [30]:
#p f r
from sklearn.metrics import classification_report

y_pred = [int(i) for i in res]
y_true = test['label']
 
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.76      0.86      1769
           1       0.61      0.97      0.75       677

    accuracy                           0.82      2446
   macro avg       0.80      0.87      0.81      2446
weighted avg       0.88      0.82      0.83      2446



In [40]:
sum(y_pred==y_true)/len(y_true)

0.821749795584628

In [71]:
train.groupby('label')['label'].count()

label
0    5332
1    2003
Name: label, dtype: int64

In [63]:
test.groupby('label')['label'].count()

label
0    1769
1     677
Name: label, dtype: int64

# train - model a

In [72]:
import boto3
import sagemaker
import os
from sagemaker import get_execution_role

sess = sagemaker.Session()
role = get_execution_role()

prefix='stary-datalab-modela'

bucket = sess.default_bucket() 
boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join(prefix, "train/train.csv")
).upload_file("../model_a_data/train.csv")
boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join(prefix, "test/test.csv")
).upload_file("../model_a_data/test.csv")

training_input_path = f's3://{sess.default_bucket()}/{prefix}/train/train.csv'
test_input_path = f's3://{sess.default_bucket()}/{prefix}/test/test.csv'

In [None]:
import sagemaker
from sagemaker.huggingface import HuggingFace

hyperparameters={'per_device_train_batch_size':4,
                 'per_device_eval_batch_size': 4,
                 'model_name_or_path': 'bert-base-uncased',
                 'train_file':'/opt/ml/input/data/train/train.csv',
                 'validation_file':'/opt/ml/input/data/test/test.csv',
                 'test_file':'/opt/ml/input/data/test/test.csv',
                 'do_train': True,
                 'do_predict': True,
                 'do_eval': True,
                 'save_total_limit':3,
                 'output_dir': '/opt/ml/model',
                 'num_train_epochs': 5,
                 'learning_rate': 5e-5,
                 'seed': 7,
                 'fp16': False,
                 'eval_steps': 1000,
                 }


# create the Estimator
huggingface_estimator = HuggingFace(
      entry_point='run_glue.py', # script
      source_dir='./', # relative path to example
      instance_type='ml.p3.2xlarge',
      instance_count=1,
      volume_size=500,
      transformers_version='4.6',
      pytorch_version='1.7',
      py_version='py36',
      role=role,
      base_job_name='stary-modela-bert-base-epoch5',
      hyperparameters = hyperparameters
)

huggingface_estimator.fit({'train':training_input_path,'test':test_input_path})

2022-07-13 10:29:12 Starting - Starting the training job...
2022-07-13 10:29:36 Starting - Insufficient capacity error from EC2 while launching instances, retrying!ProfilerReport-1657708151: InProgress
...
2022-07-13 10:29:56 Starting - Preparing the instances for training......
2022-07-13 10:31:09 Downloading - Downloading input data...
2022-07-13 10:31:36 Training - Downloading the training image.....................
2022-07-13 10:35:00 Training - Training image download completed. Training in progress.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2022-07-13 10:35:04,081 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2022-07-13 10:35:04,105 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2022-07-13 10:35:04,112 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34

# deploy

In [14]:
from sagemaker.huggingface import HuggingFaceModel
import sagemaker

role = sagemaker.get_execution_role()

# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
   model_data="s3://sagemaker-us-east-1-726335585155/stary-modela-bert-base-epoch5-2022-07-13-10-29-11-407/output/model.tar.gz",  # path to your trained sagemaker model
   role=role, # iam role with permissions to create an Endpoint
   transformers_version="4.6", # transformers version used
   pytorch_version="1.7", # pytorch version used
   py_version="py36", # python version of the DLC
)

In [15]:
predictor = huggingface_model.deploy(
   initial_instance_count=1,
   instance_type="ml.g4dn.xlarge"
)

--------!

In [4]:
from sagemaker.predictor import Predictor
import sagemaker

predictor = Predictor(endpoint_name="huggingface-pytorch-inference-2022-07-05-05-41-57-137")
predictor.serializer = sagemaker.serializers.JSONSerializer()
predictor.deserializer = sagemaker.deserializers.JSONDeserializer()

In [16]:
%%time
# example request, you always need to define "inputs"
import time


data = {
   "inputs": "Desktop Photo Printers: Liene Photo Printer Paper amp Cartridge Cartridge Refill amp Change\xa0Modify Photo Paper 40 Pack"
}

# request
predictor.predict(data)


CPU times: user 12.5 ms, sys: 798 µs, total: 13.3 ms
Wall time: 744 ms


[{'label': 0, 'score': 0.9999834895133972}]

In [17]:
import pandas as pd
df = pd.read_csv('../model_a_data/test.csv')

In [18]:
res = []
for i in df['sentence1_key']:
    data = {"inputs": i}

    # request
    res.append(predictor.predict(data))

In [19]:
#p f r
from sklearn.metrics import classification_report

y_pred = [int(i[0]['label']) for i in res]
y_true = df['label']
 
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.97      0.96      1769
           1       0.91      0.90      0.91       677

    accuracy                           0.95      2446
   macro avg       0.94      0.93      0.94      2446
weighted avg       0.95      0.95      0.95      2446



In [22]:
df.tail(10)

Unnamed: 0,label,sentence1_key
362,0,"""Who do you think I am? Of course I want it."" ..."
363,1,"""Then fine. Give me time to think about what w..."
364,0,nods and makes her way to the kitchen.
365,0,I quickly sit in a room and start to think for...
366,0,I shake my head and try to imagine what will h...
367,0,After I wake up I go downstairs to tell Emma t...
368,0,"""Hey did you have a good sleep?"" She asks me."
369,1,"""Yeah, by the way pack your things we are goin..."
370,0,"""What do you mean by that?"" She asks me surpri..."
371,1,"""You are coming to Lighthall with me."


In [19]:
y_pred[:10]

[1, 0, 0, 0, 0, 1, 0, 0, 1, 1]

In [23]:
df['sentence1_key'][362]

'"Who do you think I am? Of course I want it." She whispers and look at her stomach. '

In [24]:
df['sentence1_key'][370]

'"What do you mean by that?" She asks me surprised. '