## data-prepare - shulex

In [1]:
import pandas as pd   
import numpy as np
import os
import shutil
# !pip install imblearn

In [2]:
#preprocess data
def convert_label(x,only_tag=False):
    res = []
    for i in x:
        if only_tag:
            res.append(i['tag'])
        else:
            res.append((i['span'],i['tag']))
    return res

def write_txt(df,path):
    #output txt file
    df = df.reset_index()
    with open(path,'a')as f:
        for i in range(len(df)):
            f.write("{} {} ####{}".format(df.loc[i,'title'],df.loc[i,'content'],df.loc[i,'label_conv']))
            f.write('\n')
def mkdir_rm(folder):
    if os.path.exists(folder):
        shutil.rmtree(folder) 
    os.mkdir(folder)
    print ("<< path valid!")
    
def flag_oversample(x,over_sample_list):
    res = 0
    for i in x:
        if i['tag'] in over_sample_list:
            res =1
    return res
        
def preprocess_data(input_file,output_path,over_sample=True):
    file  = './shulex/shulex_data.jsonl'
    #load 
    jsonObj = pd.read_json(path_or_buf=input_file, lines=True)
    #convert label to (sentence, tag) list
    jsonObj['label_conv'] = jsonObj['label'].map(lambda x:convert_label(x))
    jsonObj['content'] = jsonObj['content'].map(lambda x:x.replace('\n',''))
    jsonObj['label_tag'] = jsonObj['label'].map(lambda x:','.join(convert_label(x,only_tag=True)))
    #map the tag list into single lines
    df=jsonObj.drop('label_tag', axis=1).join(jsonObj['label_tag'].str.split(',', expand=True).stack().reset_index(level=1, drop=True).rename('tag'))

    #write the tag list
    a_list = df['tag'].unique()
    
    #get oversample tag list
    tag1 = df.groupby('tag').count().reset_index()
    df_tag_res = tag1[['tag','title']]
    df_tag_res.columns = ['tag','frequency']
    over_sample_list = df_tag_res[df_tag_res['frequency']<50]['tag'].unique()

    with open('tag.txt', 'w') as filehandle:
        filehandle.writelines("%s\n" % tag for tag in a_list)

    #remove & remake the output folder 
    mkdir_rm(output_path)
    
    #train/test/val split
    train, validate, test = np.split(jsonObj.sample(frac=1), [int(.8*len(jsonObj)), int(.9*len(jsonObj))])
    #flag over sample
    train['flag'] = train['label'].map(lambda x:flag_oversample(x,over_sample_list))
    
    #write the train tag numbert distribution
    df_train_tag=jsonObj.drop('label_tag', axis=1).join(jsonObj['label_tag'].str.split(',', expand=True).stack().reset_index(level=1, drop=True).rename('tag'))
    tag1 = df_train_tag.groupby('tag').count().reset_index()
    df_res = tag1[['tag','title']]
    df_res.columns = ['tag','train_frequency']
    df_res.to_csv(os.path.join(output_path,'train_tag_distribution.csv'))
    
    #over sample
    if over_sample==True:
        train_sample = train[train['flag']==1]
        for i in range(50):
            train = pd.concat([train,train_sample])
    
    print ("training size: ",train.shape)
    print ("test size: ",test.shape)
    print ("validate size: ",validate.shape)
    
    # write train/test/dev
    write_txt(train,os.path.join(output_path,'train.txt'))
    write_txt(test,os.path.join(output_path,'test.txt'))
    write_txt(validate,os.path.join(output_path,'dev.txt'))
    print ("<<<finish data preparing!")
    
input_file = './shulex/shulex_data.jsonl'
output_path = './data/tasd/shulex'
preprocess_data(input_file,output_path,over_sample=False)

<< path valid!
training size:  (4903, 6)
test size:  (613, 5)
validate size:  (613, 5)
<<<finish data preparing!


# model training 

In [3]:
!pip install -r requirement.txt



In [4]:
!pip install editdistance



## use pretrained model from amazon-review


In [8]:
#use pretrain from amazon-review
#!aws s3 cp s3://sagemaker-us-west-2-847380964353/train-amazon-review-t5-base-10epoch-stepeval/output/model.tar.gz ./
#!ls ./pretrain/checkpoint-44500/
#unzip model file
#!tar -zxvf model.tar.gz -C ./pretrain/

config.json	   scheduler.pt		    tokenizer.json
optimizer.pt	   special_tokens_map.json  trainer_state.json
pytorch_model.bin  spiece.model		    training_args.bin
rng_state.pth	   tokenizer_config.json


In [10]:
import os
#Error: mkl-service + Intel(R) MKL: MKL_THREADING_LAYER=INTEL is incompatible with libgomp.so.1 library.
#	Try to import numpy first or set the threading layer accordingly. Set MKL_SERVICE_FORCE_INTEL to force it.

os.environ['MKL_THREADING_LAYER'] = 'GNU'

In [14]:
%%time
!python main.py --task tasd \
            --dataset shulex \
            --paradigm extraction \
            --n_gpu '0' \
            --model_name_or_path t5-base \
            --do_train \
            --train_batch_size 2 \
            --gradient_accumulation_steps 2 \
            --eval_batch_size 2 \
            --learning_rate 3e-4 \
            --num_train_epochs 1  



Here is an example (from dev set) under `extraction` paradigm:
Total examples = 613 for data/tasd/shulex/dev.txt
Input : Alabaster glass Nice piece but was to small for me to use.
Output: (was to small for me to use, short)

****** Conduct Training ******
Some weights of the model checkpoint at t5-base were not used when initializing T5ForConditionalGeneration: ['decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight']
- This IS expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
GPU available: True, used: True
TPU available: False, using: 0 TPU cores

In [22]:
!python main.py --task tasd \
            --dataset shulex \
            --ckpoint_path outputs/tasd/shulex/extraction/cktepoch=1.ckpt \
            --paradigm extraction \
            --n_gpu '0' \
            --do_direct_eval \
            --eval_batch_size 128 \
            --customer_jj False



Here is an example (from dev set) under `extraction` paradigm:
Total examples = 613 for data/tasd/shulex/dev.txt
Input : Alabaster glass Nice piece but was to small for me to use.
Output: (was to small for me to use, short)

****** Conduct Evaluating with the last state ******

Load the trained model from outputs/tasd/shulex/extraction/cktepoch=1.ckpt...
Some weights of the model checkpoint at t5-base were not used when initializing T5ForConditionalGeneration: ['decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight']
- This IS expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSe

In [18]:
!python main.py --task tasd \
            --dataset shulex \
            --ckpoint_path outputs/tasd/shulex/extraction/cktepoch=1.ckpt \
            --text "very high quality and sturdy Very high quality. We love this and intend to buy more. My 15 year old took mine!" \
            --paradigm extraction \
            --n_gpu 0 \
            --do_direct_predict \



Here is an example (from dev set) under `extraction` paradigm:
Total examples = 613 for data/tasd/shulex/dev.txt
Input : Alabaster glass Nice piece but was to small for me to use.
Output: (was to small for me to use, short)

****** Conduct predicting with the last state ******

Load the trained model from outputs/tasd/shulex/extraction/cktepoch=1.ckpt...
Some weights of the model checkpoint at t5-base were not used when initializing T5ForConditionalGeneration: ['decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight']
- This IS expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSe

In [19]:
%%time
!python main.py --task tasd \
            --dataset shulex \
            --ckpoint_path outputs/tasd/shulex/extraction/cktepoch=1.ckpt \
            --text "THE FLOOR LAMP THE LAMP IS THE BEST EVER!!!" \
            --paradigm extraction \
            --n_gpu 0 \
            --do_direct_predict \



Here is an example (from dev set) under `extraction` paradigm:
Total examples = 613 for data/tasd/shulex/dev.txt
Input : Alabaster glass Nice piece but was to small for me to use.
Output: (was to small for me to use, short)

****** Conduct predicting with the last state ******

Load the trained model from outputs/tasd/shulex/extraction/cktepoch=1.ckpt...
Some weights of the model checkpoint at t5-base were not used when initializing T5ForConditionalGeneration: ['decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight']
- This IS expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSe