In [1]:
!pip install -r requirements.txt 

Collecting transformers==4.6.0
  Using cached transformers-4.6.0-py3-none-any.whl (2.3 MB)
Collecting datasets==1.11.0
  Using cached datasets-1.11.0-py3-none-any.whl (264 kB)
Collecting sentencepiece==0.1.91
  Using cached sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1 MB)
Collecting pytorch_lightning==0.8.1
  Using cached pytorch_lightning-0.8.1-py3-none-any.whl (293 kB)
Collecting jieba
  Using cached jieba-0.42.1-py3-none-any.whl
Collecting editdistance
  Using cached editdistance-0.6.0-cp36-cp36m-manylinux2010_x86_64.whl (284 kB)
Collecting huggingface-hub==0.0.8
  Using cached huggingface_hub-0.0.8-py3-none-any.whl (34 kB)
Collecting sacremoses
  Using cached sacremoses-0.0.53-py3-none-any.whl
Collecting tokenizers<0.11,>=0.10.1
  Using cached tokenizers-0.10.3-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
Collecting filelock
  Downloading filelock-3.4.1-py3-none-any.whl (9.9 kB)
Collecting fsspec>=2021.05.0


In [1]:
import pandas as pd   
import numpy as np
import os
import shutil

In [2]:
#preprocess data
def write_txt(df,path):
    '''
    write back to txt
    '''
    #output txt file
    df = df.reset_index()
    with open(path,'a')as f:
        for i in range(len(df)):
            f.write("{} #### {}".format(df.loc[i,'text'].strip(),df.loc[i,'label']))
            f.write('\n')
            
            
def mkdir_rm(folder):
    '''
    make directory if not exists
    '''
    if os.path.exists(folder):
        shutil.rmtree(folder) 
    os.mkdir(folder)
    print ("<< path valid!")
    

def preprocess_data(input_file,output_path,over_sample=True):
    jsonObj = pd.read_csv(input_file)
    jsonObj = jsonObj[jsonObj['label']!='[]']
    print (jsonObj.head())
    
    #remove & remake the output folder 
    mkdir_rm(output_path)
    
    #generate tag.txt
    a_list = ['consumer','zone','target','consequence','product','product_spec']
    with open('tag.txt', 'w') as filehandle:
        filehandle.writelines("%s\n" % tag for tag in a_list)
    
    #train/test/val split
    train, validate, test = np.split(jsonObj.sample(frac=1), [int(.8*len(jsonObj)), int(.9*len(jsonObj))])
   
    print ("training size: ",train.shape)
    print ("test size: ",test.shape)
    print ("validate size: ",validate.shape)
    
    # write train/test/dev
    write_txt(train,os.path.join(output_path,'train.txt'))
    write_txt(test,os.path.join(output_path,'test.txt'))
    write_txt(validate,os.path.join(output_path,'dev.txt'))
    print ("<<<finish data preparing!")
    
input_file = './aspect_category.csv'
output_path = './data/tasd/haofangReview'
preprocess_data(input_file,output_path,over_sample=False)

   Unnamed: 0  sent_num                                               text  \
0           0         0  We are new to the sport and have not used othe...   
1           1         1  Bought for my parents retirement. They are lov...   
2           2         2  Good set.  Paddles and balls are both good qua...   
3           3         3  Got into Pickleball this year and researched a...   
4           4         4  I love these paddles and the case but didn’t r...   

   sent_start  sent_end  sent_len  \
0           0        85        85   
1          85       140        55   
2         140       191        51   
3         191       590       399   
4         590       768       178   

                                               label  
0                           [('paddles', 'product')]  
1                          [('parents', 'consumer')]  
2                 [('Paddles and balls', 'product')]  
3  [('paddle', 'product'), ('me and my wife', 'co...  
4  [('paddles', 'product'), ('did

In [3]:
import os
#Error: mkl-service + Intel(R) MKL: MKL_THREADING_LAYER=INTEL is incompatible with libgomp.so.1 library.
#	Try to import numpy first or set the threading layer accordingly. Set MKL_SERVICE_FORCE_INTEL to force it.

os.environ['MKL_THREADING_LAYER'] = 'GNU'

# train

In [4]:
%%time
!python -u main.py --task tasd \
            --dataset haofangReview \
            --paradigm extraction \
            --n_gpu '0' \
            --model_name_or_path t5-base \
            --do_train \
            --train_batch_size 2 \
            --gradient_accumulation_steps 2 \
            --eval_batch_size 2 \
            --learning_rate 3e-4 \
            --num_train_epochs 1 



Downloading: 100%|███████████████████████████| 792k/792k [00:00<00:00, 25.5MB/s]
Downloading: 100%|█████████████████████████| 1.39M/1.39M [00:00<00:00, 42.3MB/s]
Here is an example (from dev set) under `extraction` paradigm:
Total examples = 127 for data/tasd/haofangReview/dev.txt
Input : My job provided me a headset to use as we are switching to Skype for all of our calls. This works great with Skype! And way better than having to be strapped to my desk with a headset. Awesome product!
Output: (product, product); (This, product)

****** Conduct Training ******
Downloading: 100%|█████████████████████████| 1.20k/1.20k [00:00<00:00, 1.78MB/s]
Downloading: 100%|███████████████████████████| 892M/892M [00:28<00:00, 31.3MB/s]
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
Total examples = 1

# eval

In [5]:
!python main.py --task tasd \
            --dataset haofangReview \
            --ckpoint_path outputs/tasd/haofangReview/extraction/cktepoch=1.ckpt \
            --model_name_or_path  t5-base \
            --paradigm extraction \
            --n_gpu '0' \
            --do_direct_eval \
            --eval_batch_size 128 \
            --customer_jj False



Here is an example (from dev set) under `extraction` paradigm:
Total examples = 127 for data/tasd/haofangReview/dev.txt
Input : My job provided me a headset to use as we are switching to Skype for all of our calls. This works great with Skype! And way better than having to be strapped to my desk with a headset. Awesome product!
Output: (product, product); (This, product)

****** Conduct Evaluating with the last state ******

Load the trained model from outputs/tasd/haofangReview/extraction/cktepoch=1.ckpt...
<<< read lines
Total examples = 128 for data/tasd/haofangReview/test.txt
<<< load test data
Total examples = 128 for data/tasd/haofangReview/test.txt
<<<< start evaluate
100%|█████████████████████████████████████████████| 1/1 [00:16<00:00, 16.46s/it]

Results of raw output, only tag category
<<<< res {'product': {'n_tp': 47, 'n_gold': 76, 'n_pred': 61, 'precision': 0.7704918032786885, 'recall': 0.618421052631579, 'f1': 0.6861313868613139}, '': {'n_tp': 5, 'n_gold': 8, 'n_pred': 3

# test

In [6]:
%%time
%%time
 #### [('lightweight easy to hold', 'product_spec')]

!python main.py --task tasd \
            --dataset haofangReview \
            --ckpoint_path outputs/tasd/haofangReview/extraction/cktepoch=1.ckpt \
            --text "I am pretty new to pickleball and finally decided to try out some different paddles. This one so far is my favorite of the ones I've purchased. It's very lightweight easy to hold and I would highly recommend for those of you that are looking for an affordable“ paddle" \
            --paradigm extraction \
            --n_gpu 0 \
            --do_direct_predict \



Here is an example (from dev set) under `extraction` paradigm:
Total examples = 127 for data/tasd/haofangReview/dev.txt
Input : My job provided me a headset to use as we are switching to Skype for all of our calls. This works great with Skype! And way better than having to be strapped to my desk with a headset. Awesome product!
Output: (product, product); (This, product)

****** Conduct predicting with the last state ******

Load the trained model from outputs/tasd/haofangReview/extraction/cktepoch=4.ckpt...
Traceback (most recent call last):
  File "main.py", line 460, in <module>
    model_ckpt = torch.load(checkpoint, map_location=device)
  File "/home/ec2-user/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/torch/serialization.py", line 581, in load
    with _open_file_like(f, 'rb') as opened_file:
  File "/home/ec2-user/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/torch/serialization.py", line 230, in _open_file_like
    return _open_file(name_or_b

## output

In [11]:
# output

import pickle
import pandas as pd
f = open('./outputs/tasd/haofang/extraction/results-tasd-haofang-extraction-pred.pickle','rb')
x = pickle.load(f)

sent = [' '.join(i) for i in x['sent']]
res_table = pd.DataFrame({'sentence':sent,"label":x['label'],"prediction":x['pred']})