In [1]:
!pip install transformers
!pip install accelerate
!pip install openpyxl



## prepare test data

In [4]:
%%writefile prepare_test_data.py
import pandas as pd
import os
import shutil
import json

def write_to_dict(df,file_path):
    '''
    write back to txt
    '''
    res = []
    # output txt file
    df = df.reset_index()
   
    for i in range(len(df)):
        data = {'instruction': df.loc[i, 'review_content'].strip(),
                'context': '',
                'response': df.loc[i, 'label'],
                 'category': ''}
        res.append(data)
    with open(file_path,'w') as file:
        json.dump(res,file)
    return res


def mkdir_rm(folder):
    '''
    make directory if not exists
    '''
    os.makedirs(folder,exist_ok=True)
    print("<< path valid!")

def get_label(row):
    #try:
     #   if pd.isna(row['Remark']):
      #      return (str(row['aspect_term']),str(row['aspect_category']),str(row['opinion_term']),str(row['sentiment_polarity']))
       # elif row['Remark']=='改':
        #    return (str(row['sentiment_polarity_1']),str(row['aspect_category_1']),str(row['opinion_term_1']),str(row['sentiment_polarity_1']))
        #elif row['Remark']=='修改':
         #   return (str(row['sentiment_polarity_1']),str(row['aspect_category_1']),str(row['opinion_term_1']),str(row['sentiment_polarity_1']))
    #except:
    return (str(row['aspect_term']),str(row['aspect_category']),str(row['opinion_term']),str(row['sentiment_polarity']))

def get_data(type,excelname,sheetname=None):
    if type == 'direct':
        jsonObj = pd.read_excel(excelname)
    elif type == 'subsheet':
        jsonObj = pd.read_excel(excelname, sheet_name=sheetname)
    #revise content
    jsonObj['review_content'] = jsonObj['review_content'].fillna(method='ffill')
    jsonObj = jsonObj[-jsonObj['review_content'].isnull()]
    jsonObj['review_content'] = jsonObj['review_content'].apply(lambda x: x.replace('\n', ' '))

    # capitalize
    jsonObj['aspect_category'] = jsonObj['aspect_category'].map(lambda x: str(x).capitalize())
    
    #map sentiment
    if 'sentiment_polarity_1' in jsonObj.columns:
        jsonObj['sentiment_polarity_1'] = jsonObj['sentiment_polarity_1'].map(lambda x: 'Positive' if str(x).capitalize()=='Neutral' else x)
    jsonObj['sentiment_polarity'] = jsonObj['sentiment_polarity'].map(lambda x: 'Positive' if str(x).capitalize()=='Neutral' else x)

    # generate label
    jsonObj['label'] = jsonObj.apply(lambda row: get_label(row), axis=1)
    
    #print ('jsonObj: ', jsonObj.head())

    jsonObj = jsonObj[['review_content', 'label']]
    jsonObj = jsonObj[-jsonObj['label'].isnull()]

    # agg label
    jsonObj_grouped = jsonObj.groupby('review_content')['label'].apply(list).reset_index()
    return jsonObj_grouped


def preprocess_data(df,output_path, save_path, over_sample=True):
    # remove & remake the output folder
    mkdir_rm(output_path)

    
    save_path=os.path.join(output_path, save_path)
    res = write_to_dict(df,save_path)
    print("<<<finish data preparing!")
    return res

def main():
    # get data
    
    dataset = get_data('subsheet','../../2600条评论汇总-20240103.xlsx','测试集')
    output_path = '../data/'
    save_path='test_data.json'

    dataset = preprocess_data(dataset,output_path,save_path, over_sample=False)


if __name__ == "__main__":
    main()


Overwriting prepare_test_data.py


In [5]:
!python prepare_test_data.py

<< path valid!
<<<finish data preparing!


## eval

In [8]:
%%writefile eval.py
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from tqdm import tqdm
import ast
import argparse
import json
import re
import pandas as pd

def format_dolly(sample):
    sample['instruction']=sample['instruction'].replace("'","")
    format_mess = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": f"What is the aspect based sentiment of the following customer content, answer in format [aspect term, aspect category, opinion term, sentiment polarity]? {sample['instruction']}" }
    ]
    return format_mess

def get_res(model,tokenizer,messages):
    
    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)

    terminators = [
        tokenizer.eos_token_id,
        tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    outputs = model.generate(
        input_ids,
        max_new_tokens=256,
        eos_token_id=terminators,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
    )
    response = outputs[0][input_ids.shape[-1]:]
    return tokenizer.decode(response, skip_special_tokens=True)

def get_llama3_res(model,tokenizer,dataset):
    for i in tqdm(range(len(dataset))):
        input_data = dataset[i]
        res = get_res(model,tokenizer,format_dolly(input_data))
        dataset[i]['pred_result'] = res

def llama3_predict(model,tokenizer,dataset):
    get_llama3_res(model,tokenizer,dataset)
    return dataset

def get_llama(x1):
    x1 = x1.split('### Answer\n')[-1]
    list_obj = ast.literal_eval(x1.replace("'s "," ").replace("'ll "," ").replace("'t "," ")) 
    list_res = [(i[1],i[3]) for i in list_obj]
    return list_res

def get_label_prediction(x):
    x1 = x['response']
    res_label = [(i[1],i[3])for i in x1]
    res_pred = get_llama(x['pred_result'])
    return res_label,res_pred

def compute_f1_scores(pred_pt, gold_pt):
    """
    Function to compute F1 scores with pred and gold pairs/triplets
    The input needs to be already processed
    """
    # number of true postive, gold standard, predicted aspect terms
    res = {}

    n_tp, n_gold, n_pred = 0, 0, 0

    for i in range(len(pred_pt)):
        gold_pt[i] = list(set(gold_pt[i]))
        pred_pt[i] = list(set(pred_pt[i]))

        n_gold += len(gold_pt[i])
        n_pred += len(pred_pt[i])

        for t in pred_pt[i]:
            if t in gold_pt[i]:
                n_tp += 1

    precision = float(n_tp) / float(n_pred) if n_pred != 0 else 0
    recall = float(n_tp) / float(n_gold) if n_gold != 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if precision != 0 or recall != 0 else 0
    scores = {'total precision': precision, 'total recall': recall, 'total f1': f1}

    return scores


def get_subset(original_list, element):
    return [sublist for sublist in original_list if element in sublist[0]]


def calc_tag(element, all_predictions, all_labels):
    pred_pt_sub = [get_subset(original_list, element) for original_list in all_predictions]
    gold_pt_sub = [get_subset(original_list, element) for original_list in all_labels]
    score = compute_f1_scores(pred_pt_sub, gold_pt_sub)
    return score

def get_tags(x):
    y = [i[0] for i in x]
    return y 

def get_tag_list(all_labels):
    merged = [get_tags(x) for x in all_labels]
    flat_list = [item for sublist in merged for item in sublist]
    unique_elements = list(set(flat_list))
    return unique_elements

def get_eval_res(tags,all_predictions, all_labels):
    precisons = []
    recall = []
    f1 = []
    for i in tags:
        score = calc_tag(i, all_predictions, all_labels)
        precisons.append(score['total precision'])
        recall.append(score['total recall'])
        f1.append(score['total f1'])

    res = pd.DataFrame({'tag':tags, 'precision':precisons, 'recall':recall,'f1':f1})
    return res

def eval_function(dataset):
    all_labels = []
    all_predictions = []
    for i in tqdm(range(len(dataset))):
        x = dataset[i]
        res_label, res_pred = get_label_prediction(x) 
        all_labels.append(res_label)
        all_predictions.append(res_pred)
        
    print("\nResults of raw output, only tag category & sentiment")
    raw_scores_2 = compute_f1_scores(all_predictions, all_labels)
    print (raw_scores_2)
    
    tags = get_tag_list(all_labels)
    res = get_eval_res(tags,all_predictions, all_labels)
    res.sort_values(by=['recall'],ascending=False,inplace=True)
    return res

def parse_arge():
    """Parse the arguments."""
    parser = argparse.ArgumentParser()
    # add model id and dataset path argument
    parser.add_argument(
        "--model_path_or_name",
        type=str,
        help="Model id to use for inference.",
    )
    parser.add_argument(
        "--dataset_path", type=str, default="lm_dataset", help="Path to inference dataset."
    )
    args, _ = parser.parse_known_args()
    return args

def main():
    args = parse_arge()
    
    with open(args.dataset_path,'r') as file:
        dataset=json.load(file)
    tokenizer = AutoTokenizer.from_pretrained(args.model_path_or_name)
    model = AutoModelForCausalLM.from_pretrained(args.model_path_or_name, torch_dtype=torch.float16,device_map="auto")
    
    dataset=llama3_predict(model,tokenizer,dataset)
    metric=eval_function(dataset)
    print(metric)
    with open('pred_result.json','w') as file:
        json.dump(dataset,file)
    metric.to_csv('eval_metric.csv',index=False)


if __name__ == "__main__":
    main()


Overwriting eval.py


In [9]:
!python eval.py --model_path_or_name "../llama3_model" --dataset_path ../data/test_data.json

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████████████| 4/4 [00:18<00:00,  4.70s/it]
  0%|                                                    | 0/99 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
  1%|▍                                           | 1/99 [00:06<09:54,  6.07s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
  2%|▉                                           | 2/99 [00:06<04:52,  3.01s/it]The attention mask and the pad token id were not set. As 

# metric results

In [11]:
import pandas as pd
data=pd.read_csv('eval_metric.csv')
data
data[~data['tag'].isin(['Product durability','Product failure','Charging accessories','Brand','Easy to use','Easy to carry/move'])]

Unnamed: 0,tag,precision,recall,f1
0,Power station quality control,1.0,1.0,1.0
1,Correct delivery content and quantity,1.0,1.0,1.0
2,Workmanship craftsmanship,1.0,1.0,1.0
3,Recommendation,0.526316,1.0,0.689655
4,Warranty policy,1.0,1.0,1.0
5,Self-discharge rate,1.0,1.0,1.0
6,Appearance design,1.0,1.0,1.0
7,User manual,1.0,1.0,1.0
9,Promotions,1.0,1.0,1.0
10,Input charging method,0.894737,1.0,0.944444
