In [None]:
from datasets import load_metric, load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, TrainingArguments, Seq2SeqTrainingArguments
import torch 
import numpy as np
from tqdm import tqdm
metrics = load_metric('accuracy')
import gc
import os

def inference(path):
  prefix = 'summarize: ' if 'mt5' in path else ''
  tokenizer = AutoTokenizer.from_pretrained(path)
  model = AutoModelForSeq2SeqLM.from_pretrained(path)
  max_length = 1024 if 'bert' not in path else 256
  def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding=True)
    labels = tokenizer(text_target=examples["label"], max_length=5, truncation=True, padding=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

  testset =  pd.read_excel('test.xlsx')
  testset['label'] = testset['label'].astype(str)
  dataset = Dataset.from_pandas(testset[['text', 'label']])

#   dataset = load_dataset("json", data_files="datasets/faq/test/faq_test.json", split='train')
  test_tokenized_datasets = dataset.map(preprocess_function, batched=True)
  data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt")
  model.to('cuda')


  max_target_length = 5
  test_tokenized_datasets = test_tokenized_datasets.remove_columns(['text', 'label'])
  dataloader = torch.utils.data.DataLoader(test_tokenized_datasets, collate_fn=data_collator, batch_size=32)

  predictions = []
  references = []
  for i, batch in enumerate(tqdm(dataloader)):
  outputs = model.generate(
    input_ids=batch['input_ids'].to('cuda'),
    max_length=max_target_length,
    attention_mask=batch['attention_mask'].to('cuda'),
  )
  with tokenizer.as_target_tokenizer():
    outputs = [tokenizer.decode(out, clean_up_tokenization_spaces=False, skip_special_tokens=True) for out in outputs]

    labels = np.where(batch['labels'] != -100,  batch['labels'], tokenizer.pad_token_id)
    actuals = [tokenizer.decode(out, clean_up_tokenization_spaces=False, skip_special_tokens=True) for out in labels]
  predictions.extend(outputs)
  references.extend(actuals)
  metrics.add_batch(predictions=outputs, references=actuals)

  metrics.compute()

  rouges = [{k: v.mid.fmeasure} for k,v in metrics.compute(predictions=predictions, references=references).items()]
#   new_file_path = './r_scores_faq'
#   # Write to the file
#   try:
#   # Attempt to append to the file
#   with open(new_file_path, 'a') as file:
#     file.write(path.split('/')[-2] + '\n')
#     for new_content_str in rouges:
#       result = next(iter(new_content_str))
#       file.write(f"{result}: {new_content_str[result]}\n")
#     file.write('\n')
#   action_result = "Content appended to the existing file."
#   except FileNotFoundError:
#   # File doesn't exist, create it and write the content
#   with open(new_file_path, 'w') as file:
#     file.write(path)
#     file.write(new_content_str)
#   action_result = "File did not exist, so it was created with the new content."
  
#   del model
#   gc.collect()


In [None]:
import pandas as pd
from datasets import Dataset
import evaluate
from datasets import load_metric, load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, TrainingArguments, Seq2SeqTrainingArguments
import torch 
import numpy as np
from tqdm import tqdm
metrics = load_metric('accuracy')
import gc
import os

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")
precision = evaluate.load("precision")
recall = evaluate.load("recall")

path = './multitask/distilling-step-by-step/ckpts/VietAI/vit5-base_human_justification/'
# path = 'multitask/distilling-step-by-step/ckpts//'
# path = 'results/flan-t5-base/'
prefix = 'gt: ' if 'distilling' in path else ''
tokenizer = AutoTokenizer.from_pretrained(path)
model = AutoModelForSeq2SeqLM.from_pretrained(path)
max_length = 1024 if 'bert' not in path else 256
def preprocess_function(examples):
  inputs = [prefix + doc for doc in examples["text"]]
  model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding=True)
  labels = tokenizer(text_target=examples["label"], max_length=5, truncation=True, padding=True)
  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

testset =  pd.read_excel('test.xlsx')
test_with_asr = pd.read_excel('test_asr.xlsx')
testset['text'] = test_with_asr['asr']
testset['label'] = testset['label'].astype(str)
dataset = Dataset.from_pandas(testset[['text', 'label']])

#   dataset = load_dataset("json", data_files="datasets/faq/test/faq_test.json", split='train')
test_tokenized_datasets = dataset.map(preprocess_function, batched=True)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt")
model.to('cuda:2')


max_target_length = 25
test_tokenized_datasets = test_tokenized_datasets.remove_columns(['text', 'label'])
dataloader = torch.utils.data.DataLoader(test_tokenized_datasets.select(idx), collate_fn=data_collator, batch_size=32)

predictions = []
references = []
for i, batch in enumerate(tqdm(dataloader)):
  outputs = model.generate(
  input_ids=batch['input_ids'].to('cuda:2'),
  max_length=max_target_length,
  attention_mask=batch['attention_mask'].to('cuda:2'),
  )
  with tokenizer.as_target_tokenizer():
      outputs = [tokenizer.decode(out, clean_up_tokenization_spaces=False, skip_special_tokens=True) for out in outputs]

      labels = np.where(batch['labels'] != -100,  batch['labels'], tokenizer.pad_token_id)
      actuals = [tokenizer.decode(out, clean_up_tokenization_spaces=False, skip_special_tokens=True) for out in labels]
      predictions.extend(outputs)
      references.extend(actuals)
# metrics.add_batch(predictions=outputs, references=actuals)

# metrics.compute()

def compute_metrics(predictions, references):
  decoded_preds, decoded_labels = predictions, references
#   logits = np.argmax(logits, axis=1)
  decoded_preds = [pred if pred.isdigit() else '-1' for pred in decoded_preds]  # Replace non-digit predictions with '-1'
  decoded_labels = [label if label.isdigit() else '-1' for label in decoded_labels]  # Replace non-digit labels with '-1'
  predictions = decoded_preds
  labels = decoded_labels
  neg,neu,pos = f1.compute(predictions=predictions, references=labels, average=None)['f1']
  metrics_result = {
    "accuracy": accuracy.compute(predictions=predictions, references=labels)['accuracy'],
    "macro_f1": f1.compute(predictions=predictions, references=labels, average='macro')['f1'],
#   "macro_precision": precision.compute(predictions=predictions, references=labels, average='macro')['precision'],
#   "macro_recall": recall.compute(predictions=predictions, references=labels, average='macro')['recall'],
    "f1_neg": neg,
    "f1_neu": neu,
    "f1_pos": pos

  }
  return metrics_result
# rouges = [{k: v.mid.fmeasure} for k,v in metrics.compute(predictions=predictions, references=references).items()]
del model
gc.collect()

print(compute_metrics(predictions, references))

In [None]:
from tqdm import tqdm
import random
from datasets import Dataset, load_metric
import transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from datasets import load_dataset, Dataset
import pandas as pd
import evaluate
import torch
import nltk
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM
import nltk
import argparse
import numpy as np
from transformers import AutoModelForSeq2SeqLM
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
from trl import AutoModelForCausalLMWithValueHead
from transformers import TrainingArguments, Trainer
from trl import SFTTrainer
import evaluate
from transformers import AutoModelForCausalLM
from peft import PeftModel


base_model_name = 'vtrungnhan9/vmlu-llm'
print("loading")
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
model = AutoModelForCausalLM.from_pretrained(base_model_name, cache_dir='./models')
model = PeftModel.from_pretrained(model, './results/vmlu-llm_human_justificationv2/')
print('finished loadding')
model = model.merge_and_unload()
model = model.cuda()


train_df = pd.read_excel('train.xlsx')#pd.concat([df, df_dev]).reset_index(drop=True)
train_df['label'] = train_df['label'].astype(str)

train_dataset = Dataset.from_pandas(train_df)

testset =  pd.read_excel('test.xlsx')

testset['label'] = testset['label'].astype(str)
print(train_df['label'].unique())
print(testset['label'].unique())
test_dataset = Dataset.from_pandas(testset[['text', 'label']])

def template(inp, out):
    conversation = [{"role": "system", "content": "Bạn là trợ lý nhận diện cảm xúc. Với một văn bản, trả lời 0 nếu cảm xúc tiêu cực, 1 nếu không có cảm xúc, 2 nếu cảm xúc tích cực." },
                    {"role": "user", "content": f"""nhận diện cảm xúc: '{inp.strip()}'"""},
                    {'role': 'asssistant', 'content': str(out)}
                   ]
#     print(out)
    prompt = tokenizer.apply_chat_template(conversation, tokenize=False)
#     prompt = prompt + ' '
    return prompt

# reformatted_output = [reformat(inp, out) for inp, out in zip(dataset['train']['words'], dataset['train']['tags'])]
new_column_train = [template(inp, out) for inp, out in zip(train_dataset['text'], train_dataset['label'])]
train_dataset= train_dataset.add_column("train_text", new_column_train)
new_column_train = [template(inp, out) for inp, out in zip(test_dataset['text'], test_dataset['label'])]
test_dataset= test_dataset.add_column("train_text", new_column_train)

outs = []
i = 0
# print("Start inference")
# for tt in (test_dataset['train_text']):
#     if i % 100 == 0:
#         print(i)
#     input_ids = tokenizer(tt, return_tensors='pt').input_ids.cuda()
#     out_ids = model.generate(input_ids, max_new_tokens=1, pad_token_id=tokenizer.eos_token_id, output_scores=True)

#     assistant = tokenizer.batch_decode(out_ids[:, input_ids.size(1): ], skip_special_tokens=True)[0].strip()
# #     print(assistant)
#     outs.append(assistant)
#     i += 1
# #     break
# del model
# gc.collect()
outs = []
batch_size=32
print("Start inference")
for i in tqdm(range(0, len(test_dataset), batch_size)):
    batch = test_dataset[i:i + batch_size]
    inputs = tokenizer(batch['train_text'], return_tensors='pt', padding=True, truncation=True).input_ids.cuda()
    outputs = model.generate(inputs, max_new_tokens=2, pad_token_id=tokenizer.eos_token_id)
    decoded_outputs = tokenizer.batch_decode(outputs[:, inputs.size(1):], skip_special_tokens=True)
    outs.extend([output.strip() for output in decoded_outputs])
#     break

# Cleanup
del model
import gc
gc.collect()
torch.cuda.empty_cache()




In [None]:
import pandas as pd
from datasets import Dataset
import evaluate
from datasets import load_metric, load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, TrainingArguments, Seq2SeqTrainingArguments
import torch 
import numpy as np
from tqdm import tqdm
metrics = load_metric('accuracy')
import gc
import os

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")
precision = evaluate.load("precision")
recall = evaluate.load("recall")

def compute_metrics(predictions, references):
  decoded_preds, decoded_labels = predictions, references
#   logits = np.argmax(logits, axis=1)
  decoded_preds = [pred if pred.isdigit() else '-1' for pred in decoded_preds]  # Replace non-digit predictions with '-1'
  decoded_labels = [label if label.isdigit() else '-1' for label in decoded_labels]  # Replace non-digit labels with '-1'
  predictions = decoded_preds
  labels = decoded_labels
  neg,neu,pos = f1.compute(predictions=predictions, references=labels, average=None)['f1']
  metrics_result = {
    "accuracy": accuracy.compute(predictions=predictions, references=labels)['accuracy'],
    "macro_f1": f1.compute(predictions=predictions, references=labels, average='macro')['f1'],
#   "macro_precision": precision.compute(predictions=predictions, references=labels, average='macro')['precision'],
#   "macro_recall": recall.compute(predictions=predictions, references=labels, average='macro')['recall'],
    "f1_neg": neg,
    "f1_neu": neu,
    "f1_pos": pos

  }
  return metrics_result

references = (testset['label'])
compute_metrics(outs, references)

In [None]:
from tqdm import tqdm
import random
from datasets import Dataset, load_metric
import transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from datasets import load_dataset, Dataset
import pandas as pd
import evaluate
import torch
import nltk
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM
import nltk
import argparse
import numpy as np
from transformers import AutoModelForSeq2SeqLM
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
from trl import AutoModelForCausalLMWithValueHead
from transformers import TrainingArguments, Trainer
from trl import SFTTrainer
import evaluate
from transformers import AutoModelForCausalLM
from peft import PeftModel


base_model_name = 'vtrungnhan9/vmlu-llm'
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
model = AutoModelForCausalLM.from_pretrained(base_model_name, cache_dir='./models')
# model = PeftModel.from_pretrained(model, './Vistral-7B-Chat_no')

# model = model.merge_and_unload()
model = model.cuda()


train_df = pd.read_excel('train.xlsx')#pd.concat([df, df_dev]).reset_index(drop=True)
test_with_asr = pd.read_excel('test_asr.xlsx')
testset['text'] = test_with_asr['asr']

train_df['label'] = train_df['label'].astype(str)

train_dataset = Dataset.from_pandas(train_df)

testset =  pd.read_excel('test.xlsx')

testset['label'] = testset['label'].astype(str)
print(train_df['label'].unique())
print(testset['label'].unique())
test_dataset = Dataset.from_pandas(testset[['text', 'label']])

def template(inp, out):
    conversation = [{"role": "system", "content": "Bạn là trợ lý nhận diện cảm xúc. Với một văn bản, trả lời 0 nếu cảm xúc tiêu cực, 1 nếu không có cảm xúc, 2 nếu cảm xúc tích cực." },
                    {"role": "user", "content": f"""Bạn là trợ lý nhận diện cảm xúc. Với một văn bản, trả lời 0 nếu cảm xúc tiêu cực, 1 nếu không có cảm xúc, 2 nếu cảm xúc tích cực. Nhận diện cảm xúc: '{inp.strip()}'"""},
#                     {'role': 'asssistant', 'content': str(out) + 'herqwwewf'}
                   ]
    prompt = tokenizer.apply_chat_template(conversation, tokenize=False)
    print(prompt)
#     prompt = prompt + f' {out}'
    return prompt

# reformatted_output = [reformat(inp, out) for inp, out in zip(dataset['train']['words'], dataset['train']['tags'])]
new_column_train = [template(inp, out) for inp, out in zip(train_dataset['text'], train_dataset['label'])]
train_dataset= train_dataset.add_column("train_text", new_column_train)
new_column_train = [template(inp, out) for inp, out in zip(test_dataset['text'], test_dataset['label'])]
test_dataset= test_dataset.add_column("train_text", new_column_train)

outs = []
i = 0
# for tt in (test_dataset['train_text']):
#     if i % 500 == 0:
#         print(i)
#         print(outs)
#     input_ids = tokenizer(tt, return_tensors='pt').input_ids.cuda()
#     out_ids = model.generate(input_ids, max_new_tokens=1, pad_token_id=tokenizer.eos_token_id)

#     assistant = tokenizer.batch_decode(out_ids[:, input_ids.size(1): ], skip_special_tokens=True)[0].strip()
#     outs.append(assistant)
# #     print(outs)
    
#     i += 1

In [None]:
outs = []
for tt in (test_dataset['train_text']):
    if i % 500 == 0:
        print(i)
        print(outs)
    input_ids = tokenizer(tt, return_tensors='pt').input_ids.cuda()
    out_ids = model.generate(input_ids, max_new_tokens=2, pad_token_id=tokenizer.eos_token_id)

    assistant = tokenizer.batch_decode(out_ids[:, input_ids.size(1): ], skip_special_tokens=True)[0].strip()
    outs.append(assistant)
#     print(outs)
    
    i += 1

In [None]:
def template(inp, out):
    conversation = [{"role": "system", "content": "Bạn là trợ lý nhận diện cảm xúc. Với một văn bản, trả lời 0 nếu cảm xúc tiêu cực, 1 nếu không có cảm xúc, 2 nếu cảm xúc tích cực." },
                    {"role": "user", "content": f"""Nhận diện cảm xúc: '{inp.strip()}'"""},
#                     {'role': 'asssistant', 'content': str(out) + 'herqwwewf'}
                   ]
    prompt = tokenizer.apply_chat_template(conversation, tokenize=False)
    print(prompt)
#     prompt = prompt + f' {out}'
    return prompt

# reformatted_output = [reformat(inp, out) for inp, out in zip(dataset['train']['words'], dataset['train']['tags'])]
new_column_train = [template(inp, out) for inp, out in zip(train_dataset['text'], train_dataset['label'])]
train_dataset= train_dataset.add_column("train_text", new_column_train)
new_column_train = [template(inp, out) for inp, out in zip(test_dataset['text'], test_dataset['label'])]
test_dataset= test_dataset.add_column("train_text", new_column_train)

outs = []
i = 0


In [None]:
from tqdm import tqdm
import random
from datasets import Dataset, load_metric
import transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from datasets import load_dataset, Dataset
import pandas as pd
import evaluate
import torch
import nltk
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM
import nltk
import argparse
import numpy as np
from transformers import AutoModelForSeq2SeqLM
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
from trl import AutoModelForCausalLMWithValueHead
from transformers import TrainingArguments, Trainer
from trl import SFTTrainer
import evaluate
from transformers import AutoModelForCausalLM
from peft import PeftModel


base_model_name = 'Viet-Mistral/Vistral-7B-Chat'
print("loading")
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
model = AutoModelForCausalLM.from_pretrained(base_model_name, cache_dir='./models')
model = PeftModel.from_pretrained(model, './results/Vistral-7B-Chat_human_justification/')
print('finished laoding')
model = model.merge_and_unload()
model = model.to('cuda:7')


train_df = pd.read_excel('train.xlsx')#pd.concat([df, df_dev]).reset_index(drop=True)
train_df['label'] = train_df['label'].astype(str)

train_dataset = Dataset.from_pandas(train_df)

testset =  pd.read_excel('test.xlsx')
test_with_asr = pd.read_excel('test_asr.xlsx')
testset['text'] = test_with_asr['asr']

testset['label'] = testset['label'].astype(str)
print(train_df['label'].unique())
print(testset['label'].unique())
test_dataset = Dataset.from_pandas(testset[['text', 'label', 'human_justification']])

def template(inp, out):
    conversation = [
        {"role": "system", "content": "Bạn là trợ lý nhận diện cảm xúc. Với một văn bản, trả lời 0 nếu cảm xúc tiêu cực, 1 nếu không có cảm xúc, 2 nếu cảm xúc tích cực." },
#                     {"role": "user", "content": f"""sentiment analysis: '{inp.strip()}'"""},
        {"role": "user", "content": f"""nhận diện cảm xúc: '{inp.strip()}'"""},
        #                     {'role': 'asssistant', 'content': str(out) + 'herqwwewf'}
                   ]
#     print(out)
    prompt = tokenizer.apply_chat_template(conversation, tokenize=False)
    prompt = prompt + f' '
    return prompt

# reformatted_output = [reformat(inp, out) for inp, out in zip(dataset['train']['words'], dataset['train']['tags'])]
new_column_train = [template(inp, out) for inp, out in zip(train_dataset['text'], train_dataset['label'])]
train_dataset= train_dataset.add_column("train_text", new_column_train)
new_column_train = [template(inp, out) for inp, out in zip(test_dataset['text'], test_dataset['label'])]
test_dataset= test_dataset.add_column("train_text", new_column_train)

outs = []
i = 0
print("Start inference")
for tt in (test_dataset.select(idx)['train_text']):
    if i % 100 == 0:
        print(i, set(outs))
    input_ids = tokenizer(tt, return_tensors='pt').input_ids.to('cuda:7')#[:,:-1]
    out_ids = model.generate(input_ids, max_new_tokens=25, pad_token_id=tokenizer.eos_token_id, output_scores=True)

    assistant = tokenizer.batch_decode(out_ids[:, input_ids.size(1): ], skip_special_tokens=True)[0].strip()
#     print(assistant)
    outs.append(assistant)
    i += 1
#     break
#     print(assistant)
del model
import gc
torch.cuda.empty_cache()
gc.collect()

In [None]:
from evaluate import load
bertscore = load("bertscore")
predictions = [o[2:] for o in outs]
references = test_dataset.select(idx)['human_justification']
results = bertscore.compute(predictions=predictions, references=references, lang="vi")
sum(results['f1'])/100

In [None]:
from evaluate import load
rouge = load("rouge")
predictions = [o[2:] for o in outs]
references = test_dataset.select(idx)['human_justification']
results = rouge.compute(predictions=predictions, references=references)
results

In [None]:
test_samples = """trả lại cho họ chất lượng cuộc sống bình thường như bao người khác là được nghe được nói thế nhưng điều kỳ diệu đã xảy
những chia sẻ vô cùng hữu ích và thiết thực vừa rồi ạ có thể thấy là hầu hết người bệnh nằm điều trị trong
khám suốt tiểu đường nó vẫn mệt mỏi vô khám tai biến bộ não vô khám nhưng mà xương thì nó loãng xương rất là nhiều
""".split('\n')

testdf = test_dataset.to_pandas()
testdf[testdf.text.isin(test_samples)]

In [None]:
confidence = []
outs = []
i = 0
"""trả lại cho họ chất lượng cuộc sống bình thường như bao người khác là được nghe được nói thế nhưng điều kỳ diệu đã xảy
những chia sẻ vô cùng hữu ích và thiết thực vừa rồi ạ có thể thấy là hầu hết người bệnh nằm điều trị trong
khám suốt tiểu đường nó vẫn mệt mỏi vô khám tai biến bộ não vô khám nhưng mà xương thì nó loãng xương rất là nhiều
""".split()
for tt in (testdf[testdf.text.isin(test_samples)]['train_text']):
    if i % 100 == 0:
        print(i, set(outs))
    input_ids = tokenizer(tt, return_tensors='pt').input_ids.cuda()
    output = model.generate(input_ids, max_new_tokens=20, pad_token_id=tokenizer.eos_token_id, output_scores=True, output_attentions=True, return_dict_in_generate=True)

    assistant = (output.scores)[0].softmax(dim=1)[:,output.sequences[:, input_ids.size(1) ].item()].item()
#     print(assistant)
    confidence.append(assistant)
    
    assistant = tokenizer.batch_decode(output.sequences[:, input_ids.size(1): ], skip_special_tokens=True)[0].strip()
    outs.append(assistant)
    i += 1

In [None]:
import pandas as pd
from datasets import Dataset
import evaluate
from datasets import load_metric, load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, TrainingArguments, Seq2SeqTrainingArguments
import torch 
import numpy as np
from tqdm import tqdm
metrics = load_metric('accuracy')
import gc
import os

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")
precision = evaluate.load("precision")
recall = evaluate.load("recall")

def compute_metrics(predictions, references):
  decoded_preds, decoded_labels = predictions, references
#   logits = np.argmax(logits, axis=1)
  decoded_preds = [pred if pred.isdigit() else '-1' for pred in decoded_preds]  # Replace non-digit predictions with '-1'
  decoded_labels = [label if label.isdigit() else '-1' for label in decoded_labels]  # Replace non-digit labels with '-1'
  predictions = decoded_preds
  labels = decoded_labels
  neg,neu,pos = f1.compute(predictions=predictions, references=labels, average=None)['f1']
  metrics_result = {
    "accuracy": accuracy.compute(predictions=predictions, references=labels)['accuracy'],
    "macro_f1": f1.compute(predictions=predictions, references=labels, average='macro')['f1'],
#   "macro_precision": precision.compute(predictions=predictions, references=labels, average='macro')['precision'],
#   "macro_recall": recall.compute(predictions=predictions, references=labels, average='macro')['recall'],
    "f1_neg": neg,
    "f1_neu": neu,
    "f1_pos": pos

  }
  return metrics_result

references = (testset['label'])
compute_metrics(outs, references)

In [None]:
from tqdm import tqdm
import random
from datasets import Dataset, load_metric
import transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from datasets import load_dataset, Dataset
import pandas as pd
import evaluate
import torch
import nltk
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM
import nltk
import argparse
import numpy as np
from transformers import AutoModelForSeq2SeqLM
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
from trl import AutoModelForCausalLMWithValueHead
from transformers import TrainingArguments, Trainer
from trl import SFTTrainer
import evaluate
from transformers import AutoModelForCausalLM
from peft import PeftModel


base_model_name = 'Viet-Mistral/Vistral-7B-Chat'
print("loading")
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
model = AutoModelForCausalLM.from_pretrained(base_model_name, cache_dir='./models')
model = PeftModel.from_pretrained(model, './results/Vistral-7B-Chat_human_justification/')
print('finished laoding')
model = model.merge_and_unload()
model = model.to('cuda')


train_df = pd.read_excel('train.xlsx')#pd.concat([df, df_dev]).reset_index(drop=True)
train_df['label'] = train_df['label'].astype(str)

train_dataset = Dataset.from_pandas(train_df)

testset =  pd.read_excel('test.xlsx')
# test_with_asr = pd.read_excel('test_asr.xlsx')
# testset['text'] = test_with_asr['asr']

testset['label'] = testset['label'].astype(str)
print(train_df['label'].unique())
print(testset['label'].unique())
test_dataset = Dataset.from_pandas(testset[['text', 'label']])

def template(inp, out):
    conversation = [{"role": "system", "content": "Bạn là trợ lý nhận diện cảm xúc. Với một văn bản, trả lời 0 nếu cảm xúc tiêu cực, 1 nếu không có cảm xúc, 2 nếu cảm xúc tích cực." },
                    {"role": "user", "content": f"""nhận diện cảm xúc: '{inp.strip()}'"""},
#                     {'role': 'asssistant', 'content': str(out) + 'herqwwewf'}
                   ]
#     print(out)
    prompt = tokenizer.apply_chat_template(conversation, tokenize=False)
    prompt = prompt + f' '
    return prompt

# reformatted_output = [reformat(inp, out) for inp, out in zip(dataset['train']['words'], dataset['train']['tags'])]
new_column_train = [template(inp, out) for inp, out in zip(train_dataset['text'], train_dataset['label'])]
train_dataset= train_dataset.add_column("train_text", new_column_train)
new_column_train = [template(inp, out) for inp, out in zip(test_dataset['text'], test_dataset['label'])]
test_dataset= test_dataset.add_column("train_text", new_column_train)

outs = []
i = 0
print("Start inference")

confidence = []
i = 0
for tt in (test_dataset['train_text']):
    if i % 100 == 0:
        print(i, set(outs))
    input_ids = tokenizer(tt, return_tensors='pt').input_ids.cuda()
    output = model.generate(input_ids, max_new_tokens=1, pad_token_id=tokenizer.eos_token_id, output_scores=True, output_attentions=True, return_dict_in_generate=True)

    assistant = (output.scores)[0].softmax(dim=1)[:,output.sequences[:, input_ids.size(1) ].item()].item()
#     print(assistant)
    confidence.append(assistant)
    
    assistant = tokenizer.batch_decode(output.sequences[:, input_ids.size(1): ], skip_special_tokens=True)[0].strip()
    outs.append(assistant)
    i += 1

In [None]:
i = 0
print("Start inference")

confidence = []
i = 0
for tt in (test_dataset['train_text']):
    if i % 100 == 0:
        print(i, set(outs))
    input_ids = tokenizer(tt, return_tensors='pt').input_ids.cuda()
    output = model.generate(input_ids, max_new_tokens=1, pad_token_id=tokenizer.eos_token_id, output_scores=True, output_attentions=True, return_dict_in_generate=True)

    assistant = (output.scores)[0].softmax(dim=1)[:,output.sequences[:, input_ids.size(1) ].item()].item()
#     print(assistant)
    confidence.append(assistant)
    
    assistant = tokenizer.batch_decode(output.sequences[:, input_ids.size(1): ], skip_special_tokens=True)[0].strip()
    outs.append(assistant)
    i += 1