<a href="https://colab.research.google.com/github/hirokiOS/SentimentAnalysisWithDownloadedDataSource/blob/main/3_sentiment_error_analysis_finbert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

DATAMODEは次の３項のいずれかからご選択ください。 

DATAMODE == 'WRIME' : 感情分析のデータセット。モデルは東北大学の日本語LLM.  

DATAMODE == 'ACCERN-API' : URLとAPIを指定してACCERNのAPIを用いる.

DATAMODE == 'ACCERN-DRIVE' : ノートブック１においてURLとAPIを指定して保存したデータを用いる.

In [None]:
# DATAMODE='WRIME'
# DATAMODE='ACCERN-DRIVE' use dataset preprocessed in 1_original_accern_parse.ipynb
DATAMODE='ACCERN-API' # directly donload dataset from accern provided API

if DATAMODE == 'ACCERN-API':
    accern_api_url = '< Vender provided URL should come here >'
    accern_token_url = '< Vendor provided token should come here >'

## 3 感情分析モデルのエラー分析

### 0. 環境構築

**注意**
こちらのノートブックは、感情分析のデータセットであるWRIMEを用いた書籍同様の内容をFinbert実行するために用意されています。
参考書籍の該当する部分を今回のケースに適用しています。

In [None]:
from google.colab import drive
drive.mount("drive")
colab_path = "drive/MyDrive/Colaboratory/"

In [None]:
# Testing following on 3.11.3
!pip install --upgrade pip

In [None]:
#!pip uninstall -y numpy # reset numpy for dependency
#!pip uninstall -y setuptools
!pip install setuptools
!pip install numpy
!pip install accern-data datasets
!pip install datasets "transformers[ja,torch]" matplotlib scikit-learn 
!pip install pickle5

# pytorch installation information found here
# https://pytorch.org/get-started/locally/
# GPU environment
!pip install torch torchvision torchaudio
# CPU environment
#!pip3 install torch torchvision torchaudio                                                                   # on mac
#!pip install torch==2.0.0+cpu torchvision==0.6.0+cpu -f https://download.pytorch.org/whl/torch_stable.html  # on windows

In [None]:
if DATAMODE == 'ACCERN-API':
    import requests
    # Save datagenerators as file to colab working directory
    # If you are using GitHub, make sure you get the "Raw" version of the code
    giturl1 = 'https://raw.githubusercontent.com/hirokiOS/SentimentAnalysisWithDownloadedDataSource/main/acc_function/ACCDFConcatenator.py'
    giturl2 = 'https://raw.githubusercontent.com/hirokiOS/SentimentAnalysisWithDownloadedDataSource/main/acc_function/ConvertDfToHFData.py'
    
    giturls = [giturl1, giturl2]
    
    for giturl in giturls:
        r = requests.get(giturl)
        
        filename = giturl.split("/")[-1]
        # make sure your filename is the same as how you want to import 
        with open(filename, 'w') as f:
            f.write(r.text)
    
    # now we can import
    from ACCDFConcatenator import DFConcatenator
    from ConvertDfToHFData import Convert4SentimentAnalysis

乱数を固定

In [None]:
import torch
from transformers.trainer_utils import set_seed
# from transformers import set_seed

# 乱数シードを42に固定
set_seed(42)

### 3.1 モデルの予測結果の取得

model_savepathは実行環境に依存しますので、適切に変更してください。
私の環境では2をそのまま実行した際、checkpoint-685が訓練での最後の出力となりました。
したがって、このノートブックではcheckpoint-685をモデル出力として評価を行います。

In [None]:
from transformers import pipeline
import torch
from transformers import BertModel 

# check cuda availability
if torch.cuda.is_available():
    device = f"cuda:{torch.cuda.current_device()}" if torch.cuda.is_available() else "cpu"
else:
    device ="cpu"



if DATAMODE == 'WRIME':
    model_name = "llm-book/bert-base-japanese-v3-wrime-sentiment"
    sentiment_pipeline = pipeline(model=model_name, device=device)
elif DATAMODE == 'ACCERN-API' or 'ACCERN-DRIVE':
    # The file 2 must be run first and your drive has to contain one of the output exported model
    # properly specify the path of your last ouput

    from transformers import AutoTokenizer, AutoModelForSequenceClassification

    # 以下は実行環境やエポック数によっては名前を変更してください.
    model_dir = '/content/drive/My Drive/Colaboratory/data/output_finbert/'
    model_savepath = "finbert_finetuned/model" # this depends on your output
    #model_save = BertModel.from_pretrained('/content/checkpoint-685')
    #sentiment_pipeline = pipeline(task='sentiment-analysis', model=model_save, device=device)

    
    # retreive the saved model 
    model = AutoModelForSequenceClassification.from_pretrained(model_dir + model_savepath, 
                                                                                                      local_files_only=True)
    tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
    sentiment_pipeline = pipeline("text-classification",  model=model, device=device, tokenizer=tokenizer)

In [None]:
from datasets import load_dataset
from pickle5 import pickle

if DATAMODE=='WRIME':
    # Hugging Face Hub上のllm-book/wrime-sentimentのリポジトリから
    # データを読み込む
    train_dataset = load_dataset("llm-book/wrime-sentiment", split="train", remove_neutral=False)
    valid_dataset = load_dataset("llm-book/wrime-sentiment", split="validation", remove_neutral=False)
    # pprintで見やすく表示する

elif DATAMODE in ['ACCERN-DRIVE', 'ACCERN-API']:
    # Note book 1 で作成したDataframeのPickleファイルを読み込み学習用のデータセットとする。
    if DATAMODE == 'ACCERN-DRIVE':
        with open(colab_path + 'data/accern_dataset.pkl', 'rb') as f:
            accern_dataset = pickle.load(f)

    # APIより取得したデータを加工しHugging faceのファインチューニングに用いる。
    elif DATAMODE == 'ACCERN-API':
        df = DFConcatenator(
                        url=accern_api_url,
                        token=accern_token_url, 
                        start_date="2016-09-08", end_date="2016-09-09", 
                        output_pattern="oct31", output_path="./accern_raw_json/", 
                        mode = "json", split_dates=False)

        accern_dataset = Convert4SentimentAnalysis(df, threshold = 20, target_label = 'event_sentiment')


    train_valid_dataset = accern_dataset.train_test_split(test_size=0.10)
    # required_labels = ['sentence', 'label', 'datetime']
    train_dataset = train_valid_dataset['train'] # [required_labels]
    valid_dataset = train_valid_dataset['test']  #[required_labels]

print(train_dataset[3])


In [None]:
from tqdm import tqdm

# ラベル名の情報を取得するためのClassLabelインスタンス
class_label = valid_dataset.features["label"]

results: list[dict[str, float | str]] = []
for i, example in tqdm(enumerate(valid_dataset)):
    # モデルの予測結果を取得
    model_prediction = sentiment_pipeline(example["sentence"], max_length=512)[0]
    # 正解のラベルIDをラベル名に変換
    true_label = class_label.int2str(example["label"])

    # resultsに分析に必要な情報を格納
    results.append(
        {
            "example_id": i,
            "pred_prob": model_prediction["score"],
            "pred_label": model_prediction["label"],
            "true_label": true_label,
        }
    )

### 3.2 全体的な傾向の分析

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

plt.rcParams["font.size"] = 18  # 文字サイズを大きくする

# 混同行列の作成
confusion_matrix = confusion_matrix(
    y_true=[result["true_label"] for result in results],
    y_pred=[result["pred_label"] for result in results],
    labels=class_label.names,
)
# 混同行列を画像として表示
ConfusionMatrixDisplay(
    confusion_matrix, display_labels=class_label.names
).plot()

### 3.3 モデルのショートカットに注意

In [None]:
# 予測が誤った事例を収集
failed_results = [
    res for res in results if res["pred_label"] != res["true_label"]
]
# モデルの予測確率が高い順にソート
sorted_failed_results = sorted(
    failed_results, key=lambda x: -x["pred_prob"]
)
# 高い確率で予測しながら誤った事例の上位2件を表示
for top_result in sorted_failed_results[:5]:
    review_text = valid_dataset[top_result["example_id"]]["sentence"]
    print(f"レビュー文：{review_text}")
    print(f"予測：{top_result['pred_label']}")
    print(f"正解：{top_result['true_label']}")
    print(f"予測確率: {top_result['pred_prob']:.4f}")
    print("----------------")