In [1]:
import re
import pandas as pd

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification


In [2]:
# 事前学習済みの日本語感情分析モデルとそのトークナイザをロード
# model_name = 'lxyuan/distilbert-base-multilingual-cased-sentiments-student'
model_name = 'Mizuiro-sakura/luke-japanese-large-sentiment-analysis-wrime'
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=512)

# 感情分析のためのパイプラインを設定
# nlp = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer, truncation=True)

# 使用するデバイスの設定
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

# モデルをデバイスに転送
model = model.to(device)


In [3]:
pd_Φ  = pd.read_parquet('./parquet_data/20250227_tweet_EM_Φ.parquet')
words = [re.sub(r'単語\d+:', '', elem) for elem in pd_Φ.columns]
# sentiment_list = [label for label in model.config.id2label.values()]
sentiment_list = ['喜び', '悲しみ', '期待', '驚き', '怒り', '恐れ', '嫌悪', '信頼']
sentiment_list

['喜び', '悲しみ', '期待', '驚き', '怒り', '恐れ', '嫌悪', '信頼']

In [4]:
res_class  = []
batch_size = 10
for idx in range(0, len(words), batch_size):
	targets = words[idx:idx+batch_size]
    
	inputs  = tokenizer(targets, return_tensors='pt', padding=True, truncation=True, max_length=512)
	inputs  = {key: value.to(device) for key, value in inputs.items()}
	outputs = model(**inputs)
	label   = torch.argmax(torch.softmax(outputs.logits, dim=1), dim=1).tolist()
 
	# 1:negative, 2:neutral, 3:positive
	res     = [1 + elem for elem in label]
	res_class.extend(res)

# 各単語の分類結果の保存
pd_sentiment = pd.DataFrame(data=[[a,b] for a,b in zip(words, res_class)], columns=['word', 'sentiment'])
pd_sentiment.to_parquet('./parquet_data/20250227_tweet_sentiment.parquet')
    

In [5]:
# EMアルゴリズムによる感情分析の結果を保存
pd_Φ = pd.read_parquet('./parquet_data/20250227_tweet_EM_Φ.parquet')
for label in sentiment_list:
	pd_Φ[label] = 0.0

for col, res in zip(pd_Φ.columns, res_class):
	label = sentiment_list[res - 1]
	pd_Φ.loc[:, label] = pd_Φ.loc[:, label] + pd_Φ.loc[:, col]
pd_Φ = pd_Φ[[label for label in sentiment_list]]
pd_Φ.to_parquet('./parquet_data/20250227_tweet_EM_sentiment.parquet')



# 変分ベイズアルゴリズムによる感情分析の結果を保存
pd_Φ = pd.read_parquet('./parquet_data/20250227_tweet_VB_Φ.parquet')
for label in sentiment_list:
	pd_Φ[label] = 0.0

for col, res in zip(pd_Φ.columns, res_class):
	label = sentiment_list[res - 1]
	pd_Φ.loc[:, label] = pd_Φ.loc[:, label] + pd_Φ.loc[:, col]
pd_Φ = pd_Φ[[label for label in sentiment_list]]
pd_Φ.to_parquet('./parquet_data/20250227_tweet_VB_sentiment.parquet')



# ギブスサンプリングアルゴリズムによる感情分析の結果を保存
pd_Φ = pd.read_parquet('./parquet_data/20250227_tweet_CGS_Φ.parquet')
for label in sentiment_list:
	pd_Φ[label] = 0.0

for col, res in zip(pd_Φ.columns, res_class):
	label = sentiment_list[res - 1]
	pd_Φ.loc[:, label] = pd_Φ.loc[:, label] + pd_Φ.loc[:, col]
pd_Φ = pd_Φ[[label for label in sentiment_list]]
pd_Φ.to_parquet('./parquet_data/20250227_tweet_CGS_sentiment.parquet')