In [1]:
import re
import pandas as pd
from tqdm.notebook import tqdm

import torch
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification


In [2]:
# 事前学習済みの日本語感情分析モデルとそのトークナイザをロード
# model_name = 'Mizuiro-sakura/luke-japanese-large-sentiment-analysis-wrime'
model_name = 'patrickramos/bert-base-japanese-v2-wrime-fine-tune'
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=512)

# 感情分析のためのパイプラインを設定
# nlp = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer, truncation=True)

# 使用するデバイスの設定
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

# モデルをデバイスに転送
model = model.to(device)


In [3]:
pd_Φ  = pd.read_parquet('./parquet_data/20250227_tweet_EM_Φ.parquet')
words = [re.sub(r'単語\d+:', '', elem) for elem in pd_Φ.columns]
# sentiment_list = [label for label in model.config.id2label.values()]
sentiment_list = [
					# '主観感情：喜び', '主観感情：悲しみ', '主観感情：期待', '主観感情：驚き', '主観感情：怒り', '主観感情：恐れ', '主観感情：嫌悪', '主観感情：信頼',
					'客観感情：喜び', '客観感情：悲しみ', '客観感情：期待', '客観感情：驚き', '客観感情：怒り', '客観感情：恐れ', '客観感情：嫌悪', '客観感情：信頼',
				]
sentiment_list

['客観感情：喜び',
 '客観感情：悲しみ',
 '客観感情：期待',
 '客観感情：驚き',
 '客観感情：怒り',
 '客観感情：恐れ',
 '客観感情：嫌悪',
 '客観感情：信頼']

In [4]:
# classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)
# classifier('お、お前が犯人だったのか･･････！')

output_size = len(sentiment_list)
res_class   = []
batch_size  = 64
for idx in range(0, len(words), batch_size):
	targets = words[idx:idx+batch_size]
    
	inputs  = tokenizer(targets, return_tensors='pt', padding=True, truncation=True, max_length=512)
	inputs  = {key: value.to(device) for key, value in inputs.items()}
	outputs = model(**inputs)
	outputs = torch.maximum(outputs.logits, torch.tensor(0))
	streng  = outputs[:, 8:8+output_size].tolist()
 
	# 0:writer_joy, 1:writer_sadness, 2:writer_anticipation, 3:writer_surprise, 4:writer_anger, 5:writer_fear, 6:writer_disgust, 7:writer_trust
	res_class.extend(streng)

# 各単語の分類結果の保存
pd_sentiment = pd.DataFrame(data=[[a]+b for a,b in zip(words, res_class)], columns=['word'] + sentiment_list)
pd_sentiment.to_parquet('./parquet_data/20250227_tweet_sentiment.parquet')
    

In [5]:
# EMアルゴリズムによる感情分析の結果を保存
pd_Φ = pd.read_parquet('./parquet_data/20250227_tweet_EM_Φ.parquet')
for label in sentiment_list:
	pd_Φ[label] = 0.0
# sentiment_listの分だけpd_Φの列が増えているが、zip関数の挙動的に要素数の少ないres_classに合わせて動作するため問題ない
for col, res in zip(pd_Φ.columns, res_class):
    # 各単語の感情分析結果を重み付けして感情分析結果を算出
	for label in sentiment_list:
		pd_Φ.loc[:, label] = pd_Φ.loc[:, label] + pd_Φ.loc[:, col] * res[sentiment_list.index(label)]
pd_Φ = pd_Φ[[label for label in sentiment_list]]
pd_Φ = pd_Φ.div(pd_Φ.sum(axis=1), axis=0)
pd_Φ = pd_Φ.round(4)
pd_Φ.to_parquet('./parquet_data/20250227_tweet_EM_sentiment.parquet')



# 変分ベイズアルゴリズムによる感情分析の結果を保存
pd_Φ = pd.read_parquet('./parquet_data/20250227_tweet_VB_Φ.parquet')
for label in sentiment_list:
	pd_Φ[label] = 0.0
# sentiment_listの分だけpd_Φの列が増えているが、zip関数の挙動的に要素数の少ないres_classに合わせて動作するため問題ない
for col, res in zip(pd_Φ.columns, res_class):
    # 各単語の感情分析結果を重み付けして感情分析結果を算出
	for label in sentiment_list:
		pd_Φ.loc[:, label] = pd_Φ.loc[:, label] + pd_Φ.loc[:, col] * res[sentiment_list.index(label)]
pd_Φ = pd_Φ[[label for label in sentiment_list]]
pd_Φ = pd_Φ.div(pd_Φ.sum(axis=1), axis=0)
pd_Φ = pd_Φ.round(4)
pd_Φ.to_parquet('./parquet_data/20250227_tweet_VB_sentiment.parquet')



# ギブスサンプリングアルゴリズムによる感情分析の結果を保存
pd_Φ = pd.read_parquet('./parquet_data/20250227_tweet_CGS_Φ.parquet')
for label in sentiment_list:
	pd_Φ[label] = 0.0
# sentiment_listの分だけpd_Φの列が増えているが、zip関数の挙動的に要素数の少ないres_classに合わせて動作するため問題ない
for col, res in zip(pd_Φ.columns, res_class):
    # 各単語の感情分析結果を重み付けして感情分析結果を算出
	for label in sentiment_list:
		pd_Φ.loc[:, label] = pd_Φ.loc[:, label] + pd_Φ.loc[:, col] * res[sentiment_list.index(label)]
pd_Φ = pd_Φ[[label for label in sentiment_list]]
pd_Φ = pd_Φ.div(pd_Φ.sum(axis=1), axis=0)
pd_Φ = pd_Φ.round(4)
pd_Φ.to_parquet('./parquet_data/20250227_tweet_CGS_sentiment.parquet')

In [6]:
df_tweet  = pd.read_parquet('./parquet_data/20250227_tweet_normalized.parquet')
documents = df_tweet['text'].tolist()

output_size = len(sentiment_list)
res_class   = []
res_streng  = []
batch_size  = 4
for idx in tqdm(range(0, len(documents), batch_size)):
	targets = documents[idx:idx+batch_size]
    
	inputs  = tokenizer(targets, return_tensors='pt', padding=True, truncation=True, max_length=512)
	inputs  = {key: value.to(device) for key, value in inputs.items()}
	outputs = model(**inputs)
	outputs = torch.maximum(outputs.logits, torch.tensor(0))
	streng  = outputs[:, 8:8+output_size]
	results = [sentiment_list[idx] for idx in torch.argmax(streng, dim=1).tolist()]
	streng  = streng.tolist()
 
	# 0:writer_joy, 1:writer_sadness, 2:writer_anticipation, 3:writer_surprise, 4:writer_anger, 5:writer_fear, 6:writer_disgust, 7:writer_trust
	res_streng.extend(streng)
	res_class.extend(results)
	

# 各文書の分類結果の保存
df_tweet['classification'] = res_class
for idx, label in enumerate(sentiment_list):
    df_tweet[label] = [elem[idx] for elem in res_streng]

df_tweet.to_parquet('./parquet_data/20250227_tweet_normalized.parquet')


  0%|          | 0/511 [00:00<?, ?it/s]

In [7]:
pd_vb = pd.read_parquet('./parquet_data/20250227_tweet_VB_sentiment.parquet')
pd_vb['sum'] = pd_vb.sum(axis=1)
pd_vb

Unnamed: 0,客観感情：喜び,客観感情：悲しみ,客観感情：期待,客観感情：驚き,客観感情：怒り,客観感情：恐れ,客観感情：嫌悪,客観感情：信頼,sum
トピック1,0.2359,0.1462,0.2697,0.1311,0.0235,0.0586,0.1055,0.0295,1.0
トピック2,0.224,0.1736,0.2556,0.1023,0.021,0.0655,0.1286,0.0293,0.9999
トピック3,0.2179,0.1605,0.2549,0.1233,0.0223,0.0564,0.132,0.0328,1.0001
トピック4,0.2503,0.1663,0.2464,0.1155,0.0209,0.0553,0.1153,0.03,1.0
トピック5,0.1949,0.1874,0.253,0.1047,0.0224,0.0674,0.1485,0.0218,1.0001
トピック6,0.2651,0.1596,0.2672,0.0959,0.0174,0.0473,0.1208,0.0266,0.9999
トピック7,0.2586,0.1641,0.2435,0.1094,0.0187,0.0538,0.1198,0.0322,1.0001
トピック8,0.196,0.1858,0.2528,0.1168,0.0238,0.0592,0.1423,0.0233,1.0
トピック9,0.2318,0.1706,0.2597,0.105,0.0179,0.0651,0.1238,0.0262,1.0001
トピック10,0.224,0.1817,0.2492,0.1028,0.0157,0.0636,0.1365,0.0265,1.0
