In [1]:
%load_ext autoreload
%autoreload 2

import re
import json
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

import torch
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification

from Latent_Dirichlet_Allocation import normalization_string, create_stop_word
from Latent_Dirichlet_Allocation import Harmonized_Sentiment_Topic_Model_In_VB

%matplotlib inline

In [None]:
df_tweet1 = pd.read_csv('csv_data/20250127_tweet_data.csv')
df_tweet1['text'] = df_tweet1['text'].str.removeprefix('[').str.removesuffix(']')
df_tweet2 = pd.read_csv('csv_data/20250304_tweet_data.csv')
df_tweet2['text'] = df_tweet2['text'].str.removeprefix('[').str.removesuffix(']')
df_tweet3 = pd.read_csv('csv_data/20250313_tweet_data.csv')
df_tweet3['text'] = df_tweet3['text'].str.removeprefix('[').str.removesuffix(']')
df_tweet = pd.concat([df_tweet1, df_tweet2, df_tweet3]).reset_index(drop=True)
df_tweet = df_tweet[['created_at', 'text']]
df_tweet = normalization_string(df_tweet, 'text')
df_tweet = df_tweet.drop_duplicates(subset='text', keep='first', ignore_index=True)
df_tweet = df_tweet.sort_values(by='created_at', ascending=True, ignore_index=True)

df_tweet.to_parquet('./parquet_share_data/20250313_X_post_normalized.parquet')
df_tweet

In [None]:
# 事前学習済みの日本語感情分析モデルとそのトークナイザをロード
# 感情極性分析用
model_name = 'christian-phu/bert-finetuned-japanese-sentiment'
model_polarity = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer_polarity = AutoTokenizer.from_pretrained(model_name, model_max_length=512)

# 感情詳細分析用
model_name = 'patrickramos/bert-base-japanese-v2-wrime-fine-tune'
model_emotional = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer_emotional = AutoTokenizer.from_pretrained(model_name, model_max_length=512)

# 感情分析のためのパイプラインを設定
# nlp = pipeline('sentiment-analysis', model=model_polarity, tokenizer=tokenizer_polarity, truncation=True)

# 使用するデバイスの設定
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

# モデルをデバイスに転送
model_polarity  = model_polarity.to(device)
model_emotional = model_emotional.to(device)

In [None]:
df_tweet  = pd.read_parquet('./parquet_share_data/20250313_X_post_normalized.parquet')
documents = df_tweet['text'].tolist()

res_class   = []
batch_size  = 16
for idx in tqdm(range(0, len(documents), batch_size)):
	targets = documents[idx:idx+batch_size]
    
	inputs  = tokenizer_polarity(targets, return_tensors='pt', padding=True, truncation=True, max_length=512)
	inputs  = {key: value.to(device) for key, value in inputs.items()}
	outputs = model_polarity(**inputs)
	outputs = torch.softmax(outputs.logits, dim=1)
	outputs = torch.argmax(outputs, dim=1)
	label   = outputs.tolist()
 
	# 0:negative, 1:neutral, 2:positive
	res_class.extend([model_polarity.config.id2label[elem] for elem in label])
	

# 各文書の分類結果の保存
df_tweet['emotional polarity'] = res_class
df_tweet.to_parquet('./parquet_share_data/20250313_X_post_polarity.parquet')

In [2]:
# sentiment_list = [label for label in model_emotional.config.id2label.values()]
sentiment_list = [
					'主観感情：喜び', '主観感情：悲しみ', '主観感情：期待', '主観感情：驚き', '主観感情：怒り', '主観感情：恐れ', '主観感情：嫌悪', '主観感情：信頼',
					# '客観感情：喜び', '客観感情：悲しみ', '客観感情：期待', '客観感情：驚き', '客観感情：怒り', '客観感情：恐れ', '客観感情：嫌悪', '客観感情：信頼',
				]
sentiment_list

['主観感情：喜び',
 '主観感情：悲しみ',
 '主観感情：期待',
 '主観感情：驚き',
 '主観感情：怒り',
 '主観感情：恐れ',
 '主観感情：嫌悪',
 '主観感情：信頼']

In [None]:
df_tweet  = pd.read_parquet('./parquet_share_data/20250313_X_post_polarity.parquet')
documents = df_tweet['text'].tolist()

output_size  = len(sentiment_list)
res_class    = []
res_strength = []
batch_size   = 4
for idx in tqdm(range(0, len(documents), batch_size)):
	targets = documents[idx:idx+batch_size]
    
	inputs   = tokenizer_emotional(targets, return_tensors='pt', padding=True, truncation=True, max_length=512)
	inputs   = {key: value.to(device) for key, value in inputs.items()}
	outputs  = model_emotional(**inputs)
	outputs  = torch.maximum(outputs.logits, torch.tensor(0))
	outputs  = torch.minimum(outputs,        torch.tensor(4))
	strength = outputs[:, 0:output_size]
	# strength = outputs[:, 8:8+output_size]
	results  = [sentiment_list[idx] for idx in torch.argmax(strength, dim=1).tolist()]
	strength = strength.tolist()
 
	# 0:writer_joy, 1:writer_sadness, 2:writer_anticipation, 3:writer_surprise, 4:writer_anger, 5:writer_fear, 6:writer_disgust, 7:writer_trust
	res_strength.extend(strength)
	res_class.extend(results)
	

# 各文書の分類結果の保存
df_tweet['classification'] = res_class
for idx, label in enumerate(sentiment_list):
    df_tweet[label] = [elem[idx] for elem in res_strength]

df_tweet.to_parquet('./parquet_share_data/20250313_X_post_emotional_polarity.parquet')


In [3]:
df_tweet  = pd.read_parquet('./parquet_share_data/20250313_X_post_emotional_polarity.parquet')
df_tweet

Unnamed: 0,created_at,text,emotional polarity,classification,主観感情：喜び,主観感情：悲しみ,主観感情：期待,主観感情：驚き,主観感情：怒り,主観感情：恐れ,主観感情：嫌悪,主観感情：信頼
0,2025-01-20 18:01:41+09:00,'jinsの目が小さくなりにくい眼鏡ってどんな度数でも意味あるのかな',positive,主観感情：期待,0.240769,0.231313,0.400943,0.313888,0.149386,0.100177,0.305792,0.195014
1,2025-01-20 18:09:05+09:00,"'in有楽町jins:sparkles:', 'jinsウォヌ好きすぎて辛い:smiling...",positive,主観感情：期待,1.763595,0.053120,1.921968,0.237734,0.000000,0.000000,0.000000,1.749680
2,2025-01-20 18:09:37+09:00,"'jins「ラバー外してみたらこのように内部の痛みがひどく...」', '私(さすがに買い替...",positive,主観感情：喜び,1.850866,0.418788,0.467625,0.371383,0.081674,0.148506,0.101506,1.512607
3,2025-01-20 18:16:48+09:00,'「ジンズ」がロサンゼルスに新店をオープン\u3000デジタルを活用し新たな顧客体験を提供\...,positive,主観感情：期待,1.230775,0.000000,1.369880,0.181928,0.008648,0.000000,0.079932,0.658114
4,2025-01-20 18:22:49+09:00,'少し前のヘラルボニーのイベントで、jinsの田中社長がパネルディスカッションのパネリストや...,positive,主観感情：期待,0.323086,0.384026,1.363942,0.310979,0.095131,0.177809,0.000000,0.943650
...,...,...,...,...,...,...,...,...,...,...,...,...
2691,2025-03-12 22:11:32+09:00,'てぃんと谷口さんメガネが一緒だと知り萌えましたjinsの「誰でも似合うメガネ」だそうです',positive,主観感情：喜び,1.764935,0.005849,0.884837,0.639886,0.019598,0.000000,0.033895,0.692446
2692,2025-03-12 22:22:51+09:00,'ずっとjinsでしたが、もういい歳なのでちょいええメガネ買いました!さらに男前に!!なるは...,positive,主観感情：期待,1.888680,0.149448,1.918642,0.310616,0.000000,0.000000,0.047065,0.686199
2693,2025-03-12 23:17:15+09:00,'久々にjins memeを発掘したけど充電器が行方不明',neutral,主観感情：悲しみ,0.264304,1.214582,0.073240,0.686179,0.165172,0.058393,0.000000,0.077262
2694,2025-03-12 23:27:37+09:00,'............jinsにドラパルトデザインの眼鏡発売されてた.........ブ...,positive,主観感情：喜び,1.297899,0.016761,0.466384,0.565725,0.000000,0.000000,0.073803,0.163657


In [4]:
with open('json_data/custom_stopwords_en.json', 'r') as f:
	stop_word_en = json.load(f)
	stop_word_en = stop_word_en['stopwords']

with open('json_data/custom_stopwords_ja.json', 'r') as f:
	stop_word_ja = json.load(f)
	stop_word_ja = stop_word_ja['stopwords']

stop_word_digit1 = [str(idx) for idx in range(10000)]
stop_word_digit2 = [str(idx).zfill(2) for idx in range(100)]
stop_word_digit3 = [str(idx).zfill(3) for idx in range(1000)]
stop_word_digit4 = [str(idx).zfill(4) for idx in range(10000)]
stop_word_alpha  = [chr(idx) for idx in range(ord('a'), ord('z')+1)]
stop_word_ALPHA  = [chr(idx) for idx in range(ord('A'), ord('Z')+1)]
stop_word_hira   = [chr(idx) for idx in range(ord('あ'), ord('ん')+1)]
stop_word_kata   = [chr(idx) for idx in range(ord('ァ'), ord('ン')+1)]
stop_word_kanji  = ['一', '二', '三', '四', '五', '六', '七', '八', '九', '十']
stop_word_greece = [chr(idx) for idx in range(ord('α'), ord('ω')+1)]
stop_word_GREECE = [chr(idx) for idx in range(ord('Α'), ord('Ω')+1)]
stop_word_cyril  = [chr(idx) for idx in range(ord('а'), ord('я')+1)]
stop_word_CYRIL  = [chr(idx) for idx in range(ord('А'), ord('Я')+1)]
stop_word_symbol = ['・', '゚', '!', '。', "'", '_', '%']
stop_word_custom = stop_word_digit1 + stop_word_digit2 + stop_word_digit3 + stop_word_digit4\
    				+ stop_word_alpha + stop_word_ALPHA\
            		+ stop_word_hira  + stop_word_kata + stop_word_kanji\
                  	+ stop_word_greece + stop_word_GREECE\
                    + stop_word_cyril + stop_word_CYRIL\
                    + stop_word_symbol

# stop_word = create_stop_word(df_tweet, 'text', stop_word_en + stop_word_ja, 5)
stop_word = create_stop_word(df_tweet, 'text', stop_word_en + stop_word_ja + stop_word_custom, 5)

In [19]:
df_tweet_data  = df_tweet[['created_at', 'text', 'emotional polarity', 'classification']]
df_tweet_label = df_tweet[sentiment_list]
model = Harmonized_Sentiment_Topic_Model_In_VB(df_tweet_data, df_tweet_label, stop_word=stop_word, label_max_value=4, topic_num=10, max_iterate=1000)
model.fit()

print(f'topic of EM algorithm: finished')

idx:0 BB変分部: idx2:0  総微分量：56529.77994869048
                     要素あたりの微分量：4.986630718073603
                     q(Ψ)： 総微分量：231.5006304772243
                     q(Ψ)： 要素あたりの微分量：2.8937578809653037
                     q(θ)： 総微分量：56298.27931821325
                     q(θ)： 要素あたりの微分量：2.0928728371082994
idx:0 BB変分部: idx2:100  総微分量：93118.84998264426
                       要素あたりの微分量：9.170874483671383
                       q(Ψ)： 総微分量：458.0989519108605
                       q(Ψ)： 要素あたりの微分量：5.726236898885756
                       q(θ)： 総微分量：92660.7510307334
                       q(θ)： 要素あたりの微分量：3.444637584785628
idx:0 BB変分部: idx2:200  総微分量：135728.85758225867
                       要素あたりの微分量：21.32370860943102
                       q(Ψ)： 総微分量：1306.126484747012
                       q(Ψ)： 要素あたりの微分量：16.32658105933765
                       q(θ)： 総微分量：134422.73109751166
                       q(θ)： 要素あたりの微分量：4.99712755009337
idx:0 BB変分部: idx2:300  総微分量：156724.05395166727
                  

In [20]:
pd_θ, pd_Ψ, pd_Φ1, pd_Φ0, pd_R = model.stats_info()

pd_θ.to_parquet('./parquet_data_original/20250227_tweet_VB_θ.parquet')
pd_Ψ.to_parquet('./parquet_data_original/20250227_tweet_VB_Ψ.parquet')
pd_Φ1.to_parquet('./parquet_data_original/20250227_tweet_VB_Φ1.parquet')
pd_Φ0.to_parquet('./parquet_data_original/20250227_tweet_VB_Φ0.parquet')
pd_R.to_parquet('./parquet_data_original/20250227_tweet_VB_R.parquet')