In [None]:
%load_ext autoreload
%autoreload 2

import re
import json
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

import torch
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification

%matplotlib inline

In [2]:
df_tweet1 = pd.read_csv('csv_data/20250127_tweet_data.csv')
df_tweet1['text'] = df_tweet1['text'].str.removeprefix('[').str.removesuffix(']')
df_tweet2 = pd.read_csv('csv_data/20250304_tweet_data.csv')
df_tweet2['text'] = df_tweet2['text'].str.removeprefix('[').str.removesuffix(']')
df_tweet3 = pd.read_csv('csv_data/20250313_tweet_data.csv')
df_tweet3['text'] = df_tweet3['text'].str.removeprefix('[').str.removesuffix(']')
df_tweet = pd.concat([df_tweet1, df_tweet2, df_tweet3]).reset_index(drop=True)
df_tweet = df_tweet[['created_at', 'text']]
df_tweet = normalization_string(df_tweet, 'text')
df_tweet = df_tweet.drop_duplicates(subset='text', keep='first', ignore_index=True)
df_tweet = df_tweet.sort_values(by='created_at', ascending=True, ignore_index=True)

df_tweet.to_parquet('./parquet_share_data/20250313_X_post_normalized.parquet')
df_tweet

Unnamed: 0,created_at,text
0,2025-01-20 18:01:41+09:00,'jinsの目が小さくなりにくい眼鏡ってどんな度数でも意味あるのかな'
1,2025-01-20 18:09:05+09:00,"'in有楽町jins:sparkles:', 'jinsウォヌ好きすぎて辛い:smiling..."
2,2025-01-20 18:09:37+09:00,"'jins「ラバー外してみたらこのように内部の痛みがひどく...」', '私(さすがに買い替..."
3,2025-01-20 18:16:48+09:00,'「ジンズ」がロサンゼルスに新店をオープン\u3000デジタルを活用し新たな顧客体験を提供\...
4,2025-01-20 18:22:49+09:00,'少し前のヘラルボニーのイベントで、jinsの田中社長がパネルディスカッションのパネリストや...
...,...,...
2691,2025-03-12 22:11:32+09:00,'てぃんと谷口さんメガネが一緒だと知り萌えましたjinsの「誰でも似合うメガネ」だそうです'
2692,2025-03-12 22:22:51+09:00,'ずっとjinsでしたが、もういい歳なのでちょいええメガネ買いました!さらに男前に!!なるは...
2693,2025-03-12 23:17:15+09:00,'久々にjins memeを発掘したけど充電器が行方不明'
2694,2025-03-12 23:27:37+09:00,'............jinsにドラパルトデザインの眼鏡発売されてた.........ブ...


In [3]:
# 事前学習済みの日本語感情分析モデルとそのトークナイザをロード
# 感情極性分析用
model_name = 'christian-phu/bert-finetuned-japanese-sentiment'
model_polarity = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer_polarity = AutoTokenizer.from_pretrained(model_name, model_max_length=512)

# 感情詳細分析用
model_name = 'patrickramos/bert-base-japanese-v2-wrime-fine-tune'
model_emotional = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer_emotional = AutoTokenizer.from_pretrained(model_name, model_max_length=512)

# 感情分析のためのパイプラインを設定
# nlp = pipeline('sentiment-analysis', model=model_polarity, tokenizer=tokenizer_polarity, truncation=True)

# 使用するデバイスの設定
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

# モデルをデバイスに転送
model_polarity  = model_polarity.to(device)
model_emotional = model_emotional.to(device)

In [4]:
df_tweet  = pd.read_parquet('./parquet_share_data/20250313_X_post_normalized.parquet')
documents = df_tweet['text'].tolist()

res_class   = []
batch_size  = 16
for idx in tqdm(range(0, len(documents), batch_size)):
	targets = documents[idx:idx+batch_size]
    
	inputs  = tokenizer_polarity(targets, return_tensors='pt', padding=True, truncation=True, max_length=512)
	inputs  = {key: value.to(device) for key, value in inputs.items()}
	outputs = model_polarity(**inputs)
	outputs = torch.softmax(outputs.logits, dim=1)
	outputs = torch.argmax(outputs, dim=1)
	label   = outputs.tolist()
 
	# 0:negative, 1:neutral, 2:positive
	res_class.extend([model_polarity.config.id2label[elem] for elem in label])
	

# 各文書の分類結果の保存
df_tweet['emotional polarity'] = res_class
df_tweet.to_parquet('./parquet_share_data/20250313_X_post_polarity.parquet')

  0%|          | 0/169 [00:00<?, ?it/s]

In [5]:
# sentiment_list = [label for label in model_emotional.config.id2label.values()]
sentiment_list = [
					# '主観感情：喜び', '主観感情：悲しみ', '主観感情：期待', '主観感情：驚き', '主観感情：怒り', '主観感情：恐れ', '主観感情：嫌悪', '主観感情：信頼',
					'客観感情：喜び', '客観感情：悲しみ', '客観感情：期待', '客観感情：驚き', '客観感情：怒り', '客観感情：恐れ', '客観感情：嫌悪', '客観感情：信頼',
				]
sentiment_list

['客観感情：喜び',
 '客観感情：悲しみ',
 '客観感情：期待',
 '客観感情：驚き',
 '客観感情：怒り',
 '客観感情：恐れ',
 '客観感情：嫌悪',
 '客観感情：信頼']

In [7]:
df_tweet  = pd.read_parquet('./parquet_share_data/20250313_X_post_polarity.parquet')
documents = df_tweet['text'].tolist()

output_size  = len(sentiment_list)
res_class    = []
res_strength = []
batch_size   = 4
for idx in tqdm(range(0, len(documents), batch_size)):
	targets = documents[idx:idx+batch_size]
    
	inputs   = tokenizer_emotional(targets, return_tensors='pt', padding=True, truncation=True, max_length=512)
	inputs   = {key: value.to(device) for key, value in inputs.items()}
	outputs  = model_emotional(**inputs)
	outputs  = torch.maximum(outputs.logits, torch.tensor(0))
	outputs  = torch.minimum(outputs,        torch.tensor(4))
	strength = outputs[:, 8:8+output_size]
	results  = [sentiment_list[idx] for idx in torch.argmax(strength, dim=1).tolist()]
	strength = strength.tolist()
 
	# 0:writer_joy, 1:writer_sadness, 2:writer_anticipation, 3:writer_surprise, 4:writer_anger, 5:writer_fear, 6:writer_disgust, 7:writer_trust
	res_strength.extend(strength)
	res_class.extend(results)
	

# 各文書の分類結果の保存
df_tweet['classification'] = res_class
for idx, label in enumerate(sentiment_list):
    df_tweet[label] = [elem[idx] for elem in res_strength]

df_tweet.to_parquet('./parquet_share_data/20250313_X_post_emotional_polarity.parquet')


  0%|          | 0/674 [00:00<?, ?it/s]