# About the code

This code is going to be the main to collect and create analysis output to answer the question "Quantifying political stance of Japanese Diet members regards specific political topics through the use of LLMs and statistical methods." (Tentative) 
## Procedure
1. Create embeddings for each opinion-based sentence in regards to different topics and store it in a retrievable manner.
2. Create one single "opinion-embedding" for each politician and store it in a retrievable manner.
3. Create a stance axis vector by either generating two reference points or picking two points "opinion-embeddings" from the data
4. Collapse all the other vectors onto this axis by projecting them onto the axis
5. Create a scalar measurement for how far each politician is from the two reference points

## Notes
- Data is stored under `data/data_repr` directory
- We will attempt the procedure with different models to seek the best output


In [1]:
import os
from sentence_transformers import models, SentenceTransformer
import h5py
import torch
from params.paths import ROOT_DIR
from logger.Logger import Logger
from file_handling.file_read_writer import read_json, write_json, create_dir, write_file

VERBOSE = False
logger = Logger(verbose=VERBOSE)
DATA_DIR = os.path.join(ROOT_DIR, 'data')
DATA_REPR_SPEECHES_DIR = os.path.join(DATA_DIR, 'data_repr')
PARTIES = [party for party in os.listdir(DATA_REPR_SPEECHES_DIR) if not '.' in party]
MODEL_NAME = "cl-tohoku/bert-base-japanese-v3"
print('-----------------------------------')
print('DATA_DIR: ', DATA_DIR)
print('DATA_REPR_SPEECHES_DIR: ', DATA_REPR_SPEECHES_DIR)
print('PARTIES: ', PARTIES)
print('-----------------------------------')

-----------------------------------
DATA_DIR:  /root/projects/kokkai_analysis/data_prepping/params/../data
DATA_REPR_SPEECHES_DIR:  /root/projects/kokkai_analysis/data_prepping/params/../data/data_repr
PARTIES:  ['自民', '国民', '公明', '立憲', 'れ新', '維新', '無', '共産', '有志']
-----------------------------------


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# adapted from: https://osima.jp/posts/sentence-bert/

sentence_transformer = models.Transformer(MODEL_NAME)

pooling = models.Pooling(
    sentence_transformer.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=False,
    pooling_mode_cls_token=True,
    pooling_mode_max_tokens=False)

st = SentenceTransformer(modules=[sentence_transformer, pooling])


Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-v3 were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## 1. Creating the embeddings for each opinon based sentence

In [3]:
def embed_speeches(speeches):
	encoded_opinions = st.encode(speeches, convert_to_tensor=True, show_progress_bar=True)
	return encoded_opinions

def read_opinion_sentences_and_dates(file_path):
	logger.message(f'Reading {file_path}')
	target_dict = read_json(file_path)
	opinion_sentences = []
	dates = []
	for speech in target_dict['speeches']:
		date = [speech['date'] for _ in range(len(speech['extracted_opinions']))]
		opinions = speech['extracted_opinions']
		opinion_sentences.extend(opinions)
		dates.extend(date)
	
	return opinion_sentences, dates

def iterate_topics_for_repr(repr_path):
	for topic in os.listdir(repr_path):
		logger.message(f'Working on {topic}')
		topic_path = os.path.join(repr_path, topic)
		file_paths = [os.path.join(topic_path, file) for file in os.listdir(topic_path) if file.endswith('.json')]
		topic_opinions = []
		topic_dates = []
		for file_path in file_paths:
			opinion_sentences, dates = read_opinion_sentences_and_dates(file_path)
			topic_opinions.extend(opinion_sentences)
			topic_dates.extend(dates)
		embeddings = embed_speeches(topic_opinions)
		embeddings = [embedding.cpu() for embedding in embeddings]
		logger.message(f'Number of dates {len(topic_dates)}\nNumber of opinions {len(topic_opinions)} \nNumber of embeddings {len(embeddings)}')
		embeddings = torch.stack(embeddings)
		with h5py.File(os.path.join(topic_path, 'embeddings.hdf5'), 'w') as f:
			f.create_dataset('embeddings', data=embeddings)
			f.create_dataset('dates', data=topic_dates, dtype=h5py.string_dtype(encoding='utf-8'))
			f.create_dataset('opinions', data=topic_opinions, dtype=h5py.string_dtype(encoding='utf-8'))

for party in PARTIES:
	party_path = os.path.join(DATA_REPR_SPEECHES_DIR, party)
	repr_names = os.listdir(party_path)
	for repr_name in repr_names:
		logger.message(f'{party} ----- {repr_name}')
		repr_path = os.path.join(party_path, repr_name)
		iterate_topics_for_repr(repr_path)

Batches: 100%|██████████| 1/1 [00:00<00:00,  1.64it/s]
Batches: 100%|██████████| 2/2 [00:00<00:00, 10.23it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 39.44it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 51.62it/s]
Batches: 100%|██████████| 3/3 [00:00<00:00, 14.25it/s]
Batches: 100%|██████████| 7/7 [00:00<00:00, 15.31it/s]
Batches: 100%|██████████| 9/9 [00:00<00:00, 14.50it/s]
Batches: 100%|██████████| 2/2 [00:00<00:00, 15.17it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 43.47it/s]
Batches: 100%|██████████| 2/2 [00:00<00:00, 19.66it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 44.64it/s]
Batches: 100%|██████████| 10/10 [00:00<00:00, 16.23it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 67.82it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 61.34it/s]
Batches: 100%|██████████| 5/5 [00:00<00:00, 16.37it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 37.12it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 72.94it/s]
Batches: 100%|██████████| 2/2 [00:00<00:00, 14.97it/s]
Batches:

KeyboardInterrupt: 

## 2. Create one single "opinion-embedding" for each politician and store it in a retrievable manner.

In [15]:
class EmbeddingHandler:
	def __init__(self, data_dir=DATA_DIR, speeches_dir = DATA_REPR_SPEECHES_DIR, parties=PARTIES):
		self.data_dir = data_dir
		self.speeches_dir = speeches_dir
		self.parties = parties

	def check_party_exists(self, party):
		if party not in self.parties:
			raise ValueError(f'{party} not in {self.parties}')
		
	def get_reprs_for_party(self, party):
		self.check_party_exists(party)

		return os.listdir(os.path.join(self.speeches_dir, party))

	def get_topics_for_repr(self, party, repr_name):
		self.check_party_exists(party)
		return os.listdir(os.path.join(self.speeches_dir, party, repr_name))

	def get_embeddings_for_topic(self, party, repr_name, topic):
		path = os.path.join(self.speeches_dir, party, repr_name, topic, 'embeddings.hdf5')
		with h5py.File(path, 'r') as f:
			embeddings = f['embeddings'][:]
			dates = [date.decode('utf-8') for date in f['dates'][:]]
			opinions = [opinion.decode('utf-8') for opinion in f['opinions'][:]]
			return embeddings, dates, opinions


In [17]:
eh = EmbeddingHandler()
print(eh.get_reprs_for_party('自民'))
print(eh.get_topics_for_repr('自民', '新藤義孝'))
embeddings, dates, opinions = eh.get_embeddings_for_topic('自民', '新藤義孝', '少子化')
print('embeddings shape', embeddings.shape)
print(f"{len(dates)} dates ---- sample: {dates[0]}")
print(f"{len(opinions)} opinions ---- sample: {opinions[0]}")

['関芳弘', '新藤義孝', '鷲尾英一郎', '江藤拓', '義家弘介', '高村正大', '高鳥修一', '中谷真一', '森英介', '藤丸敏', '柴山昌彦', '中谷元', '下村博文', '岸田文雄', '小渕優子', '勝俣孝明', '小泉進次郎', '吉野正芳', '中曽根康隆', '宮下一郎', '平井卓也', '菅義偉', '石井拓', '西村明宏', '木村次郎', '上杉謙太郎', '八木哲也', '小林茂樹', '松島みどり', '菅家一郎', '鈴木憲和', '渡辺孝一', '林幹雄', '石破茂', '堀井学', '石原正敬', '上田英俊', '井上信治', '城内実', '古賀篤', '東国幹', '橘慶一郎', '林芳正', '遠藤利明', '村上誠一郎', '小森卓郎', '高見康裕', '後藤茂之', '神田潤一', '牧島かれん', '小林史明', '国光あやの', '田所嘉徳', '茂木敏充', '高木宏壽', '山本左近', '泉田裕彦', '宮崎政久', '山下貴司', '野中厚', '加藤勝信', '稲田朋美', '伊藤達也', '齋藤健', '船田元', '土屋品子', '上川陽子', '五十嵐清', '和田義明', '棚橋泰文', '甘利明', '門山宏哲', '工藤彰三', '大塚拓', '加藤鮎子', '山田賢司', '松本洋平', '伊藤信太郎', '石田真敏', '江崎鐵磨', '福田達夫', '務台俊介', '斎藤洋明', '越智隆雄', '岩屋毅', '山本ともひろ', '木原誠二', '古川直季', '鬼木誠', '永岡桂子', '江渡聡徳', '国定勇人', '根本幸典', '辻清人', '小倉將信', '今枝宗一郎', '井野俊郎', '黄川田仁志', 'あかま二郎', '田野瀬太道', '柿沢未途', '保岡宏武', '宮澤博行', '山際大志郎', '杉田水脈', '梶山弘志', '小野寺五典', '鈴木英敬', '鈴木隼人', '松野博一', '鈴木俊一', '大西英男', '宮路拓馬', '古川康', '武部新', '山口俊一', '奥野信亮', '田中良生', 'あべ俊子', '田中英之', '野田聖子', '神田憲次', '田畑裕明', '熊田裕通', '上野賢一郎', '中山展宏

In [4]:
path = '/root/projects/kokkai_analysis/data_prepping/data/data_repr/自民/関芳弘/原発/embeddings.hdf5'
f = h5py.File(path, 'r')
print(list(f.keys()))
for key in f.keys():
	print(key)
	print(key, f[key])
	if key in ['dates', 'opinions']:
		print(key, f[key][0].decode('utf-8'))
	else:
		print(type(f[key][0]))
		print(key, f[key][0])

['dates', 'embeddings', 'opinions']
dates
dates <HDF5 dataset "dates": shape (16,), type "|O">
dates 2020-11-18
embeddings
embeddings <HDF5 dataset "embeddings": shape (16, 768), type "<f4">
<class 'numpy.ndarray'>
embeddings [ 1.82691485e-01 -8.33204567e-01 -7.30217636e-01 -3.05814683e-01
 -4.69669342e-01 -2.35474840e-01 -4.55190688e-01 -4.76663932e-02
  1.69568747e-01  6.94899142e-01  7.32448623e-02  1.59747511e-01
 -3.05168420e-01 -3.97550136e-01 -2.92605877e-01  5.09004235e-01
  3.04416865e-01 -5.72520912e-01 -4.88299459e-01  5.72173595e-01
  7.25738525e-01  3.68589103e-01 -2.78492123e-01 -4.46505338e-01
  6.59366608e-01  4.19738382e-01  3.29563767e-01  7.63414800e-01
 -6.88293278e-02  1.80018738e-01 -4.01491970e-01 -8.77472281e-01
  2.08504617e-01 -1.04402504e+01 -5.11266589e-02 -2.26977080e-01
  2.69378394e-01 -4.79521491e-02  4.14126694e-01 -2.90959626e-01
 -5.23690701e-01 -5.25138676e-01  2.63480186e-01 -1.26261767e-02
 -4.57496196e-01 -8.76290262e-01  6.87399656e-02 -1.7752227

## 3. Create a stance axis vector by either generating two reference points or picking two points "opinion-embeddings" from the data

## 4. Collapse all the other vectors onto this axis by projecting them onto the axis

## 5. Create a scalar measurement for how far each politician is from the two reference points