In [1]:
import os
import json
from scrape.general_scraper import GeneralScraper
from selenium.webdriver.common.by import By
import openai
from secret_keys.api_keys import OPENAI_API_KEY
from file_handling.file_read_writer import read_json, write_json, create_dir
from params.paths import ROOT_DIR
from logger.Logger import Logger
from json import JSONDecodeError
RESOURCE_DIR = os.path.join(ROOT_DIR, 'resource')
DATA_DIR = os.path.join(ROOT_DIR, 'data')
LOCAL_DATA_DIR = os.path.join(DATA_DIR, 'data_local_gov')
LOCAL_GOV_MEMBER_DIR = os.path.join(LOCAL_DATA_DIR, 'members')
os.makedirs(LOCAL_DATA_DIR, exist_ok=True)
os.makedirs(LOCAL_GOV_MEMBER_DIR, exist_ok=True)
logger = Logger(os.path.join(LOCAL_DATA_DIR, 'log.txt'))

gs = GeneralScraper(firefox=True)

os.environ['PYTORCH_CUDA_ALLOC_CONF']='max_split_size_mb:512'


In [2]:
openai.organization = "org-KwrqfnvZUjabTOAFL3QUAhk2"
openai.api_key = OPENAI_API_KEY

In [13]:
def gpt(content):
	sys_prompt = "あなたはこれからユーザーからもらうテキストをJSONデータに変換するアシスタントです。"
	user_prompt = '以下のテキストからすべての議員の情報を略すことなく完全な辞書のリスト形式で取得してください。結果はJSON形式で返してください。抽出する情報は名前（漢字）、名前（フリガナ）、会派、年齢、所属委員会、名前から推定される性別、です。\n'
	format_prompt = '出力フォーマットは次のようにしてください。\n [ {"name_kana":, "name_kanji":, "party":, "age":, "gender":, "commitee":[], "term":, "district":, }, { }, { }, ... ]\n'
	print('Sending to GPT')
	response = openai.ChatCompletion.create(
			model="gpt-3.5-turbo-1106",
			messages=[
				{"role": "system", "content":sys_prompt},
				{"role": "user", "content": user_prompt},
				{"role": "user", "content": format_prompt},
				{'role': "user", "content": content},
			],
			response_format= { "type":"json_object" },
			temperature=1,
		)
	try:
		response_content = json.loads(response.choices[0].message.content)
		return response_content
	except (JSONDecodeError, UnicodeEncodeError):
		print('failed to parse json')
		logger.write(f"Failed to parse json: {response_content}\n")

def handle_reprs_on_multiple_pages(city_dict):
	output = []
	for url in city_dict['urls']:
		gs.get_url(url)
		indep_repr_page_links = gs.get_site_components_by(By.XPATH, city_dict['ind_reprs_xpath'])
		links = [link.get_attribute('href') for link in indep_repr_page_links]
		print("Getting independant links")
		for link in links:
			print("getting link", link)
			gs.get_url(link)
			repr_info_component = gs.get_site_components_by(By.XPATH, city_dict['ind_reprs_info_xpath'])
			repr_info = repr_info_component[0].text
			response = gpt(repr_info)
			if 'members' in response.keys():
				output += response['members']
			else:
				output.append(response)
	return output

def handle_reprs_on_single_page(city_dict, city_name):
	output = []
	for url in city_dict['urls']:
		print('getting', url)
		try:
			if not url == gs.driver.current_url:
				gs.get_url(url)
		except:
			print('failed to get url', url)
			logger.write(f"Could not get {city_name} : {url}\n")
		try:
			reprs_component = gs.get_site_components_by(By.XPATH, city_dict['reprs_xpath'])
		except:
			logger.write(f"Check xpath for {city_name} : {city_dict['reprs_xpath']} : {url}\n")
		if len(reprs_component) > 0:
			all_text = reprs_component[0].text
			print(f"Extracted all text for {city_name} : {url}")
			response = gpt(all_text)
			if 'members' in response.keys():
				output += response["members"]
			else:
				output.append(response)
		else:
			print('No reprs found on this page.')
			logger.write(f"No reprs found on this page. {city_name} : {url}\n")
	return output



IndentationError: expected an indented block after 'else' statement on line 63 (2309060617.py, line 64)

In [12]:
scraping_resource_path = os.path.join(RESOURCE_DIR, 'local_gov_repr_scrape.json')
scraping_resource = read_json(scraping_resource_path)
for city_name, city_dict in scraping_resource.items():
	city_dir = os.path.join(LOCAL_GOV_MEMBER_DIR, city_name)
	os.makedirs(city_dir, exist_ok=True)
	if os.path.exists(os.path.join(city_dir, 'members.json')):
		continue
	count = 1
	print(f"Getting {city_name} data [{count}]")
	while True:
		if count == 3:
			break
		try:
			profile_on_multiple_pages = 'multiple_pages' in city_dict.keys()
			if profile_on_multiple_pages:
				output = handle_reprs_on_multiple_pages(city_dict, city_name)
				print('writing json')
				write_json([{"members":output}], os.path.join(city_dir, 'members.json'))
				continue
			scrapable = city_dict['reprs_xpath'] != ''
			if not scrapable:
				break
			output = handle_reprs_on_single_page(city_dict, city_name)
			print('writing json')
			write_json({"members":output}, os.path.join(city_dir, 'members.json'))
			break
		except:
			count += 1
			logger.write(f"Could not get {city_name} : {city_dict['urls']}\n")
	

Getting 北海道　岩見沢市 data [1]
Getting 北海道　美唄市 data [1]
Getting 北海道　芦別市 data [1]
Getting 北海道　江別市 data [1]
getting https://www.city.ebetsu.hokkaido.jp/site/gikai/2002.html
Extracted all text for 北海道　江別市 : https://www.city.ebetsu.hokkaido.jp/site/gikai/2002.html
Sending to GPT
writing json
Getting 北海道　赤平市 data [1]
Getting 北海道　紋別市 data [1]
getting https://mombetsu.jp/gikai/member/
Extracted all text for 北海道　紋別市 : https://mombetsu.jp/gikai/member/
Sending to GPT


UnicodeEncodeError: 'charmap' codec can't encode characters in position 14-20: character maps to <undefined>

In [None]:
## getting one sample for the city
city = "北海道　札幌市"
gs.get_url(scraping_resource[city]['urls'][0])
reprs_component = gs.get_site_components_by(By.XPATH, scraping_resource[city]['reprs_xpath'])
all_text = reprs_component[0].text
file = open(os.path.join(RESOURCE_DIR, 'sample_text.txt'), 'w', encoding='utf-8')
file.write(all_text)
file.close()