# 研究課題の主要データをDBに保存するプログラム

## プログラムの概要

- 前提：研究課題のXMLファイルが./xmlフォルダに保存されていること
- 内定時点のデータを保存する
- 原則としてデータはsummary要素から取得する。研究機関データのみgrantlist要素から取得する。
研究者情報は、grantAward/summary/memberとgrantAward/memberList/memberの2箇所にある。
前者は同じ人は複数出てこなくてまとまっているが、所属機関等のコードがない。
後者は所属機関コードがあるが、毎年度の実績報告書があるので同じ人が複数回出てくる。
差し当たって前者からデータを取得することにする。そのうち余裕が出たら、後者のデータと突合したい。

### 流れ

1. grantaward : 研究課題メインになる部分。課題番号、研究種目、開始年度、終了年度、直接経費総額など。


- 部品1：研究課題データのうち、課題番号や研究種目など、変更にならない項目で、課題番号に対して一対一になる項目
- 部品2：採択年度の研究機関
- 部品3：採択年度の研究代表者

上記の3つの部品を課題番号をキーにして、結合して一つのテーブルを作り、DBに書き込む

以下のテーブルは、grantawardに対して、基本的に一対多のリレーションになっている。DBに書き込む。

2. grantaward_member : 研究代表者、研究分担者など
3. grantaward_field : 研究分野。系分野分科細目表に基づくもの。2017年度まで。
4. grantaward_review_section : 審査区分。審査区分表に基づくもの。2018年度以降。
5. grantaward_annual : 年度ごとの直接経費金額
6. grantaward_keyword : 研究課題のキーワード
7. grantaward_paragraph : 研究概要等のテキストデータ
8. grantaward_product: 研究成果物

## 事前準備

In [None]:
import configparser
import os
import pickle
import re
import shutil
from glob import glob

import numpy as np
import pandas as pd
from joblib import Parallel, delayed
from lxml import etree
from sqlalchemy import create_engine
from sqlalchemy.types import Date, Integer, String, BigInteger
from tqdm import tqdm_notebook as tqdm

In [None]:
# DB設定
config = configparser.ConfigParser()
config.read("../../settings/config.ini")
username = config["mariadb"]["username"]
password = config["mariadb"]["password"]
url = (
    "mysql+pymysql://"
    + username
    + ":"
    + password
    + "@localhost:3306/"
    + "kaken"
    + "?charset=UTF8MB4"
)
engine = create_engine(url, echo=True)

データセットを作成する年度を指定する

In [None]:
startyear = 1964
endyear = 2020

## XMLファイルからデータ抽出

関数を定義する

In [None]:
# 研究課題の主な項目
def kadai(xmlfile):
    tree = etree.parse(xmlfile)
    nsmap = {"xml": "http://www.w3.org/XML/1998/namespace"}
    kadailist = []
    for grantAward in tree.iterfind("grantAward"):
        projecttype = grantAward.get("projectType")
        awardnumber = grantAward.get("awardNumber")
        summary = grantAward.find("summary[@xml:lang='ja']", nsmap)
        projectstatus = summary.find("projectStatus")
        try:
            projectstatus_fiscalyear = projectstatus.get("fiscalYear")
        except AttributeError:
            projectstatus_fiscalyear = None
        try:
            projectstatus_statuscode = projectstatus.get("statusCode")
        except AttributeError:
            projectstatus_statuscode = None
        startfiscalyear = summary.find("periodOfAward").get("searchStartFiscalYear")
        endfiscalyear = summary.find("periodOfAward").get("searchEndFiscalYear")
        try:
            category_niicode = summary.find("category").get("niiCode")
        except AttributeError:
            category_niicode = None
        try:
            category = summary.find("category").text
        except AttributeError:
            category = None
        try:
            section_niicode = summary.find("section").get("niiCode")
        except AttributeError:
            section_niicode = None
        try:
            section = summary.find("section").text
        except AttributeError:
            section = None
        try:
            title_ja = summary.find("title").text
        except AttributeError:
            title_ja = None
        try:
            title_en = summary.find("title").text
        except AttributeError:
            title_en = None
        try:
            directcost = summary.find("overallAwardAmount/directCost").text
        except AttributeError:
            directcost = None
        row = [
            awardnumber,
            projecttype,
            projectstatus_fiscalyear,
            projectstatus_statuscode,
            startfiscalyear,
            endfiscalyear,
            category_niicode,
            category,
            section_niicode,
            section,
            title_ja,
            title_en,
            directcost,
        ]
        kadailist.append(row)
    dumpfilename = (
        "dump_kadai/main/main_"
        + re.search("[0-9]{4}_[0-9]+-[0-9]+.xml", xmlfile).group()
        + ".dump"
    )
    with open(dumpfilename, "wb") as f:
        pickle.dump(kadailist, f)

In [None]:
# 研究代表者が所属する研究機関
def institution(xmlfile):
    tree = etree.parse(xmlfile)
    nsmap = {"xml": "http://www.w3.org/XML/1998/namespace"}
    institutionlist = []
    for grantAward in tree.iterfind("grantAward"):
        awardnumber = grantAward.get("awardNumber")
        grantlist = grantAward.find("grantList")
        try:
            for grant in grantlist.iterfind("grant[@xml:lang='ja']", nsmap):
                fiscalyear = grant.get("fiscalYear")
                grant_sequence = grant.get("sequence")
                for institution in grant.iterfind("institution"):
                    institution_sequence = institution.get("sequence")
                    institution_niicode = institution.get("niiCode")
                    institution_mextcode = institution.get("mextCode")
                    institution_jspscode = institution.get("jspsCode")
                    institution_name = institution.text
                    row = [
                        awardnumber,
                        fiscalyear,
                        grant_sequence,
                        institution_sequence,
                        institution_niicode,
                        institution_mextcode,
                        institution_jspscode,
                        institution_name,
                    ]
                    institutionlist.append(row)
        except AttributeError:
            row = [awardnumber] + [None] * 7
    dumpfilename = (
        "dump_kadai/institution/institution_"
        + re.search("[0-9]{4}_[0-9]+-[0-9]+.xml", xmlfile).group()
        + ".dump"
    )
    with open(dumpfilename, "wb") as f:
        pickle.dump(institutionlist, f)

In [None]:
# 研究代表者等の研究者番号等
def member(xmlfile):
    tree = etree.parse(xmlfile)
    nsmap = {"xml": "http://www.w3.org/XML/1998/namespace"}
    memberlist = []
    for grantAward in tree.iterfind("grantAward"):
        awardnumber = grantAward.get("awardNumber")
        summary = grantAward.find("summary[@xml:lang='ja']", nsmap)
        for member in summary.iterfind("member", nsmap):
            sequence = member.get("sequence")
            try:
                participate = member.get("participate")
            except AttributeError:
                participate = None
            eradcode = member.get("eradCode")
            role = member.get("role")
            try:
                fullname = member.find("personalName/fullName").text
            except AttributeError:
                fullname = None
            try:
                familyname = member.find("personalName/familyName").text
            except AttributeError:
                familyname = None
            try:
                givenname = member.find("personalName/givenName").text
            except AttributeError:
                givenname = None
            try:
                familyname_yomi = member.find("personalName/familyName").get("yomi")
            except AttributeError:
                familyname_yomi = None
            try:
                givenname_yomi = member.find("personalName/givenName").get("yomi")
            except AttributeError:
                givenname_yomi = None
            row = [
                awardnumber,
                sequence,
                participate,
                eradcode,
                role,
                fullname,
                familyname,
                givenname,
                familyname_yomi,
                givenname_yomi,
            ]
            memberlist.append(row)
    dumpfilename = (
        "dump_kadai/member/member_"
        + re.search("[0-9]{4}_[0-9]+-[0-9]+.xml", xmlfile).group()
        + ".dump"
    )
    with open(dumpfilename, "wb") as f:
        pickle.dump(memberlist, f)

In [None]:
# 系分野分科細目表に基づく研究分野データ
def field(xmlfile):
    tree = etree.parse(xmlfile)
    nsmap = {"xml": "http://www.w3.org/XML/1998/namespace"}
    fieldlist = []
    for grantAward in tree.iterfind("grantAward"):
        awardnumber = grantAward.get("awardNumber")
        summary = grantAward.find("summary[@xml:lang='ja']", nsmap)
        for field in summary.iterfind("field"):
            field_sequence = field.get("sequence")
            field_path = field.get("path")
            field_niicode = field.get("niiCode")
            field_table = field.get("fieldTable")
            field_name = field.text
            row = [
                awardnumber,
                field_sequence,
                field_path,
                field_niicode,
                field_table,
                field_name,
            ]
            fieldlist.append(row)
    dumpfilename = (
        "dump_kadai/field/field_"
        + re.search("[0-9]{4}_[0-9]+-[0-9]+.xml", xmlfile).group()
        + ".dump"
    )
    with open(dumpfilename, "wb") as f:
        pickle.dump(fieldlist, f)

In [None]:
# 審査区分表に基づく研究分野
def review_section(xmlfile):
    tree = etree.parse(xmlfile)
    nsmap = {"xml": "http://www.w3.org/XML/1998/namespace"}
    review_secitonlist = []
    for grantAward in tree.iterfind("grantAward"):
        awardnumber = grantAward.get("awardNumber")
        summary = grantAward.find("summary[@xml:lang='ja']", nsmap)
        for review_section in summary.iterfind("review_section", nsmap):
            review_section_sequence = review_section.get("sequence")
            review_section_niicode = review_section.get("niiCode")
            review_section_table_type = review_section.get("tableType")
            review_section_name = review_section.text
            row = [
                awardnumber,
                review_section_sequence,
                review_section_niicode,
                review_section_table_type,
                review_section_name,
            ]
            review_secitonlist.append(row)
        dumpfilename = (
            "dump_kadai/review_section/review_section_"
            + re.search("[0-9]{4}_[0-9]+-[0-9]+.xml", xmlfile).group()
            + ".dump"
        )
    with open(dumpfilename, "wb") as f:
        pickle.dump(review_secitonlist, f)

In [None]:
# 年度ごとの直接経費金額
def annual(xmlfile):
    tree = etree.parse(xmlfile)
    nsmap = {"xml": "http://www.w3.org/XML/1998/namespace"}
    directcostlist = []
    for grantAward in tree.iterfind("grantAward"):
        awardnumber = grantAward.get("awardNumber")
        for awardamountlist in grantAward.iterfind("awardAmountList"):
            sequence = awardamountlist.get("sequence")
            for awardamount in awardamountlist.iterfind("awardAmount"):
                try:
                    fiscalyear = awardamount.get("fiscalYear")
                except AttributeError:
                    fiscalyear = None
                try:
                    directcost = awardamount.find("directCost").text
                except AttributeError:
                    directcost = None
                row = [awardnumber, sequence, fiscalyear, directcost]
                directcostlist.append(row)
    dumpfilename = (
        "dump_kadai/annual/annual_"
        + re.search("[0-9]{4}_[0-9]+-[0-9]+.xml", xmlfile).group()
        + ".dump"
    )
    with open(dumpfilename, "wb") as f:
        pickle.dump(directcostlist, f)

In [None]:
# 研究課題のキーワード
def keyword(xmlfile):
    tree = etree.parse(xmlfile)
    nsmap = {"xml": "http://www.w3.org/XML/1998/namespace"}
    keywordlist = []
    for grantAward in tree.iterfind("grantAward"):
        awardnumber = grantAward.get("awardNumber")
        try:
            keywordList = grantAward.find("summary[@xml:lang='ja']/keywordList", nsmap)
            for keyword in keywordList.iterfind("keyword"):
                keyword_sequence = keyword.get("sequence")
                keyword_text = keyword.text
                row = [awardnumber, keyword_sequence, keyword_text]
                keywordlist.append(row)
        except AttributeError:
            row = [awardnumber] + [None] * 2
    dumpfilename = (
        "dump_kadai/keyword/keyword_"
        + re.search("[0-9]{4}_[0-9]+-[0-9]+.xml", xmlfile).group()
        + ".dump"
    )
    with open(dumpfilename, "wb") as f:
        pickle.dump(keywordlist, f)

In [None]:
# 研究課題のテキストデータ
def paragraph(xmlfile):
    tree = etree.parse(xmlfile)
    nsmap = {"xml": "http://www.w3.org/XML/1998/namespace"}
    textlist = []
    for grantAward in tree.iterfind("grantAward"):
        awardnumber = grantAward.get("awardNumber")
        summary = grantAward.find("summary[@xml:lang='ja']", nsmap)
        try:
            for paragraphlist in summary.iterfind("paragraphList"):
                paragraphlist_sequence = paragraphlist.get("sequence")
                paragraphlist_parentid = paragraphlist.get("parentId")
                paragraphlist_type = paragraphlist.get("type")
                for paragraph in paragraphlist.iterfind("paragraph"):
                    paragraph_sequence = paragraph.get("sequence")
                    paragraph_text = paragraph.text
                    row = [
                        awardnumber,
                        paragraphlist_sequence,
                        paragraphlist_parentid,
                        paragraphlist_type,
                        paragraph_sequence,
                        paragraph_text,
                    ]
                    textlist.append(row)
        except AttributeError:
            row = [awardnumber] + [None] * 5
            textlist.append(row)
    dumpfilename = (
        "dump_kadai/paragraph/paragraph_"
        + re.search("[0-9]{4}_[0-9]+-[0-9]+.xml", xmlfile).group()
        + ".dump"
    )
    with open(dumpfilename, "wb") as f:
        pickle.dump(textlist, f)

In [None]:
# 研究成果物
def product(xmlfile):
    tree = etree.parse(xmlfile)
    nsmap = {"xml": "http://www.w3.org/XML/1998/namespace"}
    productlist = []
    for grantAward in tree.iterfind("grantAward"):
        awardnumber = grantAward.get("awardNumber")
        try:
            productlistenriched = grantAward.find("productListEnriched")
            for product in productlistenriched.iterfind("product"):
                product_type = product.get("type")
                sequence = product.get("sequence")
                try:
                    reviewed = product.get("reviewed")
                except AttributeError:
                    reviewed = None
                try:
                    doi = product.find("doi").text
                except AttributeError:
                    doi = None
                try:
                    author_ja = product.find("author[@xml:lang='ja']", nsmap).text
                except AttributeError:
                    author_ja = None
                try:
                    author_en = product.find("author[@xml:lang='en']", nsmap).text
                except AttributeError:
                    author_en = None
                try:
                    title_ja = product.find("title[@xml:lang='ja']", nsmap).text
                except AttributeError:
                    title_ja = None
                try:
                    title_en = product.find("title[@xml:lang='en']", nsmap).text
                except AttributeError:
                    title_en = None
                try:
                    journaltitle_ja = product.find(
                        "journalTitle[@xml:lang='ja']", nsmap
                    ).text
                except AttributeError:
                    journaltitle_ja = None
                try:
                    journaltitle_en = product.find(
                        "journalTitle[@xml:lang='en']", nsmap
                    ).text
                except AttributeError:
                    journaltitle_en = None
                try:
                    year = product.find("year").text
                except AttributeError:
                    year = None
                row = [
                    awardnumber,
                    product_type,
                    sequence,
                    reviewed,
                    doi,
                    author_ja,
                    author_en,
                    title_ja,
                    title_en,
                    journaltitle_ja,
                    journaltitle_en,
                    year,
                ]
                productlist.append(row)
        except:
            row = [awardnumber] + [None] * 11
            productlist.append(row)

    dumpfilename = (
        "dump_kadai/product/product_"
        + re.search("[0-9]{4}_[0-9]+-[0-9]+.xml", xmlfile).group()
        + ".dump"
    )
    with open(dumpfilename, "wb") as f:
        pickle.dump(productlist, f)

XMLファイルから研究課題に関するデータを抽出して保存する

In [None]:
# dump_kadaiフォルダを空にしておく
target_dir = "dump_kadai"
if os.path.isdir(target_dir):
    shutil.rmtree(target_dir)
parts = [
    "main",
    "institution",
    "member",
    "field",
    "review_section",
    "annual",
    "keyword",
    "paragraph",
    "product",
]
dirlist = [target_dir + "/" + p for p in parts]
for d in dirlist:
    os.makedirs(d)

In [None]:
# XMLファイルのリストを作成する
filenames = []
for i in range(startyear, endyear + 1):
    globdir = "../kaken_parse_grants_masterxml/xml/" + str(i) + "*.xml"
    filenames.extend(glob(globdir))

# XMLファイルをパースする関数を束ねる
def parse(xmlfile):
    kadai(xmlfile)
    institution(xmlfile)
    member(xmlfile)
    field(xmlfile)
    review_section(xmlfile)
    annual(xmlfile)
    keyword(xmlfile)
    paragraph(xmlfile)
    product(xmlfile)


# Joblibで並列処理する
Parallel(n_jobs=-1, verbose=1)([delayed(parse)(i) for i in filenames])

## データ処理用の関数の準備

In [None]:
def merge_list(parts):
    lists = []
    for dump in tqdm(glob("dump_kadai/" + parts + "/" + parts + "*.dump")):
        with open(dump, mode="rb") as f:
            l = pickle.load(f)
            lists += l
    return lists

## 研究課題基礎テーブル

### 基礎テーブル部品1. 研究課題メインデータ

In [None]:
# リストを結合する
lists = merge_list("main")
# リストをデータフレームに変換する
columns = [
    "awardnumber",
    "projecttype",
    "projectstatus_fiscalyear",
    "projectstatus_statuscode",
    "startfiscalyear",
    "endfiscalyear",
    "category_niicode",
    "category",
    "section_niicode",
    "section",
    "title_ja",
    "title_en",
    "directcost",
]
base_main = pd.DataFrame(lists, columns=columns)
# 課題番号に重複がないことを確認して、インデックスに設定する
assert not base_main["awardnumber"].duplicated().any(), "awardnumber is duplicated."
base_main = base_main.set_index("awardnumber")
# データを見ると、研究種目名と区分名は表記ゆれなどがあって使いにくいので、それぞれのniicodeだけ残して、削除しておく。
base_main = base_main.drop(columns=["category", "section"])
base_main

### 基礎テーブル部品2. 採択時の代表研究機関

In [None]:
# リストを結合する
lists = merge_list("institution")
# リストをデータフレームに変換する
columns = [
    "awardnumber",
    "fiscalyear",
    "grant_sequence",
    "institution_sequence",
    "institution_niicode",
    "institution_mextcode",
    "institution_jspscode",
    "institution_name",
]
base_institution = pd.DataFrame(lists, columns=columns)
# awardnumberごとにfiscalyearが最小の行（＝採択時点の研究機関の行）を取得する
oldest = base_institution.groupby("awardnumber")["fiscalyear"].min().reset_index()
# dfのうち、oldestと一致する行のみ残す
base_institution = pd.merge(oldest, base_institution, on=["awardnumber", "fiscalyear"])
# 課題番号に重複がないことを確認して、インデックスに設定する
assert not base_institution["awardnumber"].duplicated().any(), "awardnumber is duplicated."
base_institution = base_institution.set_index("awardnumber")
# 使用しない列を削除する
base_institution = base_institution.drop(columns=["fiscalyear", "grant_sequence", "institution_sequence"])
base_institution

### 基礎テーブル部品3. 採択時の研究代表者

In [None]:
# リストを結合する
lists = merge_list("member")
# リストをデータフレームに変換する
columns = [
    "awardnumber",
    "sequence",
    "participate",
    "eradcode",
    "role",
    "fullname",
    "familyname",
    "givenname",
    "familyname_yomi",
    "givenname_yomi",
]
base_member = pd.DataFrame(lists, columns=columns)
# 代表者のみ抽出
daihyou = [
    "principal_investigator",
    "area_organizer",
    "principal_investigator_support",
    "research_fellow",
    "foreign_research_fellow",
]
base_member = base_member[base_member["role"].isin(daihyou)]
# データ型を指定する
base_member = base_member.astype({"sequence": int})
# awardnumberごとにsequenceが最大のレコードのみ抽出する。生のXMLを眺めてみると、sequenceが大きいほど古い年度のデータなので。
seqmax = base_member.groupby('awardnumber')['sequence'].max().reset_index()
base_member = pd.merge(seqmax, base_member, on=['awardnumber', 'sequence'])
# 課題番号に重複がないことを確認して、インデックスに設定する
assert not base_member["awardnumber"].duplicated().any(), "awardnumber is duplicated."
base_member = base_member.set_index("awardnumber")
base_member

### 基礎テーブルの3つの部品を結合してDBに書き込む

In [None]:
# 3つのデータフレームを結合する
base = base_main.join(base_institution)
base = base.join(base_member)
base

In [None]:
# 外部キー制約を外す
try:
    with engine.connect() as con:
        con.execute("ALTER TABLE grantaward_review_section DROP FOREIGN KEY fk_grantaward_review_section_grantaward;")
        con.execute("ALTER TABLE grantaward_field DROP FOREIGN KEY fk_grantaward_field_grantaward;")
        con.execute("ALTER TABLE grantaward_annual DROP FOREIGN KEY fk_grantaward_annual_grantaward;")
        con.execute("ALTER TABLE grantaward_member DROP FOREIGN KEY fk_grantaward_member_grantaward;")
        con.execute("ALTER TABLE grantaward_paragraph DROP FOREIGN KEY fk_grantaward_paragraph_grantaward;")
        con.execute("ALTER TABLE grantaward_keyword DROP FOREIGN KEY fk_grantaward_keyword_grantaward;")
        con.execute("ALTER TABLE grantaward_product DROP FOREIGN KEY fk_grantaward_product_grantaward;")
        con.execute("ALTER TABLE grantaward DROP PRIMARY KEY;")
        con.execute("ALTER TABLE grantaward_member DROP PRIMARY KEY;")
        con.execute("ALTER TABLE grantaward_field DROP PRIMARY KEY;")
        con.execute("ALTER TABLE grantaward_review_section DROP PRIMARY KEY;")
        con.execute("ALTER TABLE grantaward_annual DROP PRIMARY KEY;")
        con.execute("ALTER TABLE grantaward_keyword DROP PRIMARY KEY;")
        con.execute("ALTER TABLE grantaward_paragraph DROP PRIMARY KEY;")
        con.execute("ALTER TABLE grantaward_product DROP PRIMARY KEY;")
except:
    pass

In [None]:
# データベースに書き込む
base.to_sql(
    "grantaward",
    engine,
    if_exists="replace",
    dtype={
        "awardnumber": String(255),
        "startfiscalyear": Integer,
        "endfiscalyear": Integer,
        "projectstatus_fiscalyear": Integer,
        "category_niicode": Integer,
        "section_niicode": Integer,
        "institution_niicode": String(7),
        "directcost": BigInteger,
        "sequence": Integer,
        "eradcode": String(8),
    },
)

In [None]:
# 主キーと外部キー制約を設定する
with engine.connect() as con:
    con.execute("ALTER TABLE grantaward ADD PRIMARY KEY(awardnumber)")
    con.execute(
        "ALTER TABLE grantaward ADD CONSTRAINT category_niicode_1 FOREIGN KEY (category_niicode) REFERENCES categories(category_niicode);"
    )
    con.execute(
        "ALTER TABLE grantaward ADD CONSTRAINT section_niicode_1 FOREIGN KEY (section_niicode) REFERENCES sections(section_niicode);"
    )
    con.execute(
        "ALTER TABLE grantaward ADD CONSTRAINT institution_niicode_1 FOREIGN KEY (institution_niicode) REFERENCES institutions(institution_niicode);"
    )

---

## 研究者テーブルを作る

In [None]:
# リストを結合する
lists = merge_list("member")
# リストをデータフレームに変換する
columns = [
    "awardnumber",
    "sequence",
    "participate",
    "eradcode",
    "role",
    "fullname",
    "familyname",
    "givenname",
    "familyname_yomi",
    "givenname_yomi",
]
member = pd.DataFrame(lists, columns=columns)
# 研究者番号が数字のみで構成されていることを確認する
assert member["eradcode"].str.match('^[0-9]*$').all(), "eradcode contains non-integer letter."
member

In [None]:
# データベースに書き込む
member.to_sql(
    "grantaward_member",
    engine,
    if_exists="replace",
    dtype={"awardnumber": String(255), "sequence": Integer, "eradcode": String(8)},
)

In [None]:
# 主キーと外部キー制約を設定する
with engine.connect() as con:
    con.execute("ALTER TABLE `grantaward_member` ADD PRIMARY KEY(`index`);")
    con.execute(
        "ALTER TABLE `grantaward_member` ADD CONSTRAINT fk_grantaward_member_grantaward FOREIGN KEY (`awardnumber`) REFERENCES `grantaward`(`awardnumber`);"
    )

---

## 研究分野テーブルを作る

In [None]:
# リストを結合する
lists = merge_list("field")
# リストをデータフレームに変換する
columns = [
    "awardnumber",
    "field_sequence",
    "field_path",
    "field_niicode",
    "field_table",
    "field_name",
]
field = pd.DataFrame(lists,columns=columns)
field

In [None]:
# データベースに書き込む
field.to_sql(
    "grantaward_field",
    engine,
    if_exists="replace",
    dtype={
        "awardnumber": String(255),
        "field_niicode": Integer,
        "field_path": String(255),
    },
)

In [None]:
# 主キーと外部キー制約を設定する
with engine.connect() as con:
#     con.execute("ALTER TABLE `grantaward_field` ADD PRIMARY KEY(`index`);")
    con.execute(
        "ALTER TABLE `grantaward_field` ADD CONSTRAINT fk_grantaward_field_grantaward FOREIGN KEY (`awardnumber`) REFERENCES `grantaward`(`awardnumber`);"
    )
    con.execute(
        "ALTER TABLE `grantaward_field` ADD CONSTRAINT fk_grantaward_field_field_niicode FOREIGN KEY (`field_niicode`) REFERENCES `fields`(`field_niicode`);"
    )
    con.execute(
        "ALTER TABLE grantaward_field ADD CONSTRAINT fk_grantaward_field_field_path FOREIGN KEY (field_path) REFERENCES fields (field_path);"
    )

---

## 審査区分テーブルを作る 

In [None]:
# リストを結合する
lists = merge_list("review_section")
# リストをデータフレームに変換する
columns = [
    "awardnumber",
    "review_section_sequence",
    "review_section_niicode",
    "review_section_table_type",
    "review_section_name",
]
review_section = pd.DataFrame(lists, columns=columns)
review_section

In [None]:
# データベースに書き込む
review_section.to_sql(
    "grantaward_review_section",
    engine,
    if_exists="replace",
    dtype={"awardnumber": String(255), "review_section_niicode": Integer},
)

In [None]:
# 主キーと外部キー制約を設定する
try:
    with engine.connect() as con:
        con.execute("ALTER TABLE `grantaward_review_section` ADD PRIMARY KEY(`index`);")
        con.execute(
            "ALTER TABLE `grantaward_review_section` ADD CONSTRAINT fk_grantaward_review_section_grantaward FOREIGN KEY (`awardnumber`) REFERENCES `grantaward`(`awardnumber`);"
        )
        con.execute(
            "ALTER TABLE `grantaward_review_section` ADD CONSTRAINT fk_grantaward_review_section_review_section_niicode FOREIGN KEY (`review_section_niicode`) REFERENCES `review_sections`(`review_section_niicode`);"
        )
except:
    pass

---

## 年度ごとの直接経費金額テーブルを作る

In [None]:
# リストを結合する
lists = merge_list("annual")
# リストをデータフレームに変換する
columns = ["awardnumber", "sequence", "fiscalyear", "directcost"]
annual = pd.DataFrame(lists, columns=columns)
annual

In [None]:
# データベースに書き込む
annual.to_sql(
    "grantaward_annual",
    engine,
    if_exists="replace",
    dtype={
        "awardnumber": String(255),
        "sequence": Integer,
        "fiscalyaer": Integer,
        "directcost": BigInteger,
    },
)

# 主キーと外部キー制約を設定する
with engine.connect() as con:
    con.execute("ALTER TABLE `grantaward_annual` ADD PRIMARY KEY(`index`);")
    con.execute(
        "ALTER TABLE `grantaward_annual` ADD CONSTRAINT fk_grantaward_annual_grantaward FOREIGN KEY (`awardnumber`) REFERENCES `grantaward`(`awardnumber`);"
    )

---

## キーワードテーブルを作る

In [None]:
# リストを結合する
lists = merge_list("keyword")
# リストをデータフレームに変換する
columns = ["awardnumber", "keyword_sequence", "keyword_text"]
keyword = pd.DataFrame(lists, columns=columns)
keyword

In [None]:
# データベースに書き込む
keyword.to_sql(
    "grantaward_keyword",
    engine,
    if_exists="replace",
    dtype={"awardnumber": String(255)},
)

# 主キーと外部キー制約を設定する
with engine.connect() as con:
    con.execute("ALTER TABLE `grantaward_keyword` ADD PRIMARY KEY(`index`);")
    con.execute(
        "ALTER TABLE `grantaward_keyword` ADD CONSTRAINT fk_grantaward_keyword_grantaward FOREIGN KEY (`awardnumber`) REFERENCES `grantaward`(`awardnumber`);"
    )

---

## 研究概要等のテキストのテーブルを作る

In [None]:
# リストを結合する
lists = merge_list("paragraph")
# リストをデータフレームに変換する
columns = [
    "awardnumber",
    "paragraphlist_sequence",
    "paragraphlist_parentid",
    "paragraphlist_type",
    "paragraph_sequence",
    "paragraph_text",
]
paragraph = pd.DataFrame(lists, columns=columns)
paragraph

In [None]:
# データベースに書き込む
paragraph.to_sql(
    "grantaward_paragraph",
    engine,
    if_exists="replace",
    dtype={"awardnumber": String(255)},
)

# 主キーと外部キー制約を設定する
with engine.connect() as con:
    con.execute("ALTER TABLE `grantaward_paragraph` ADD PRIMARY KEY(`index`);")
    con.execute(
        "ALTER TABLE `grantaward_paragraph` ADD CONSTRAINT fk_grantaward_paragraph_grantaward FOREIGN KEY (`awardnumber`) REFERENCES `grantaward`(`awardnumber`);"
    )

## 成果物テーブルを作る

In [None]:
# リストを結合する
lists = merge_list("product")
# リストをデータフレームに変換する
columns = [
    "awardnumber",
    "product_type",
    "sequence",
    "reviewed",
    "doi",
    "author_ja",
    "author_en",
    "title_ja",
    "title_en",
    "journaltitle_ja",
    "journaltitle_en",
    "year",
]
product = pd.DataFrame(lists, columns=columns)
product

In [None]:
# データベースに書き込む
product.to_sql(
    "grantaward_product",
    engine,
    if_exists="replace",
    dtype={"awardnumber": String(255), "year": Integer},
)

# 主キーと外部キー制約を設定する
with engine.connect() as con:
    con.execute("ALTER TABLE `grantaward_product` ADD PRIMARY KEY(`index`);")
    con.execute(
        "ALTER TABLE `grantaward_product` ADD CONSTRAINT fk_grantaward_product_grantaward FOREIGN KEY (`awardnumber`) REFERENCES `grantaward`(`awardnumber`);"
    )

おしまい