# KAKENの各種マスタをローカルのDBに保存する

## 事前準備

- KAKENマスタデータは、git のリポジトリで管理されており、最新のデータを利用可能。https://bitbucket.org/niijp/grants_masterxml_kaken/ からリポジトリを pull して、ローカルの ./grants_masterxml_kaken フォルダに同期しておく。

1. 研究種目：category_master_kakenhi.xml
2. 研究分野：field_master_kakenhi.xml
3. 研究機関：institution_master_kakenhi.xml
4. 審査区分：review_section_master_kakenhi.xml
5. 応募区分：section_master_kakenhi.xml

In [None]:
import configparser

import numpy as np
import pandas as pd
import pymysql
from lxml import etree
from sqlalchemy import create_engine
from sqlalchemy.types import Date, Integer, String, Text

In [None]:
config = configparser.ConfigParser()
config.read("../../settings/config.ini")
username = config["mariadb"]["username"]
password = config["mariadb"]["password"]
url = (
    "mysql+pymysql://"
    + username
    + ":"
    + password
    + "@localhost:3306/"
    + "kaken"
    + "?charset=UTF8MB4"
)
engine = create_engine(url, echo=True)

## 1. 研究種目マスタ

In [None]:
# XMLファイルからリストを作成する
tree = etree.parse("../grants_masterxml_kaken/category_master_kakenhi.xml")
categorylist = []
for category_table in tree.iterfind("category_table"):
    for category in category_table.iterfind("category"):
        name = category.find("name[@lang='ja']").text
        niicode = category.find("code[@type='nii']").text
        row = [niicode, name]
        categorylist.append(row)
# リストをデータフレームに変換する
columns = ["category_niicode", "category_name"]
df = pd.DataFrame(categorylist, columns=columns)
# 重複を削除して、category_niicodeに重複がないことを確認して、インデックスに設定する
df = df.drop_duplicates()
assert not df["category_niicode"].duplicated().any(), "category_niicode is duplicated."
df = df.set_index("category_niicode")
df

In [None]:
# 外部キー制約を外す
try:
    with engine.connect() as con:
        con.execute("ALTER TABLE grantaward DROP FOREIGN KEY category_niicode_1;")
except:
    pass
# データベースに書き込む
df.to_sql(
    "categories",
    engine,
    if_exists="replace",
    dtype={"category_niicode": Integer, "category_name": String(255)},
)

## 2. 研究分野マスタ

In [None]:
# XMLファイルからリストを作成する
tree = etree.parse("../grants_masterxml_kaken/field_master_kakenhi.xml")
fieldlist = []
for field_table in tree.iterfind("field_table"):
    field_table_type = field_table.get("type")
    field_table_start_date = field_table.get("start_date")
    field_table_end_date = field_table.get("end_date")
    # layer 1
    for field in field_table.iterfind("field"):
        field_path = field.get("path")
        field_niicode = field.find("code[@type='nii']").text
        field_name = field.find("name[@lang='ja']").text
        row = [
            field_table_type,
            field_table_start_date,
            field_table_end_date,
            field_path,
            field_niicode,
            field_name,
            1,
        ]
        fieldlist.append(row)
        # layer 2
        for field in field.iterfind("field"):
            field_path = field.get("path")
            field_niicode = field.find("code[@type='nii']").text
            field_name = field.find("name[@lang='ja']").text
            row = [
                field_table_type,
                field_table_start_date,
                field_table_end_date,
                field_path,
                field_niicode,
                field_name,
                2,
            ]
            fieldlist.append(row)
            # layer 3
            for field in field.iterfind("field"):
                field_path = field.get("path")
                field_niicode = field.find("code[@type='nii']").text
                field_name = field.find("name[@lang='ja']").text
                row = [
                    field_table_type,
                    field_table_start_date,
                    field_table_end_date,
                    field_path,
                    field_niicode,
                    field_name,
                    3,
                ]
                fieldlist.append(row)
                # layer 4
                for field in field.iterfind("field"):
                    field_path = field.get("path")
                    field_niicode = field.find("code[@type='nii']").text
                    field_name = field.find("name[@lang='ja']").text
                    row = [
                        field_table_type,
                        field_table_start_date,
                        field_table_end_date,
                        field_path,
                        field_niicode,
                        field_name,
                        4,
                    ]
                    fieldlist.append(row)
# リストをデータフレームに変換する
columns = [
    "field_table_type",
    "field_table_start_date",
    "field_table_end_date",
    "field_path",
    "field_niicode",
    "field_name",
    "layer",
]
df = pd.DataFrame(fieldlist, columns=columns)
df

In [None]:
# 外部キー制約を外す
try:
    with engine.connect() as con:
        con.execute("ALTER TABLE grantaward DROP FOREIGN KEY field_niicode_1;")
except:
    pass
# データベースに書き込む
df.to_sql(
    "fields",
    engine,
    if_exists="replace",
    dtype={
        "field_table_type": String(255),
        "field_table_start_date": Date,
        "field_table_end_date": Date,
        "field_path": String(255),
        "field_niicode": Integer,
        "field_name": String(255),
        "layer": Integer,
    },
)

In [None]:
# 主キーと外部キー制約を設定する
with engine.connect() as con:
    con.execute("ALTER TABLE fields ADD INDEX (field_niicode);")
    con.execute("ALTER TABLE fields ADD INDEX (field_path);")

# 3. 研究機関マスタ

In [None]:
# XMLファイルからリストを作成する
tree = etree.parse("../grants_masterxml_kaken/institution_master_kakenhi.xml")
institutionlist = []
for institution_table in tree.iterfind("institution_table"):
    for institution in institution_table.iterfind("institution"):
        institution_name_ja = institution.find("name[@lang='ja']").text
        institution_name_en = institution.find("name[@lang='en']").text
        institution_niicode = institution.find("code[@type='nii']").text
        institution_mextcode = institution.find("code[@type='mext']").text
        try:
            institution_jspscode = institution.find("code[@type='jsps']").text
        except AttributeError:
            institution_jspscode = None
        row = [
            institution_niicode,
            institution_mextcode,
            institution_jspscode,
            institution_name_ja,
            institution_name_en,
        ]
        institutionlist.append(row)
# リストからデータフレームに変換する
columns = [
    "institution_niicode",
    "institution_mextcode",
    "institution_jspscode",
    "institution_name_ja",
    "institution_name_en",
]
df = pd.DataFrame(institutionlist, columns=columns)
# 重複を削除して、category_niicodeに重複がないことを確認して、インデックスに設定する
df = df.drop_duplicates()
assert not df["institution_niicode"].duplicated().any(), "category_niicode is duplicated."
df = df.set_index("institution_niicode")
df

In [None]:
# 外部キー制約を外す
try:
    with engine.connect() as con:
        con.execute("ALTER TABLE grantaward DROP FOREIGN KEY institution_niicode_1;")
except:
    pass
# データベースに書き込む
df.to_sql(
    "institutions",
    engine,
    if_exists="replace",
    dtype={
        "institution_niicode": Integer,
        "institution_mextcodei": String(255),
        "institution_jspscode": String(255),
    },
)

## 4. 審査区分マスタ

In [None]:
# XMLファイルからリストを作成する
tree = etree.parse("../grants_masterxml_kaken/review_section_master_kakenhi.xml")
review_sectionlist = []
for review_section_table in tree.iterfind("review_section_table"):
    review_section_table_type = review_section_table.get("type")
    review_section_table_start_date = review_section_table.get("start_date")
    review_section_table_end_date = review_section_table.get("end_date")
    # layer 1
    for review_section in review_section_table.iterfind("review_section"):
        review_section_path = review_section.get("path")
        review_section_niicode = review_section.find("code[@type='nii']").text
        review_section_name = review_section.find("name[@lang='ja']").text
        row = [
            review_section_table_type,
            review_section_table_start_date,
            review_section_table_end_date,
            review_section_path,
            review_section_niicode,
            review_section_name,
            1,
        ]
        review_sectionlist.append(row)
        # layer 2
        for review_section in review_section.iterfind("review_section"):
            review_section_path = review_section.get("path")
            review_section_niicode = review_section.find("code[@type='nii']").text
            review_section_name = review_section.find("name[@lang='ja']").text
            row = [
                review_section_table_type,
                review_section_table_start_date,
                review_section_table_end_date,
                review_section_path,
                review_section_niicode,
                review_section_name,
                2,
            ]
            review_sectionlist.append(row)
            # layer 3
            for review_section in review_section.iterfind("review_section"):
                review_section_path = review_section.get("path")
                review_section_niicode = review_section.find("code[@type='nii']").text
                review_section_name = review_section.find("name[@lang='ja']").text
                row = [
                    review_section_table_type,
                    review_section_table_start_date,
                    review_section_table_end_date,
                    review_section_path,
                    review_section_niicode,
                    review_section_name,
                    3,
                ]
                review_sectionlist.append(row)
# リストからデータフレームに変換する
columns = [
    "review_section_table_type",
    "review_section_table_start_date",
    "review_section_table_end_date",
    "review_section_path",
    "review_section_niicode",
    "review_section_name",
    "layer",
]
df = pd.DataFrame(review_sectionlist, columns=columns)
df

In [None]:
# 外部キー制約を外す
try:
    with engine.connect() as con:
        con.execute("ALTER TABLE grantaward DROP FOREIGN KEY review_section_niicode_1;")
except:
    pass
# データベースに書き込む
df.to_sql(
    "review_sections",
    engine,
    if_exists="replace",
    dtype={
        "review_section_table_type": String(255),
        "review_section_table_start_date": Date,
        "review_section_table_end_date": Date,
        "review_section_path": String(255),
        "review_section_niicode": Integer,
        "layer": Integer,
    },
)

In [None]:
# 主キーと外部キー制約を設定する
with engine.connect() as con:
    con.execute("ALTER TABLE review_sections ADD INDEX (review_section_niicode);")

# 5. 応募区分マスタ

In [None]:
# XMLファイルからリストを作成する
tree = etree.parse("../grants_masterxml_kaken/section_master_kakenhi.xml")
sectionlist = []
for section_table in tree.iterfind("section_table"):
    for section in section_table.iterfind("section"):
        section_name = section.find("name[@lang='ja']").text
        section_niicode = section.find("code[@type='nii']").text
        row = [section_niicode, section_name]
        sectionlist.append(row)
# リストをデータフレームに変換する
columns = ["section_niicode", "section_name"]
df = pd.DataFrame(sectionlist, columns=columns)
# 重複を削除して、category_niicodeに重複がないことを確認して、インデックスに設定する
df = df.drop_duplicates()
assert not df["section_niicode"].duplicated().any(), "section_niicode is duplicated."
df = df.set_index('section_niicode')
df

In [None]:
# 外部キー制約を外す
try:
    with engine.connect() as con:
        con.execute("ALTER TABLE grantaward DROP FOREIGN KEY section_niicode_1;")
except:
    pass
# データベースに書き込む
df.to_sql("sections", engine, if_exists="replace", dtype={"section_niicode": Integer})

ここまででマスタが完成