# KAKENの各種マスタをローカルのDBに保存する

## 事前準備

- KAKENマスタデータは、git のリポジトリで管理されており、最新のデータを利用可能。https://bitbucket.org/niijp/grants_masterxml_kaken/ からリポジトリを pull して、ローカルの ./grants_masterxml_kaken フォルダに同期しておく。

1. 研究種目：category_master_kakenhi.xml
2. 研究分野：field_master_kakenhi.xml
3. 研究機関：institution_master_kakenhi.xml
4. 審査区分：review_section_master_kakenhi.xml
5. 応募区分：section_master_kakenhi.xml

In [1]:
import configparser

import numpy as np
import pandas as pd
import pymysql
from lxml import etree
from sqlalchemy import create_engine
from sqlalchemy.types import Date, Integer, String, Text

In [2]:
config = configparser.ConfigParser()
config.read("../kaken_parse_grants_masterxml/config.ini")
username = config["mariadb"]["username"]
password = config["mariadb"]["password"]
url = (
    "mysql+pymysql://"
    + username
    + ":"
    + password
    + "@localhost:3306/"
    + "kaken"
    + "?charset=UTF8MB4"
)
engine = create_engine(url, echo=True)

## 1. 研究種目マスタ

In [3]:
# XMLファイルからリストを作成する
tree = etree.parse("../grants_masterxml_kaken/category_master_kakenhi.xml")
categorylist = []
for category_table in tree.iterfind("category_table"):
    for category in category_table.iterfind("category"):
        name = category.find("name[@lang='ja']").text
        niicode = category.find("code[@type='nii']").text
        row = [niicode, name]
        categorylist.append(row)
# リストをデータフレームに変換する
columns = ["category_niicode", "category_name"]
df = pd.DataFrame(categorylist, columns=columns)
# 重複を削除して、category_niicodeに重複がないことを確認して、インデックスに設定する
df = df.drop_duplicates()
assert not df["category_niicode"].duplicated().any(), "category_niicode is duplicated."
df = df.set_index("category_niicode")
df

Unnamed: 0_level_0,category_name
category_niicode,Unnamed: 1_level_1
1,機関研究
2,各個研究
3,特定研究
4,総合研究
5,試験研究
6,海外学術調査
7,研究成果刊行費
8,奨励研究
9,がん特別研究
10,総合研究(A)


In [4]:
# 外部キー制約を外す
try:
    with engine.connect() as con:
        con.execute("ALTER TABLE grantaward DROP FOREIGN KEY category_niicode_1;")
except:
    pass
# データベースに書き込む
df.to_sql(
    "categories",
    engine,
    if_exists="replace",
    dtype={"category_niicode": Integer, "category_name": String(255)},
)

2020-05-23 09:24:35,432 INFO sqlalchemy.engine.base.Engine SHOW VARIABLES LIKE 'sql_mode'
2020-05-23 09:24:35,433 INFO sqlalchemy.engine.base.Engine {}
2020-05-23 09:24:35,439 INFO sqlalchemy.engine.base.Engine SELECT DATABASE()
2020-05-23 09:24:35,440 INFO sqlalchemy.engine.base.Engine {}
2020-05-23 09:24:35,443 INFO sqlalchemy.engine.base.Engine show collation where `Charset` = 'utf8' and `Collation` = 'utf8_bin'
2020-05-23 09:24:35,444 INFO sqlalchemy.engine.base.Engine {}
2020-05-23 09:24:35,449 INFO sqlalchemy.engine.base.Engine SELECT CAST('test plain returns' AS CHAR(60)) AS anon_1
2020-05-23 09:24:35,450 INFO sqlalchemy.engine.base.Engine {}
2020-05-23 09:24:35,452 INFO sqlalchemy.engine.base.Engine SELECT CAST('test unicode returns' AS CHAR(60)) AS anon_1
2020-05-23 09:24:35,453 INFO sqlalchemy.engine.base.Engine {}
2020-05-23 09:24:35,455 INFO sqlalchemy.engine.base.Engine SELECT CAST('test collated returns' AS CHAR CHARACTER SET utf8) COLLATE utf8_bin AS anon_1
2020-05-23 09

## 2. 研究分野マスタ

In [5]:
# XMLファイルからリストを作成する
tree = etree.parse("../grants_masterxml_kaken/field_master_kakenhi.xml")
fieldlist = []
for field_table in tree.iterfind("field_table"):
    field_table_type = field_table.get("type")
    field_table_start_date = field_table.get("start_date")
    field_table_end_date = field_table.get("end_date")
    # layer 1
    for field in field_table.iterfind("field"):
        field_path = field.get("path")
        field_niicode = field.find("code[@type='nii']").text
        field_name = field.find("name[@lang='ja']").text
        row = [
            field_table_type,
            field_table_start_date,
            field_table_end_date,
            field_path,
            field_niicode,
            field_name,
            1,
        ]
        fieldlist.append(row)
        # layer 2
        for field in field.iterfind("field"):
            field_path = field.get("path")
            field_niicode = field.find("code[@type='nii']").text
            field_name = field.find("name[@lang='ja']").text
            row = [
                field_table_type,
                field_table_start_date,
                field_table_end_date,
                field_path,
                field_niicode,
                field_name,
                2,
            ]
            fieldlist.append(row)
            # layer 3
            for field in field.iterfind("field"):
                field_path = field.get("path")
                field_niicode = field.find("code[@type='nii']").text
                field_name = field.find("name[@lang='ja']").text
                row = [
                    field_table_type,
                    field_table_start_date,
                    field_table_end_date,
                    field_path,
                    field_niicode,
                    field_name,
                    3,
                ]
                fieldlist.append(row)
                # layer 4
                for field in field.iterfind("field"):
                    field_path = field.get("path")
                    field_niicode = field.find("code[@type='nii']").text
                    field_name = field.find("name[@lang='ja']").text
                    row = [
                        field_table_type,
                        field_table_start_date,
                        field_table_end_date,
                        field_path,
                        field_niicode,
                        field_name,
                        4,
                    ]
                    fieldlist.append(row)
# リストをデータフレームに変換する
columns = [
    "field_table_type",
    "field_table_start_date",
    "field_table_end_date",
    "field_path",
    "field_niicode",
    "field_name",
    "layer",
]
df = pd.DataFrame(fieldlist, columns=columns)
df

Unnamed: 0,field_table_type,field_table_start_date,field_table_end_date,field_path,field_niicode,field_name,layer
0,saimoku,1972-04-01,1973-03-31,000001,1,文学,1
1,saimoku,1972-04-01,1973-03-31,000001000002,2,哲学,2
2,saimoku,1972-04-01,1973-03-31,000001000002000003,3,哲学,3
3,saimoku,1972-04-01,1973-03-31,000001000002000004,4,中国哲学,3
4,saimoku,1972-04-01,1973-03-31,000001000002000005,5,印度哲学(含仏教学),3
5,saimoku,1972-04-01,1973-03-31,000001000002000006,6,宗教学,3
6,saimoku,1972-04-01,1973-03-31,000001000002000007,7,倫理学,3
7,saimoku,1972-04-01,1973-03-31,000001000002000008,8,美学(含芸術諸学),3
8,saimoku,1972-04-01,1973-03-31,000001000002000009,9,美術史,3
9,saimoku,1972-04-01,1973-03-31,000001000010,10,心理学・社会学・教育学・文化人類学,2


In [6]:
# 外部キー制約を外す
try:
    with engine.connect() as con:
#        con.execute("ALTER TABLE grantaward DROP FOREIGN KEY field_niicode_1;")
        con.execute("ALTER TABLE grantaward_field DROP FOREIGN KEY fk_grantaward_field_field_niicode;")
        con.execute("ALTER TABLE grantaward_field DROP FOREIGN KEY fk_grantaward_field_field_path;")
except:
    pass
# データベースに書き込む
df.to_sql(
    "fields",
    engine,
    if_exists="replace",
    dtype={
        "field_table_type": String(255),
        "field_table_start_date": Date,
        "field_table_end_date": Date,
        "field_path": String(255),
        "field_niicode": Integer,
        "field_name": String(255),
        "layer": Integer,
    },
)

2020-05-23 09:24:57,287 INFO sqlalchemy.engine.base.Engine ALTER TABLE grantaward_field DROP FOREIGN KEY fk_grantaward_field_field_niicode;
2020-05-23 09:24:57,288 INFO sqlalchemy.engine.base.Engine {}
2020-05-23 09:24:57,295 INFO sqlalchemy.engine.base.Engine ROLLBACK
2020-05-23 09:24:57,299 INFO sqlalchemy.engine.base.Engine DESCRIBE `fields`
2020-05-23 09:24:57,300 INFO sqlalchemy.engine.base.Engine {}
2020-05-23 09:24:57,303 INFO sqlalchemy.engine.base.Engine DESCRIBE `fields`
2020-05-23 09:24:57,304 INFO sqlalchemy.engine.base.Engine {}
2020-05-23 09:24:57,309 INFO sqlalchemy.engine.base.Engine SHOW FULL TABLES FROM `kaken`
2020-05-23 09:24:57,310 INFO sqlalchemy.engine.base.Engine {}
2020-05-23 09:24:57,313 INFO sqlalchemy.engine.base.Engine SHOW CREATE TABLE `fields`
2020-05-23 09:24:57,314 INFO sqlalchemy.engine.base.Engine {}
2020-05-23 09:24:57,320 INFO sqlalchemy.engine.base.Engine 
DROP TABLE `fields`
2020-05-23 09:24:57,321 INFO sqlalchemy.engine.base.Engine {}
2020-05-23 

In [7]:
# 主キーと外部キー制約を設定する
with engine.connect() as con:
    con.execute("ALTER TABLE fields ADD INDEX (field_niicode);")
    con.execute("ALTER TABLE fields ADD INDEX (field_path);")

2020-05-23 09:25:35,429 INFO sqlalchemy.engine.base.Engine ALTER TABLE fields ADD INDEX (field_niicode);
2020-05-23 09:25:35,430 INFO sqlalchemy.engine.base.Engine {}
2020-05-23 09:25:35,453 INFO sqlalchemy.engine.base.Engine COMMIT
2020-05-23 09:25:35,455 INFO sqlalchemy.engine.base.Engine ALTER TABLE fields ADD INDEX (field_path);
2020-05-23 09:25:35,457 INFO sqlalchemy.engine.base.Engine {}
2020-05-23 09:25:35,483 INFO sqlalchemy.engine.base.Engine COMMIT


# 3. 研究機関マスタ

In [8]:
# XMLファイルからリストを作成する
tree = etree.parse("../grants_masterxml_kaken/institution_master_kakenhi.xml")
institutionlist = []
for institution_table in tree.iterfind("institution_table"):
    for institution in institution_table.iterfind("institution"):
        institution_name_ja = institution.find("name[@lang='ja']").text
        institution_name_en = institution.find("name[@lang='en']").text
        institution_niicode = institution.find("code[@type='nii']").text
        institution_mextcode = institution.find("code[@type='mext']").text
        try:
            institution_jspscode = institution.find("code[@type='jsps']").text
        except AttributeError:
            institution_jspscode = None
        row = [
            institution_niicode,
            institution_mextcode,
            institution_jspscode,
            institution_name_ja,
            institution_name_en,
        ]
        institutionlist.append(row)
# リストからデータフレームに変換する
columns = [
    "institution_niicode",
    "institution_mextcode",
    "institution_jspscode",
    "institution_name_ja",
    "institution_name_en",
]
df = pd.DataFrame(institutionlist, columns=columns)
# 重複を削除して、category_niicodeに重複がないことを確認して、インデックスに設定する
df = df.drop_duplicates()
assert not df["institution_niicode"].duplicated().any(), "category_niicode is duplicated."
df = df.set_index("institution_niicode")
df

Unnamed: 0_level_0,institution_mextcode,institution_jspscode,institution_name_ja,institution_name_en
institution_niicode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0010101,10101,10101,北海道大学,Hokkaido University
0010102,10102,10102,北海道教育大学,Hokkaido University of Education
0010103,10103,10103,室蘭工業大学,Muroran Institute of Technology
0010104,10104,10104,小樽商科大学,Otaru University of Commerce
0010105,10105,10105,帯広畜産大学,Obihiro University of Agriculture and Veterina...
0010106,10106,10106,北見工業大学,Kitami Institute of Technology
0010107,10107,10107,旭川医科大学,Asahikawa Medical College
0011101,11101,11101,弘前大学,Hirosaki University
0011201,11201,11201,岩手大学,Iwate University
0011301,11301,11301,東北大学,Tohoku University


In [9]:
# 外部キー制約を外す
try:
    with engine.connect() as con:
        con.execute("ALTER TABLE grantaward DROP FOREIGN KEY institution_niicode_1;")
except:
    pass
# データベースに書き込む
df.to_sql(
    "institutions",
    engine,
    if_exists="replace",
    dtype={
        "institution_niicode": Integer,
        "institution_mextcodei": String(255),
        "institution_jspscode": String(255),
    },
)

2020-05-23 09:25:49,029 INFO sqlalchemy.engine.base.Engine ALTER TABLE grantaward DROP FOREIGN KEY institution_niicode_1;
2020-05-23 09:25:49,030 INFO sqlalchemy.engine.base.Engine {}
2020-05-23 09:25:49,037 INFO sqlalchemy.engine.base.Engine COMMIT
2020-05-23 09:25:49,047 INFO sqlalchemy.engine.base.Engine DESCRIBE `institutions`
2020-05-23 09:25:49,048 INFO sqlalchemy.engine.base.Engine {}
2020-05-23 09:25:49,052 INFO sqlalchemy.engine.base.Engine DESCRIBE `institutions`
2020-05-23 09:25:49,053 INFO sqlalchemy.engine.base.Engine {}
2020-05-23 09:25:49,060 INFO sqlalchemy.engine.base.Engine SHOW FULL TABLES FROM `kaken`
2020-05-23 09:25:49,061 INFO sqlalchemy.engine.base.Engine {}
2020-05-23 09:25:49,064 INFO sqlalchemy.engine.base.Engine SHOW CREATE TABLE `institutions`
2020-05-23 09:25:49,065 INFO sqlalchemy.engine.base.Engine {}
2020-05-23 09:25:49,072 INFO sqlalchemy.engine.base.Engine 
DROP TABLE institutions
2020-05-23 09:25:49,073 INFO sqlalchemy.engine.base.Engine {}
2020-05-2

## 4. 審査区分マスタ

In [10]:
# XMLファイルからリストを作成する
tree = etree.parse("../grants_masterxml_kaken/review_section_master_kakenhi.xml")
review_sectionlist = []
for review_section_table in tree.iterfind("review_section_table"):
    review_section_table_type = review_section_table.get("type")
    review_section_table_start_date = review_section_table.get("start_date")
    review_section_table_end_date = review_section_table.get("end_date")
    # layer 1
    for review_section in review_section_table.iterfind("review_section"):
        review_section_path = review_section.get("path")
        review_section_niicode = review_section.find("code[@type='nii']").text
        review_section_name = review_section.find("name[@lang='ja']").text
        row = [
            review_section_table_type,
            review_section_table_start_date,
            review_section_table_end_date,
            review_section_path,
            review_section_niicode,
            review_section_name,
            1,
        ]
        review_sectionlist.append(row)
        # layer 2
        for review_section in review_section.iterfind("review_section"):
            review_section_path = review_section.get("path")
            review_section_niicode = review_section.find("code[@type='nii']").text
            review_section_name = review_section.find("name[@lang='ja']").text
            row = [
                review_section_table_type,
                review_section_table_start_date,
                review_section_table_end_date,
                review_section_path,
                review_section_niicode,
                review_section_name,
                2,
            ]
            review_sectionlist.append(row)
            # layer 3
            for review_section in review_section.iterfind("review_section"):
                review_section_path = review_section.get("path")
                review_section_niicode = review_section.find("code[@type='nii']").text
                review_section_name = review_section.find("name[@lang='ja']").text
                row = [
                    review_section_table_type,
                    review_section_table_start_date,
                    review_section_table_end_date,
                    review_section_path,
                    review_section_niicode,
                    review_section_name,
                    3,
                ]
                review_sectionlist.append(row)
# リストからデータフレームに変換する
columns = [
    "review_section_table_type",
    "review_section_table_start_date",
    "review_section_table_end_date",
    "review_section_path",
    "review_section_niicode",
    "review_section_name",
    "layer",
]
df = pd.DataFrame(review_sectionlist, columns=columns)
df

Unnamed: 0,review_section_table_type,review_section_table_start_date,review_section_table_end_date,review_section_path,review_section_niicode,review_section_name,layer
0,review_section,2018-04-01,,000001,1,大区分A,1
1,review_section,2018-04-01,,000001000002,2,中区分1:思想、芸術およびその関連分野,2
2,review_section,2018-04-01,,000001000002000003,3,小区分01010:哲学および倫理学関連,3
3,review_section,2018-04-01,,000001000002000004,4,小区分01020:中国哲学、印度哲学および仏教学関連,3
4,review_section,2018-04-01,,000001000002000005,5,小区分01030:宗教学関連,3
5,review_section,2018-04-01,,000001000002000006,6,小区分01040:思想史関連,3
6,review_section,2018-04-01,,000001000002000007,7,小区分01050:美学および芸術論関連,3
7,review_section,2018-04-01,,000001000002000008,8,小区分01060:美術史関連,3
8,review_section,2018-04-01,,000001000002000009,9,小区分01070:芸術実践論関連,3
9,review_section,2018-04-01,,000001000002000010,10,小区分01080:科学社会学および科学技術史関連,3


In [11]:
# 外部キー制約を外す
try:
    with engine.connect() as con:
#        con.execute("ALTER TABLE grantaward DROP FOREIGN KEY review_section_niicode_1;")
        con.execute("ALTER TABLE grantaward_review_section DROP FOREIGN KEY fk_grantaward_review_section_review_section_niicode;")
except:
    pass
# データベースに書き込む
df.to_sql(
    "review_sections",
    engine,
    if_exists="replace",
    dtype={
        "review_section_table_type": String(255),
        "review_section_table_start_date": Date,
        "review_section_table_end_date": Date,
        "review_section_path": String(255),
        "review_section_niicode": Integer,
        "layer": Integer,
    },
)

2020-05-23 09:26:10,066 INFO sqlalchemy.engine.base.Engine ALTER TABLE grantaward_review_section DROP FOREIGN KEY fk_grantaward_review_section_review_section_niicode;
2020-05-23 09:26:10,067 INFO sqlalchemy.engine.base.Engine {}
2020-05-23 09:26:10,074 INFO sqlalchemy.engine.base.Engine ROLLBACK
2020-05-23 09:26:10,080 INFO sqlalchemy.engine.base.Engine DESCRIBE `review_sections`
2020-05-23 09:26:10,081 INFO sqlalchemy.engine.base.Engine {}
2020-05-23 09:26:10,085 INFO sqlalchemy.engine.base.Engine DESCRIBE `review_sections`
2020-05-23 09:26:10,087 INFO sqlalchemy.engine.base.Engine {}
2020-05-23 09:26:10,091 INFO sqlalchemy.engine.base.Engine SHOW FULL TABLES FROM `kaken`
2020-05-23 09:26:10,092 INFO sqlalchemy.engine.base.Engine {}
2020-05-23 09:26:10,095 INFO sqlalchemy.engine.base.Engine SHOW CREATE TABLE `review_sections`
2020-05-23 09:26:10,096 INFO sqlalchemy.engine.base.Engine {}
2020-05-23 09:26:10,102 INFO sqlalchemy.engine.base.Engine 
DROP TABLE review_sections
2020-05-23 0

In [12]:
# 主キーと外部キー制約を設定する
with engine.connect() as con:
    con.execute("ALTER TABLE review_sections ADD INDEX (review_section_niicode);")

2020-05-23 09:26:19,386 INFO sqlalchemy.engine.base.Engine ALTER TABLE review_sections ADD INDEX (review_section_niicode);
2020-05-23 09:26:19,388 INFO sqlalchemy.engine.base.Engine {}
2020-05-23 09:26:19,400 INFO sqlalchemy.engine.base.Engine COMMIT


# 5. 応募区分マスタ

In [13]:
# XMLファイルからリストを作成する
tree = etree.parse("../grants_masterxml_kaken/section_master_kakenhi.xml")
sectionlist = []
for section_table in tree.iterfind("section_table"):
    for section in section_table.iterfind("section"):
        section_name = section.find("name[@lang='ja']").text
        section_niicode = section.find("code[@type='nii']").text
        row = [section_niicode, section_name]
        sectionlist.append(row)
# リストをデータフレームに変換する
columns = ["section_niicode", "section_name"]
df = pd.DataFrame(sectionlist, columns=columns)
# 重複を削除して、category_niicodeに重複がないことを確認して、インデックスに設定する
df = df.drop_duplicates()
assert not df["section_niicode"].duplicated().any(), "section_niicode is duplicated."
df = df.set_index('section_niicode')
df

Unnamed: 0_level_0,section_name
section_niicode,Unnamed: 1_level_1
1,本調査
2,成果とりまとめ
3,総括
4,学術定期刊行物
5,学術図書
6,二次刊行物
7,現地調査
8,調査総括
9,がん特別調査
10,現地調査


In [14]:
# 外部キー制約を外す
try:
    with engine.connect() as con:
        con.execute("ALTER TABLE grantaward DROP FOREIGN KEY section_niicode_1;")
except:
    pass
# データベースに書き込む
df.to_sql("sections", engine, if_exists="replace", dtype={"section_niicode": Integer})

2020-05-23 09:26:28,766 INFO sqlalchemy.engine.base.Engine ALTER TABLE grantaward DROP FOREIGN KEY section_niicode_1;
2020-05-23 09:26:28,767 INFO sqlalchemy.engine.base.Engine {}
2020-05-23 09:26:28,778 INFO sqlalchemy.engine.base.Engine COMMIT
2020-05-23 09:26:28,783 INFO sqlalchemy.engine.base.Engine DESCRIBE `sections`
2020-05-23 09:26:28,784 INFO sqlalchemy.engine.base.Engine {}
2020-05-23 09:26:28,789 INFO sqlalchemy.engine.base.Engine DESCRIBE `sections`
2020-05-23 09:26:28,790 INFO sqlalchemy.engine.base.Engine {}
2020-05-23 09:26:28,795 INFO sqlalchemy.engine.base.Engine SHOW FULL TABLES FROM `kaken`
2020-05-23 09:26:28,796 INFO sqlalchemy.engine.base.Engine {}
2020-05-23 09:26:28,799 INFO sqlalchemy.engine.base.Engine SHOW CREATE TABLE `sections`
2020-05-23 09:26:28,801 INFO sqlalchemy.engine.base.Engine {}
2020-05-23 09:26:28,804 INFO sqlalchemy.engine.base.Engine 
DROP TABLE sections
2020-05-23 09:26:28,806 INFO sqlalchemy.engine.base.Engine {}
2020-05-23 09:26:28,810 INFO 

ここまででマスタが完成