# YAML関連プログラム 実行例

今回検討したyamlの仕様によるプログラムの実行例です。  
主に以下のような動作の関数を用意しています。  

- 符号表間のマッピング情報のyamlファイルをまとめて１つのjsonにする
- 変数間のマッピング情報からプログラムを生成する
- データフレームとテキストからyamlファイルを作成する
- yamlファイルをデータフレームに変換する

In [1]:
from newmeta.lib.util_yaml import * 
import pandas as pd
import yaml

In [2]:
# 符号表間のマッピング情報のyamlファイルをまとめて１つのjsonにする
create_codebook_file('newmeta/data/code_mapping', 'newmeta/sample/codebook.json')

In [3]:
# 変数間のマッピング情報からプログラムを生成する

with open('newmeta/data/column.yaml', 'r', encoding='utf-8') as file:
    mapping = yaml.safe_load(file)

render_tpl(mapping, 'newmeta/sample')

In [None]:
# データフレームとテキストから変数間のマッピング情報のyamlファイルを作成
# dict以降の変数名はentry_id, entry_valとする

df = pd.DataFrame({
    'var': ['age', 'inctype', 'gender', 'employment'],
    'label': ['Age', 'Type of Household Income', 'Gender', 'Employment Status'],
    'type': ['dict', 'copy', 'assign', 'skip'],
    'dict': ['age2', None, None, None],
    '2000_id': [21, 15, None, None],
    '2000_val': ['age', 'income', '2000', None],
    '2005_id': [20, None, None, None],
    '2005_val': ['age', None, '2005', None],
    '2010_id': [20, None, None, None],
    '2010_val': ['age', None, '2010', None],
    '2015_id': [19, None, None, None],
    '2015_val': ['age', None, '2015', None]
})

vmap = generate_vmap_yaml(
    title='Population Census of Japan to IPUMS',
    entries=["2000", "2005", "2010", "2015"],
    notes='',
    map_str=create_vmap_elements(df)
    )

print(vmap)

title: Population Census of Japan to IPUMS
entries: ['2000', '2005', '2010', '2015']
notes: ""
map: 
  age:
    label: Age
    type: dict
    dict: age2
    vars: [21: age, 20: age, 20: age, 19: age]

  inctype:
    label: Type of Household Income
    type: copy
    vars: [15: income, null, null, null]

  gender:
    label: Gender
    type: assign
    values: ['2000', '2005', '2010', '2015']

  employment:
    label: Employment Status
    type: skip


In [5]:
# 変数間のマッピング情報のyamlファイルからデータフレームを作成

vmap_df = create_vmapdf(vmap)
print(vmap_df)

          var                     label    type  dict  2000_id 2000_val  \
0         age                       Age    dict  age2     21.0      age   
1     inctype  Type of Household Income    copy  None     15.0   income   
2      gender                    Gender  assign  None      NaN     2000   
3  employment         Employment Status    skip  None      NaN     None   

   2005_id 2005_val  2010_id 2010_val  2015_id 2015_val  
0     20.0      age     20.0      age     19.0      age  
1      NaN     None      NaN     None      NaN     None  
2      NaN     2005      NaN     2010      NaN     2015  
3      NaN     None      NaN     None      NaN     None  


In [None]:
# データフレームとテキストから符号間のマッピング情報のyamlファイルを作成
# code以降の変数名はentry_code, entry_com1, entry_com2とする

df = pd.DataFrame({
    'code': [' 2', ' 7', '12'],
    '2000_code': ['1', '2', '3'],
    '2000_com1': ['0 to 4', '5 to 9', '10 to 14'],
    '2000_com2': ['2 years', '7', '12'],
    '2005_code': ['1', '2', '3'],
    '2005_com1': ['0 to 4: 2 years', '5 to 9: 7', '10 to 14: 12'],
    '2010_code': ['1', '2', '3']
}, dtype=str)

cmap = generate_cmap_yaml(
name='age', 
from_='Population Census of Japan', 
to='IPUMS', 
notes='AGE is a continuous variable. For samples that report age in 5-year groups, such as Japan, IPUMS codes AGE to the mid-point of the interval. In the Japanese censuses, AGE is top-coded at 85+ in 2000 and 2005 and 90+ in 2010 and 2015.',
entries=create_cmap_elements(df)
)

print(cmap)

name: age
from: Population Census of Japan
to: IPUMS
notes: "AGE is a continuous variable. For samples that report age in 5-year groups, such as Japan, IPUMS codes AGE to the mid-point of the interval. In the Japanese censuses, AGE is top-coded at 85+ in 2000 and 2005 and 90+ in 2010 and 2015."
entries:
  "2000":
    "1": " 2" # 0 to 4: 2 years
    "2": " 7" # 5 to 9: 7
    "3": "12" # 10 to 14: 12
  "2005":
    "1": " 2" # 0 to 4: 2 years
    "2": " 7" # 5 to 9: 7
    "3": "12" # 10 to 14: 12
  "2010":
    "1": " 2"
    "2": " 7"
    "3": "12"


In [7]:
# 符号のマッピング情報のyamlファイルからデータフレームを作成

cmap_df = create_cmapdf(cmap)
print(cmap_df)

  code 2000_code 2000_com1 2000_com2 2005_code 2005_com1 2005_com2 2010_code  \
0    2         1    0 to 4   2 years         1    0 to 4   2 years         1   
1    7         2    5 to 9         7         2    5 to 9         7         2   
2   12         3  10 to 14        12         3  10 to 14        12         3   

  2010_com1 2010_com2  
0                      
1                      
2                      
