### 식품 DB

In [20]:
# 식품섭취조사 7차 / 8차 

import pandas as pd
import olefile
import zlib
import struct

# 7차 

def get_hwp_text(filename) :
    f = olefile.OleFileIO(filename)
    dirs = f.listdir()

    # HWP 파일 검증
    if ["FileHeader"] not in dirs or \
       ["\x05HwpSummaryInformation"] not in dirs :
        raise Exception("Not Valid HWP.")

    # 문서 포맷 압축 여부 확인
    header = f.openstream("FileHeader")
    header_data = header.read()
    is_compressed = (header_data[36] & 1) == 1

    # Body Sections 불러오기
    nums = []
    for d in dirs :
        if d[0] == "BodyText" :
            nums.append(int(d[1][len("Section"):]))
    sections = ["BodyText/Section" + str(x) for x in sorted(nums)]

    # 전체 text 추출
    text = ""
    for section in sections :
        bodytext = f.openstream(section)
        data = bodytext.read()
        if is_compressed :
            unpacked_data = zlib.decompress(data, -15)
        else:
            unpacked_data = data
    
        # 각 Section 내 text 추출    
        section_text = ""
        i = 0
        size = len(unpacked_data)
        while i < size :
            header = struct.unpack_from("<I", unpacked_data, i)[0]
            rec_type = header & 0x3ff
            rec_len = (header >> 20) & 0xfff

            if rec_type in [67] and header != 52429891 and header != 18875459 and  header != 85984323 :
                rec_data = unpacked_data[i + 4 : i + 4 + rec_len]
                #print(header)
                #print(rec_data.decode('UTF-16'))
                section_text += rec_data.decode('UTF-16')
                section_text += "\n"

            i += 4 + rec_len
        
        text += section_text
        text += "\n"

    return text

In [30]:
# 8차

def get_hwp_text2(filename) :
    f = olefile.OleFileIO(filename)
    dirs = f.listdir()

    # HWP 파일 검증
    if ["FileHeader"] not in dirs or \
       ["\x05HwpSummaryInformation"] not in dirs :
        raise Exception("Not Valid HWP.")

    # 문서 포맷 압축 여부 확인
    header = f.openstream("FileHeader")
    header_data = header.read()
    is_compressed = (header_data[36] & 1) == 1

    # Body Sections 불러오기
    nums = []
    for d in dirs :
        if d[0] == "BodyText" :
            nums.append(int(d[1][len("Section"):]))
    sections = ["BodyText/Section" + str(x) for x in sorted(nums)]

    # 전체 text 추출
    text = ""
    for section in sections :
        bodytext = f.openstream(section)
        data = bodytext.read()
        if is_compressed :
            unpacked_data = zlib.decompress(data, -15)
        else:
            unpacked_data = data
    
        # 각 Section 내 text 추출    
        section_text = ""
        i = 0
        size = len(unpacked_data)
        while i < size :
            header = struct.unpack_from("<I", unpacked_data, i)[0]
            rec_type = header & 0x3ff
            rec_len = (header >> 20) & 0xfff

            if rec_type in [67] and header != 52429891 and header != 18875459 :
                rec_data = unpacked_data[i + 4 : i + 4 + rec_len]
                #print(header)
                #print(rec_data.decode('UTF-16'))
                section_text += rec_data.decode('UTF-16')
                section_text += "\n"

            i += 4 + rec_len
        
        text += section_text
        text += "\n"

    return text

In [None]:
# 한자가 적힌 변수 기록

# 7차
'''
52429891
捤獥汤捯氠瑢
18875459
氠瑢
18875459
氠瑢
18875459
氠瑢
18875459
氠瑢
18875459
氠瑢
18875459
氠瑢
18875459
氠瑢
85984323
氠瑢
'''

# 8차
'''
52429891
捤獥
18875459
氠瑢
18875459
氠瑢
18875459
氠瑢
18875459
氠瑢
18875459
氠瑢
18875459
氠瑢
18875459
氠瑢
18875459
氠瑢
18875459
桤灧
'''

In [None]:
# HWP 파일 불러오기

식품섭취조사7차 = get_hwp_text('/sas_view/식품섭취조사_7.hwp')
식품섭취조사8차 = get_hwp_text2('/sas_view/식품섭취조사_8.hwp')

In [None]:
# ''\r\n' split

식품섭취조사7차 = 식품섭취조사7차.split('\r\n')
식품섭취조사8차 = 식품섭취조사8차.split('\r\n')

In [None]:
## 딕셔너리 형태로 생성

식품7차_dic = dict()
for i in range(0, len(식품섭취조사7차)-1,2) :
    식품7차_dic[식품섭취조사7차[i]] = 식품섭취조사7차[i+1] 

식품8차_dic = dict()
for i in range(0, len(식품섭취조사8차)-1,2) :
    식품8차_dic[식품섭취조사8차[i]] = 식품섭취조사8차[i+1] 

In [None]:
# 데이터프레임 생성

식품7차_df = pd.DataFrame(list(식품7차_dic.items()),
                   columns = ['변수명','변수설명'])

식품8차_df = pd.DataFrame(list(식품8차_dic.items()),
                   columns = ['변수명','변수설명'])

In [37]:
## 데이터 프레임 csv 파일로 저장

식품7차_df.to_csv('rc7.csv', encoding='euc-kr')
식품8차_df.to_csv('rc8.csv', encoding='euc-kr')