referred https://m.blog.naver.com/PostView.naver?isHttpsRedirect=true&blogId=39954&logNo=50120700741

In [1]:
from jupyterthemes import jtplot
jtplot.style(theme='solarizedl', context='notebook', ticks=True, grid=False, figsize=(10, 6))

In [2]:
from IPython.display import display, Markdown
from tqdm.notebook import tqdm

import random
import requests
import re
from copy import copy
from bs4 import BeautifulSoup

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
# !wget "https://www.wfonts.com/download/data/2016/06/13/malgun-gothic/malgun.ttf"
plt.rc('font', family='Malgun Gothic') 
plt.rcParams['axes.unicode_minus'] = False

from sklearn.preprocessing import normalize
from nltk.tokenize import word_tokenize
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import AgglomerativeClustering
from gensim.models import Word2Vec, FastText

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim

___

## Load and Merge

In [5]:
main_df = pd.read_pickle("../data/main_df.pickle")

In [9]:
ingredient_df = pd.read_excel("../data/ingredient_table.xlsx")
ingredient_df = ingredient_df[['제품명', '업체명', '주성분', '품목기준코드', 'ATC코드']].copy()

In [10]:
main_df['itemseq'] = main_df['itemseq'].astype(int)
merged_df = pd.merge(main_df, ingredient_df, left_on='itemseq', right_on='품목기준코드')

In [11]:
merged_df = merged_df[['itemname', 'entpname', 'efcyqesitm', 'usemethodqesitm', 'atpnqesitm',
       'itemseq', '주성분', 'ATC코드']]

merged_df.rename(columns={'주성분': 'ingredient', 'ATC코드':"atc_cd"}, inplace=True)
merged_df = merged_df[merged_df.atc_cd.notnull()]

In [12]:
def parse_atc(x) :     
    
    splitted = re.split('(\d+)',x)
    splitted = [s for s in splitted if s]

    five_state_atc = list()
    if len(splitted) >= 3 : 
        tmp = list()
        for idx, v in enumerate(splitted) : 
            if idx == 2 : 
                if len(v) == 2 and v.isupper() : 
                    for s in v : 
                        tmp.append(s)
            else : 
                tmp.append(v)
        five_state_atc = tmp
    else : 
        return splitted + ['<PAD>'] * (5-len(splitted))
    
    return five_state_atc + ['<PAD>'] * (5-len(five_state_atc))

In [13]:
splitted_atc_cd = merged_df.atc_cd.map(parse_atc)
code_matrix = np.vstack(splitted_atc_cd.values)

In [14]:
def label_generator(mat, depth=0) : 
    arr = list()
    for ls in code_matrix[:,:depth+1] : 
        tmp = ''
        for i in ls : 
            tmp += i
        arr.append(tmp)
    arr = np.array(arr)
    
    mapper = {val:idx for idx,val in enumerate(np.unique(arr))}    
    label = np.vectorize(lambda x : mapper.get(x))(arr)
    return label, np.vectorize(lambda x : x.replace('<PAD>',''))(arr)

In [15]:
l, a = label_generator(code_matrix, 3)

In [17]:
code_matrix

array([['D', '11', 'A', 'F', '<PAD>'],
       ['A', '02', 'A', 'H', '<PAD>'],
       ['V', '03', 'A', 'B', '32'],
       ...,
       ['R', '05', '<PAD>', '<PAD>', '<PAD>'],
       ['A', '07', 'B', 'C', '05'],
       ['A', '11', 'A', 'B', '<PAD>']], dtype='<U5')