## 농촌진흥청 한식요리 재료에 대한 정보를 이용한 유사성
#### 데이터 정규화 추가 - z-score(normalize), min-max, L2, L1
1. 음식 코드와 식품 코드(재료)의 중량 정보의 pivot 생성
2. 데이터 이용 cosine similarity
3. 계산된 similarity를 이용한 유사 식품 추출

In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
import json
from lib.similarity_model import SimilarityModel

In [2]:
data_raw = pd.read_csv('./dataset/(한식요리정보)FD_음식_신규-수정.csv',encoding='cp949')
data = data_raw.copy()
data['식품명'].fillna('',inplace=True)
data.head(2)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['식품명'].fillna('',inplace=True)


Unnamed: 0,음식_코드,음식명,식품_코드,식품명,정렬_순서,식품_중량,등록자_고유_번호,등록_일시,수정자_고유_번호,수정_일시,중량_1,단위_1,중량_2,단위_2,중량_3,단위_3,메인_식품_여부
0,D230046,우메기떡(개성주악),F02876,"소금, 천일염, 가는소금",6,6.0,SYSTEM,2023-02-13,SYSTEM,2023-02-13,0.5,682032,0.0,0,0.0,0,0
1,D230046,우메기떡(개성주악),F02712,콩기름,7,400.0,SYSTEM,2023-02-13,SYSTEM,2023-02-13,2.0,682031,0.0,0,362.8667,682002,0


In [3]:
data['음식_코드'].unique()

array(['D230046', 'D022018', 'D285006', ..., 'D082023', 'D053038',
       'D031022'], shape=(3093,), dtype=object)

In [5]:
data_pivot = data.pivot(index='음식_코드',columns='식품_코드',values='식품_중량')
data_source = data_pivot.fillna(0.0)
data_source.iloc[:,1295:].head()

식품_코드,F03108,F03109,F03110,F03111,F03112,F03113,F03114,F03115,F03116,F03117,F03118,F03119,F03123,F03338,F03485,F03489
음식_코드,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
D011001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
D011002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
D011003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,210.0,0.0,0.0,0.0
D011004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
D011005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 유사도 구하기 위한 함수

In [30]:

def fit_similarity(data:pd.DataFrame, type:str = 'z'):
    ## type : z-standardScaler, m-MinMaxScaler, l1, l2 
    from sklearn.preprocessing import StandardScaler, MinMaxScaler, normalize
    features = data.index
    scaler = StandardScaler()
    if type == 'm':
        scaler = MinMaxScaler()
    target = scaler.fit_transform(data_source)

    if type == 'l1':
        target = normalize(data_source, 'l1')
    elif type == 'l2':
        target = normalize(data_source, 'l2')
    similarity = cosine_similarity(target,target)
    return similarity, features

def similarity_map(similarity,features):
    argsort = np.argsort(similarity, axis=1)[:,::-1]
    valuesort = np.sort(similarity, axis=1)[:,::-1]
    ## 음식 코드에 대한 유사도 기준값보다 큰 리스트들 정리(유사도 높은순)
    threadhold = -100 ## 전체값 다가져오기
    feature_similarity_map = {}
    for i,arg in enumerate(argsort):
        top = (valuesort[i] > threadhold).sum()
        top_idx = 3 if top < 3 else top
        arg_top = argsort[i,:top_idx]
        if features[i] in feature_similarity_map:
            print(features[i],arg_top[:1],feature_similarity_map[features[i]])
        feature_similarity_map[features[i]] = features[arg_top].tolist()
    return feature_similarity_map

### Z-Score, Mim-Max, L1, L2
- z : 평균 0, 표준편차 1로 정규화 / 이상치에 민감
- Min-Max : 값의 범위를 동일하게
- L1 : 희소한 데이터 처리
- L2 : 벡터간 유사도 비교

In [34]:
similarity, features = fit_similarity(data=data_source, type='z')
similarity_map_z = similarity_map(similarity, features)
similarity, features = fit_similarity(data=data_source, type='m')
similarity_map_m = similarity_map(similarity, features)
similarity, features = fit_similarity(data=data_source, type='l1')
similarity_map_l1 = similarity_map(similarity, features)
similarity, features = fit_similarity(data=data_source, type='l2')
similarity_map_l2 = similarity_map(similarity, features)

In [48]:
def map_df(code):
    df = pd.DataFrame()
    df['z'] = similarity_map_z[code]
    food_name = data[['음식_코드','음식명']]
    food_name.columns = ['z','z_name']
    df['z_name'] = df[['z']].merge(food_name,on='z',how='left')['z_name']
    df['m'] = similarity_map_m[code]
    food_name.columns = ['m','m_name']
    df['m_name'] = df[['m']].merge(food_name,on='m',how='left')['m_name']
    df['l1'] = similarity_map_l1[code]
    food_name.columns = ['l1','l1_name']
    df['l1_name'] = df[['l1']].merge(food_name,on='l1',how='left')['l1_name']
    df['l2'] = similarity_map_l2[code]
    food_name.columns = ['l2','l2_name']
    df['l2_name'] = df[['l2']].merge(food_name,on='l2',how='left')['l2_name']
    return df
features[:10]

Index(['D011001', 'D011002', 'D011003', 'D011004', 'D011005', 'D011006',
       'D011007', 'D012001', 'D012002', 'D012003'],
      dtype='object', name='음식_코드')

In [78]:
df = map_df('D311040')
df.head(20)

Unnamed: 0,z,z_name,m,m_name,l1,l1_name,l2,l2_name
0,D311040,알밥,D311040,알밥,D311040,알밥,D311040,알밥
1,D180003,고추장,D081054,장어구이,D081054,장어구이,D081054,장어구이
2,D171002,무절임,D081053,장어구이,D081053,장어구이,D081053,장어구이
3,D281007,전분액(진주냉면),D081052,장어구이,D081052,장어구이,D081052,장어구이
4,D301014,전분액(진주냉면),D081051,장어구이,D081051,장어구이,D081051,장어구이
5,D292006,청주,D081050,장어구이,D081050,장어구이,D081050,장어구이
6,D180001,배추,D081049,장어구이,D081049,장어구이,D081049,장어구이
7,D180126,간장(양조),D081048,장어구이,D081048,장어구이,D081048,장어구이
8,D180315,쇠고기 양념(파산적),D081047,장어구이,D081047,장어구이,D081047,장어구이
9,D180190,쇠고기 양념(파산적),D081046,장어구이,D081046,장어구이,D081046,장어구이
