In [21]:
!pip install pydantic langchain-teddynote langchain_community langchain_huggingface langchain_openai

from google.colab import output

output.clear()

In [22]:
import os

from google.colab import userdata

# os.environ["LANGCHAIN_TRACING_V2"] = "true"
# os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
# os.environ["LANGCHAIN_PROJECT"] = "LangChain 실습"
# Langchain API key 입력
# os.environ["LANGCHAIN_API_KEY"] = userdata.get('langsmith_api_key')

In [23]:
import zipfile

# 데이터 처리 및 분석
import pandas as pd
import numpy as np
from scipy import stats
from tqdm import tqdm
import seaborn as sns
from matplotlib import pyplot as plt
import datetime

from sklearn.metrics.pairwise import cosine_similarity

# 모델 저장
import joblib

# To ignore all warnings
import warnings
warnings.filterwarnings('ignore')



# 데이터 불러오기

In [24]:
bl_data_sampled=pd.read_csv('bl_sample_data.csv')

display(bl_data_sampled.head())

Unnamed: 0,HSCD,SUPLY_CO_NAME,PRDT_DC_VAL
0,330499,"MESOESTETIC,. S.L.",BODYSHOCK LOCAL REDUCER 200ML (FACE CREAM)
1,330499,"MESOESTETIC,. S.L.",COSMELAN 2 (FACE CREAM)
2,330499,"MESOESTETIC,. S.L.",ANTI STRESS FACE MASK 100ML (FACE CREAM)
3,330499,KRYOLAN GMBH CHEMISCHE FABRIK,BLOT POWDER
4,330499,KRYOLAN GMBH CHEMISCHE FABRIK,DERMACOLOR CAMOUFLAGE FLUID


In [25]:
bk_data_sampled=pd.read_excel('BK_330499_수정.xlsx')

display(bk_data_sampled.head())

Unnamed: 0,NO,PID,CORPNAME,PRD_NAME,PRD,URL
0,1,3723376,"CELLVIO COSMETIC CO.,LTD.",Doldori Premium Gold Essence,Our patented mask pack redefines skincare with...,https://buykorea.org/ec/prd/selectGoodsDetail....
1,2,3722918,Brade,Exosome Exo-V Skin Boos,The EXO-V Exosome Skin Booster is used in Kore...,https://buykorea.org/ec/prd/selectGoodsDetail....
2,3,3722956,"BIOVAIKOREA Co.,LTD",Oil-Free Ultra-Moisturizing Lotion with Birch Sap,DETAILS\nBENEFITS\n\n\n• Soothing irritated sk...,https://buykorea.org/ec/prd/selectGoodsDetail....
3,4,3722734,Hit Company,CHARNE - Shiny Brightening Cream,Charne Shiny Brightening Cream\n\nVarious natu...,https://buykorea.org/ec/prd/selectGoodsDetail....
4,5,3717513,"Wooanjoo Co.,Ltd",DEAR OHNEUL Singreen Donkey Toner Pad,Product Features:\n\n－ Hypoallergenic: Dermato...,https://buykorea.org/ec/prd/selectGoodsDetail....


# Langsmith , LLM API KEY 설정

In [26]:
from langchain_teddynote import logging

# logging.langsmith("KOTRA_BL_data_matching")

In [27]:
from langchain_community.chat_models import ChatPerplexity
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts import PromptTemplate

from langchain_core.output_parsers import CommaSeparatedListOutputParser
from langchain_core.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field


os.environ["OPENAI_API_KEY"] = userdata.get('KOTRA2_openai_api_key')


# API 사용 rate 제한 설정


In [28]:
from langchain_core.rate_limiters import InMemoryRateLimiter

rate_limiter_openai = InMemoryRateLimiter(
    requests_per_second=8,  # maximum query per day - 10000 (tier 1)
    check_every_n_seconds=0.1,  # Wake up every 100 ms to check whether allowed to make a request,
    max_bucket_size=10,  # Controls the maximum burst size.
    )

# 사용 모델 설정

In [29]:
chat_openai = ChatOpenAI(temperature=0,  model="gpt-4o-mini", rate_limiter=rate_limiter_openai)

# OpenAI API 이용 결과 (model : chatgpt 4o-mini)

# buyKOREA 설명 처리 프롬프트문

---------------------

+ 업무 배경 설명 (제품명, 상품 설명에 대한 데이터임을 인식)
+ HSCODE 4자리 설명을 추가함 (6자리보다 더 광범위한 개념 인식)
+ 제품명과 상품 정보에 기반한 제품 설명 생성
+ 지시 내용별 예시 제공 (one-shot)

In [77]:
keyword_num=5

class DescriptionSummary(BaseModel):
    type : str = Field(description="product type")
    keyword : list[str] = Field(description=f"{keyword_num} keywords only, which focus on product's main features")

parser= PydanticOutputParser(pydantic_object=DescriptionSummary)

In [78]:
summary_prompt_buykorea = PromptTemplate.from_template(
    """You are given data about product's description(PRDT_DESC) and product name(PRDT_NAME).
    All products fall under HS Code 330499, which is a subcategory of 3304.
    The description for 3304 is as follows:
    33.04 - Beauty or make-up preparations and preparations for the care of the skin (other than medicaments), including sunscreen or sun tan preparations; manicure or pedicure preparations.

    Using this information, complete the following tasks:
    Provide a simple explanation for PRDT_DESC using both PRDT_NAME and PRDT_DESC.
    Use publicly available product descriptions from websites to help.
    If the meaning is unclear, do not force an explanation.

    Example :
    PRDT_DESC : 'FIORESE Pure Hydra Water Cream contains Centella Asiatica and Hyaluronic Acid, effectively delivering hydration and providing instant soothing benefits.
    With Niacinamide and Adenosine, it is a dual-functional cosmetic for brightening and wrinkle care. This cream helps create healthy, smooth, and radiant skin.
    The lightweight, hydrating formula absorbs quickly, delivering moisture deep into the skin with a fresh, non-sticky finish. Suitable for all skin types, especially sensitive, dull, and uneven skin.

    For Sensitive Skin: Gentle, irritation-free formula, suitable even for sensitive skin.

    FIORESE Brand: Fiorese is dedicated to clean beauty, using ingredients sourced from nature. 100% cruelty-free.'

    Output Example : The FIORESE Pure Hydra Water Cream is a lightweight, hydrating cream containing Centella Asiatica and Hyaluronic Acid, providing deep moisture and soothing benefits.
    It also includes Niacinamide and Adenosine, making it a dual-functional cosmetic for brightening and wrinkle care.
    The formula absorbs quickly, leaving a fresh, non-sticky finish, and is suitable for all skin types, especially sensitive, dull, and uneven skin.
    The product is gentle and irritation-free, ideal for sensitive skin. Fiorese is committed to clean beauty with cruelty-free, nature-sourced ingredients.

    PRODUCT DESCRIPTION(PRDT_DESC) : {prdt_desc}
    PRODUCT NAME(PRDT_NAME) : {prdt_name}
    """
)


summary_chain_buykorea = summary_prompt_buykorea | chat_openai

In [91]:
buykorea_preprocessed_dict_openai={}

for index, row in bk_data_sampled.iterrows():
  response_buykorea = summary_chain_buykorea.invoke({"prdt_desc": row['PRD'], "prdt_name": row['PRD_NAME']})
  print(response_buykorea.content)
  buykorea_preprocessed_dict_openai[row['PRD_NAME']]=[response_buykorea.content]

The Doldori Premium Gold Essence is an innovative mask pack designed to enhance the skincare experience with its convenient and hygienic application method. This product stands out in the market due to its elegant packaging and practicality, making it a unique addition to any skincare routine. 

Available for export and sold in Korea at aesthetic clinics, department store pop-ups, and online shopping malls, the mask pack meets the high demand for effective and luxurious skincare solutions. The entire manufacturing process is handled in-house, ensuring quality control from the formulation of the essence to the development of the mask sheet. 

Enriched with high-quality ingredients, the essence hydrates, brightens, and revitalizes the skin, providing a refreshing and nourishing treatment. The mask sheet is designed for excellent adhesion, maximizing comfort and allowing for deep absorption of the essence. 

With a focus on detail and quality, the Doldori Premium Gold Essence delivers vis

In [92]:
extract_prompt_buykorea = PromptTemplate.from_template(
    """Extract product type and keywords from input, based on following rules:
    - Strictly exclude any information about capacity or volume.

    INPUT : {input}
    FORMAT :{{ "type": "<product type>", "keyword": ["<keyword1>", "<keyword2>", "<keyword3>", "<keyword4>", "<keyword5>"] }}
    """
)

extract_prompt_buykorea= extract_prompt_buykorea.partial(keyword_num=keyword_num)

extract_chain_buykorea = extract_prompt_buykorea | chat_openai | parser

In [93]:
for key, value in buykorea_preprocessed_dict_openai.items():
  response_buykorea = extract_chain_buykorea.invoke({"input": value})
  print(response_buykorea)
  buykorea_preprocessed_dict_openai[key].append(response_buykorea.type)
  buykorea_preprocessed_dict_openai[key].append(response_buykorea.keyword)

type='mask pack' keyword=['skincare', 'essence', 'hydration', 'brightening', 'luxurious']
type='skincare product' keyword=['Exosome Exo-V Skin Booster', 'plant-derived exosomes', 'growth factors', 'skin regeneration', 'anti-aging']
type='lotion' keyword=['oil-free', 'moisturizing', 'lightweight', 'soothing', 'skin-friendly']
type='Brightening Cream' keyword=['cosmetic', 'skin brightness', 'reduce wrinkles', 'natural ingredients', 'antioxidant effects']
type='Toner Pad' keyword=['hypoallergenic', 'skincare', 'dual-sided', 'deep cleansing', 'gentle hydration']
type='skincare product' keyword=['soothing gel', 'snail mucin', 'hydration', 'watermelon extract', 'antioxidants']
type='sunscreen' keyword=['Vegan', 'Herb', 'Non-Nano', 'UV protection', 'hypoallergenic']
type='Moisture Cream' keyword=['hydrating', 'Squalane', 'moisturizing', 'protecting', 'skincare']
type='Peeling Master Pad' keyword=['exfoliation', 'skin soothing', 'coriander extract', 'sensitive skin', 'biodegradable fabric']
ty

In [94]:
buykorea_preprocessed_dict_openai

{'Doldori Premium Gold Essence': ['The Doldori Premium Gold Essence is an innovative mask pack designed to enhance the skincare experience with its convenient and hygienic application method. This product stands out in the market due to its elegant packaging and practicality, making it a unique addition to any skincare routine. \n\nAvailable for export and sold in Korea at aesthetic clinics, department store pop-ups, and online shopping malls, the mask pack meets the high demand for effective and luxurious skincare solutions. The entire manufacturing process is handled in-house, ensuring quality control from the formulation of the essence to the development of the mask sheet. \n\nEnriched with high-quality ingredients, the essence hydrates, brightens, and revitalizes the skin, providing a refreshing and nourishing treatment. The mask sheet is designed for excellent adhesion, maximizing comfort and allowing for deep absorption of the essence. \n\nWith a focus on detail and quality, the 

In [95]:
buykorea_preprocessed_openai=pd.DataFrame.from_dict(buykorea_preprocessed_dict_openai, orient='index', columns=['description_summary', 'type', 'keyword'])
buykorea_preprocessed_openai.reset_index(inplace=True)
buykorea_preprocessed_openai.rename(columns={'index':'product_name'}, inplace=True)
buykorea_preprocessed_openai.to_csv('buykorea_preprocessed_openai.csv', index=False)

display(buykorea_preprocessed_openai.head())

Unnamed: 0,product_name,description_summary,type,keyword
0,Doldori Premium Gold Essence,The Doldori Premium Gold Essence is an innovat...,mask pack,"[skincare, essence, hydration, brightening, lu..."
1,Exosome Exo-V Skin Boos,The Exosome Exo-V Skin Booster is a versatile ...,skincare product,"[Exosome Exo-V Skin Booster, plant-derived exo..."
2,Oil-Free Ultra-Moisturizing Lotion with Birch Sap,The Oil-Free Ultra-Moisturizing Lotion with Bi...,lotion,"[oil-free, moisturizing, lightweight, soothing..."
3,CHARNE - Shiny Brightening Cream,The CHARNE Shiny Brightening Cream is a dual-f...,Brightening Cream,"[cosmetic, skin brightness, reduce wrinkles, n..."
4,DEAR OHNEUL Singreen Donkey Toner Pad,The DEAR OHNEUL Singreen Donkey Toner Pad is a...,Toner Pad,"[hypoallergenic, skincare, dual-sided, deep cl..."


# BL 데이터 이름 처리 프롬프트문

----------------------------

+ 업무 배경 설명 (수입자, 공급자, 상품 설명이 존재함을 인식)
+ HSCODE 4자리 설명을 추가함 (6자리보다 더 광범위한 개념 인식)
+ 공급자와 상품 이름 정보에 기반한 제품 설명 생성
+ 인터넷 검색으로 추가 정보 검색 지시
+ 의미가 확실치 않은 경우 생성 제한
+ 지시 내용별 예시 제공 (one-shot)

In [37]:
generate_prompt_bldata = PromptTemplate.from_template(
    """You are given data where IMP_CO_NAME imports products (PRDT_DC_VAL) from SUPLY_CO_NAME.
    All products fall under HS Code 330499, which is a subcategory of 3304.
    The description for 3304 is as follows:
    33.04 - Beauty or make-up preparations and preparations for the care of the skin (other than medicaments), including sunscreen or sun tan preparations; manicure or pedicure preparations.

    Using this information, complete the following tasks:
    Provide a simple explanation for PRDT_DC_VAL using both SUPLY_CO_NAME and PRDT_DC_VAL.
    Use publicly available product descriptions from websites to help.
    If the meaning is unclear, do not force an explanation.

    Example:
    PRDT_DC_VAL: COSMELAN 2 (FACE CREAM)
    SUPLY_CO_NAME: MESOESTETIC, S.L.
    Output Example: This product is called COSMELAN 2, a face cream designed to reduce pigmentation and improve skin tone. It is often used for treating melasma and other skin discolorations by inhibiting melanin production.
    Extract up to {keyword_num} key keywords that describe the product's main features.
    Output Example: pigmentation, skin tone, melasma, melanin, face cream
    Identify the type of product.
    Output Example: face cream
    Extract only the product name from PRDT_DC_VAL, excluding any volume, symbols, special characters, or colors.
    Output Example: COSMELAN 2

    PRODUCT DESCRIPTION(PRDT_DC_VAL) : {prdt_desc}
    SUPPLEYER NAME(SUPLY_CO_NAME) : {suply_name}
    """
)

generate_prompt_bldata= generate_prompt_bldata.partial(keyword_num=keyword_num)

generate_chain_bldata = generate_prompt_bldata | chat_openai

In [38]:
bldata_preprocessed_dict_openai={}

for index, row in tqdm(bl_data_sampled.iterrows()):
  response_bldata = generate_chain_bldata.invoke({"prdt_desc": row['PRDT_DC_VAL'], "suply_name":row['SUPLY_CO_NAME']})
  if index%100==0:
    print(f"\nProduct Name : {row['PRDT_DC_VAL']} \n")
    print(response_bldata.content, '\n')
  bldata_preprocessed_dict_openai[row['PRDT_DC_VAL']]=[response_bldata.content]

1it [00:01,  1.53s/it]


Product Name : BODYSHOCK LOCAL REDUCER 200ML (FACE CREAM) 

This product is called BODYSHOCK LOCAL REDUCER, a face cream designed to target localized fat and improve the appearance of the skin. It is often used in body contouring treatments to help reduce the appearance of cellulite and promote smoother skin texture.

**Keywords:** localized fat, cellulite, body contouring, skin texture, face cream

**Type of product:** face cream

**Product name:** BODYSHOCK LOCAL REDUCER 



101it [03:30,  2.17s/it]


Product Name : AVEENO BABY DERMEXA MOISTURING CREAM 206G(1 PAC X 12 PCS) 

This product is called AVEENO BABY DERMEXA MOISTURIZING CREAM, a moisturizing cream specifically formulated for babies to help relieve dry, itchy skin. It is designed to provide long-lasting hydration and is often used to soothe and protect sensitive skin, making it ideal for infants with eczema or other skin conditions.

**Key Features:**
1. moisturizing
2. dry skin relief
3. sensitive skin
4. eczema
5. baby care

**Type of Product:** moisturizing cream

**Product Name:** AVEENO BABY DERMEXA MOISTURIZING CREAM 



201it [06:42,  1.68s/it]


Product Name : TYA BRAND-5035 MAKEUP KIT-15g 

This product is called TYA BRAND-5035, a makeup kit that includes a variety of cosmetic products designed for enhancing beauty. The kit typically contains items such as foundation, eyeshadow, blush, and lip color, allowing users to create different makeup looks for various occasions.

**Key Features:**
1. makeup kit
2. variety of cosmetics
3. beauty enhancement
4. foundation
5. eyeshadow

**Type of Product:** makeup kit

**Product Name:** TYA BRAND-5035 



301it [09:54,  2.11s/it]


Product Name : FACIAL MASK (UNBRANDED COSMETIC BEAUTY PRODUCTS) 

This product is called a Facial Mask, which is a cosmetic treatment designed to improve the appearance and health of the skin. Facial masks can provide various benefits, such as hydration, cleansing, and nourishment, depending on their specific formulation. They are typically applied to the face for a certain period before being removed, allowing the active ingredients to penetrate the skin.

**Keywords:** hydration, cleansing, nourishment, cosmetic, skin

**Type of product:** facial mask

**Product name:** Facial Mask 



401it [12:54,  1.75s/it]


Product Name : BELLO YUGARD 30G (.15% RETINOL CREAM- 30 GRAM TUBES) 

This product is called BELLO YUGARD, a retinol cream designed to improve skin texture and reduce the appearance of fine lines and wrinkles. Retinol is a derivative of vitamin A that promotes cell turnover and helps to unclog pores, making it effective for anti-aging and acne treatment.

**Keywords:** retinol, skin texture, fine lines, wrinkles, anti-aging

**Type of product:** retinol cream

**Product name:** BELLO YUGARD 



501it [15:43,  1.59s/it]


Product Name : POLYAMIDE-8 - 80142 (COSMETIC RAW MATERIAL SAMPLE FOR TESTING PURPOSE ONLY) 

This product is called POLYAMIDE-8, which is a cosmetic raw material used primarily for testing purposes. It is often utilized in the formulation of various beauty and skincare products due to its properties that enhance texture and stability. As a raw material, it is not intended for direct consumer use but serves as an ingredient in the development of cosmetics.

**Key Keywords:** cosmetic, raw material, testing, formulation, texture

**Type of Product:** cosmetic raw material

**Product Name:** POLYAMIDE-8 



601it [18:32,  1.83s/it]


Product Name : MASKARA (UNBRANDED) 

This product is called MASKARA, which is a type of cosmetic used to enhance the eyelashes by making them appear longer, thicker, and darker. It is typically applied with a brush and is a staple in many makeup routines for achieving a more defined eye look.

**Key Features:**
1. enhances eyelashes
2. lengthening
3. thickening
4. darkening
5. cosmetic

**Type of Product:** mascara

**Product Name:** MASKARA 



701it [21:23,  1.33s/it]


Product Name : FE ALOVERA JUICE 100ML (RETURN TO ORIGIN) (EXP.VIDE S/BILL NO.9172066 & DT-24/12/19) 

This product is called FE ALOVERA JUICE, a natural juice derived from the aloe vera plant, known for its soothing and hydrating properties. It is often consumed for its health benefits, including aiding digestion, promoting skin health, and providing essential nutrients.

**Key Keywords:** aloe vera, juice, hydration, digestion, skin health

**Type of Product:** juice

**Product Name:** FE ALOVERA JUICE 



801it [24:24,  1.69s/it]


Product Name : CC CREAM (UNBRANDED COSMETIC PRODUCTS) RC NO.COS-471/COS-685/COS-686 5 

This product is called CC CREAM, an unbranded cosmetic product designed to provide coverage while also offering skincare benefits. CC creams typically help to even out skin tone, provide hydration, and may contain sun protection factors. They are often used as a lightweight alternative to foundation, suitable for daily wear.

**Keywords:** coverage, skin tone, hydration, sun protection, lightweight

**Type of product:** CC cream

**Product name:** CC CREAM 



901it [28:50,  1.57s/it]


Product Name : 2.14756 YURAQ (SAMPLE-NCV-R&D PURPOSE) (1 X 250 G) 

This product is called YURAQ, which is a sample intended for research and development purposes. It is packaged in a 250g container and is likely related to beauty or skincare preparations, given its classification under HS Code 330499. However, specific details about its intended use or benefits are not provided in the description.

**Key Keywords:** sample, research, development, skincare, 250g

**Type of Product:** sample for research and development

**Product Name:** YURAQ 



1001it [31:53,  1.78s/it]


Product Name : MIRACLE MASK DAMAGE REPAIR 200 ML (20 PCS) 

This product is called MIRACLE MASK DAMAGE REPAIR, a hair treatment designed to restore and repair damaged hair. It typically contains nourishing ingredients that help to strengthen hair, improve its texture, and enhance shine. This product is often used to revitalize hair that has been subjected to heat styling, chemical treatments, or environmental stressors.

**Keywords:** damage repair, hair treatment, nourishing, strengthen, shine

**Type of product:** hair treatment

**Product name:** MIRACLE MASK DAMAGE REPAIR 



1101it [34:47,  1.59s/it]


Product Name : 10001Rose (Rosa Damascena) Water l Organic BG-BIO-07 

This product is called Rose Water, derived from Rosa Damascena, which is known for its soothing and hydrating properties. It is often used in skincare routines for its ability to refresh the skin, reduce redness, and provide a natural fragrance. This organic rose water is suitable for various skin types and can be used as a toner or a facial mist.

**Key Features:**
1. soothing
2. hydrating
3. reduces redness
4. natural fragrance
5. organic

**Type of Product:** Rose Water

**Product Name:** Rose Water 



1201it [37:37,  1.74s/it]


Product Name : ILUMA INTENSE BLEACHING SERUM 1OZ(COSMATIC FOR SKIN CARE) 

This product is called ILUMA INTENSE BLEACHING SERUM, a cosmetic serum designed for skin care that aims to lighten and brighten the skin. It is often used to reduce the appearance of dark spots, hyperpigmentation, and uneven skin tone, providing a more radiant complexion.

**Key Features:**
1. skin lightening
2. hyperpigmentation
3. brightening
4. dark spots
5. serum

**Type of Product:** serum

**Product Name:** ILUMA INTENSE BLEACHING SERUM 



1301it [40:59,  1.59s/it]


Product Name : Cosmetics-Essence soft touch mousse make-up 04 matt ivory 

This product is called Cosmetics-Essence Soft Touch Mousse Make-up 04 Matt Ivory, a lightweight mousse foundation that provides a soft, matte finish for a natural look. It is designed to even out skin tone while offering a comfortable wear throughout the day, making it suitable for daily use.

**Keywords:** lightweight, matte finish, natural look, even skin tone, foundation

**Type of product:** foundation

**Product name:** Cosmetics-Essence Soft Touch Mousse Make-up 



1401it [44:02,  1.85s/it]


Product Name : SERUM - 909248 6 3 JARS/16 SACHET (COSMETIC RAW MATERIAL FOR TESTING PURPOSE ONLY) 

This product is referred to as "SERUM - 909248," which is a cosmetic raw material intended for testing purposes only. It is supplied by M/S. L'OREAL CHINA, a well-known company in the beauty and cosmetics industry. This serum may be used in various formulations to assess its effectiveness or compatibility in cosmetic products.

**Key Features:**
1. cosmetic raw material
2. testing purposes
3. serum
4. L'OREAL
5. 3 jars/16 sachets

**Type of Product:** cosmetic raw material

**Product Name:** SERUM - 909248 



1501it [47:27,  1.72s/it]


Product Name : CMC SKETCH EYELINER 0.6ML(COSMETIC PRODUCT)(RC/COS-001893) 

This product is called CMC SKETCH EYELINER, a cosmetic product designed for precise application to enhance the eyes. It typically features a fine tip for creating both thin and thick lines, allowing for versatile makeup looks. Eyeliners like this one are often used to define the eyes, making them appear larger and more expressive.

**Keywords:** eyeliner, precise application, cosmetic, fine tip, define eyes

**Type of product:** eyeliner

**Product name:** CMC SKETCH EYELINER 



1601it [50:43,  2.26s/it]


Product Name : ST. IVES AVOCADO&COCONUT OIL CRM ( PACKING 4 X3X45G) (MATERIAL CODE:67775077) (CDSCO REGN: RC/COS-001649) 1 

This product is called ST. IVES AVOCADO & COCONUT OIL CRM, a cream that combines the nourishing properties of avocado and coconut oil to hydrate and moisturize the skin. It is designed to provide a rich, creamy texture that helps to soften and smooth the skin, making it ideal for daily use.

**Key Features:**
1. Hydration
2. Nourishing
3. Moisturizing
4. Avocado oil
5. Coconut oil

**Type of Product:** Cream

**Product Name:** ST. IVES AVOCADO & COCONUT OIL 



1701it [54:16,  1.90s/it]


Product Name : FACE MASK 

This product is called FACE MASK, which is a skincare product designed to provide various benefits to the skin, such as hydration, cleansing, or treatment of specific skin concerns. Face masks can come in different forms, including sheet masks, clay masks, or cream masks, and are typically applied to the face for a certain period before being removed or rinsed off.

**Key Features:**
1. hydration
2. cleansing
3. treatment
4. skincare
5. rejuvenation

**Type of Product:** face mask

**Product Name:** FACE MASK 



1801it [57:26,  1.65s/it]


Product Name : LIGHT WONDER - 7 MEDIUM (EU) (FFDNX40X7R45) (COSMETICS) 40 ml 

This product is called LIGHT WONDER, a medium coverage foundation designed to provide a natural, radiant finish while enhancing the skin's appearance. It is formulated to give a lightweight feel and is suitable for daily wear, helping to even out skin tone and provide a healthy glow.

**Key Features:**
1. medium coverage
2. natural finish
3. lightweight
4. evens skin tone
5. radiant glow

**Type of Product:** foundation

**Product Name:** LIGHT WONDER 



1901it [1:00:29,  1.51s/it]


Product Name : CBRN50006 - VISBLWHITE MOIST CREAM 1.7 OZ/50ML [PERFUMERY / COSMETICS PRODUCTS] (BRAND: ELIZABETH ARDEN) 

This product is called VISBLWHITE MOIST CREAM, a moisturizing cream designed to enhance skin brightness and provide hydration. It is part of the Elizabeth Arden brand, known for its focus on skincare and beauty products. This cream is formulated to improve skin tone and texture, making it suitable for daily use in skincare routines.

**Keywords:** moisturizing, skin brightness, hydration, skincare, cream

**Type of product:** moisturizing cream

**Product name:** VISBLWHITE MOIST CREAM 



2001it [1:03:39,  2.89s/it]


Product Name : RV MICRO ESS FRE WTR 130ML AS (RV MICRO ESS FRE WTR 130ML AS) (130 GM EACH) 

This product is called RV MICRO ESS FRE WTR, a refreshing water essence designed to hydrate and revitalize the skin. It is often used as a part of a skincare routine to provide moisture and enhance the skin's overall appearance, making it feel fresh and rejuvenated.

**Key Features:**
1. hydration
2. revitalizing
3. refreshing
4. skincare
5. essence

**Type of Product:** essence

**Product Name:** RV MICRO ESS FRE WTR 



2101it [1:06:46,  1.88s/it]


Product Name : COSMETICS DR.JART ALL THANKS TO CERA SET-SEPHORA-EU-BZ SET0360O1 

This product is called "COSMETICS DR.JART ALL THANKS TO CERA SET," which is a skincare set that includes various products designed to hydrate and nourish the skin. The set typically features products that contain ceramides, which are known for their ability to strengthen the skin barrier and retain moisture, making it suitable for dry or sensitive skin types.

**Key Features:**
1. hydration
2. nourishment
3. ceramides
4. skin barrier
5. sensitive skin

**Type of Product:** skincare set

**Product Name:** COSMETICS DR.JART ALL THANKS TO CERA SET 



2201it [1:10:17,  1.69s/it]


Product Name : COSMETICS -LHS BASILLIME 500ML 

This product is called BASILLIME, a cosmetic product designed for skin care. It typically serves as a hydrating or nourishing treatment, often used to enhance the skin's appearance and texture. The 500ml size suggests it is suitable for regular use, making it a practical choice for both personal and professional settings.

**Key Features:**
1. hydrating
2. nourishing
3. skin care
4. cosmetic
5. 500ml

**Type of Product:** cosmetic

**Product Name:** BASILLIME 



2301it [1:13:39,  1.64s/it]


Product Name : COSMETICS- COLLAGEN DRY SKIN MASK 250ML 

This product is called the COLLAGEN DRY SKIN MASK, a cosmetic treatment designed to hydrate and rejuvenate dry skin. It typically contains collagen, which helps to improve skin elasticity and moisture levels, making it ideal for individuals looking to restore their skin's suppleness and overall appearance.

**Key Features:**
1. hydration
2. rejuvenation
3. dry skin
4. collagen
5. elasticity

**Type of Product:** skin mask

**Product Name:** COLLAGEN DRY SKIN MASK 



2401it [1:16:49,  1.73s/it]


Product Name : CHINESE BRAND NYN MAKEUP KIT 80114-37.4 GM 

This product is called the CHINESE BRAND NYN MAKEUP KIT, which includes a variety of makeup items designed for enhancing beauty. The kit typically contains essential makeup products such as foundation, eyeshadow, blush, and lip color, making it a versatile choice for both everyday use and special occasions.

**Key Features:**
1. makeup kit
2. versatile
3. beauty enhancement
4. essential products
5. everyday use

**Type of Product:** makeup kit

**Product Name:** CHINESE BRAND NYN MAKEUP KIT 



2501it [1:20:03,  1.58s/it]


Product Name : 7771SY1210 EIGHT HOUR LIP PROTECTANT TIN (COSMETICS) 

This product is called EIGHT HOUR LIP PROTECTANT TIN, a cosmetic designed to provide long-lasting moisture and protection for the lips. It is known for its soothing properties and is often used to prevent chapping and dryness, making it a staple in many beauty routines.

**Keywords:** moisture, protection, soothing, chapping, cosmetics  
**Type of product:** lip protectant  
**Product name:** EIGHT HOUR LIP PROTECTANT TIN 



2601it [1:22:55,  1.71s/it]


Product Name : DARK CIRCLE CREAM (BULK PACKING) (K.I.C.E.P.A CERTIFICATE NO. K001-20-0064786 DT.22.01.2020 

This product is called DARK CIRCLE CREAM, which is designed to reduce the appearance of dark circles under the eyes. It typically contains ingredients that hydrate the skin, improve circulation, and brighten the under-eye area, helping to diminish the signs of fatigue and aging.

**Key Features:**
1. dark circles
2. hydration
3. brightening
4. under-eye area
5. anti-aging

**Type of Product:** eye cream

**Product Name:** DARK CIRCLE CREAM 



2701it [1:26:04,  1.67s/it]


Product Name : COSMETIC PRODUCTS: SKIN CARE PRODUCT ( SAKURA WHITE BOOSTER ) #ZZ936210202 ( N.C.V. SAMPLE FOR R & D PURPOSE ONLY) 

This product is called SAKURA WHITE BOOSTER, a skin care product designed to enhance skin brightness and promote a more radiant complexion. It is often formulated with ingredients that help to improve skin tone and texture, providing a boost to overall skin health.

**Keywords:** brightness, skin tone, radiant, skin health, booster

**Type of product:** skin care product

**Product name:** SAKURA WHITE BOOSTER 



2801it [1:29:06,  1.54s/it]


Product Name : SWISS IMAGE BRAND - SI EN SOOTH.CLEAN.MILK 200ML 

This product is called SWISS IMAGE BRAND - SI EN SOOTH.CLEAN.MILK, a soothing cleansing milk designed to gently remove impurities and makeup from the skin while maintaining its natural moisture balance. It is often formulated with nourishing ingredients to provide a calming effect, making it suitable for sensitive skin types.

**Key Features:**
1. soothing
2. cleansing
3. moisturizing
4. gentle
5. sensitive skin

**Type of Product:** cleansing milk

**Product Name:** SWISS IMAGE BRAND - SI EN SOOTH.CLEAN.MILK 



2901it [1:32:08,  2.15s/it]


Product Name : FACE CARE PRODUCTS OTHER THAN FACE MASK - NS00903 NASHI ARGAN - THE BALM - MOISTURIZING AFTER SHAVE, 100 ML 

This product is called NASHI ARGAN - THE BALM, a moisturizing after shave designed to hydrate and soothe the skin after shaving. It contains argan oil, which is known for its nourishing properties, making it ideal for post-shave care to prevent irritation and dryness.

**Key Features:**
1. moisturizing
2. after shave
3. argan oil
4. soothing
5. hydrating

**Type of Product:** after shave balm

**Product Name:** NASHI ARGAN - THE BALM 



2932it [1:33:12,  1.91s/it]


In [39]:
joblib.dump(bldata_preprocessed_dict_openai, 'bldata_preprocessed_dict_openai.pkl')

['bldata_preprocessed_dict_openai_10.pkl']

In [40]:
bldata_preprocessed_dict_openai_ext=joblib.load('bldata_preprocessed_dict_openai.pkl')

In [41]:
parser.get_format_instructions()

'The output should be formatted as a JSON instance that conforms to the JSON schema below.\n\nAs an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}\nthe object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.\n\nHere is the output schema:\n```\n{"properties": {"type": {"description": "product type", "title": "Type", "type": "string"}, "keyword": {"description": "5 keywords which focus on product\'s main features", "items": {"type": "string"}, "title": "Keyword", "type": "array"}}, "required": ["type", "keyword"]}\n```'

In [96]:
extract_prompt_bldata = PromptTemplate.from_template(
    """Extract product type and keywords from input, based on following rules:
    - Strictly exclude any information about capacity or volume.

    INPUT : {input}
    FORMAT :{{ "type": "<product type>", "keyword": ["<keyword1>", "<keyword2>", "<keyword3>", "<keyword4>", "<keyword5>"] }}
    """
)

extract_prompt_bldata= extract_prompt_bldata.partial(keyword_num=keyword_num)

extract_chain_bldata = extract_prompt_bldata | chat_openai | parser

In [97]:
for i, (key, value) in tqdm(enumerate(bldata_preprocessed_dict_openai_ext.items())):
  response_bldata = extract_chain_bldata.invoke({"input": value[0]})
  if i%100==0:
    print(response_bldata)
  bldata_preprocessed_dict_openai_ext[key].append(response_bldata.type)
  bldata_preprocessed_dict_openai_ext[key].append(response_bldata.keyword)

1it [00:00,  1.24it/s]

type='face cream' keyword=['localized fat', 'cellulite', 'body contouring', 'skin texture', 'face cream']


101it [01:54,  1.21it/s]

type='highlighter palette' keyword=['luminous glow', 'complexion enhancement', 'multiple shades', 'highlighting', 'sculpted appearance']


201it [03:12,  1.03it/s]

type='after waxing lotion' keyword=['menthol', 'soothing', 'cooling', 'cosmetics', 'post-waxing']


301it [04:43,  1.08it/s]

type='makeup palette' keyword=['versatile shades', 'high-definition', 'blendability', 'beauty enhancement', 'various shades']


401it [06:07,  1.20it/s]

type='Foaming cleanser' keyword=['Manuka honey', 'foaming cleanser', 'hydration', 'radiant glow', 'gentle formula']


501it [07:43,  1.25it/s]

type='cream' keyword=['matte finish', 'shine control', 'smooth appearance', 'lightweight', 'pores']


601it [09:02,  1.10it/s]

type='body lotion' keyword=['brightening', 'hydration', 'skin texture', 'glutathione', 'vitamin C']


701it [10:32,  1.22it/s]

type='hydrating emulsion' keyword=['hydrating', 'sensitive skin', 'instant comfort', 'emulsion gel', 'lightweight']


801it [12:08,  1.05it/s]

type='sheen spray' keyword=['shine', 'moisture', 'braided hairstyles', 'nourishing', 'cosmetic']


901it [13:29,  1.29it/s]

type='skincare pack' keyword=['Coenzyme Q10', 'antioxidant properties', 'skin elasticity', 'hydration', 'nourishing']


1001it [14:46,  1.39it/s]

type='highlighter and contour' keyword=['highlighter', 'contour', 'facial features', 'sculpted', 'cosmetics']


1101it [16:07,  1.14it/s]

type='baby massage oil' keyword=['hydration', 'nourishment', 'gentle', 'lightweight', 'baby massage']


1201it [17:31,  1.22it/s]

type='facial oil' keyword=['facial oil', 'hydration', 'nourishment', 'skin texture', 'capsules']


1301it [18:56,  1.31it/s]

type='foundation' keyword=['oil control', 'SPF 20', 'smooth complexion', 'sun protection', 'enhance skin appearance']


1401it [20:41,  1.16it/s]

type='BB cream' keyword=['moisturizing', 'sun protection', 'skin tone evening', 'beautifying', 'lightweight coverage']


1501it [22:27,  2.89s/it]

type='Cleansing astringent' keyword=['whitening', 'cleansing', 'astringent', 'rose extract', 'pearl extract']


1601it [23:41,  1.42it/s]

type='shampoo' keyword=['cleansing', 'hair care', 'scalp care', 'dirt removal', 'oil removal']


1701it [25:12,  1.15it/s]

type='raw material for cosmetics' keyword=['acne treatment', 'raw material', 'skincare formulation', 'reduces acne lesions', 'prevents breakouts']


1801it [26:34,  1.12it/s]

type='makeup remover' keyword=['makeup remover', 'cleanse', 'impurities', 'skincare', 'GLAMGALS']


1901it [27:57,  1.23it/s]

type='hair gel' keyword=['firm hold', 'hair texture', 'long-lasting', 'styling', 'natural look']


2001it [29:21,  1.17it/s]

type='eye liner' keyword=['definition', 'color', 'makeup looks', 'shades']


2101it [30:45,  1.22it/s]

type='lipstick' keyword=['matte finish', 'long-lasting', 'vibrant color', 'smooth application', 'lip enhancement']


2201it [32:03,  1.31it/s]

type='exfoliator' keyword=['exfoliator', 'skincare', 'dead skin cells', 'skin renewal', 'texture']


2301it [33:44,  1.22it/s]

type='perfume wax' keyword=['fragrance', 'scented', 'wax melts', 'aroma', 'testing']


2401it [35:14,  1.47it/s]

type='cosmetic' keyword=['moisture', 'nourishment', 'hydration', 'skin texture', 'healthy appearance']


2501it [37:01,  1.36it/s]

type='concealer' keyword=['full coverage', 'blemishes', 'dark circles', 'creamy texture', 'long-lasting']


2601it [38:25,  1.21it/s]

type='cosmetics raw materials' keyword=['cosmetics', 'raw materials', 'formulation', 'emulsifiers', 'stabilizers']


2701it [44:07,  1.36it/s]

type='cosmetics' keyword=['natural', 'skin care', 'sun protection', 'cooling', 'beauty enhancer']


2801it [45:31,  1.03it/s]

type='moisturizer' keyword=['natural moisturizer', 'nourishing', 'hydrating', 'soothing', 'skin care']





In [100]:
joblib.dump(bldata_preprocessed_dict_openai_ext, 'bldata_preprocessed_dict_openai_ext.pkl')

['bldata_preprocessed_dict_openai_ext.pkl']

In [102]:
bldata_preprocessed_openai=pd.DataFrame.from_dict(bldata_preprocessed_dict_openai_ext, orient='index', columns=['product_description', 'type', 'keyword'])
bldata_preprocessed_openai.reset_index(inplace=True)
bldata_preprocessed_openai.rename(columns={'index':'product_name'}, inplace=True)
bldata_preprocessed_openai.to_csv('bldata_preprocessed_openai.csv', index=False)

display(bldata_preprocessed_openai.head())

Unnamed: 0,product_name,product_description,type,keyword,type2,keyword2
0,BODYSHOCK LOCAL REDUCER 200ML (FACE CREAM),This product is called BODYSHOCK LOCAL REDUCER...,face cream,"[localized fat, cellulite, body contouring, sk...",face cream,"[localized fat, cellulite, body contouring, sk..."
1,COSMELAN 2 (FACE CREAM),"This product is called COSMELAN 2, a face crea...",face cream,"[pigmentation, skin tone, melasma, melanin]",face cream,"[pigmentation, skin tone, melasma, melanin, fa..."
2,ANTI STRESS FACE MASK 100ML (FACE CREAM),"This product is called ANTI STRESS FACE MASK, ...",face mask,"[hydration, soothing, stressed skin, redness, ...",face mask,"[hydration, soothing, stressed skin, redness, ..."
3,BLOT POWDER,"This product is called BLOT POWDER, a setting ...",setting powder,"[shine control, oil absorption, matte finish, ...",setting powder,"[shine control, oil absorption, matte finish, ..."
4,DERMACOLOR CAMOUFLAGE FLUID,This product is called DERMACOLOR CAMOUFLAGE F...,camouflage fluid,"[high coverage, skin imperfections, long-lasti...",camouflage fluid,"[high coverage, skin imperfections, long-lasti..."


In [130]:
bldata_preprocessed_openai[bldata_preprocessed_openai.columns[:-2]]

Unnamed: 0,product_name,product_description,type,keyword
0,BODYSHOCK LOCAL REDUCER 200ML (FACE CREAM),This product is called BODYSHOCK LOCAL REDUCER...,face cream,"['localized fat', 'cellulite', 'body contourin..."
1,COSMELAN 2 (FACE CREAM),"This product is called COSMELAN 2, a face crea...",face cream,"['pigmentation', 'skin tone', 'melasma', 'mela..."
2,ANTI STRESS FACE MASK 100ML (FACE CREAM),"This product is called ANTI STRESS FACE MASK, ...",face mask,"['hydration', 'soothing', 'stressed skin', 're..."
3,BLOT POWDER,"This product is called BLOT POWDER, a setting ...",setting powder,"['shine control', 'oil absorption', 'matte fin..."
4,DERMACOLOR CAMOUFLAGE FLUID,This product is called DERMACOLOR CAMOUFLAGE F...,camouflage fluid,"['high coverage', 'skin imperfections', 'long-..."
...,...,...,...,...
2796,P-GINKGO LEAVES EXTRACT (COSMETIC PRODUCTS FOR...,This product is called P-GINKGO LEAVES EXTRACT...,cosmetic ingredient,"['antioxidant', 'skin health', 'circulation', ..."
2797,BAMBOO CHARCOAL BLOTTING TISSUE GRAY 160 PCS (...,This product is called BAMBOO CHARCOAL BLOTTIN...,Blotting tissue,"['oil absorption', 'shine control', 'bamboo ch..."
2798,FINISHED PRODUCTS- KITS LYSOFIX (NCV),"This product is called LYSOFIX, which is a kit...",skincare kit,"['skincare', 'hydration', 'rejuvenation', 'kit..."
2799,FINISHED PRODUCTS-COLOR ETHICS (NCV),This product is called FINISHED PRODUCTS-COLOR...,cosmetic products,"['ethical sourcing', 'cosmetic products', 'sus..."


In [131]:
buykorea_preprocessed_openai[buykorea_preprocessed_openai.columns[:-2]]

Unnamed: 0,product_name,description_summary,type,keyword
0,Doldori Premium Gold Essence,The Doldori Premium Gold Essence is an innovat...,mask pack,"['skincare', 'essence', 'hydration', 'brighten..."
1,Exosome Exo-V Skin Boos,The Exosome Exo-V Skin Booster is a versatile ...,skincare product,"['Exosome Exo-V Skin Booster', 'plant-derived ..."
2,Oil-Free Ultra-Moisturizing Lotion with Birch Sap,The Oil-Free Ultra-Moisturizing Lotion with Bi...,lotion,"['oil-free', 'moisturizing', 'lightweight', 's..."
3,CHARNE - Shiny Brightening Cream,The CHARNE Shiny Brightening Cream is a dual-f...,Brightening Cream,"['cosmetic', 'skin brightness', 'reduce wrinkl..."
4,DEAR OHNEUL Singreen Donkey Toner Pad,The DEAR OHNEUL Singreen Donkey Toner Pad is a...,Toner Pad,"['hypoallergenic', 'skincare', 'dual-sided', '..."
5,PAXMOLY Jeju Aloe Vera 100 Soothing Gel,The PAXMOLY Jeju Aloe Vera 100 Soothing Gel is...,skincare product,"['soothing gel', 'snail mucin', 'hydration', '..."
6,CHOROK CHORONG Vegan Herb Non-Nano Jeju Pepper...,The CHOROK CHORONG Vegan Herb Non-Nano Jeju Pe...,sunscreen,"['Vegan', 'Herb', 'Non-Nano', 'UV protection',..."
7,vegreen Daily Moisture Cream,The Vegreen Daily Moisture Cream is a hydratin...,Moisture Cream,"['hydrating', 'Squalane', 'moisturizing', 'pro..."
8,Lulugine Elite Skin-Coat 3.0 Peeling Master Pad,The Lulugine Elite Skin-Coat 3.0 Peeling Maste...,Peeling Master Pad,"['exfoliation', 'skin soothing', 'coriander ex..."
9,t:aim Barrier Sunblock 40ml,The t:aim Barrier Sunblock is a high-protectio...,sunscreen,"['high-protection', 'SPF 50+', 'PA++++', 'orga..."


# 수출한 기업은 다르지만, 수출한 품목의 이름은 같은 경우가 있어 불일치 발생
(중복 제거됨)

In [45]:
# 같은 품목 중 다른 기업이 수출한 이력이 있는 경우, 다른 수출건으로 집계됨

bl_data_sampled.shape

(2932, 3)

In [46]:
# 제품 이름에 기반해, 제품 수입 확률이 높은 해외 바이어 매칭이므로
# 중복되는 제품 이름은 하나의 키워드 및 임베딩으로 통일

len(bldata_preprocessed_dict_openai)

2801

# 임베딩

In [47]:
from langchain_huggingface import HuggingFaceEmbeddings

# 오픈소스 임베딩 모델 다운로드
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [48]:
def text_embedding(text):
  embedding_text=embeddings.embed_query(text)
  return np.array(embedding_text).reshape(1,-1)

#  1. 키워드 10개로 이루어진 문장 임베딩

In [49]:
tqdm.pandas()

# 임베딩할 텍스트

buykorea_preprocessed_openai=pd.read_csv('./buykorea_preprocessed_openai_10.csv')
bldata_preprocessed_openai=pd.read_csv('./bldata_preprocessed_openai_10.csv')

buykorea_preprocessed_openai['buykorea_preprocessed']=buykorea_preprocessed_openai['keyword']
bldata_preprocessed_openai['bldata_preprocessed']=bldata_preprocessed_openai['keyword']
# 키워드로 구성된 문장을 임베딩
buykorea_preprocessed_openai['buykorea_embedding']=buykorea_preprocessed_openai['buykorea_preprocessed'].progress_apply(text_embedding)
bldata_preprocessed_openai['bldata_embedding']=bldata_preprocessed_openai['bldata_preprocessed'].progress_apply(text_embedding)

bldata_sample_len=bldata_preprocessed_openai.shape[0]

100%|██████████| 10/10 [00:02<00:00,  4.89it/s]
100%|██████████| 2801/2801 [06:55<00:00,  6.75it/s]


In [50]:
openai_similarity_df=pd.DataFrame(columns=['buykorea_product_name', 'buykorea_keyword', 'bldata_product_name', 'bldata_keyword', 'cosine_similarity'])

for index, row in buykorea_preprocessed_openai.iterrows():

  # 각 buykorea 상품 (10개) 에,
  # 2800개의 수출 기록 존재하는 제품 이름과의 유사도 계산
  # buykorea 상품별로 2800개의 유사도가 계산되어 저장된 데이터프레임 생성

  similarity_df=pd.DataFrame({'buykorea_product_name':[row['product_name']]*bldata_sample_len,
                               'buykorea_keyword': [row['buykorea_preprocessed']]*bldata_sample_len,
                               'bldata_product_name' : ["None"]*bldata_sample_len,
                               'bldata_keyword' : ["None"]*bldata_sample_len,
                               'cosine_similarity' : [0]*bldata_sample_len})

  buykorea_embedding=row['buykorea_embedding']
  row_count=0

  for index, row_bl in tqdm(bldata_preprocessed_openai.iterrows()):

    bldata_embedding=row_bl['bldata_embedding']

    similarity=cosine_similarity(buykorea_embedding, bldata_embedding)
    similarity_df.loc[row_count, 'bldata_product_name']=row_bl['product_name']
    similarity_df.loc[row_count, 'bldata_keyword']=row_bl['bldata_preprocessed']
    similarity_df.loc[row_count, 'cosine_similarity']=similarity

    row_count+=1

  # buykorea 제품 유사도 데이터프레임을 하나의 데이터프레임으로 병합
  openai_similarity_df=pd.concat([openai_similarity_df, similarity_df], axis=0)

display(openai_similarity_df.head())

2801it [00:05, 538.45it/s]
2801it [00:04, 653.41it/s]
2801it [00:05, 529.51it/s]
2801it [00:04, 643.66it/s]
2801it [00:04, 653.45it/s]
2801it [00:05, 526.91it/s]
2801it [00:04, 659.82it/s]
2801it [00:04, 657.18it/s]
2801it [00:05, 520.17it/s]
2801it [00:04, 661.71it/s]


Unnamed: 0,buykorea_product_name,buykorea_keyword,bldata_product_name,bldata_keyword,cosine_similarity
0,Doldori Premium Gold Essence,"['skincare', 'essence', 'hydration', 'brighten...",BODYSHOCK LOCAL REDUCER 200ML (FACE CREAM),"['localized fat', 'cellulite', 'body contourin...",0.562266
1,Doldori Premium Gold Essence,"['skincare', 'essence', 'hydration', 'brighten...",COSMELAN 2 (FACE CREAM),"['pigmentation', 'skin tone', 'melasma', 'mela...",0.574345
2,Doldori Premium Gold Essence,"['skincare', 'essence', 'hydration', 'brighten...",ANTI STRESS FACE MASK 100ML (FACE CREAM),"['hydration', 'soothing', 'stressed skin', 're...",0.569448
3,Doldori Premium Gold Essence,"['skincare', 'essence', 'hydration', 'brighten...",BLOT POWDER,"['shine control', 'oil absorption', 'matte fin...",0.682449
4,Doldori Premium Gold Essence,"['skincare', 'essence', 'hydration', 'brighten...",DERMACOLOR CAMOUFLAGE FLUID,"['high coverage', 'skin imperfections', 'long-...",0.423549


In [51]:
openai_similarity_df_sorted= openai_similarity_df.groupby(['buykorea_product_name'], group_keys=False, sort=False).apply(lambda x: x.sort_values('cosine_similarity', ascending=False))
openai_similarity_df_sorted.reset_index(drop=True, inplace=True)

In [52]:
# 각 buykorea_product_name 별로 cosine_similarity가 높은 10개, 낮은 10개를 'Highest'로, 나머지는 'Lowest'으로 표시하는 함수
def mark_top_bottom_10(df):
    # 상위 10개와 하위 10개는 'Highest', 나머지는 'Lowest'
    df['test_row'] = ['Highest'] * 10 + ['N'] * (len(df) - 20) + ['Lowest'] * 10
    return df

# 각 buykorea_product_name별로 그룹화하여 처리
# Apply the function and then reset the index to align with the original DataFrame
openai_similarity_df_sorted['test_row'] = openai_similarity_df_sorted.groupby('buykorea_product_name', group_keys=False).apply(mark_top_bottom_10).reset_index(drop=True)['test_row']


# 결과 확인
display(openai_similarity_df_sorted)

openai_similarity_df_sorted.to_csv('openai_similarity_df_sorted_v1.csv', index=False)

Unnamed: 0,buykorea_product_name,buykorea_keyword,bldata_product_name,bldata_keyword,cosine_similarity,test_row
0,Doldori Premium Gold Essence,"['skincare', 'essence', 'hydration', 'brighten...",23445590 BODY LOTION (COSMETIC) / (NOT MEANT F...,"['moisturizing', 'nourishing', 'cosmetic', 'hy...",0.912510,Highest
1,Doldori Premium Gold Essence,"['skincare', 'essence', 'hydration', 'brighten...",LESENTIA (MOISTURIZING CRAM FOR SKIN) (FOR PHO...,"['moisturizing', 'hydrate', 'nourish', 'skin t...",0.912250,Highest
2,Doldori Premium Gold Essence,"['skincare', 'essence', 'hydration', 'brighten...",DERMABRITE CONCENTRATE-BATCH NO.IA20267 REG. N...,"['brightness', 'clarity', 'skincare', 'radiant...",0.907972,Highest
3,Doldori Premium Gold Essence,"['skincare', 'essence', 'hydration', 'brighten...",TESTER-SCENTIO MILK PLUS WHITE AND BRIGHT SERU...,"['brightness', 'radiant', 'moisturizing', 'ski...",0.906547,Highest
4,Doldori Premium Gold Essence,"['skincare', 'essence', 'hydration', 'brighten...",COSMETICS PRODUCT CODE 34400142 YEHWADAM PURE ...,"['brightening', 'skincare', 'hydration', 'nour...",0.904458,Highest
...,...,...,...,...,...,...
28005,t:aim Barrier Sunblock 40ml,"['high-protection', 'SPF 50+', 'PA++++', 'afte...",PERFUME SAMPLE ( F.O.C. FOR R & D PURPOSE ONLY),"['perfume', 'sample', 'research', 'development...",0.330480,Lowest
28006,t:aim Barrier Sunblock 40ml,"['high-protection', 'SPF 50+', 'PA++++', 'afte...",TERAPOL POLISHING PASTE 0.35KG(EQUIPMENT AND M...,"['polishing', 'paste', 'educational', 'surface...",0.326224,Lowest
28007,t:aim Barrier Sunblock 40ml,"['high-protection', 'SPF 50+', 'PA++++', 'afte...",DEPUFF & DEFINE EYE RESCUE (NOT FOR SALE CAPTI...,"['depuffing', 'defining', 'eye area', 'soothin...",0.323788,Lowest
28008,t:aim Barrier Sunblock 40ml,"['high-protection', 'SPF 50+', 'PA++++', 'afte...",65142967 LA FEMME PRADA WATER SPLASH 150ML(COS...,"['refreshing', 'elegance', 'femininity', 'upli...",0.298735,Lowest


# 2. 제품 유형만 이용해 유사도 순 나열

In [105]:
tqdm.pandas()

# 임베딩할 텍스트

buykorea_preprocessed_openai=pd.read_csv('./buykorea_preprocessed_openai.csv')
bldata_preprocessed_openai=pd.read_csv('./bldata_preprocessed_openai.csv')

buykorea_preprocessed_openai['buykorea_preprocessed']=buykorea_preprocessed_openai['type']
bldata_preprocessed_openai['bldata_preprocessed']=bldata_preprocessed_openai['type']
# 키워드로 구성된 문장을 임베딩
buykorea_preprocessed_openai['buykorea_embedding']=buykorea_preprocessed_openai['buykorea_preprocessed'].progress_apply(text_embedding)
bldata_preprocessed_openai['bldata_embedding']=bldata_preprocessed_openai['bldata_preprocessed'].progress_apply(text_embedding)

bldata_sample_len=bldata_preprocessed_openai.shape[0]

100%|██████████| 10/10 [00:01<00:00,  6.79it/s]
100%|██████████| 2801/2801 [04:02<00:00, 11.53it/s]


In [106]:
openai_similarity_df=pd.DataFrame(columns=['buykorea_product_name', 'buykorea_type', 'bldata_product_name', 'bldata_type', 'cosine_similarity'])

for index, row in buykorea_preprocessed_openai.iterrows():

  # 각 buykorea 상품 (10개) 에,
  # 2800개의 수출 기록 존재하는 제품 이름과의 유사도 계산
  # buykorea 상품별로 2800개의 유사도가 계산되어 저장된 데이터프레임 생성

  similarity_df=pd.DataFrame({'buykorea_product_name':[row['product_name']]*bldata_sample_len,
                               'buykorea_type': [row['buykorea_preprocessed']]*bldata_sample_len,
                               'bldata_product_name' : ["None"]*bldata_sample_len,
                               'bldata_type' : ["None"]*bldata_sample_len,
                               'cosine_similarity' : [0]*bldata_sample_len})

  buykorea_embedding=row['buykorea_embedding']
  row_count=0

  for index, row_bl in tqdm(bldata_preprocessed_openai.iterrows()):

    bldata_embedding=row_bl['bldata_embedding']

    similarity=cosine_similarity(buykorea_embedding, bldata_embedding)
    similarity_df.loc[row_count, 'bldata_product_name']=row_bl['product_name']
    similarity_df.loc[row_count, 'bldata_type']=row_bl['bldata_preprocessed']
    similarity_df.loc[row_count, 'cosine_similarity']=similarity

    row_count+=1

  # buykorea 제품 유사도 데이터프레임을 하나의 데이터프레임으로 병합
  openai_similarity_df=pd.concat([openai_similarity_df, similarity_df], axis=0)

display(openai_similarity_df.head())


2801it [00:05, 540.57it/s]
2801it [00:04, 565.62it/s]
2801it [00:05, 478.43it/s]
2801it [00:05, 508.12it/s]
2801it [00:04, 629.17it/s]
2801it [00:04, 610.57it/s]
2801it [00:05, 509.07it/s]
2801it [00:04, 617.96it/s]
2801it [00:04, 575.47it/s]
2801it [00:05, 516.97it/s]


Unnamed: 0,buykorea_product_name,buykorea_type,bldata_product_name,bldata_type,cosine_similarity
0,Doldori Premium Gold Essence,mask pack,BODYSHOCK LOCAL REDUCER 200ML (FACE CREAM),face cream,0.443385
1,Doldori Premium Gold Essence,mask pack,COSMELAN 2 (FACE CREAM),face cream,0.443385
2,Doldori Premium Gold Essence,mask pack,ANTI STRESS FACE MASK 100ML (FACE CREAM),face mask,0.761262
3,Doldori Premium Gold Essence,mask pack,BLOT POWDER,setting powder,0.142408
4,Doldori Premium Gold Essence,mask pack,DERMACOLOR CAMOUFLAGE FLUID,camouflage fluid,0.390623


In [107]:
openai_similarity_df_sorted= openai_similarity_df.groupby(['buykorea_product_name'], group_keys=False, sort=False).apply(lambda x: x.sort_values('cosine_similarity', ascending=False))
openai_similarity_df_sorted.reset_index(drop=True, inplace=True)

# 각 buykorea_product_name 별로 cosine_similarity가 높은 10개, 낮은 10개를 'Highest'로, 나머지는 'Lowest'으로 표시하는 함수
def mark_top_bottom_10(df):
    # 상위 10개와 하위 10개는 'Highest', 나머지는 'Lowest'
    df['test_row'] = ['Highest'] * 10 + ['N'] * (len(df) - 20) + ['Lowest'] * 10
    return df

# 각 buykorea_product_name별로 그룹화하여 처리
# Apply the function and then reset the index to align with the original DataFrame
openai_similarity_df_sorted['test_row'] = openai_similarity_df_sorted.groupby('buykorea_product_name', group_keys=False).apply(mark_top_bottom_10).reset_index(drop=True)['test_row']


# 결과 확인
display(openai_similarity_df_sorted)

openai_similarity_df_sorted.to_csv('openai_similarity_df_sorted_v2.csv', index=False)

Unnamed: 0,buykorea_product_name,buykorea_type,bldata_product_name,bldata_type,cosine_similarity,test_row
0,Doldori Premium Gold Essence,mask pack,COSMETICS MASK PACK,mask pack,1.000000,Highest
1,Doldori Premium Gold Essence,mask pack,ASSORTD FACE PACK (RETURN TO ORIGIN) (EXPORTED...,face pack,0.785882,Highest
2,Doldori Premium Gold Essence,mask pack,AVOCADO HIGH NUTRITION MASK PACK (BRAND-STAROV...,skincare mask,0.764832,Highest
3,Doldori Premium Gold Essence,mask pack,(LABUTE )REVIVE THE SKIN VITAMIN MASK,skincare mask,0.764832,Highest
4,Doldori Premium Gold Essence,mask pack,HIBISCUS & ROSE GLOW MASK (RETURN TO ORIGIN) (...,skincare mask,0.764832,Highest
...,...,...,...,...,...,...
28005,t:aim Barrier Sunblock 40ml,sunscreen,HILARY RHODA HIGHLIGHTER & CONTOUR (NT. WT. 12...,highlighter and contour,0.045261,Lowest
28006,t:aim Barrier Sunblock 40ml,sunscreen,PEPPLUS LIFTING PACK (8 PACKS) ( 40 PCS) ( CDS...,lifting pack,0.041898,Lowest
28007,t:aim Barrier Sunblock 40ml,sunscreen,2.14756 YURAQ (SAMPLE-NCV-R&D PURPOSE) (1 X 25...,sample for research and development,0.033819,Lowest
28008,t:aim Barrier Sunblock 40ml,sunscreen,(N.C.V.) - MIBIOME SUSPENSION 50ML/EN (FOR R &...,Research formulation,0.014456,Lowest


# 3. 제품 유형 + 키워드 5개 (총 6개 키워드) 이용해 유사도 순 나열

In [108]:
tqdm.pandas()

# 임베딩할 텍스트

buykorea_preprocessed_openai=pd.read_csv('./buykorea_preprocessed_openai.csv')
bldata_preprocessed_openai=pd.read_csv('./bldata_preprocessed_openai.csv')

buykorea_preprocessed_openai['buykorea_preprocessed']=buykorea_preprocessed_openai['type']+","+buykorea_preprocessed_openai['keyword'].apply(lambda x : ",".join(x.strip('[]').split(',')))
bldata_preprocessed_openai['bldata_preprocessed']=bldata_preprocessed_openai['type']+","+bldata_preprocessed_openai['keyword'].apply(lambda x : ",".join(x.strip('[]').split(',')))
# 키워드로 구성된 문장을 임베딩
buykorea_preprocessed_openai['buykorea_embedding']=buykorea_preprocessed_openai['buykorea_preprocessed'].progress_apply(text_embedding)
bldata_preprocessed_openai['bldata_embedding']=bldata_preprocessed_openai['bldata_preprocessed'].progress_apply(text_embedding)

bldata_sample_len=bldata_preprocessed_openai.shape[0]

100%|██████████| 10/10 [00:02<00:00,  4.36it/s]
100%|██████████| 2801/2801 [07:33<00:00,  6.17it/s]


In [109]:
openai_similarity_df=pd.DataFrame(columns=['buykorea_product_name', 'buykorea_keyword', 'bldata_product_name', 'bldata_keyword', 'cosine_similarity'])

for index, row in buykorea_preprocessed_openai.iterrows():

  # 각 buykorea 상품 (10개) 에,
  # 2800개의 수출 기록 존재하는 제품 이름과의 유사도 계산
  # buykorea 상품별로 2800개의 유사도가 계산되어 저장된 데이터프레임 생성

  similarity_df=pd.DataFrame({'buykorea_product_name':[row['product_name']]*bldata_sample_len,
                               'buykorea_keyword': [row['buykorea_preprocessed']]*bldata_sample_len,
                               'bldata_product_name' : ["None"]*bldata_sample_len,
                               'bldata_keyword' : ["None"]*bldata_sample_len,
                               'cosine_similarity' : [0]*bldata_sample_len})

  buykorea_embedding=row['buykorea_embedding']
  row_count=0

  for index, row_bl in tqdm(bldata_preprocessed_openai.iterrows()):

    bldata_embedding=row_bl['bldata_embedding']

    similarity=cosine_similarity(buykorea_embedding, bldata_embedding)
    similarity_df.loc[row_count, 'bldata_product_name']=row_bl['product_name']
    similarity_df.loc[row_count, 'bldata_keyword']=row_bl['bldata_preprocessed']
    similarity_df.loc[row_count, 'cosine_similarity']=similarity

    row_count+=1

  # buykorea 제품 유사도 데이터프레임을 하나의 데이터프레임으로 병합
  openai_similarity_df=pd.concat([openai_similarity_df, similarity_df], axis=0)

display(openai_similarity_df.head())


2801it [00:04, 654.86it/s]
2801it [00:05, 525.97it/s]
2801it [00:04, 650.49it/s]
2801it [00:04, 655.87it/s]
2801it [00:05, 531.55it/s]
2801it [00:04, 650.00it/s]
2801it [00:04, 652.54it/s]
2801it [00:05, 526.53it/s]
2801it [00:04, 649.62it/s]
2801it [00:04, 628.05it/s]


Unnamed: 0,buykorea_product_name,buykorea_keyword,bldata_product_name,bldata_keyword,cosine_similarity
0,Doldori Premium Gold Essence,"mask pack,'skincare', 'essence', 'hydration', ...",BODYSHOCK LOCAL REDUCER 200ML (FACE CREAM),"face cream,'localized fat', 'cellulite', 'body...",0.654006
1,Doldori Premium Gold Essence,"mask pack,'skincare', 'essence', 'hydration', ...",COSMELAN 2 (FACE CREAM),"face cream,'pigmentation', 'skin tone', 'melas...",0.710032
2,Doldori Premium Gold Essence,"mask pack,'skincare', 'essence', 'hydration', ...",ANTI STRESS FACE MASK 100ML (FACE CREAM),"face mask,'hydration', 'soothing', 'stressed s...",0.659665
3,Doldori Premium Gold Essence,"mask pack,'skincare', 'essence', 'hydration', ...",BLOT POWDER,"setting powder,'shine control', 'oil absorptio...",0.650969
4,Doldori Premium Gold Essence,"mask pack,'skincare', 'essence', 'hydration', ...",DERMACOLOR CAMOUFLAGE FLUID,"camouflage fluid,'high coverage', 'skin imperf...",0.454845


In [110]:
openai_similarity_df_sorted= openai_similarity_df.groupby(['buykorea_product_name'], group_keys=False, sort=False).apply(lambda x: x.sort_values('cosine_similarity', ascending=False))
openai_similarity_df_sorted.reset_index(drop=True, inplace=True)

# 각 buykorea_product_name 별로 cosine_similarity가 높은 10개, 낮은 10개를 'Highest'로, 나머지는 'Lowest'으로 표시하는 함수
def mark_top_bottom_10(df):
    # 상위 10개와 하위 10개는 'Highest', 나머지는 'Lowest'
    df['test_row'] = ['Highest'] * 10 + ['N'] * (len(df) - 20) + ['Lowest'] * 10
    return df

# 각 buykorea_product_name별로 그룹화하여 처리
# Apply the function and then reset the index to align with the original DataFrame
openai_similarity_df_sorted['test_row'] = openai_similarity_df_sorted.groupby('buykorea_product_name', group_keys=False).apply(mark_top_bottom_10).reset_index(drop=True)['test_row']


# 결과 확인
display(openai_similarity_df_sorted)

openai_similarity_df_sorted.to_csv('openai_similarity_df_sorted_v3.csv', index=False)

Unnamed: 0,buykorea_product_name,buykorea_keyword,bldata_product_name,bldata_keyword,cosine_similarity,test_row
0,Doldori Premium Gold Essence,"mask pack,'skincare', 'essence', 'hydration', ...",EB BRIGHTER MOISTURE MAS 100ML/3.4FLOZ - (100....,"moisturizing mask,'brightening', 'moisturizing...",0.923858,Highest
1,Doldori Premium Gold Essence,"mask pack,'skincare', 'essence', 'hydration', ...",(LABUTE )REVIVE THE SKIN VITAMIN MASK,"skincare mask,'nourishment', 'rejuvenation', '...",0.910223,Highest
2,Doldori Premium Gold Essence,"mask pack,'skincare', 'essence', 'hydration', ...",(N.C.V.) - NEUTROGENA BRIGHT BOOSTERMASK-21G S...,"skincare mask,'brightening', 'hydration', 'rad...",0.900369,Highest
3,Doldori Premium Gold Essence,"mask pack,'skincare', 'essence', 'hydration', ...",COSMETICS PRODUCT CODE 34400142 YEHWADAM PURE ...,"skincare kit,'brightening', 'skincare', 'hydra...",0.897600,Highest
4,Doldori Premium Gold Essence,"mask pack,'skincare', 'essence', 'hydration', ...",HIBISCUS & ROSE GLOW MASK (RETURN TO ORIGIN) (...,"skincare mask,'hibiscus', 'rose', 'glow', 'hyd...",0.891152,Highest
...,...,...,...,...,...,...
28005,t:aim Barrier Sunblock 40ml,"sunscreen,'high-protection', 'SPF 50+', 'PA+++...",WNW BROW PENCIL(0.2GM) (COSMETIC PRODUCTS) RC ...,"eyebrow pencil,'eyebrow', 'cosmetic', 'define'...",0.324040,Lowest
28006,t:aim Barrier Sunblock 40ml,"sunscreen,'high-protection', 'SPF 50+', 'PA+++...",CL L BR FINGER CLIPS - LARGE - BROWN 12PC/BAG,"finger clips,'brown', 'grooming', 'beauty', 's...",0.323467,Lowest
28007,t:aim Barrier Sunblock 40ml,"sunscreen,'high-protection', 'SPF 50+', 'PA+++...",COSMETICS MASCARA,"mascara,'mascara', 'enhance', 'eyelashes', 'de...",0.321659,Lowest
28008,t:aim Barrier Sunblock 40ml,"sunscreen,'high-protection', 'SPF 50+', 'PA+++...",COSMETICS 11155 U.BEVER SCALP ENERG SHAMP 258M,"shampoo,'scalp energizing', 'promotes healthy ...",0.270411,Lowest


# 4. 유형 단어 포함 여부로 필터 후, 키워드 5개 이용해 유사도 순 나열

In [111]:
tqdm.pandas()

# 임베딩할 텍스트

buykorea_preprocessed_openai=pd.read_csv('./buykorea_preprocessed_openai.csv')
bldata_preprocessed_openai=pd.read_csv('./bldata_preprocessed_openai.csv')

buykorea_preprocessed_openai['buykorea_preprocessed']=buykorea_preprocessed_openai['keyword'].apply(lambda x : ",".join(x.strip('[]').split(',')))
bldata_preprocessed_openai['bldata_preprocessed']=bldata_preprocessed_openai['keyword'].apply(lambda x : ",".join(x.strip('[]').split(',')))
# 키워드로 구성된 문장을 임베딩
buykorea_preprocessed_openai['buykorea_embedding']=buykorea_preprocessed_openai['buykorea_preprocessed'].progress_apply(text_embedding)
bldata_preprocessed_openai['bldata_embedding']=bldata_preprocessed_openai['bldata_preprocessed'].progress_apply(text_embedding)

bldata_sample_len=bldata_preprocessed_openai.shape[0]

100%|██████████| 10/10 [00:02<00:00,  4.45it/s]
100%|██████████| 2801/2801 [07:50<00:00,  5.96it/s]


In [112]:
import re

openai_similarity_df=pd.DataFrame(columns=['buykorea_product_name', 'buykorea_keyword', 'bldata_product_name', 'bldata_keyword', 'cosine_similarity'])

for index, row in buykorea_preprocessed_openai.iterrows():

  # 각 buykorea 상품 (10개) 에,
  # 2800개의 수출 기록 존재하는 제품 이름과의 유사도 계산
  # buykorea 상품별로 2800개의 유사도가 계산되어 저장된 데이터프레임 생성

  similarity_df=pd.DataFrame({'buykorea_product_name':[row['product_name']]*bldata_sample_len,
                               'buykorea_keyword': [row['buykorea_preprocessed']]*bldata_sample_len,
                               'bldata_product_name' : ["None"]*bldata_sample_len,
                               'bldata_keyword' : ["None"]*bldata_sample_len,
                               'cosine_similarity' : [0]*bldata_sample_len})

  buykorea_embedding=row['buykorea_embedding']
  buykorea_product_type=row['type'].split(' ')
  row_count=0

  for index, row_bl in tqdm(bldata_preprocessed_openai.iterrows()):
    if any(keyword in row_bl['type'].lower().split(' ') for keyword in set(buykorea_product_type)):
      bldata_embedding=row_bl['bldata_embedding']

      similarity=cosine_similarity(buykorea_embedding, bldata_embedding)
      similarity_df.loc[row_count, 'bldata_product_name']=row_bl['product_name']
      similarity_df.loc[row_count, 'bldata_keyword']=row_bl['bldata_preprocessed']
      similarity_df.loc[row_count, 'cosine_similarity']=similarity

    row_count+=1

  # buykorea 제품 유사도 데이터프레임을 하나의 데이터프레임으로 병합
  openai_similarity_df=pd.concat([openai_similarity_df, similarity_df], axis=0)

display(openai_similarity_df.head())



2801it [00:00, 5656.22it/s]
2801it [00:00, 6141.98it/s]
2801it [00:00, 6160.93it/s]
2801it [00:00, 14655.14it/s]
2801it [00:00, 16827.97it/s]
2801it [00:00, 6388.54it/s]
2801it [00:00, 10005.21it/s]
2801it [00:00, 10248.40it/s]
2801it [00:00, 10568.09it/s]
2801it [00:00, 7486.45it/s]


Unnamed: 0,buykorea_product_name,buykorea_keyword,bldata_product_name,bldata_keyword,cosine_similarity
0,Doldori Premium Gold Essence,"'skincare', 'essence', 'hydration', 'brighteni...",,,0.0
1,Doldori Premium Gold Essence,"'skincare', 'essence', 'hydration', 'brighteni...",,,0.0
2,Doldori Premium Gold Essence,"'skincare', 'essence', 'hydration', 'brighteni...",ANTI STRESS FACE MASK 100ML (FACE CREAM),"'hydration', 'soothing', 'stressed skin', 'red...",0.522552
3,Doldori Premium Gold Essence,"'skincare', 'essence', 'hydration', 'brighteni...",,,0.0
4,Doldori Premium Gold Essence,"'skincare', 'essence', 'hydration', 'brighteni...",,,0.0


In [113]:
openai_similarity_df_sorted=openai_similarity_df[openai_similarity_df['bldata_product_name']!='None']

openai_similarity_df_sorted= openai_similarity_df.groupby(['buykorea_product_name'], group_keys=False, sort=False).apply(lambda x: x.sort_values('cosine_similarity', ascending=False))
openai_similarity_df_sorted.reset_index(drop=True, inplace=True)

# 각 buykorea_product_name 별로 cosine_similarity가 높은 10개, 낮은 10개를 'Highest'로, 나머지는 'Lowest'으로 표시하는 함수
def mark_top_bottom_10(df):
    # 상위 10개와 하위 10개는 'Highest', 나머지는 'Lowest'
    df['test_row'] = ['Highest'] * 10 + ['N'] * (len(df) - 20) + ['Lowest'] * 10
    return df

# 각 buykorea_product_name별로 그룹화하여 처리
# Apply the function and then reset the index to align with the original DataFrame
openai_similarity_df_sorted['test_row'] = openai_similarity_df_sorted.groupby('buykorea_product_name', group_keys=False).apply(mark_top_bottom_10).reset_index(drop=True)['test_row']


# 결과 확인
display(openai_similarity_df_sorted)

openai_similarity_df_sorted.to_csv('openai_similarity_df_sorted_v4.csv', index=False)

Unnamed: 0,buykorea_product_name,buykorea_keyword,bldata_product_name,bldata_keyword,cosine_similarity,test_row
0,Doldori Premium Gold Essence,"'skincare', 'essence', 'hydration', 'brighteni...",EB BRIGHTER MOISTURE MAS 100ML/3.4FLOZ - (100....,"'brightening', 'moisturizing', 'skin hydration...",0.870156,Highest
1,Doldori Premium Gold Essence,"'skincare', 'essence', 'hydration', 'brighteni...",I AM LEMON MASK SHEET - BRIGHTENING ( PRODUCT ...,"'brightening', 'hydration', 'lemon extract', '...",0.858402,Highest
2,Doldori Premium Gold Essence,"'skincare', 'essence', 'hydration', 'brighteni...",HIBISCUS & ROSE GLOW MASK (RETURN TO ORIGIN) (...,"'hibiscus', 'rose', 'glow', 'hydration', 'skin...",0.835730,Highest
3,Doldori Premium Gold Essence,"'skincare', 'essence', 'hydration', 'brighteni...","ROSE HIP & MAIZE EXFOLIATING MASK, 250 ML X 30...","'exfoliation', 'radiant complexion', 'nourishi...",0.833835,Highest
4,Doldori Premium Gold Essence,"'skincare', 'essence', 'hydration', 'brighteni...",SU CLARIFYING MASK 150ML (150 ML) (270320030) ...,"'clarifying', 'purifying', 'skincare', 'comple...",0.831755,Highest
...,...,...,...,...,...,...
28005,t:aim Barrier Sunblock 40ml,"'high-protection', 'SPF 50+', 'PA++++', 'organ...",,,0.000000,Lowest
28006,t:aim Barrier Sunblock 40ml,"'high-protection', 'SPF 50+', 'PA++++', 'organ...",,,0.000000,Lowest
28007,t:aim Barrier Sunblock 40ml,"'high-protection', 'SPF 50+', 'PA++++', 'organ...",,,0.000000,Lowest
28008,t:aim Barrier Sunblock 40ml,"'high-protection', 'SPF 50+', 'PA++++', 'organ...",,,0.000000,Lowest


# 5. 유형 유사도 순 나열 후, 비슷한 유형 내에서 키워드 5개 이용해 유사도 순 나열

In [114]:
tqdm.pandas()

# 임베딩할 텍스트

buykorea_preprocessed_openai=pd.read_csv('./buykorea_preprocessed_openai.csv')
bldata_preprocessed_openai=pd.read_csv('./bldata_preprocessed_openai.csv')

buykorea_preprocessed_openai['buykorea_type_preprocessed']=buykorea_preprocessed_openai['type']
bldata_preprocessed_openai['bldata_type_preprocessed']=bldata_preprocessed_openai['type']
# 키워드로 구성된 문장을 임베딩
buykorea_preprocessed_openai['buykorea_type_embedding']=buykorea_preprocessed_openai['buykorea_type_preprocessed'].progress_apply(text_embedding)
bldata_preprocessed_openai['bldata_type_embedding']=bldata_preprocessed_openai['bldata_type_preprocessed'].progress_apply(text_embedding)

buykorea_preprocessed_openai['buykorea_keyword_preprocessed']=buykorea_preprocessed_openai['keyword'].apply(lambda x : ",".join(x.strip('[]').split(',')))
bldata_preprocessed_openai['bldata_keyword_preprocessed']=bldata_preprocessed_openai['keyword'].apply(lambda x : ",".join(x.strip('[]').split(',')))
# 키워드로 구성된 문장을 임베딩
buykorea_preprocessed_openai['buykorea_keyword_embedding']=buykorea_preprocessed_openai['buykorea_keyword_preprocessed'].progress_apply(text_embedding)
bldata_preprocessed_openai['bldata_keyword_embedding']=bldata_preprocessed_openai['bldata_keyword_preprocessed'].progress_apply(text_embedding)

bldata_sample_len=bldata_preprocessed_openai.shape[0]

100%|██████████| 10/10 [00:01<00:00,  8.93it/s]
100%|██████████| 2801/2801 [03:48<00:00, 12.26it/s]
100%|██████████| 10/10 [00:01<00:00,  5.44it/s]
100%|██████████| 2801/2801 [06:59<00:00,  6.68it/s]


In [115]:
openai_similarity_df=pd.DataFrame(columns=['buykorea_product_name', 'buykorea_type', 'buykorea_keyword', 'bldata_product_name', 'bldata_type', 'bldata_keyword', 'type_cosine_similarity', 'keyword_cosine_similarity'])

for index, row in buykorea_preprocessed_openai.iterrows():

  # 각 buykorea 상품 (10개) 에,
  # 2800개의 수출 기록 존재하는 제품 이름과의 유사도 계산
  # buykorea 상품별로 2800개의 유사도가 계산되어 저장된 데이터프레임 생성

  similarity_df=pd.DataFrame({'buykorea_product_name':[row['product_name']]*bldata_sample_len,
                              'buykorea_type' : [row['buykorea_type_preprocessed']]*bldata_sample_len,
                               'buykorea_keyword': [row['buykorea_keyword_preprocessed']]*bldata_sample_len,
                               'bldata_product_name' : ["None"]*bldata_sample_len,
                               'bldata_type' : ["None"]*bldata_sample_len,
                               'bldata_keyword' : ["None"]*bldata_sample_len,
                               'type_cosine_similarity' : [0]*bldata_sample_len,
                              'keyword_cosine_similarity ' : [0]*bldata_sample_len})

  buykorea_type_embedding=row['buykorea_type_embedding']
  buykorea_keyword_embedding=row['buykorea_keyword_embedding']
  row_count=0

  for index, row_bl in tqdm(bldata_preprocessed_openai.iterrows()):

    bldata_type_embedding=row_bl['bldata_type_embedding']
    bldata_keyword_embedding=row_bl['bldata_keyword_embedding']

    type_similarity=cosine_similarity(buykorea_type_embedding, bldata_type_embedding)
    keyword_similarity=cosine_similarity(buykorea_keyword_embedding, bldata_keyword_embedding)
    similarity_df.loc[row_count, 'bldata_product_name']=row_bl['product_name']
    similarity_df.loc[row_count, 'bldata_type']=row_bl['bldata_type_preprocessed']
    similarity_df.loc[row_count, 'bldata_keyword']=row_bl['bldata_keyword_preprocessed']
    similarity_df.loc[row_count, 'type_cosine_similarity']=type_similarity
    similarity_df.loc[row_count, 'keyword_cosine_similarity']=keyword_similarity

    row_count+=1

  # buykorea 제품 유사도 데이터프레임을 하나의 데이터프레임으로 병합
  openai_similarity_df=pd.concat([openai_similarity_df, similarity_df], axis=0)

display(openai_similarity_df.head())


2801it [00:08, 338.87it/s]
2801it [00:07, 387.99it/s]
2801it [00:08, 329.14it/s]
2801it [00:07, 368.80it/s]
2801it [00:08, 343.03it/s]
2801it [00:08, 336.03it/s]
2801it [00:07, 384.73it/s]
2801it [00:08, 332.46it/s]
2801it [00:07, 381.58it/s]
2801it [00:08, 330.38it/s]


Unnamed: 0,buykorea_product_name,buykorea_type,buykorea_keyword,bldata_product_name,bldata_type,bldata_keyword,type_cosine_similarity,keyword_cosine_similarity,keyword_cosine_similarity.1
0,Doldori Premium Gold Essence,mask pack,"'skincare', 'essence', 'hydration', 'brighteni...",BODYSHOCK LOCAL REDUCER 200ML (FACE CREAM),face cream,"'localized fat', 'cellulite', 'body contouring...",0.443385,0.51543,0.0
1,Doldori Premium Gold Essence,mask pack,"'skincare', 'essence', 'hydration', 'brighteni...",COSMELAN 2 (FACE CREAM),face cream,"'pigmentation', 'skin tone', 'melasma', 'melanin'",0.443385,0.575196,0.0
2,Doldori Premium Gold Essence,mask pack,"'skincare', 'essence', 'hydration', 'brighteni...",ANTI STRESS FACE MASK 100ML (FACE CREAM),face mask,"'hydration', 'soothing', 'stressed skin', 'red...",0.761262,0.522552,0.0
3,Doldori Premium Gold Essence,mask pack,"'skincare', 'essence', 'hydration', 'brighteni...",BLOT POWDER,setting powder,"'shine control', 'oil absorption', 'matte fini...",0.142408,0.620937,0.0
4,Doldori Premium Gold Essence,mask pack,"'skincare', 'essence', 'hydration', 'brighteni...",DERMACOLOR CAMOUFLAGE FLUID,camouflage fluid,"'high coverage', 'skin imperfections', 'long-l...",0.390623,0.374129,0.0


In [117]:
openai_similarity_df_sorted= openai_similarity_df.groupby(['buykorea_product_name'], group_keys=False, sort=False).apply(lambda x: x.sort_values(by=['type_cosine_similarity','keyword_cosine_similarity'], ascending=False))
openai_similarity_df_sorted.reset_index(drop=True, inplace=True)

# 각 buykorea_product_name 별로 cosine_similarity가 높은 10개, 낮은 10개를 'Highest'로, 나머지는 'Lowest'으로 표시하는 함수
def mark_top_bottom_10(df):
    # 상위 10개와 하위 10개는 'Highest', 나머지는 'Lowest'
    df['test_row'] = ['Highest'] * 10 + ['N'] * (len(df) - 20) + ['Lowest'] * 10
    return df

# 각 buykorea_product_name별로 그룹화하여 처리
# Apply the function and then reset the index to align with the original DataFrame
openai_similarity_df_sorted['test_row'] = openai_similarity_df_sorted.groupby('buykorea_product_name', group_keys=False).apply(mark_top_bottom_10).reset_index(drop=True)['test_row']


# 결과 확인
display(openai_similarity_df_sorted)

openai_similarity_df_sorted.to_csv('openai_similarity_df_sorted_v5.csv', index=False)

Unnamed: 0,buykorea_product_name,buykorea_type,buykorea_keyword,bldata_product_name,bldata_type,bldata_keyword,type_cosine_similarity,keyword_cosine_similarity,keyword_cosine_similarity.1,test_row
0,Doldori Premium Gold Essence,mask pack,"'skincare', 'essence', 'hydration', 'brighteni...",COSMETICS MASK PACK,mask pack,"'hydration', 'nourishment', 'rejuvenation', 's...",1.000000,0.754996,0.0,Highest
1,Doldori Premium Gold Essence,mask pack,"'skincare', 'essence', 'hydration', 'brighteni...",ASSORTD FACE PACK (RETURN TO ORIGIN) (EXPORTED...,face pack,"'skin texture', 'hydration', 'skincare routine...",0.785882,0.625478,0.0,Highest
2,Doldori Premium Gold Essence,mask pack,"'skincare', 'essence', 'hydration', 'brighteni...",HIBISCUS & ROSE GLOW MASK (RETURN TO ORIGIN) (...,skincare mask,"'hibiscus', 'rose', 'glow', 'hydration', 'skin...",0.764832,0.835730,0.0,Highest
3,Doldori Premium Gold Essence,mask pack,"'skincare', 'essence', 'hydration', 'brighteni...",I AM TOMATO MASK - SKIN GLOW (PRODUCT CODE:- T...,skincare mask,"'skin glow', 'brightening', 'nourishing', 'rev...",0.764832,0.812760,0.0,Highest
4,Doldori Premium Gold Essence,mask pack,"'skincare', 'essence', 'hydration', 'brighteni...",(N.C.V.) - NEUTROGENA BRIGHT BOOSTERMASK-21G S...,skincare mask,"'brightening', 'hydration', 'radiance', 'skinc...",0.764832,0.811947,0.0,Highest
...,...,...,...,...,...,...,...,...,...,...
28005,t:aim Barrier Sunblock 40ml,sunscreen,"'high-protection', 'SPF 50+', 'PA++++', 'organ...",HILARY RHODA HIGHLIGHTER & CONTOUR (NT. WT. 12...,highlighter and contour,"'highlighter', 'contour', 'facial features', '...",0.045261,0.473259,0.0,Lowest
28006,t:aim Barrier Sunblock 40ml,sunscreen,"'high-protection', 'SPF 50+', 'PA++++', 'organ...",PEPPLUS LIFTING PACK (8 PACKS) ( 40 PCS) ( CDS...,lifting pack,"'lifting', 'firming', 'elasticity', 'fine line...",0.041898,0.491200,0.0,Lowest
28007,t:aim Barrier Sunblock 40ml,sunscreen,"'high-protection', 'SPF 50+', 'PA++++', 'organ...",2.14756 YURAQ (SAMPLE-NCV-R&D PURPOSE) (1 X 25...,sample for research and development,"'sample', 'research', 'development', 'skincare'",0.033819,0.504682,0.0,Lowest
28008,t:aim Barrier Sunblock 40ml,sunscreen,"'high-protection', 'SPF 50+', 'PA++++', 'organ...",(N.C.V.) - MIBIOME SUSPENSION 50ML/EN (FOR R &...,Research formulation,"'microbiome', 'skin care', 'research', 'formul...",0.014456,0.587574,0.0,Lowest


# 6. 전체 문장 임베딩

In [121]:
tqdm.pandas()

# 임베딩할 텍스트

buykorea_preprocessed_openai=pd.read_csv('./buykorea_preprocessed_openai.csv')
bldata_preprocessed_openai=pd.read_csv('./bldata_preprocessed_openai.csv')

buykorea_preprocessed_openai['buykorea_preprocessed']=buykorea_preprocessed_openai['description_summary']
bldata_preprocessed_openai['bldata_preprocessed']=bldata_preprocessed_openai['product_description']

# 키워드로 구성된 문장을 임베딩
buykorea_preprocessed_openai['buykorea_embedding']=buykorea_preprocessed_openai['buykorea_preprocessed'].progress_apply(text_embedding)
bldata_preprocessed_openai['bldata_embedding']=bldata_preprocessed_openai['bldata_preprocessed'].progress_apply(text_embedding)


bldata_sample_len=bldata_preprocessed_openai.shape[0]

100%|██████████| 10/10 [00:06<00:00,  1.57it/s]
100%|██████████| 2801/2801 [20:23<00:00,  2.29it/s]


In [122]:
openai_similarity_df=pd.DataFrame(columns=['buykorea_product_name', 'buykorea_description', 'bldata_product_name', 'bldata_description', 'cosine_similarity'])

for index, row in buykorea_preprocessed_openai.iterrows():

  # 각 buykorea 상품 (10개) 에,
  # 2800개의 수출 기록 존재하는 제품 이름과의 유사도 계산
  # buykorea 상품별로 2800개의 유사도가 계산되어 저장된 데이터프레임 생성

  similarity_df=pd.DataFrame({'buykorea_product_name':[row['product_name']]*bldata_sample_len,
                               'buykorea_description': [row['buykorea_preprocessed']]*bldata_sample_len,
                               'bldata_product_name' : ["None"]*bldata_sample_len,
                               'bldata_description' : ["None"]*bldata_sample_len,
                               'cosine_similarity' : [0]*bldata_sample_len})

  buykorea_embedding=row['buykorea_embedding']
  row_count=0

  for index, row_bl in tqdm(bldata_preprocessed_openai.iterrows()):

    bldata_embedding=row_bl['bldata_embedding']

    similarity=cosine_similarity(buykorea_embedding, bldata_embedding)
    similarity_df.loc[row_count, 'bldata_product_name']=row_bl['product_name']
    similarity_df.loc[row_count, 'bldata_description']=row_bl['bldata_preprocessed']
    similarity_df.loc[row_count, 'cosine_similarity']=similarity

    row_count+=1

  # buykorea 제품 유사도 데이터프레임을 하나의 데이터프레임으로 병합
  openai_similarity_df=pd.concat([openai_similarity_df, similarity_df], axis=0)

display(openai_similarity_df.head())


2801it [00:04, 572.18it/s]
2801it [00:04, 564.22it/s]
2801it [00:04, 643.42it/s]
2801it [00:05, 549.00it/s]
2801it [00:04, 598.39it/s]
2801it [00:04, 645.52it/s]
2801it [00:05, 531.20it/s]
2801it [00:04, 641.18it/s]
2801it [00:04, 639.29it/s]
2801it [00:05, 509.32it/s]


Unnamed: 0,buykorea_product_name,buykorea_description,bldata_product_name,bldata_description,cosine_similarity
0,Doldori Premium Gold Essence,The Doldori Premium Gold Essence is an innovat...,BODYSHOCK LOCAL REDUCER 200ML (FACE CREAM),This product is called BODYSHOCK LOCAL REDUCER...,0.434211
1,Doldori Premium Gold Essence,The Doldori Premium Gold Essence is an innovat...,COSMELAN 2 (FACE CREAM),"This product is called COSMELAN 2, a face crea...",0.477261
2,Doldori Premium Gold Essence,The Doldori Premium Gold Essence is an innovat...,ANTI STRESS FACE MASK 100ML (FACE CREAM),"This product is called ANTI STRESS FACE MASK, ...",0.586053
3,Doldori Premium Gold Essence,The Doldori Premium Gold Essence is an innovat...,BLOT POWDER,"This product is called BLOT POWDER, a setting ...",0.483652
4,Doldori Premium Gold Essence,The Doldori Premium Gold Essence is an innovat...,DERMACOLOR CAMOUFLAGE FLUID,This product is called DERMACOLOR CAMOUFLAGE F...,0.518189


In [124]:
openai_similarity_df_sorted= openai_similarity_df.groupby(['buykorea_product_name'], group_keys=False, sort=False).apply(lambda x: x.sort_values(by=['cosine_similarity'], ascending=False))
openai_similarity_df_sorted.reset_index(drop=True, inplace=True)

# 각 buykorea_product_name 별로 cosine_similarity가 높은 10개, 낮은 10개를 'Highest'로, 나머지는 'Lowest'으로 표시하는 함수
def mark_top_bottom_10(df):
    # 상위 10개와 하위 10개는 'Highest', 나머지는 'Lowest'
    df['test_row'] = ['Highest'] * 10 + ['N'] * (len(df) - 20) + ['Lowest'] * 10
    return df

# 각 buykorea_product_name별로 그룹화하여 처리
# Apply the function and then reset the index to align with the original DataFrame
openai_similarity_df_sorted['test_row'] = openai_similarity_df_sorted.groupby('buykorea_product_name', group_keys=False).apply(mark_top_bottom_10).reset_index(drop=True)['test_row']


# 결과 확인
display(openai_similarity_df_sorted)

openai_similarity_df_sorted.to_csv('openai_similarity_df_sorted_v6.csv', index=False)

Unnamed: 0,buykorea_product_name,buykorea_description,bldata_product_name,bldata_description,cosine_similarity,test_row
0,Doldori Premium Gold Essence,The Doldori Premium Gold Essence is an innovat...,GOLD MASK 500 ML,"This product is called GOLD MASK, a skincare t...",0.700289,Highest
1,Doldori Premium Gold Essence,The Doldori Premium Gold Essence is an innovat...,SPA BLK PEEL FACE MASK 50 V1NG (AVON PLANET SP...,"This product is called SPA BLK PEEL FACE MASK,...",0.693363,Highest
2,Doldori Premium Gold Essence,The Doldori Premium Gold Essence is an innovat...,URBAN COLOR PRO WHITE ADVANCE ANTI WRINKLE & N...,This product is called URBAN COLOR PRO WHITE A...,0.685175,Highest
3,Doldori Premium Gold Essence,The Doldori Premium Gold Essence is an innovat...,I AM SEAWEEDS MASK SHEET - PURIFYING (PRODUCT ...,This product is called I AM SEAWEEDS MASK SHEE...,0.682498,Highest
4,Doldori Premium Gold Essence,The Doldori Premium Gold Essence is an innovat...,COSMETIC PRODUCTS V22 DNA MOISTURIZING MASK 250ML,This product is called COSMETIC PRODUCTS V22 D...,0.681528,Highest
...,...,...,...,...,...,...
28005,t:aim Barrier Sunblock 40ml,The t:aim Barrier Sunblock is a high-protectio...,PARSLEY (DRIED LEAVES) (FREE SAMPLE),"This product is called PARSLEY (DRIED LEAVES),...",0.171900,Lowest
28006,t:aim Barrier Sunblock 40ml,The t:aim Barrier Sunblock is a high-protectio...,GLITTER POWDER (FOR COSMETIC RAW MATERIAL),"This product is called GLITTER POWDER, which i...",0.171787,Lowest
28007,t:aim Barrier Sunblock 40ml,The t:aim Barrier Sunblock is a high-protectio...,EYE SHADOW PALLETTE (RC NO.- RC/COS-000867 DT....,"This product is called EYE SHADOW PALETTE, whi...",0.168097,Lowest
28008,t:aim Barrier Sunblock 40ml,The t:aim Barrier Sunblock is a high-protectio...,PARSLEY CURLY (DRIED LEAVES) (FREE SAMPLE),"This product is called PARSLEY CURLY, which re...",0.167452,Lowest
