In [1]:
!pip install pydantic langchain-teddynote langchain_community langchain_huggingface langchain_openai

Collecting langchain-teddynote
  Downloading langchain_teddynote-0.3.30-py3-none-any.whl.metadata (708 bytes)
Collecting langchain_community
  Downloading langchain_community-0.3.7-py3-none-any.whl.metadata (2.9 kB)
Collecting langchain_huggingface
  Downloading langchain_huggingface-0.1.2-py3-none-any.whl.metadata (1.3 kB)
Collecting langchain_openai
  Downloading langchain_openai-0.2.9-py3-none-any.whl.metadata (2.6 kB)
Collecting langgraph (from langchain-teddynote)
  Downloading langgraph-0.2.53-py3-none-any.whl.metadata (15 kB)
Collecting kiwipiepy (from langchain-teddynote)
  Downloading kiwipiepy-0.20.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.1 kB)
Collecting rank-bm25 (from langchain-teddynote)
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Collecting pinecone-client[grpc] (from langchain-teddynote)
  Downloading pinecone_client-5.0.1-py3-none-any.whl.metadata (19 kB)
Collecting pinecone-text (from langchain-teddynote)
  Download

In [2]:
from google.colab import output

output.clear()

In [3]:
import os

from google.colab import userdata

# os.environ["LANGCHAIN_TRACING_V2"] = "true"
# os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
# os.environ["LANGCHAIN_PROJECT"] = "LangChain 실습"
# Langchain API key 입력
# os.environ["LANGCHAIN_API_KEY"] = userdata.get('langsmith_api_key')

In [4]:
import zipfile

# 데이터 처리 및 분석
import pandas as pd
import numpy as np
from scipy import stats
from tqdm import tqdm
import seaborn as sns
from matplotlib import pyplot as plt
import datetime

from sklearn.metrics.pairwise import cosine_similarity

# 모델 저장
import joblib

# To ignore all warnings
import warnings
warnings.filterwarnings('ignore')



# 데이터 불러오기

In [5]:
bl_data_sampled=pd.read_csv('bl_sample_data.csv')

display(bl_data_sampled.head())

Unnamed: 0,HSCD,SUPLY_CO_NAME,PRDT_DC_VAL
0,330499,"MESOESTETIC,. S.L.",BODYSHOCK LOCAL REDUCER 200ML (FACE CREAM)
1,330499,"MESOESTETIC,. S.L.",COSMELAN 2 (FACE CREAM)
2,330499,"MESOESTETIC,. S.L.",ANTI STRESS FACE MASK 100ML (FACE CREAM)
3,330499,KRYOLAN GMBH CHEMISCHE FABRIK,BLOT POWDER
4,330499,KRYOLAN GMBH CHEMISCHE FABRIK,DERMACOLOR CAMOUFLAGE FLUID


In [6]:
bk_data_sampled=pd.read_excel('BK_330499_수정.xlsx')

display(bk_data_sampled.head())

Unnamed: 0,NO,PID,CORPNAME,PRD_NAME,PRD,URL
0,1,3723376,"CELLVIO COSMETIC CO.,LTD.",Doldori Premium Gold Essence,Our patented mask pack redefines skincare with...,https://buykorea.org/ec/prd/selectGoodsDetail....
1,2,3722918,Brade,Exosome Exo-V Skin Boos,The EXO-V Exosome Skin Booster is used in Kore...,https://buykorea.org/ec/prd/selectGoodsDetail....
2,3,3722956,"BIOVAIKOREA Co.,LTD",Oil-Free Ultra-Moisturizing Lotion with Birch Sap,DETAILS\nBENEFITS\n\n\n• Soothing irritated sk...,https://buykorea.org/ec/prd/selectGoodsDetail....
3,4,3722734,Hit Company,CHARNE - Shiny Brightening Cream,Charne Shiny Brightening Cream\n\nVarious natu...,https://buykorea.org/ec/prd/selectGoodsDetail....
4,5,3717513,"Wooanjoo Co.,Ltd",DEAR OHNEUL Singreen Donkey Toner Pad,Product Features:\n\n－ Hypoallergenic: Dermato...,https://buykorea.org/ec/prd/selectGoodsDetail....


# Langsmith , LLM API KEY 설정

In [7]:
from langchain_teddynote import logging

# logging.langsmith("KOTRA_BL_data_matching")

In [8]:
from langchain_community.chat_models import ChatPerplexity
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts import PromptTemplate

from langchain_core.output_parsers import CommaSeparatedListOutputParser
from langchain_core.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field


os.environ["OPENAI_API_KEY"] = userdata.get('KOTRA2_openai_api_key')


# API 사용 rate 제한 설정


In [11]:
from langchain_core.rate_limiters import InMemoryRateLimiter

rate_limiter_openai = InMemoryRateLimiter(
    requests_per_second=8,  # maximum query per day - 10000 (tier 1)
    check_every_n_seconds=0.1,  # Wake up every 100 ms to check whether allowed to make a request,
    max_bucket_size=10,  # Controls the maximum burst size.
    )

# 사용 모델 설정

In [12]:
chat_openai = ChatOpenAI(temperature=0,  model="gpt-4o-mini", rate_limiter=rate_limiter_openai)

# OpenAI API 이용 결과 (model : chatgpt 4o-mini)

# buyKOREA 설명 처리 프롬프트문

---------------------

+ 업무 배경 설명 (제품명, 상품 설명에 대한 데이터임을 인식)
+ HSCODE 4자리 설명을 추가함 (6자리보다 더 광범위한 개념 인식)
+ 제품명과 상품 정보에 기반한 제품 설명 생성
+ 지시 내용별 예시 제공 (one-shot)

In [13]:
keyword_num=10

class DescriptionSummary(BaseModel):
    type : str = Field(description="product type")
    keyword : list[str] = Field(description=f"{keyword_num} keywords which focus on product's strong point and purpose")

parser= PydanticOutputParser(pydantic_object=DescriptionSummary)

In [34]:
summary_prompt_buykorea = PromptTemplate.from_template(
    """You are given data about product's description(PRDT_DESC) and product name(PRDT_NAME).
    All products fall under HS Code 330499, which is a subcategory of 3304.
    The description for 3304 is as follows:
    33.04 - Beauty or make-up preparations and preparations for the care of the skin (other than medicaments), including sunscreen or sun tan preparations; manicure or pedicure preparations.

    Using this information, complete the following tasks:
    Provide a simple summary for description.

    Example :
    PRDT_DESC : 'FIORESE Pure Hydra Water Cream contains Centella Asiatica and Hyaluronic Acid, effectively delivering hydration and providing instant soothing benefits.
    With Niacinamide and Adenosine, it is a dual-functional cosmetic for brightening and wrinkle care. This cream helps create healthy, smooth, and radiant skin.
    The lightweight, hydrating formula absorbs quickly, delivering moisture deep into the skin with a fresh, non-sticky finish. Suitable for all skin types, especially sensitive, dull, and uneven skin.

    For Sensitive Skin: Gentle, irritation-free formula, suitable even for sensitive skin.

    FIORESE Brand: Fiorese is dedicated to clean beauty, using ingredients sourced from nature. 100% cruelty-free.'

    PRDT_COM : DAMOA COSMETICS.CO.LTD

    Output Example : The FIORESE Pure Hydra Water Cream is a lightweight, hydrating cream containing Centella Asiatica and Hyaluronic Acid, providing deep moisture and soothing benefits.
    It also includes Niacinamide and Adenosine, making it a dual-functional cosmetic for brightening and wrinkle care.
    The formula absorbs quickly, leaving a fresh, non-sticky finish, and is suitable for all skin types, especially sensitive, dull, and uneven skin.
    The product is gentle and irritation-free, ideal for sensitive skin. Fiorese is committed to clean beauty with cruelty-free, nature-sourced ingredients.

    PRODUCT DESCRIPTION(PRDT_DESC) : {prdt_desc}
    PRODUCT NAME(PRDT_NAME) : {prdt_name}
    """
)


summary_chain_buykorea = summary_prompt_buykorea | chat_openai

In [35]:
buykorea_preprocessed_dict_openai={}

for index, row in bk_data_sampled.iterrows():
  response_buykorea = summary_chain_buykorea.invoke({"prdt_desc": row['PRD'], "prdt_name": row['PRD_NAME']})
  print(response_buykorea.content)
  buykorea_preprocessed_dict_openai[row['PRD_NAME']]=[response_buykorea.content]

The Doldori Premium Gold Essence is a unique and innovative mask pack designed for an enhanced skincare experience. Its patented, hygienic design offers a new application method that combines practicality with elegance, making it stand out in the market. Available for export and sold in aesthetic clinics, department store pop-ups, and online shopping malls in Korea, this product meets the high demand for effective and luxurious skincare solutions.

The mask pack is meticulously crafted, with the entire manufacturing process handled in-house to ensure quality. It features a specially formulated essence enriched with high-quality ingredients that hydrate, brighten, and revitalize the skin. The mask sheet is designed for excellent adhesion, maximizing comfort and allowing for deep absorption of the essence.

With a focus on detail and quality, the Doldori Premium Gold Essence delivers visible skincare benefits while providing an indulgent experience, making it a luxurious addition to any 

- Strictly exclude adjectives, adverbs, prepositions, .

In [36]:
extract_prompt_buykorea = PromptTemplate.from_template(
    """Extract product type and keywords from input, based on following rules:
    - Strictly exclude any information about capacity or volume.

    INPUT : {input}
    FORMAT : {format}
    """
)

extract_prompt_buykorea= extract_prompt_buykorea.partial(format=parser.get_format_instructions())

extract_chain_buykorea = extract_prompt_buykorea | chat_openai | parser

In [37]:
for key, value in buykorea_preprocessed_dict_openai.items():
  response_buykorea = extract_chain_buykorea.invoke({"input": value})
  print(response_buykorea)
  buykorea_preprocessed_dict_openai[key].append(response_buykorea.type)
  buykorea_preprocessed_dict_openai[key].append(response_buykorea.keyword)

type='mask pack' keyword=['skincare', 'essence', 'hydrating', 'brightening', 'revitalizing', 'luxurious', 'adhesion', 'deep absorption', 'quality', 'indulgent']
type='Skin Treatment' keyword=['Exosome', 'Skin Booster', 'Plant-derived', 'Growth Factors', 'Skin Regeneration', 'Elasticity', 'Wrinkle Reduction', 'Moisture Retention', 'Anti-aging', 'Scalp Nourishment']
type='Lotion' keyword=['Oil-Free', 'Ultra-Moisturizing', 'lightweight', 'daily moisturizer', 'all skin types', 'oily skin', 'combination skin', 'natural ingredients', 'soothe irritated skin', 'fragrance-free']
type='cosmetic cream' keyword=['brightening', 'skin whitening', 'wrinkle improvement', 'natural ingredients', 'antioxidant', 'skin health', 'firmness', 'elasticity', 'radiant skin', 'youthful skin']
type='Toner Pad' keyword=['hypoallergenic', 'skincare', 'sensitive skin', 'dual-sided pads', 'deep cleansing', 'gentle hydration', 'green apple extract', 'calm skin', 'reduce pore size', 'eco-friendly']
type='skincare produc

In [38]:
print(buykorea_preprocessed_dict_openai['Doldori Premium Gold Essence'])

['The Doldori Premium Gold Essence is a unique and innovative mask pack designed for an enhanced skincare experience. Its patented, hygienic design offers a new application method that combines practicality with elegance, making it stand out in the market. Available for export and sold in aesthetic clinics, department store pop-ups, and online shopping malls in Korea, this product meets the high demand for effective and luxurious skincare solutions.\n\nThe mask pack is meticulously crafted, with the entire manufacturing process handled in-house to ensure quality. It features a specially formulated essence enriched with high-quality ingredients that hydrate, brighten, and revitalize the skin. The mask sheet is designed for excellent adhesion, maximizing comfort and allowing for deep absorption of the essence.\n\nWith a focus on detail and quality, the Doldori Premium Gold Essence delivers visible skincare benefits while providing an indulgent experience, making it a luxurious addition t

In [39]:
buykorea_preprocessed_openai=pd.DataFrame.from_dict(buykorea_preprocessed_dict_openai, orient='index', columns=['description_summary', 'type', 'keyword'])
buykorea_preprocessed_openai.reset_index(inplace=True)
buykorea_preprocessed_openai.rename(columns={'index':'product_name'}, inplace=True)
buykorea_preprocessed_openai.to_csv('buykorea_preprocessed_openai_10.csv', index=False)

display(buykorea_preprocessed_openai.head())

Unnamed: 0,product_name,description_summary,type,keyword
0,Doldori Premium Gold Essence,The Doldori Premium Gold Essence is a unique a...,mask pack,"[skincare, essence, hydrating, brightening, re..."
1,Exosome Exo-V Skin Boos,The Exosome Exo-V Skin Booster is a versatile ...,Skin Treatment,"[Exosome, Skin Booster, Plant-derived, Growth ..."
2,Oil-Free Ultra-Moisturizing Lotion with Birch Sap,The Oil-Free Ultra-Moisturizing Lotion with Bi...,Lotion,"[Oil-Free, Ultra-Moisturizing, lightweight, da..."
3,CHARNE - Shiny Brightening Cream,The CHARNE Shiny Brightening Cream is a dual-f...,cosmetic cream,"[brightening, skin whitening, wrinkle improvem..."
4,DEAR OHNEUL Singreen Donkey Toner Pad,The DEAR OHNEUL Singreen Donkey Toner Pad is a...,Toner Pad,"[hypoallergenic, skincare, sensitive skin, dua..."


# BL 데이터 이름 처리 프롬프트문

----------------------------

+ 업무 배경 설명 (수입자, 공급자, 상품 설명이 존재함을 인식)
+ HSCODE 4자리 설명을 추가함 (6자리보다 더 광범위한 개념 인식)
+ 공급자와 상품 이름 정보에 기반한 제품 설명 생성
+ 인터넷 검색으로 추가 정보 검색 지시
+ 의미가 확실치 않은 경우 생성 제한
+ 지시 내용별 예시 제공 (one-shot)

In [20]:
generate_prompt_bldata = PromptTemplate.from_template(
    """You are given data where IMP_CO_NAME imports products (PRDT_DC_VAL) from SUPLY_CO_NAME.
    All products fall under HS Code 330499, which is a subcategory of 3304.
    The description for 3304 is as follows:
    33.04 - Beauty or make-up preparations and preparations for the care of the skin (other than medicaments), including sunscreen or sun tan preparations; manicure or pedicure preparations.

    Using this information, complete the following tasks:
    Provide a simple explanation for PRDT_DC_VAL using both SUPLY_CO_NAME and PRDT_DC_VAL.
    Use publicly available product descriptions from websites to help.
    If the meaning is unclear, do not force an explanation.

    Example:
    PRDT_DC_VAL: COSMELAN 2 (FACE CREAM)
    SUPLY_CO_NAME: MESOESTETIC, S.L.
    Output Example: This product is called COSMELAN 2, a face cream designed to reduce pigmentation and improve skin tone. It is often used for treating melasma and other skin discolorations by inhibiting melanin production.
    Extract up to {keyword_num} key keywords that describe the product's main features.
    Output Example: pigmentation, skin tone, melasma, melanin, face cream
    Identify the type of product.
    Output Example: face cream
    Extract only the product name from PRDT_DC_VAL, excluding any volume, symbols, special characters, or colors.
    Output Example: COSMELAN 2

    PRODUCT DESCRIPTION(PRDT_DC_VAL) : {prdt_desc}
    SUPPLEYER NAME(SUPLY_CO_NAME) : {suply_name}
    """
)

generate_prompt_bldata= generate_prompt_bldata.partial(keyword_num=keyword_num)

generate_chain_bldata = generate_prompt_bldata | chat_openai

In [21]:
bldata_preprocessed_dict_openai={}

for index, row in tqdm(bl_data_sampled.iterrows()):
  response_bldata = generate_chain_bldata.invoke({"prdt_desc": row['PRDT_DC_VAL'], "suply_name":row['SUPLY_CO_NAME']})
  if index%100==0:
    print(f"\nProduct Name : {row['PRDT_DC_VAL']} \n")
    print(response_bldata.content, '\n')
  bldata_preprocessed_dict_openai[row['PRDT_DC_VAL']]=[response_bldata.content]

1it [00:02,  2.74s/it]


Product Name : BODYSHOCK LOCAL REDUCER 200ML (FACE CREAM) 

This product is called BODYSHOCK LOCAL REDUCER, a face cream designed to target localized fat and improve the appearance of the skin. It is often used in body contouring treatments to help reduce the appearance of cellulite and promote smoother skin texture.

**Keywords:** localized fat, body contouring, cellulite, skin texture, face cream, reduction, smoothness, treatment, appearance, improvement

**Type of product:** face cream

**Product name:** BODYSHOCK LOCAL REDUCER 



101it [03:23,  1.92s/it]


Product Name : AVEENO BABY DERMEXA MOISTURING CREAM 206G(1 PAC X 12 PCS) 

This product is called AVEENO BABY DERMEXA MOISTURIZING CREAM, a moisturizing cream specifically formulated for babies to help relieve dry, itchy skin. It is designed to provide long-lasting hydration and is often used to soothe and protect sensitive skin, making it ideal for babies with eczema or other skin conditions.

**Keywords:** moisturizing, cream, baby, dry skin, itchy skin, hydration, soothe, protect, eczema, sensitive skin

**Type of product:** moisturizing cream

**Product name:** AVEENO BABY DERMEXA MOISTURIZING CREAM 



201it [07:21,  2.26s/it]


Product Name : TYA BRAND-5035 MAKEUP KIT-15g 

This product is called TYA BRAND-5035, a makeup kit that includes a variety of cosmetic items designed for enhancing beauty. The kit typically contains essential makeup products such as foundation, eyeshadow, blush, and lip color, allowing users to create different looks for various occasions.

**Key Keywords:** makeup kit, cosmetics, foundation, eyeshadow, blush, lip color, beauty, enhance, variety, occasions

**Type of Product:** makeup kit

**Product Name:** TYA BRAND-5035 



301it [10:52,  3.02s/it]


Product Name : FACIAL MASK (UNBRANDED COSMETIC BEAUTY PRODUCTS) 

This product is called a Facial Mask, which is a cosmetic treatment designed to improve the appearance and health of the skin. Facial masks can hydrate, cleanse, exfoliate, or provide other benefits depending on their formulation. They are typically applied to the face for a specific period and then removed, leaving the skin feeling refreshed and rejuvenated.

**Keywords:** facial mask, cosmetic treatment, hydrate, cleanse, exfoliate, skin health, rejuvenate, beauty, skincare, unbranded

**Type of product:** facial mask

**Product name:** FACIAL MASK 



401it [14:30,  1.80s/it]


Product Name : BELLO YUGARD 30G (.15% RETINOL CREAM- 30 GRAM TUBES) 

This product is called BELLO YUGARD, a retinol cream designed to improve skin texture and reduce the appearance of fine lines and wrinkles. Retinol is a derivative of vitamin A that promotes cell turnover and enhances skin radiance, making it a popular choice for anti-aging skincare routines.

**Keywords:** retinol, cream, skin texture, fine lines, wrinkles, vitamin A, anti-aging, radiance, skincare, beauty

**Type of product:** retinol cream

**Product name:** BELLO YUGARD 



501it [18:07,  1.99s/it]


Product Name : POLYAMIDE-8 - 80142 (COSMETIC RAW MATERIAL SAMPLE FOR TESTING PURPOSE ONLY) 

This product is called POLYAMIDE-8, which is a cosmetic raw material used primarily for testing purposes. It is often utilized in the formulation of various beauty and skincare products due to its properties that enhance texture and stability. As a raw material, it is not intended for direct consumer use but serves as an essential component in the development of cosmetics.

**Keywords:** cosmetic, raw material, testing, formulation, texture, stability, beauty, skincare, properties, component

**Type of product:** cosmetic raw material

**Product name:** POLYAMIDE-8 



601it [21:41,  1.78s/it]


Product Name : MASKARA (UNBRANDED) 

This product is called MASKARA, which is a type of cosmetic used to enhance the eyelashes by making them appear longer, thicker, and darker. It is typically applied with a brush and is a staple in many makeup routines for achieving a more defined and dramatic eye look.

**Key Keywords:** mascara, enhance, eyelashes, longer, thicker, darker, cosmetic, makeup, defined, dramatic

**Type of Product:** mascara

**Product Name:** MASKARA 



701it [25:00,  1.84s/it]


Product Name : FE ALOVERA JUICE 100ML (RETURN TO ORIGIN) (EXP.VIDE S/BILL NO.9172066 & DT-24/12/19) 

This product is called FE ALOVERA JUICE, a natural juice made from the leaves of the aloe vera plant. It is known for its hydrating and soothing properties, often used to promote skin health and digestion. Aloe vera juice is popular for its potential benefits in improving skin hydration, aiding digestion, and providing essential nutrients.

**Keywords:** aloe vera, juice, hydration, skin health, digestion, natural, nutrients, soothing, wellness, organic

**Type of product:** aloe vera juice

**Product name:** FE ALOVERA JUICE 



801it [28:03,  1.70s/it]


Product Name : CC CREAM (UNBRANDED COSMETIC PRODUCTS) RC NO.COS-471/COS-685/COS-686 5 

This product is called CC CREAM, an unbranded cosmetic product designed to provide coverage and improve the appearance of the skin. CC creams, or color-correcting creams, typically offer a lightweight formula that helps to even out skin tone, reduce redness, and provide hydration while often containing SPF for sun protection.

**Keywords:** coverage, skin tone, redness, hydration, SPF, lightweight, cosmetic, unbranded, color-correcting, cream

**Type of product:** CC cream

**Product name:** CC CREAM 



901it [31:30,  2.00s/it]


Product Name : 2.14756 YURAQ (SAMPLE-NCV-R&D PURPOSE) (1 X 250 G) 

This product is called YURAQ, which is a sample intended for research and development purposes. It is likely a formulation related to beauty or skincare, given its classification under HS Code 330499. However, specific details about its intended use or benefits are not provided in the description.

**Keywords:** sample, research, development, formulation, beauty, skincare, NCV, purpose, 250g

**Type of product:** sample formulation

**Product name:** YURAQ 



1001it [34:35,  1.91s/it]


Product Name : MIRACLE MASK DAMAGE REPAIR 200 ML (20 PCS) 

This product is called MIRACLE MASK DAMAGE REPAIR, a hair treatment designed to restore and repair damaged hair. It typically contains nourishing ingredients that help to strengthen hair, reduce breakage, and improve overall hair health. This product is often used by individuals looking to revitalize their hair after exposure to heat styling, chemical treatments, or environmental stressors.

**Keywords:** damage repair, hair treatment, nourishing ingredients, strengthen, reduce breakage, improve health, revitalizing, heat styling, chemical treatments, environmental stressors

**Type of product:** hair treatment

**Product name:** MIRACLE MASK DAMAGE REPAIR 



1101it [37:48,  2.05s/it]


Product Name : 10001Rose (Rosa Damascena) Water l Organic BG-BIO-07 

This product is called Rose Water, specifically derived from Rosa Damascena. It is an organic floral water known for its soothing and hydrating properties, often used in skincare routines to refresh the skin, balance pH levels, and provide a natural fragrance. Rose water is also popular for its anti-inflammatory benefits, making it suitable for sensitive skin.

**Keywords:** rose water, Rosa Damascena, organic, soothing, hydrating, skincare, refresh, pH balance, fragrance, anti-inflammatory

**Type of product:** rose water

**Product name:** Rose Water 



1201it [40:58,  2.12s/it]


Product Name : ILUMA INTENSE BLEACHING SERUM 1OZ(COSMATIC FOR SKIN CARE) 

This product is called ILUMA INTENSE BLEACHING SERUM, a cosmetic serum designed for skin care that aims to lighten and brighten the skin. It is often used to reduce the appearance of dark spots, hyperpigmentation, and uneven skin tone, providing a more radiant complexion.

**Keywords:** bleaching, serum, skin care, dark spots, hyperpigmentation, uneven skin tone, radiant, cosmetic, brightening, intense

**Type of product:** serum

**Product name:** ILUMA INTENSE BLEACHING SERUM 



1301it [44:31,  2.25s/it]


Product Name : Cosmetics-Essence soft touch mousse make-up 04 matt ivory 

This product is called Cosmetics-Essence Soft Touch Mousse Make-up 04 Matt Ivory, a lightweight mousse foundation that provides a soft, matte finish for a natural look. It is designed to even out skin tone while offering a comfortable wear throughout the day. The mousse texture allows for easy application and blending, making it suitable for various skin types.

**Keywords:** lightweight, mousse, foundation, matte finish, natural look, even skin tone, comfortable wear, easy application, blending, skin types

**Type of product:** foundation

**Product name:** Cosmetics-Essence Soft Touch Mousse Make-up 04 



1401it [48:18,  1.93s/it]


Product Name : SERUM - 909248 6 3 JARS/16 SACHET (COSMETIC RAW MATERIAL FOR TESTING PURPOSE ONLY) 

This product is referred to as "SERUM - 909248," which is a cosmetic raw material intended for testing purposes only. It is supplied by M/S. L'OREAL CHINA, a well-known company in the beauty and cosmetics industry. This serum may be used in various formulations to evaluate its effectiveness and compatibility with other cosmetic ingredients.

**Key Keywords:** serum, cosmetic, raw material, testing, formulation, L'OREAL, beauty, skincare, ingredients, evaluation

**Type of Product:** serum

**Product Name:** SERUM 909248 



1501it [52:00,  1.95s/it]


Product Name : CMC SKETCH EYELINER 0.6ML(COSMETIC PRODUCT)(RC/COS-001893) 

This product is called CMC SKETCH EYELINER, a cosmetic product designed for precise application to enhance the eyes. It typically features a fine tip for creating both thin and thick lines, allowing for versatile makeup looks. Eyeliners like this one are often used to define the eyes, making them appear larger and more expressive.

**Keywords:** eyeliner, cosmetic, precise application, fine tip, makeup, define eyes, versatile, enhance, beauty, charm

**Type of product:** eyeliner

**Product name:** CMC SKETCH EYELINER 



1601it [55:24,  2.01s/it]


Product Name : ST. IVES AVOCADO&COCONUT OIL CRM ( PACKING 4 X3X45G) (MATERIAL CODE:67775077) (CDSCO REGN: RC/COS-001649) 1 

This product is called ST. IVES AVOCADO & COCONUT OIL CRM, a cream that combines the nourishing properties of avocado and coconut oil to hydrate and care for the skin. It is designed to provide moisture and improve skin texture, making it suitable for daily use to maintain healthy skin.

**Keywords:** avocado, coconut oil, cream, hydration, moisture, skin care, texture, daily use, nourishing, healthy skin

**Type of product:** cream

**Product name:** ST. IVES AVOCADO & COCONUT OIL CRM 



1701it [58:48,  2.66s/it]


Product Name : FACE MASK 

This product is called FACE MASK, which is a skincare treatment designed to hydrate, nourish, or cleanse the skin. Face masks can come in various forms, such as sheet masks, clay masks, or cream masks, and are typically used to enhance the overall appearance and health of the skin.

**Keywords:** hydration, nourishment, cleanse, skincare, treatment, sheet mask, clay mask, cream mask, appearance, health

**Type of product:** face mask

**Product name:** FACE MASK 



1801it [1:02:16,  1.72s/it]


Product Name : LIGHT WONDER - 7 MEDIUM (EU) (FFDNX40X7R45) (COSMETICS) 40 ml 

This product is called LIGHT WONDER, a medium coverage foundation designed to provide a natural, radiant finish while enhancing the skin's appearance. It is formulated to give a lightweight feel and is suitable for daily wear, helping to even out skin tone and provide a healthy glow.

**Key Keywords:** lightweight, medium coverage, radiant finish, skin enhancement, natural look, daily wear, even skin tone, glow, foundation, cosmetics

**Type of Product:** foundation

**Product Name:** LIGHT WONDER 



1901it [1:05:55,  1.90s/it]


Product Name : CBRN50006 - VISBLWHITE MOIST CREAM 1.7 OZ/50ML [PERFUMERY / COSMETICS PRODUCTS] (BRAND: ELIZABETH ARDEN) 

This product is called VISBLWHITE MOIST CREAM, a moisturizing cream designed to enhance skin brightness and provide hydration. It is part of the Elizabeth Arden brand, known for its high-quality skincare and cosmetic products. This cream aims to improve skin texture and radiance, making it suitable for daily use in skincare routines.

**Keywords:** moisturizing, skin brightness, hydration, skincare, daily use, texture, radiance, cream, Elizabeth Arden, cosmetics

**Type of product:** moisturizing cream

**Product name:** VISBLWHITE MOIST CREAM 



2001it [1:09:26,  2.03s/it]


Product Name : RV MICRO ESS FRE WTR 130ML AS (RV MICRO ESS FRE WTR 130ML AS) (130 GM EACH) 

This product is called RV MICRO ESS FRE WTR, a refreshing water essence designed to hydrate and revitalize the skin. It is often used as a part of a skincare routine to provide moisture and enhance the skin's overall appearance, making it feel fresh and rejuvenated.

**Key Keywords:** hydration, revitalizing, refreshing, essence, skincare, moisture, rejuvenated, skin, beauty, care

**Type of Product:** refreshing water essence

**Product Name:** RV MICRO ESS FRE WTR 



2101it [1:13:03,  2.10s/it]


Product Name : COSMETICS DR.JART ALL THANKS TO CERA SET-SEPHORA-EU-BZ SET0360O1 

This product is called "COSMETICS DR.JART ALL THANKS TO CERA SET," which is a skincare set that includes various products designed to hydrate and nourish the skin. Dr. Jart is known for its innovative formulations, and this set likely features products that utilize ceramides to strengthen the skin barrier and improve moisture retention.

**Key Keywords:** skincare, hydration, ceramides, nourish, barrier, moisture, set, Dr. Jart, cosmetics, beauty

**Type of Product:** skincare set

**Product Name:** COSMETICS DR.JART ALL THANKS TO CERA SET 



2201it [1:16:16,  1.66s/it]


Product Name : COSMETICS -LHS BASILLIME 500ML 

This product is called BASILLIME, a cosmetic product designed for skin care. It typically contains ingredients that help to hydrate and nourish the skin, making it suitable for various skin types. The 500ml size indicates it is a larger bottle, likely intended for regular use or for professional settings.

**Keywords:** cosmetics, skin care, hydrate, nourish, 500ml, beauty, moisturizing, treatment, professional, daily use

**Type of product:** cosmetic

**Product name:** BASILLIME 



2301it [1:19:25,  1.69s/it]


Product Name : COSMETICS- COLLAGEN DRY SKIN MASK 250ML 

This product is called COLLAGEN DRY SKIN MASK, a cosmetic treatment designed to hydrate and rejuvenate dry skin. It typically contains collagen, which helps to improve skin elasticity and moisture levels, making it ideal for individuals with dry or dehydrated skin.

**Keywords:** collagen, dry skin, hydration, rejuvenation, elasticity, moisture, cosmetic, treatment, mask, skincare

**Type of product:** mask

**Product name:** COLLAGEN DRY SKIN MASK 



2401it [1:22:23,  1.48s/it]


Product Name : CHINESE BRAND NYN MAKEUP KIT 80114-37.4 GM 

This product is called the CHINESE BRAND NYN MAKEUP KIT, which includes a variety of makeup items designed for enhancing beauty. The kit typically contains essential makeup products such as foundation, eyeshadow, blush, and lip color, making it a versatile choice for both everyday use and special occasions.

**Keywords:** makeup kit, beauty, foundation, eyeshadow, blush, lip color, versatile, everyday use, special occasions, cosmetics

**Type of product:** makeup kit

**Product name:** CHINESE BRAND NYN MAKEUP KIT 



2501it [1:25:26,  1.78s/it]


Product Name : 7771SY1210 EIGHT HOUR LIP PROTECTANT TIN (COSMETICS) 

This product is called EIGHT HOUR LIP PROTECTANT TIN, a cosmetic designed to provide long-lasting hydration and protection for the lips. It is known for its soothing properties and is often used to relieve dry, chapped lips while offering a barrier against environmental elements.

**Keywords:** hydration, protection, lips, soothing, dry, chapped, barrier, cosmetics, long-lasting, moisture

**Type of product:** lip protectant

**Product name:** EIGHT HOUR LIP PROTECTANT TIN 



2601it [1:28:37,  2.05s/it]


Product Name : DARK CIRCLE CREAM (BULK PACKING) (K.I.C.E.P.A CERTIFICATE NO. K001-20-0064786 DT.22.01.2020 

This product is called DARK CIRCLE CREAM, which is designed to reduce the appearance of dark circles under the eyes. It typically contains ingredients that hydrate the skin, improve circulation, and brighten the under-eye area, helping to diminish the signs of fatigue and aging.

**Keywords:** dark circles, hydration, brightening, under-eye, fatigue, aging, cream, skin care, circulation, bulk packing

**Type of product:** eye cream

**Product name:** DARK CIRCLE CREAM 



2701it [1:31:46,  1.80s/it]


Product Name : COSMETIC PRODUCTS: SKIN CARE PRODUCT ( SAKURA WHITE BOOSTER ) #ZZ936210202 ( N.C.V. SAMPLE FOR R & D PURPOSE ONLY) 

This product is called SAKURA WHITE BOOSTER, a skin care product designed to enhance skin brightness and promote a more radiant complexion. It is often formulated with ingredients that help to improve skin tone and texture, providing a boost of hydration and nourishment.

**Keywords:** skin care, brightness, radiant complexion, improve skin tone, hydration, nourishment, booster, Sakura, cosmetic products, skincare product

**Type of product:** skin care product

**Product name:** SAKURA WHITE BOOSTER 



2801it [1:34:46,  1.68s/it]


Product Name : SWISS IMAGE BRAND - SI EN SOOTH.CLEAN.MILK 200ML 

This product is called SWISS IMAGE BRAND - SI EN SOOTH.CLEAN.MILK, a soothing cleansing milk designed to gently remove makeup and impurities from the skin while maintaining its natural moisture balance. It is often used as part of a skincare routine to cleanse and prepare the skin for further treatment.

**Keywords:** soothing, cleansing, milk, makeup removal, impurities, moisture balance, skincare, gentle, face, routine

**Type of product:** cleansing milk

**Product name:** SWISS IMAGE BRAND - SI EN SOOTH.CLEAN.MILK 



2901it [1:37:38,  2.24s/it]


Product Name : FACE CARE PRODUCTS OTHER THAN FACE MASK - NS00903 NASHI ARGAN - THE BALM - MOISTURIZING AFTER SHAVE, 100 ML 

This product is called NASHI ARGAN - THE BALM, a moisturizing after shave designed to hydrate and soothe the skin after shaving. It is formulated with argan oil, known for its nourishing properties, making it ideal for post-shave care to prevent irritation and dryness.

**Keywords:** moisturizing, after shave, argan oil, hydrate, soothe, skin care, balm, post-shave, irritation, dryness

**Type of product:** after shave balm

**Product name:** NASHI ARGAN - THE BALM 



2932it [1:39:05,  2.03s/it]


In [22]:
joblib.dump(bldata_preprocessed_dict_openai, 'bldata_preprocessed_dict_openai_10.pkl')

['bldata_preprocessed_dict_openai_10.pkl']

In [23]:
bldata_preprocessed_dict_openai_ext=joblib.load('bldata_preprocessed_dict_openai_10.pkl')

In [24]:
parser.get_format_instructions()

'The output should be formatted as a JSON instance that conforms to the JSON schema below.\n\nAs an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}\nthe object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.\n\nHere is the output schema:\n```\n{"properties": {"type": {"description": "product type", "title": "Type", "type": "string"}, "keyword": {"description": "10 keywords which focus on product\'s strong point and purpose", "items": {"type": "string"}, "title": "Keyword", "type": "array"}}, "required": ["type", "keyword"]}\n```'

In [25]:
extract_prompt_bldata = PromptTemplate.from_template(
    """Extract product type and keywords from input, based on following rules:
    - Strictly exclude any information about capacity or volume.

    INPUT : {input}
    FORMAT : Extract product type and keywords from input, based on following rules:
    - Strictly exclude any information about capacity or volume.

    INPUT : {input}
    FORMAT : {{ "type": "<product type>", "keyword": ["<keyword1>", "<keyword2>", "<keyword3>", "<keyword4>", "<keyword5>"] }}
    """
)

# extract_prompt_bldata= extract_prompt_bldata.partial(format=parser.get_format_instructions())

extract_chain_bldata = extract_prompt_bldata | chat_openai | parser

In [26]:
for i, (key, value) in tqdm(enumerate(bldata_preprocessed_dict_openai_ext.items())):
  response_bldata = extract_chain_bldata.invoke({"input": value[0]})
  if i%100==0:
    print(response_bldata)
  bldata_preprocessed_dict_openai_ext[key].append(response_bldata.type)
  bldata_preprocessed_dict_openai_ext[key].append(response_bldata.keyword)

1it [00:01,  1.26s/it]

type='face cream' keyword=['localized fat', 'body contouring', 'cellulite', 'skin texture', 'reduction', 'smoothness', 'treatment', 'appearance', 'improvement']


101it [02:25,  1.11s/it]

type='highlighter palette' keyword=['highlighter', 'palette', 'luminous', 'glow', 'complexion', 'shades', 'enhance', 'cheekbones', 'brow bones', 'sculpted']


201it [04:35,  1.20s/it]

type='after waxing lotion' keyword=['menthol', 'after waxing', 'soothing', 'cooling', 'skin care', 'hydration', 'irritation relief', 'cosmetics', 'lotion', 'RICA']


301it [06:45,  2.23s/it]

type='makeup palette' keyword=['makeup palette', 'versatile', 'colors', 'eyes', 'cheeks', 'lips', 'HD', 'professional', 'beauty', 'cosmetics']


401it [08:45,  1.34s/it]

type='foaming cleanser' keyword=['cleanser', 'foaming', 'Manuka honey', 'hydration', 'glow', 'impurities', 'makeup removal', 'gentle', 'nourishing', 'daily use']


501it [10:50,  1.23s/it]

type='cream' keyword=['matte finish', 'shine control', 'oiliness', 'smooth', 'flawless', 'cosmetic', 'pores', 'makeup', 'NFC']


601it [12:58,  1.23s/it]

type='body lotion' keyword=['glutathione', 'vitamin C', 'hydration', 'brightening', 'skin tone', 'radiant', 'moisturizing', 'even complexion', 'skin benefits']


701it [14:59,  1.20s/it]

type='hydrating emulsion gel' keyword=['hydrating', 'emulsion', 'sensitive skin', 'comfort', 'soothe', 'protect', 'instant', 'gel', 'skincare', 'Biologique Recherche']


801it [17:01,  1.02it/s]

type='hair sheen spray' keyword=['sheen', 'moisture', 'braids', 'shine', 'nourishment', 'hair care', 'vibrant', 'healthy', 'cosmetic', 'styling']


901it [19:05,  1.36s/it]

type='skincare product' keyword=['Coenzyme Q10', 'antioxidant', 'nourish', 'rejuvenate', 'skin elasticity', 'hydration', 'skincare', 'beauty', 'care pack', 'skin protection']


1001it [21:13,  1.14s/it]

type='highlighter and contour' keyword=['highlighter', 'contour', 'facial features', 'dimension', 'sculpted', 'cheekbones', 'jawline', 'cosmetic', 'beauty', 'makeup']


1101it [23:11,  1.12s/it]

type='massage oil' keyword=['baby', 'hydration', 'nourishment', 'gentle', 'lightweight', 'daily use', 'soft skin', 'bonding', 'delicate skin']


1201it [25:19,  1.30s/it]

type='facial oil' keyword=['facial oil', 'capsules', 'nourish', 'hydrate', 'skin', 'essential oils', 'nutrients', 'healthy', 'radiant', 'complexion']


1301it [27:18,  1.14s/it]

type='foundation' keyword=['oil control', 'SPF 20', 'smooth complexion', 'even skin tone', 'daily use', 'sun protection', 'skin appearance', 'magic colour']


1401it [29:18,  1.01it/s]

type='BB cream' keyword=['beautifying', 'moisturizer', 'skin tone', 'hydration', 'lightweight', 'natural glow', 'cosmetic', 'skin care', 'sun protection']


1501it [31:30,  1.51s/it]

type='cleansing astringent' keyword=['whitening', 'cleansing', 'astringent', 'rose', 'pearl', 'brightening', 'skin', 'refreshing', 'facial care', 'daily use']


1601it [33:31,  1.55s/it]

type='shampoo' keyword=['hair care', 'cleanse', 'scalp', 'dirt', 'oil', 'product buildup', 'fresh', 'clean', 'sample', 'testing']


1701it [35:55,  1.25s/it]

type='raw material for cosmetics' keyword=['acne', 'raw material', 'cosmetics', 'lesions', 'breakouts', 'skincare', 'formulation', 'active ingredients', 'treatment', 'prevention']


1801it [37:59,  1.16s/it]

type='makeup remover' keyword=['makeup remover', 'cleanse', 'impurities', 'skincare', 'skin', 'cosmetic', 'effective', 'preparation', 'gentle', 'removal']


1901it [40:03,  1.19s/it]

type='cosmetics' keyword=['firming', 'smoothing', 'skin texture', 'hydration', 'youthful', 'beauty routine', 'appearance', 'formulation', 'effect']


2001it [42:13,  1.18s/it]

type='eye liner' keyword=['eye liner', 'enhance', 'definition', 'color', 'makeup', 'shades', 'subtle', 'dramatic', 'cosmetics', 'PFI6068CC']


2101it [44:15,  1.03it/s]

type='lipstick' keyword=['matte', 'long-lasting', 'color', 'smooth application', 'comfortable', 'cosmetics', 'beauty', 'enhance', 'Athena']


2201it [46:17,  1.11s/it]

type='exfoliator' keyword=['exfoliator', 'skincare', 'dead skin cells', 'skin renewal', 'texture', 'radiance', 'clogged pores', 'clearer skin', 'cosmetics']


2301it [48:11,  1.14s/it]

type='perfume wax' keyword=['perfume', 'wax', 'fragrance', 'cosmetic', 'sample', 'testing', 'scent', 'formulation', 'personal care', 'candles']


2401it [50:20,  1.20s/it]

type='cosmetic' keyword=['hydration', 'moisture', 'nourishment', 'skin texture', 'healthy appearance']


2501it [52:32,  1.75s/it]

type='concealer' keyword=['concealer', 'full coverage', 'blemishes', 'dark circles', 'imperfections', 'creamy texture', 'blends seamlessly', 'long-lasting', 'makeup', 'HUDA BEAUTY']


2601it [54:42,  1.31s/it]

type='cosmetics raw materials' keyword=['cosmetics', 'raw materials', 'emulsifiers', 'preservatives', 'formulation', 'beauty', 'skincare', 'texture', 'stability', 'effectiveness']


2701it [56:37,  1.03it/s]

type='cosmetic' keyword=['thanakha', 'natural', 'skin care', 'cooling', 'beauty', 'sun protection', 'glow', 'oiliness', 'texture']


2801it [58:32,  1.25s/it]

type='moisturizer' keyword=['organic', 'raw', 'shea butter', 'vitamins', 'fatty acids', 'hydrating', 'nourishing', 'dry skin', 'elasticity']





In [31]:
bldata_preprocessed_openai=pd.DataFrame.from_dict(bldata_preprocessed_dict_openai_ext, orient='index', columns=['product_description', 'type', 'keyword'])
bldata_preprocessed_openai.reset_index(inplace=True)
bldata_preprocessed_openai.rename(columns={'index':'product_name'}, inplace=True)
bldata_preprocessed_openai.to_csv('bldata_preprocessed_openai_10.csv', index=False)

display(bldata_preprocessed_openai.head())

Unnamed: 0,product_name,product_description,type,keyword
0,BODYSHOCK LOCAL REDUCER 200ML (FACE CREAM),This product is called BODYSHOCK LOCAL REDUCER...,face cream,"[localized fat, body contouring, cellulite, sk..."
1,COSMELAN 2 (FACE CREAM),"This product is called COSMELAN 2, a face crea...",face cream,"[pigmentation, skin tone, melasma, melanin, fa..."
2,ANTI STRESS FACE MASK 100ML (FACE CREAM),"This product is called ANTI STRESS FACE MASK, ...",face cream,"[hydration, relief, stressed skin, soothing, c..."
3,BLOT POWDER,"This product is called BLOT POWDER, a setting ...",setting powder,"[blotting, powder, shine control, oil absorpti..."
4,DERMACOLOR CAMOUFLAGE FLUID,This product is called DERMACOLOR CAMOUFLAGE F...,camouflage fluid,"[camouflage, high coverage, skin imperfections..."


# 수출한 기업은 다르지만, 수출한 품목의 이름은 같은 경우가 있어 불일치 발생
(중복 제거됨)

In [None]:
# 같은 품목 중 다른 기업이 수출한 이력이 있는 경우, 다른 수출건으로 집계됨

bl_data_sampled.shape

(2932, 3)

In [None]:
# 제품 이름에 기반해, 제품 수입 확률이 높은 해외 바이어 매칭이므로
# 중복되는 제품 이름은 하나의 키워드 및 임베딩으로 통일

len(bldata_preprocessed_dict_openai)

2801

# 임베딩

In [32]:
from langchain_huggingface import HuggingFaceEmbeddings

# 오픈소스 임베딩 모델 다운로드
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [33]:
def text_embedding(text):
  embedding_text=embeddings.embed_query(text)
  return np.array(embedding_text).reshape(1,-1)

#  1. 키워드 10개로 이루어진 문장 임베딩

In [45]:
tqdm.pandas()

# 임베딩할 텍스트

buykorea_preprocessed_openai=pd.read_csv('./buykorea_preprocessed_openai_10.csv')
bldata_preprocessed_openai=pd.read_csv('./bldata_preprocessed_openai_10.csv')

buykorea_preprocessed_openai['buykorea_preprocessed']=buykorea_preprocessed_openai['keyword']
bldata_preprocessed_openai['bldata_preprocessed']=bldata_preprocessed_openai['keyword']
# 키워드로 구성된 문장을 임베딩
buykorea_preprocessed_openai['buykorea_embedding']=buykorea_preprocessed_openai['buykorea_preprocessed'].progress_apply(text_embedding)
bldata_preprocessed_openai['bldata_embedding']=bldata_preprocessed_openai['bldata_preprocessed'].progress_apply(text_embedding)

bldata_sample_len=bldata_preprocessed_openai.shape[0]

100%|██████████| 10/10 [00:04<00:00,  2.41it/s]
100%|██████████| 2801/2801 [11:45<00:00,  3.97it/s]


In [46]:
openai_similarity_df=pd.DataFrame(columns=['buykorea_product_name', 'buykorea_keyword', 'bldata_product_name', 'bldata_keyword', 'cosine_similarity'])

for index, row in buykorea_preprocessed_openai.iterrows():

  # 각 buykorea 상품 (10개) 에,
  # 2800개의 수출 기록 존재하는 제품 이름과의 유사도 계산
  # buykorea 상품별로 2800개의 유사도가 계산되어 저장된 데이터프레임 생성

  similarity_df=pd.DataFrame({'buykorea_product_name':[row['product_name']]*bldata_sample_len,
                               'buykorea_keyword': [row['buykorea_preprocessed']]*bldata_sample_len,
                               'bldata_product_name' : ["None"]*bldata_sample_len,
                               'bldata_keyword' : ["None"]*bldata_sample_len,
                               'cosine_similarity' : [0]*bldata_sample_len})

  buykorea_embedding=row['buykorea_embedding']
  row_count=0

  for index, row_bl in tqdm(bldata_preprocessed_openai.iterrows()):

    bldata_embedding=row_bl['bldata_embedding']

    similarity=cosine_similarity(buykorea_embedding, bldata_embedding)
    similarity_df.loc[row_count, 'bldata_product_name']=row_bl['product_name']
    similarity_df.loc[row_count, 'bldata_keyword']=row_bl['bldata_preprocessed']
    similarity_df.loc[row_count, 'cosine_similarity']=similarity

    row_count+=1

  # buykorea 제품 유사도 데이터프레임을 하나의 데이터프레임으로 병합
  openai_similarity_df=pd.concat([openai_similarity_df, similarity_df], axis=0)

display(openai_similarity_df.head())

2801it [00:05, 494.44it/s]
2801it [00:04, 580.81it/s]
2801it [00:04, 582.36it/s]
2801it [00:05, 496.18it/s]
2801it [00:04, 580.32it/s]
2801it [00:05, 507.13it/s]
2801it [00:04, 565.07it/s]
2801it [00:04, 585.69it/s]
2801it [00:05, 468.15it/s]
2801it [00:04, 567.53it/s]


Unnamed: 0,buykorea_product_name,buykorea_keyword,bldata_product_name,bldata_keyword,cosine_similarity
0,Doldori Premium Gold Essence,"['skincare', 'essence', 'hydrating', 'brighten...",BODYSHOCK LOCAL REDUCER 200ML (FACE CREAM),"['localized fat', 'body contouring', 'cellulit...",0.53945
1,Doldori Premium Gold Essence,"['skincare', 'essence', 'hydrating', 'brighten...",COSMELAN 2 (FACE CREAM),"['pigmentation', 'skin tone', 'melasma', 'mela...",0.640061
2,Doldori Premium Gold Essence,"['skincare', 'essence', 'hydrating', 'brighten...",ANTI STRESS FACE MASK 100ML (FACE CREAM),"['hydration', 'relief', 'stressed skin', 'soot...",0.583049
3,Doldori Premium Gold Essence,"['skincare', 'essence', 'hydrating', 'brighten...",BLOT POWDER,"['blotting', 'powder', 'shine control', 'oil a...",0.712573
4,Doldori Premium Gold Essence,"['skincare', 'essence', 'hydrating', 'brighten...",DERMACOLOR CAMOUFLAGE FLUID,"['camouflage', 'high coverage', 'skin imperfec...",0.657734


In [47]:
openai_similarity_df_sorted= openai_similarity_df.groupby(['buykorea_product_name'], group_keys=False, sort=False).apply(lambda x: x.sort_values('cosine_similarity', ascending=False))
openai_similarity_df_sorted.reset_index(drop=True, inplace=True)

In [48]:
# 각 buykorea_product_name 별로 cosine_similarity가 높은 10개, 낮은 10개를 'Highest'로, 나머지는 'Lowest'으로 표시하는 함수
def mark_top_bottom_10(df):
    # 상위 10개와 하위 10개는 'Highest', 나머지는 'Lowest'
    df['test_row'] = ['Highest'] * 10 + ['N'] * (len(df) - 20) + ['Lowest'] * 10
    return df

# 각 buykorea_product_name별로 그룹화하여 처리
# Apply the function and then reset the index to align with the original DataFrame
openai_similarity_df_sorted['test_row'] = openai_similarity_df_sorted.groupby('buykorea_product_name', group_keys=False).apply(mark_top_bottom_10).reset_index(drop=True)['test_row']


# 결과 확인
display(openai_similarity_df_sorted)

openai_similarity_df_sorted.to_csv('openai_similarity_df_sorted_v1.csv', index=False)

Unnamed: 0,buykorea_product_name,buykorea_keyword,bldata_product_name,bldata_keyword,cosine_similarity,test_row
0,Doldori Premium Gold Essence,"['skincare', 'essence', 'hydrating', 'brighten...",RV MICRO ESS FRE WTR 22ML AS V2 (RV MICRO ESS ...,"['hydration', 'refresh', 'lightweight', 'essen...",0.944300,Highest
1,Doldori Premium Gold Essence,"['skincare', 'essence', 'hydrating', 'brighten...",82265354 OLAY WHITE RADIANCE TONE PERFECTING H...,"['radiance', 'tone', 'hydrating', 'essence', '...",0.928231,Highest
2,Doldori Premium Gold Essence,"['skincare', 'essence', 'hydrating', 'brighten...",JADE BODY CREAM - (1KG) - (PRODUCT CODE - OP29...,"['moisturizing', 'nourishing', 'skin texture',...",0.928179,Highest
3,Doldori Premium Gold Essence,"['skincare', 'essence', 'hydrating', 'brighten...",VITAL SS SUPER SERUM 50ML (VITAL SS SUPER SERU...,"['vitality', 'hydration', 'skin texture', 'rad...",0.924322,Highest
4,Doldori Premium Gold Essence,"['skincare', 'essence', 'hydrating', 'brighten...",SKINCARE CREAM (FOC),"['skincare', 'cream', 'nourish', 'hydrate', 'm...",0.920471,Highest
...,...,...,...,...,...,...
28005,t:aim Barrier Sunblock 40ml,"['high-protection', 'SPF50+', 'PA++++', 'UV pr...",COSMETICS EYEBROW PENCILS,"['eyebrow', 'define', 'enhance', 'shape', 'sha...",0.323613,Lowest
28006,t:aim Barrier Sunblock 40ml,"['high-protection', 'SPF50+', 'PA++++', 'UV pr...",TERAPOL POLISHING PASTE 0.35KG(EQUIPMENT AND M...,"['polishing', 'paste', 'educational', 'surface...",0.311732,Lowest
28007,t:aim Barrier Sunblock 40ml,"['high-protection', 'SPF50+', 'PA++++', 'UV pr...",Other face makeup products - 1944-2DSP - PROFU...,"['eyebrows', 'versatile', 'shades', 'tools', '...",0.311130,Lowest
28008,t:aim Barrier Sunblock 40ml,"['high-protection', 'SPF50+', 'PA++++', 'UV pr...",HINDU RELIGIOUS MATERIALS FOR POOJA PURPOSE-PO...,"['Hindu', 'religious', 'pooja', 'materials', '...",0.306676,Lowest


# 2. 제품 유형만 이용해 유사도 순 나열

In [None]:
tqdm.pandas()

# 임베딩할 텍스트

buykorea_preprocessed_openai=pd.read_csv('./buykorea_preprocessed_openai.csv')
bldata_preprocessed_openai=pd.read_csv('./bldata_preprocessed_openai.csv')

buykorea_preprocessed_openai['buykorea_preprocessed']=buykorea_preprocessed_openai['type']
bldata_preprocessed_openai['bldata_preprocessed']=bldata_preprocessed_openai['type']
# 키워드로 구성된 문장을 임베딩
buykorea_preprocessed_openai['buykorea_embedding']=buykorea_preprocessed_openai['buykorea_preprocessed'].progress_apply(text_embedding)
bldata_preprocessed_openai['bldata_embedding']=bldata_preprocessed_openai['bldata_preprocessed'].progress_apply(text_embedding)

bldata_sample_len=bldata_preprocessed_openai.shape[0]

100%|██████████| 10/10 [00:00<00:00, 10.11it/s]
100%|██████████| 2801/2801 [04:20<00:00, 10.77it/s]


In [None]:
openai_similarity_df=pd.DataFrame(columns=['buykorea_product_name', 'buykorea_type', 'bldata_product_name', 'bldata_type', 'cosine_similarity'])

for index, row in buykorea_preprocessed_openai.iterrows():

  # 각 buykorea 상품 (10개) 에,
  # 2800개의 수출 기록 존재하는 제품 이름과의 유사도 계산
  # buykorea 상품별로 2800개의 유사도가 계산되어 저장된 데이터프레임 생성

  similarity_df=pd.DataFrame({'buykorea_product_name':[row['product_name']]*bldata_sample_len,
                               'buykorea_type': [row['buykorea_preprocessed']]*bldata_sample_len,
                               'bldata_product_name' : ["None"]*bldata_sample_len,
                               'bldata_type' : ["None"]*bldata_sample_len,
                               'cosine_similarity' : [0]*bldata_sample_len})

  buykorea_embedding=row['buykorea_embedding']
  row_count=0

  for index, row_bl in tqdm(bldata_preprocessed_openai.iterrows()):

    bldata_embedding=row_bl['bldata_embedding']

    similarity=cosine_similarity(buykorea_embedding, bldata_embedding)
    similarity_df.loc[row_count, 'bldata_product_name']=row_bl['product_name']
    similarity_df.loc[row_count, 'bldata_type']=row_bl['bldata_preprocessed']
    similarity_df.loc[row_count, 'cosine_similarity']=similarity

    row_count+=1

  # buykorea 제품 유사도 데이터프레임을 하나의 데이터프레임으로 병합
  openai_similarity_df=pd.concat([openai_similarity_df, similarity_df], axis=0)

display(openai_similarity_df.head())


2801it [00:05, 507.90it/s]
2801it [00:04, 568.80it/s]
2801it [00:04, 600.25it/s]
2801it [00:05, 500.14it/s]
2801it [00:04, 591.45it/s]
2801it [00:04, 565.15it/s]
2801it [00:05, 492.79it/s]
2801it [00:04, 595.83it/s]
2801it [00:05, 496.83it/s]
2801it [00:04, 577.05it/s]


Unnamed: 0,buykorea_product_name,buykorea_type,bldata_product_name,bldata_type,cosine_similarity
0,Doldori Premium Gold Essence,mask pack,BODYSHOCK LOCAL REDUCER 200ML (FACE CREAM),face cream,0.443385
1,Doldori Premium Gold Essence,mask pack,COSMELAN 2 (FACE CREAM),face cream,0.443385
2,Doldori Premium Gold Essence,mask pack,ANTI STRESS FACE MASK 100ML (FACE CREAM),face cream,0.443385
3,Doldori Premium Gold Essence,mask pack,BLOT POWDER,setting powder,0.142408
4,Doldori Premium Gold Essence,mask pack,DERMACOLOR CAMOUFLAGE FLUID,camouflage fluid,0.390623


In [None]:
openai_similarity_df_sorted= openai_similarity_df.groupby(['buykorea_product_name'], group_keys=False, sort=False).apply(lambda x: x.sort_values('cosine_similarity', ascending=False))
openai_similarity_df_sorted.reset_index(drop=True, inplace=True)

# 각 buykorea_product_name 별로 cosine_similarity가 높은 10개, 낮은 10개를 'Highest'로, 나머지는 'Lowest'으로 표시하는 함수
def mark_top_bottom_10(df):
    # 상위 10개와 하위 10개는 'Highest', 나머지는 'Lowest'
    df['test_row'] = ['Highest'] * 10 + ['N'] * (len(df) - 20) + ['Lowest'] * 10
    return df

# 각 buykorea_product_name별로 그룹화하여 처리
# Apply the function and then reset the index to align with the original DataFrame
openai_similarity_df_sorted['test_row'] = openai_similarity_df_sorted.groupby('buykorea_product_name', group_keys=False).apply(mark_top_bottom_10).reset_index(drop=True)['test_row']


# 결과 확인
display(openai_similarity_df_sorted)

openai_similarity_df_sorted.to_csv('openai_similarity_df_sorted_v2.csv', index=False)

Unnamed: 0,buykorea_product_name,buykorea_type,bldata_product_name,bldata_type,cosine_similarity,test_row
0,Doldori Premium Gold Essence,mask pack,AVOCADO HIGH NUTRITION MASK PACK (BRAND-STAROV...,mask pack,1.000000,Highest
1,Doldori Premium Gold Essence,mask pack,COSMETICS MASK PACK,mask pack,1.000000,Highest
2,Doldori Premium Gold Essence,mask pack,ASSORTD FACE PACK (RETURN TO ORIGIN) (EXPORTED...,face pack,0.785882,Highest
3,Doldori Premium Gold Essence,mask pack,SS0000016020SS APC MASK 13G TUBE,skincare mask,0.764832,Highest
4,Doldori Premium Gold Essence,mask pack,HIBISCUS & ROSE GLOW MASK (RETURN TO ORIGIN) (...,skincare mask,0.764832,Highest
...,...,...,...,...,...,...
28005,t:aim Barrier Sunblock 40ml,sunscreen,HILARY RHODA HIGHLIGHTER & CONTOUR (NT. WT. 12...,highlighter and contour,0.045261,Lowest
28006,t:aim Barrier Sunblock 40ml,sunscreen,PEPPLUS LIFTING PACK (8 PACKS) ( 40 PCS) ( CDS...,lifting pack,0.041898,Lowest
28007,t:aim Barrier Sunblock 40ml,sunscreen,LYON BEAUTY USA - PRIMA VISTA MULTI COLOR BASE...,multi-color base,0.038602,Lowest
28008,t:aim Barrier Sunblock 40ml,sunscreen,2.14725 YURAQ (SAMPLE-NCV-R&D PURPOSE) (1 X 25...,sample for research and development,0.033819,Lowest


# 3. 제품 유형 + 키워드 5개 (총 6개 키워드) 이용해 유사도 순 나열

In [None]:
tqdm.pandas()

# 임베딩할 텍스트

buykorea_preprocessed_openai=pd.read_csv('./buykorea_preprocessed_openai.csv')
bldata_preprocessed_openai=pd.read_csv('./bldata_preprocessed_openai.csv')

buykorea_preprocessed_openai['buykorea_preprocessed']=buykorea_preprocessed_openai['type']+","+buykorea_preprocessed_openai['keyword'].apply(lambda x : ",".join(x.strip('[]').split(',')))
bldata_preprocessed_openai['bldata_preprocessed']=bldata_preprocessed_openai['type']+","+bldata_preprocessed_openai['keyword'].apply(lambda x : ",".join(x.strip('[]').split(',')))
# 키워드로 구성된 문장을 임베딩
buykorea_preprocessed_openai['buykorea_embedding']=buykorea_preprocessed_openai['buykorea_preprocessed'].progress_apply(text_embedding)
bldata_preprocessed_openai['bldata_embedding']=bldata_preprocessed_openai['bldata_preprocessed'].progress_apply(text_embedding)

bldata_sample_len=bldata_preprocessed_openai.shape[0]

100%|██████████| 10/10 [00:01<00:00,  5.50it/s]
100%|██████████| 2801/2801 [08:44<00:00,  5.34it/s]


In [None]:
openai_similarity_df=pd.DataFrame(columns=['buykorea_product_name', 'buykorea_keyword', 'bldata_product_name', 'bldata_keyword', 'cosine_similarity'])

for index, row in buykorea_preprocessed_openai.iterrows():

  # 각 buykorea 상품 (10개) 에,
  # 2800개의 수출 기록 존재하는 제품 이름과의 유사도 계산
  # buykorea 상품별로 2800개의 유사도가 계산되어 저장된 데이터프레임 생성

  similarity_df=pd.DataFrame({'buykorea_product_name':[row['product_name']]*bldata_sample_len,
                               'buykorea_keyword': [row['buykorea_preprocessed']]*bldata_sample_len,
                               'bldata_product_name' : ["None"]*bldata_sample_len,
                               'bldata_keyword' : ["None"]*bldata_sample_len,
                               'cosine_similarity' : [0]*bldata_sample_len})

  buykorea_embedding=row['buykorea_embedding']
  row_count=0

  for index, row_bl in tqdm(bldata_preprocessed_openai.iterrows()):

    bldata_embedding=row_bl['bldata_embedding']

    similarity=cosine_similarity(buykorea_embedding, bldata_embedding)
    similarity_df.loc[row_count, 'bldata_product_name']=row_bl['product_name']
    similarity_df.loc[row_count, 'bldata_keyword']=row_bl['bldata_preprocessed']
    similarity_df.loc[row_count, 'cosine_similarity']=similarity

    row_count+=1

  # buykorea 제품 유사도 데이터프레임을 하나의 데이터프레임으로 병합
  openai_similarity_df=pd.concat([openai_similarity_df, similarity_df], axis=0)

display(openai_similarity_df.head())


2801it [00:05, 530.75it/s]
2801it [00:04, 582.21it/s]
2801it [00:05, 482.78it/s]
2801it [00:04, 588.38it/s]
2801it [00:05, 552.71it/s]
2801it [00:05, 510.13it/s]
2801it [00:04, 575.73it/s]
2801it [00:05, 482.47it/s]
2801it [00:04, 577.53it/s]
2801it [00:04, 582.27it/s]


Unnamed: 0,buykorea_product_name,buykorea_keyword,bldata_product_name,bldata_keyword,cosine_similarity
0,Doldori Premium Gold Essence,"mask pack,'luxurious skincare', 'hygienic desi...",BODYSHOCK LOCAL REDUCER 200ML (FACE CREAM),"face cream,'localized fat', 'cellulite', 'body...",0.57374
1,Doldori Premium Gold Essence,"mask pack,'luxurious skincare', 'hygienic desi...",COSMELAN 2 (FACE CREAM),"face cream,'pigmentation', 'skin tone', 'melas...",0.474314
2,Doldori Premium Gold Essence,"mask pack,'luxurious skincare', 'hygienic desi...",ANTI STRESS FACE MASK 100ML (FACE CREAM),"face cream,'hydration', 'soothing', 'skin reli...",0.667383
3,Doldori Premium Gold Essence,"mask pack,'luxurious skincare', 'hygienic desi...",BLOT POWDER,"setting powder,'shine control', 'oil absorptio...",0.645117
4,Doldori Premium Gold Essence,"mask pack,'luxurious skincare', 'hygienic desi...",DERMACOLOR CAMOUFLAGE FLUID,"camouflage fluid,'high coverage', 'skin imperf...",0.60376


In [None]:
openai_similarity_df_sorted= openai_similarity_df.groupby(['buykorea_product_name'], group_keys=False, sort=False).apply(lambda x: x.sort_values('cosine_similarity', ascending=False))
openai_similarity_df_sorted.reset_index(drop=True, inplace=True)

# 각 buykorea_product_name 별로 cosine_similarity가 높은 10개, 낮은 10개를 'Highest'로, 나머지는 'Lowest'으로 표시하는 함수
def mark_top_bottom_10(df):
    # 상위 10개와 하위 10개는 'Highest', 나머지는 'Lowest'
    df['test_row'] = ['Highest'] * 10 + ['N'] * (len(df) - 20) + ['Lowest'] * 10
    return df

# 각 buykorea_product_name별로 그룹화하여 처리
# Apply the function and then reset the index to align with the original DataFrame
openai_similarity_df_sorted['test_row'] = openai_similarity_df_sorted.groupby('buykorea_product_name', group_keys=False).apply(mark_top_bottom_10).reset_index(drop=True)['test_row']


# 결과 확인
display(openai_similarity_df_sorted)

openai_similarity_df_sorted.to_csv('openai_similarity_df_sorted_v3.csv', index=False)

Unnamed: 0,buykorea_product_name,buykorea_keyword,bldata_product_name,bldata_keyword,cosine_similarity,test_row
0,Doldori Premium Gold Essence,"mask pack,'luxurious skincare', 'hygienic desi...",COSMETICS MASK,"mask,'hydration', 'nourishment', 'rejuvenation...",0.850780,Highest
1,Doldori Premium Gold Essence,"mask pack,'luxurious skincare', 'hygienic desi...",SS0000014020 SS APC MASK 30G TRAVEL SIZE FILLE...,"facial mask,'skin care', 'hydration', 'easy ap...",0.830218,Highest
2,Doldori Premium Gold Essence,"mask pack,'luxurious skincare', 'hygienic desi...",BELIF THE TRUE CREAM MOISTURIZING BOMB 25ML (C...,"moisturizer,'intense moisture', 'herbal ingred...",0.823652,Highest
3,Doldori Premium Gold Essence,"mask pack,'luxurious skincare', 'hygienic desi...",OAP RV CRYSTAL MASK 1PC AS (OAP RV CRYSTAL MAS...,"skincare mask,'hydration', 'rejuvenation', 'sk...",0.819121,Highest
4,Doldori Premium Gold Essence,"mask pack,'luxurious skincare', 'hygienic desi...",COSMETICS DR.JART CRYO RUBBER WITH MOISTURIZIN...,"moisturizing mask,'moisturizing', 'hyaluronic ...",0.816542,Highest
...,...,...,...,...,...,...
28005,t:aim Barrier Sunblock 40ml,"sunscreen,'high-protection', 'SPF50+', 'UV pro...",COLOR ICON EYESHADOW GLITTER SINGLE(1.4gm)(COS...,"eyeshadow,'eyeshadow', 'glitter', 'cosmetic', ...",0.305137,Lowest
28006,t:aim Barrier Sunblock 40ml,"sunscreen,'high-protection', 'SPF50+', 'UV pro...",EYEBROW PENCIL,"eyebrow pencil,'define', 'enhance', 'shape', '...",0.303874,Lowest
28007,t:aim Barrier Sunblock 40ml,"sunscreen,'high-protection', 'SPF50+', 'UV pro...",COSMETICS- SUGAR Arch Arrival 3 in 1 Brow Shap...,"brow shaper,'brow shaper', 'define', 'fill', '...",0.281756,Lowest
28008,t:aim Barrier Sunblock 40ml,"sunscreen,'high-protection', 'SPF50+', 'UV pro...",POLYESTER GLITTER,"glitter,'glitter', 'cosmetic', 'sparkle', 'mak...",0.261830,Lowest


# 4. 유형 단어 포함 여부로 필터 후, 키워드 5개 이용해 유사도 순 나열

In [None]:
tqdm.pandas()

# 임베딩할 텍스트

buykorea_preprocessed_openai=pd.read_csv('./buykorea_preprocessed_openai.csv')
bldata_preprocessed_openai=pd.read_csv('./bldata_preprocessed_openai.csv')

buykorea_preprocessed_openai['buykorea_preprocessed']=buykorea_preprocessed_openai['keyword'].apply(lambda x : ",".join(x.strip('[]').split(',')))
bldata_preprocessed_openai['bldata_preprocessed']=bldata_preprocessed_openai['keyword'].apply(lambda x : ",".join(x.strip('[]').split(',')))
# 키워드로 구성된 문장을 임베딩
buykorea_preprocessed_openai['buykorea_embedding']=buykorea_preprocessed_openai['buykorea_preprocessed'].progress_apply(text_embedding)
bldata_preprocessed_openai['bldata_embedding']=bldata_preprocessed_openai['bldata_preprocessed'].progress_apply(text_embedding)

bldata_sample_len=bldata_preprocessed_openai.shape[0]

In [None]:
openai_similarity_df=pd.DataFrame(columns=['buykorea_product_name', 'buykorea_keyword', 'bldata_product_name', 'bldata_keyword', 'cosine_similarity'])

for index, row in buykorea_preprocessed_openai.iterrows():

  # 각 buykorea 상품 (10개) 에,
  # 2800개의 수출 기록 존재하는 제품 이름과의 유사도 계산
  # buykorea 상품별로 2800개의 유사도가 계산되어 저장된 데이터프레임 생성

  similarity_df=pd.DataFrame({'buykorea_product_name':[row['product_name']]*bldata_sample_len,
                               'buykorea_keyword': [row['buykorea_preprocessed']]*bldata_sample_len,
                               'bldata_product_name' : ["None"]*bldata_sample_len,
                               'bldata_keyword' : ["None"]*bldata_sample_len,
                               'cosine_similarity' : [0]*bldata_sample_len})

  buykorea_embedding=row['buykorea_embedding']
  buykorea_product_type=row['type'].split(' ')

  pattern = '|'.join(buykorea_product_type) + '|' + ' '.join(buykorea_product_type)
  result_pattern = f'({pattern})'

  row_count=0

  for index, row_bl in tqdm(bldata_preprocessed_openai.iterrows()):
    if row_bl['type'].str.contains(result_pattern, Regex=True):
      bldata_embedding=row_bl['bldata_embedding']

      similarity=cosine_similarity(buykorea_embedding, bldata_embedding)
      similarity_df.loc[row_count, 'bldata_product_name']=row_bl['product_name']
      similarity_df.loc[row_count, 'bldata_keyword']=row_bl['bldata_preprocessed']
      similarity_df.loc[row_count, 'cosine_similarity']=similarity

    row_count+=1

  # buykorea 제품 유사도 데이터프레임을 하나의 데이터프레임으로 병합
  openai_similarity_df=pd.concat([openai_similarity_df, similarity_df], axis=0)

display(openai_similarity_df.head())


In [None]:
openai_similarity_df_sorted= openai_similarity_df.groupby(['buykorea_product_name'], group_keys=False, sort=False).apply(lambda x: x.sort_values('cosine_similarity', ascending=False))
openai_similarity_df_sorted.reset_index(drop=True, inplace=True)

# 각 buykorea_product_name 별로 cosine_similarity가 높은 10개, 낮은 10개를 'Highest'로, 나머지는 'Lowest'으로 표시하는 함수
def mark_top_bottom_10(df):
    # 상위 10개와 하위 10개는 'Highest', 나머지는 'Lowest'
    df['test_row'] = ['Highest'] * 10 + ['N'] * (len(df) - 20) + ['Lowest'] * 10
    return df

# 각 buykorea_product_name별로 그룹화하여 처리
# Apply the function and then reset the index to align with the original DataFrame
openai_similarity_df_sorted['test_row'] = openai_similarity_df_sorted.groupby('buykorea_product_name', group_keys=False).apply(mark_top_bottom_10).reset_index(drop=True)['test_row']


# 결과 확인
display(openai_similarity_df_sorted)

openai_similarity_df_sorted.to_csv('openai_similarity_df_sorted_v4.csv', index=False)

# 5. 유형 유사도 순 나열 후, 비슷한 유형 내에서 키워드 5개 이용해 유사도 순 나열

In [None]:
tqdm.pandas()

# 임베딩할 텍스트

buykorea_preprocessed_openai=pd.read_csv('./buykorea_preprocessed_openai.csv')
bldata_preprocessed_openai=pd.read_csv('./bldata_preprocessed_openai.csv')

buykorea_preprocessed_openai['buykorea_type_preprocessed']=buykorea_preprocessed_openai['type']
bldata_preprocessed_openai['bldata_type_preprocessed']=bldata_preprocessed_openai['type']
# 키워드로 구성된 문장을 임베딩
buykorea_preprocessed_openai['buykorea_type_embedding']=buykorea_preprocessed_openai['buykorea_type_preprocessed'].progress_apply(text_embedding)
bldata_preprocessed_openai['bldata_type_embedding']=bldata_preprocessed_openai['bldata_type_preprocessed'].progress_apply(text_embedding)

buykorea_preprocessed_openai['buykorea_keyword_preprocessed']=buykorea_preprocessed_openai['keyword'].apply(lambda x : ",".join(x.strip('[]').split(',')))
bldata_preprocessed_openai['bldata_keyword_preprocessed']=bldata_preprocessed_openai['keyword'].apply(lambda x : ",".join(x.strip('[]').split(',')))
# 키워드로 구성된 문장을 임베딩
buykorea_preprocessed_openai['buykorea_keyword_embedding']=buykorea_preprocessed_openai['buykorea_keyword_preprocessed'].progress_apply(text_embedding)
bldata_preprocessed_openai['bldata_keyword_embedding']=bldata_preprocessed_openai['bldata_keyword_preprocessed'].progress_apply(text_embedding)

bldata_sample_len=bldata_preprocessed_openai.shape[0]

100%|██████████| 10/10 [00:00<00:00, 10.46it/s]
100%|██████████| 2801/2801 [04:19<00:00, 10.78it/s]
100%|██████████| 10/10 [00:02<00:00,  3.95it/s]
  6%|▌         | 174/2801 [00:30<07:11,  6.08it/s]

In [None]:
openai_similarity_df=pd.DataFrame(columns=['buykorea_product_name', 'buykorea_type', 'buykorea_keyword', 'bldata_product_name', 'bldata_type', 'bldata_keyword', 'type_cosine_similarity', 'keyword_cosine_similarity'])

for index, row in buykorea_preprocessed_openai.iterrows():

  # 각 buykorea 상품 (10개) 에,
  # 2800개의 수출 기록 존재하는 제품 이름과의 유사도 계산
  # buykorea 상품별로 2800개의 유사도가 계산되어 저장된 데이터프레임 생성

  similarity_df=pd.DataFrame({'buykorea_product_name':[row['product_name']]*bldata_sample_len,
                              'buykorea_type' : [row['buykorea_type_preprocessed']]*bldata_sample_len,
                               'buykorea_keyword': [row['buykorea_keyword_preprocessed']]*bldata_sample_len,
                               'bldata_product_name' : ["None"]*bldata_sample_len,
                               'bldata_type' : ["None"]*bldata_sample_len,
                               'bldata_keyword' : ["None"]*bldata_sample_len,
                               'type_cosine_similarity' : [0]*bldata_sample_len,
                              'keyword_cosine_similarity ' : [0]*bldata_sample_len})

  buykorea_type_embedding=row['buykorea_type_embedding']
  buykorea_keyword_embedding=row['buykorea_keyword_embedding']
  row_count=0

  for index, row_bl in tqdm(bldata_preprocessed_openai.iterrows()):

    bldata_type_embedding=row_bl['bldata_type_embedding']
    bldata_keyword_embedding=row_bl['bldata_keyword_embedding']

    type_similarity=cosine_similarity(buykorea_type_embedding, bldata_type_embedding)
    keyword_similarity=cosine_similarity(buykorea_keyword_embedding, bldata_keyword_embedding)
    similarity_df.loc[row_count, 'bldata_product_name']=row_bl['product_name']
    similarity_df.loc[row_count, 'bldata_type']=row_bl['bldata_type_preprocessed']
    similarity_df.loc[row_count, 'bldata_keyword']=row_bl['bldata_keyword_preprocessed']
    similarity_df.loc[row_count, 'type_cosine_similarity']=type_similarity
    similarity_df.loc[row_count, 'keyword_cosine_similarity']=keyword_similarity

    row_count+=1

  # buykorea 제품 유사도 데이터프레임을 하나의 데이터프레임으로 병합
  openai_similarity_df=pd.concat([openai_similarity_df, similarity_df], axis=0)

display(openai_similarity_df.head())


In [None]:
openai_similarity_df_sorted= openai_similarity_df.groupby(['buykorea_product_name'], group_keys=False, sort=False).apply(lambda x: x.sort_values(subset=['type_cosine_similarity','keyword_cosine_similarity'], ascending=False))
openai_similarity_df_sorted.reset_index(drop=True, inplace=True)

# 각 buykorea_product_name 별로 cosine_similarity가 높은 10개, 낮은 10개를 'Highest'로, 나머지는 'Lowest'으로 표시하는 함수
def mark_top_bottom_10(df):
    # 상위 10개와 하위 10개는 'Highest', 나머지는 'Lowest'
    df['test_row'] = ['Highest'] * 10 + ['N'] * (len(df) - 20) + ['Lowest'] * 10
    return df

# 각 buykorea_product_name별로 그룹화하여 처리
# Apply the function and then reset the index to align with the original DataFrame
openai_similarity_df_sorted['test_row'] = openai_similarity_df_sorted.groupby('buykorea_product_name', group_keys=False).apply(mark_top_bottom_10).reset_index(drop=True)['test_row']


# 결과 확인
display(openai_similarity_df_sorted)

openai_similarity_df_sorted.to_csv('openai_similarity_df_sorted_v5.csv', index=False)

In [None]:
buykorea_preprocessed_openai_10