In [2]:
import spacy
from typing import List

nlp = spacy.load("en_core_web_sm")

def extract_key_features(description: str, max_features: int = 10) -> List[str]:
    """
    Args:
        description (str): Main text description.
        max_features (int): Số lượng đặc điểm tối đa (mặc định là 10).
    
    Returns:
        list: Danh sách các đặc điểm quan trọng.
    """
    doc = nlp(description)
    features = []
    
    # Trích xuất các cụm danh từ (noun chunks)
    for chunk in doc.noun_chunks:
        # Lọc các cụm không liên quan
        unwanted = {"which", "your", "and"}  # Có thể mở rộng danh sách này
        chunk_text = chunk.text.strip()
        if not any(word in chunk_text.lower() for word in unwanted):
            if any(token.pos_ in ["NOUN", "PROPN"] for token in chunk):
                # Chỉ thêm nếu cụm tồn tại trong văn bản gốc
                if chunk_text in description:
                    features.append((chunk_text, len(chunk_text.split()), chunk.start))
    
    # Trích xuất các cụm tính từ bổ nghĩa danh từ
    for token in doc:
        if token.dep_ == "amod" and token.head.pos_ == "NOUN":
            feature_phrase = f"{token.text} {token.head.text}"
            if feature_phrase in description:  # Chỉ thêm nếu cụm tồn tại trong văn bản
                features.append((feature_phrase, len(feature_phrase.split()), token.i))
    
    # Sắp xếp theo độ quan trọng:
    # 1. Độ dài cụm (dài hơn = quan trọng hơn)
    # 2. Vị trí trong văn bản (sớm hơn = quan trọng hơn)
    features = sorted(features, key=lambda x: (-x[1], x[2]))
    
    # Lấy danh sách đặc điểm duy nhất và giới hạn số lượng
    feature_list = []
    seen = set()
    for feature, _, _ in features:
        if feature.lower() not in seen:
            feature_list.append(feature)
            seen.add(feature.lower())
        if len(feature_list) >= max_features:
            break
    
    return feature_list

def process_description(description: str) -> dict:
    """
    Xử lý mô tả sản phẩm để trích xuất các đặc điểm quan trọng.
    
    Args:
        description (str): Mô tả sản phẩm.
    
    Returns:
        dict: Kết quả chứa danh sách đặc điểm.
    """
    features = extract_key_features(description)
    return {"features": features}

description = "Samsung Galaxy S23 Ultra 5G with stunning 6.8-inch AMOLED display, fast charging, 12GB RAM, and incredible camera quality for your photos."
result = process_description(description)

print("Đặc điểm (tối đa 10):", result["features"])

Đặc điểm (tối đa 10): ['stunning 6.8-inch AMOLED display', 'incredible camera quality', 'fast charging', '12GB RAM', '5G']


In [3]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB ? eta -:--:--
     -- ------------------------------------- 0.8/12.8 MB 1.7 MB/s eta 0:00:08
     ---- ----------------------------------- 1.3/12.8 MB 1.8 MB/s eta 0:00:07
     ---- ----------------------------------- 1.6/12.8 MB 1.9 MB/s eta 0:00:06
     ------- -------------------------------- 2.4/12.8 MB 2.1 MB/s eta 0:00:05
     --------- ------------------------------ 2.9/12.8 MB 2.3 MB/s eta 0:00:05
     ------------ --------------------------- 3.9/12.8 MB 2.7 MB/s eta 0:00:04
     ---------------- ----------------------- 5.2/12.8 MB 3.1 MB/s eta 0:00:03
     --------------------- ------------------ 6.8/12.8 


[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
import pandas as pd
df = pd.read_csv("description_data_with_subcategory.csv")

In [5]:
df

Unnamed: 0,TITLE,DESCRIPTION,main_category,sub_category
0,Drill America DWCTL Series High-Speed Steel En...,"DRILL AMERICA 7/8"" HS 2FSE LONG ENDMILL (DWCTL...",home & kitchen,Other
1,Whats Your Kick® Chess Lover Inspired Printed ...,This mug is made up of fine quality ceramic ma...,home & kitchen,Other
2,TravisLappy Keyboard for Dell Inspiron 3541 35...,Travislappy keyboard for dell inspiron 3541 35...,music,Other
3,Korky 528PRO Pro Grade Max Performance Fill Va...,FILL VALVES,appliances,Other
4,Kristin Ess Weightless Shine Leave-In Conditioner,Kristin Ess Weightless Shine Leave-In Conditioner,beauty & health,Other
...,...,...,...,...
199995,Fariox Plastic Wall Mounted Cosmetic Organizer...,"Could be set up in kitchen, bathroom, living r...",home & kitchen,Data Storage
199996,MADSABRE 12.5 in Handmade Forged Optimal Full ...,"""Features:<br> 100% Quality Assurance&nbsp;<br...",Other,Outdoor Recreation
199997,MNTC Lord Vishnu Beautiful Paper Poster (Paper...,Mntc beautiful paper print poster (size 12 inc...,home & kitchen,Other
199998,Just Love Womens Solid Jacket 4501-NEW-CRL-S,<b> STAY COMFY AND WARM WITH SCRUB JACKETS DES...,women's clothing,Other


In [6]:
df["prompt"] = df.apply(lambda row:
    f"User input: {row['TITLE']}\n"
    f"Main category: {row['main_category']}\n"
    f"Sub category: {row['sub_category']}\n"
    f"Features: {', '.join(process_description(row['TITLE'])['features'])}", axis=1                        
)

In [7]:
df

Unnamed: 0,TITLE,DESCRIPTION,main_category,sub_category,prompt
0,Drill America DWCTL Series High-Speed Steel En...,"DRILL AMERICA 7/8"" HS 2FSE LONG ENDMILL (DWCTL...",home & kitchen,Other,User input: Drill America DWCTL Series High-Sp...
1,Whats Your Kick® Chess Lover Inspired Printed ...,This mug is made up of fine quality ceramic ma...,home & kitchen,Other,User input: Whats Your Kick® Chess Lover Inspi...
2,TravisLappy Keyboard for Dell Inspiron 3541 35...,Travislappy keyboard for dell inspiron 3541 35...,music,Other,User input: TravisLappy Keyboard for Dell Insp...
3,Korky 528PRO Pro Grade Max Performance Fill Va...,FILL VALVES,appliances,Other,User input: Korky 528PRO Pro Grade Max Perform...
4,Kristin Ess Weightless Shine Leave-In Conditioner,Kristin Ess Weightless Shine Leave-In Conditioner,beauty & health,Other,User input: Kristin Ess Weightless Shine Leave...
...,...,...,...,...,...
199995,Fariox Plastic Wall Mounted Cosmetic Organizer...,"Could be set up in kitchen, bathroom, living r...",home & kitchen,Data Storage,User input: Fariox Plastic Wall Mounted Cosmet...
199996,MADSABRE 12.5 in Handmade Forged Optimal Full ...,"""Features:<br> 100% Quality Assurance&nbsp;<br...",Other,Outdoor Recreation,User input: MADSABRE 12.5 in Handmade Forged O...
199997,MNTC Lord Vishnu Beautiful Paper Poster (Paper...,Mntc beautiful paper print poster (size 12 inc...,home & kitchen,Other,User input: MNTC Lord Vishnu Beautiful Paper P...
199998,Just Love Womens Solid Jacket 4501-NEW-CRL-S,<b> STAY COMFY AND WARM WITH SCRUB JACKETS DES...,women's clothing,Other,User input: Just Love Womens Solid Jacket 4501...


In [13]:
df["prompt"][5]

'User input: Stunned Superb Owl Pattern Polycarbonate Matte Finish Designer, Premium Mobile Phone Back case Cover for Huawei Mate 20 Pro\nMain category: tv, audio & cameras\nSub category: Other\nFeatures: Stunned Superb Owl Pattern Polycarbonate Matte Finish Designer, Premium Mobile Phone Back case Cover, Huawei Mate, Pro'

In [9]:
df.to_csv("description_data_with_subcategory_and_prompt.csv", index=False)