# 珠宝产品类型分析

本 notebook 用于分析 articles.csv 数据集中的 product_type_name 字段，找出与珠宝相关的产品类型。

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
# 加载数据
data_path = Path('../data/articles.csv')
df = pd.read_csv(data_path)

# 显示基本信息
print(f"数据集形状: {df.shape}")
print(f"\n列名: {df.columns.tolist()}")
print(f"\n前5行数据:")
df.head()

数据集形状: (105542, 25)

列名: ['article_id', 'product_code', 'prod_name', 'product_type_no', 'product_type_name', 'product_group_name', 'graphical_appearance_no', 'graphical_appearance_name', 'colour_group_code', 'colour_group_name', 'perceived_colour_value_id', 'perceived_colour_value_name', 'perceived_colour_master_id', 'perceived_colour_master_name', 'department_no', 'department_name', 'index_code', 'index_name', 'index_group_no', 'index_group_name', 'section_no', 'section_name', 'garment_group_no', 'garment_group_name', 'detail_desc']

前5行数据:


Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."


In [3]:
# 获取所有唯一的 product_type_name
unique_product_types = df['product_type_name'].unique()
print(f"总共有 {len(unique_product_types)} 种不同的产品类型")
print("\n所有产品类型:")
for i, pt in enumerate(sorted(unique_product_types), 1):
    print(f"{i}. {pt}")

总共有 131 种不同的产品类型

所有产品类型:
1. Accessories set
2. Alice band
3. Baby Bib
4. Backpack
5. Bag
6. Ballerinas
7. Beanie
8. Belt
9. Bikini top
10. Blanket
11. Blazer
12. Blouse
13. Bodysuit
14. Bootie
15. Boots
16. Bra
17. Bra extender
18. Bracelet
19. Braces
20. Bucket hat
21. Bumbag
22. Cap
23. Cap/peaked
24. Cardigan
25. Chem. cosmetics
26. Clothing mist
27. Coat
28. Costumes
29. Cross-body bag
30. Cushion
31. Dog Wear
32. Dog wear
33. Dress
34. Dungarees
35. Earring
36. Earrings
37. Eyeglasses
38. Felt hat
39. Fine cosmetics
40. Flat shoe
41. Flat shoes
42. Flip flop
43. Garment Set
44. Giftbox
45. Gloves
46. Hair clip
47. Hair string
48. Hair ties
49. Hair/alice band
50. Hairband
51. Hat/beanie
52. Hat/brim
53. Headband
54. Heeled sandals
55. Heels
56. Hoodie
57. Jacket
58. Jumpsuit/Playsuit
59. Keychain
60. Kids Underwear top
61. Leg warmers
62. Leggings/Tights
63. Long John
64. Marker pen
65. Mobile case
66. Moccasins
67. Necklace
68. Night gown
69. Nipple covers
70. Other accessories


In [4]:
# 统计每个产品类型的数量
product_type_counts = df['product_type_name'].value_counts().sort_values(ascending=False)
print("产品类型数量统计 (前20):")
print(product_type_counts.head(20))

产品类型数量统计 (前20):
product_type_name
Trousers            11169
Dress               10362
Sweater              9302
T-shirt              7904
Top                  4155
Blouse               3979
Jacket               3940
Shorts               3939
Shirt                3405
Vest top             2991
Underwear bottom     2748
Skirt                2696
Hoodie               2356
Bra                  2212
Socks                1889
Leggings/Tights      1878
Sneakers             1621
Cardigan             1550
Hat/beanie           1349
Garment Set          1320
Name: count, dtype: int64


In [5]:
# 定义珠宝相关的关键词
jewelry_keywords = [
    'necklace', 'necklaces', 'neck', 'chain', 'pendant',
    'earring', 'earrings', 'ear', 'stud', 'hoop',
    'bracelet', 'bracelets', 'bangle', 'bangles',
    'ring', 'rings',
    'brooch', 'brooches', 'pin', 'pins',
    'jewelry', 'jewellery',
    'watch', 'watches',
    'charm', 'charms',
    'anklet', 'anklets',
    'cuff', 'cuffs',
    'tiara', 'crown',
    'gem', 'gemstone', 'crystal', 'pearl',
    'gold', 'silver', 'platinum', 'diamond',
    'bead', 'beads'
]

# 筛选与珠宝相关的产品类型
jewelry_related = []
for product_type in unique_product_types:
    product_type_lower = product_type.lower()
    for keyword in jewelry_keywords:
        if keyword in product_type_lower:
            jewelry_related.append(product_type)
            break

jewelry_related = sorted(set(jewelry_related))
print(f"找到 {len(jewelry_related)} 种与珠宝相关的产品类型:")
for i, pt in enumerate(jewelry_related, 1):
    print(f"{i}. {pt}")

找到 21 种与珠宝相关的产品类型:
1. Bracelet
2. Dog Wear
3. Dog wear
4. Earring
5. Earrings
6. Hair string
7. Keychain
8. Kids Underwear top
9. Necklace
10. Ring
11. Sleeping sack
12. Swimwear bottom
13. Swimwear set
14. Swimwear top
15. Underwear Tights
16. Underwear body
17. Underwear bottom
18. Underwear corset
19. Underwear set
20. Watch
21. Wireless earphone case


In [6]:
# 查看 Accessories 类别的产品类型
accessories_df = df[df['product_group_name'] == 'Accessories']
accessories_product_types = accessories_df['product_type_name'].unique()
print(f"\nAccessories 类别下的产品类型 ({len(accessories_product_types)} 种):")
for i, pt in enumerate(sorted(accessories_product_types), 1):
    count = len(accessories_df[accessories_df['product_type_name'] == pt])
    print(f"{i}. {pt} (数量: {count})")


Accessories 类别下的产品类型 (38 种):
1. Accessories set (数量: 7)
2. Alice band (数量: 6)
3. Baby Bib (数量: 3)
4. Bag (数量: 1280)
5. Beanie (数量: 56)
6. Belt (数量: 458)
7. Bracelet (数量: 180)
8. Braces (数量: 3)
9. Bucket hat (数量: 7)
10. Cap (数量: 13)
11. Cap/peaked (数量: 573)
12. Dog Wear (数量: 20)
13. Earring (数量: 1159)
14. Earrings (数量: 11)
15. Eyeglasses (数量: 2)
16. Felt hat (数量: 10)
17. Giftbox (数量: 15)
18. Gloves (数量: 367)
19. Hair clip (数量: 244)
20. Hair string (数量: 238)
21. Hair ties (数量: 24)
22. Hair/alice band (数量: 854)
23. Hairband (数量: 2)
24. Hat/beanie (数量: 1349)
25. Hat/brim (数量: 396)
26. Headband (数量: 1)
27. Necklace (数量: 581)
28. Other accessories (数量: 1034)
29. Ring (数量: 240)
30. Scarf (数量: 1013)
31. Soft Toys (数量: 46)
32. Straw hat (数量: 6)
33. Sunglasses (数量: 621)
34. Tie (数量: 141)
35. Umbrella (数量: 26)
36. Wallet (数量: 77)
37. Watch (数量: 73)
38. Waterbottle (数量: 22)


In [7]:
# 查看所有产品组别
product_groups = df['product_group_name'].unique()
print("所有产品组别:")
for i, pg in enumerate(sorted(product_groups), 1):
    count = len(df[df['product_group_name'] == pg])
    print(f"{i}. {pg} (数量: {count})")

所有产品组别:
1. Accessories (数量: 11158)
2. Bags (数量: 25)
3. Cosmetic (数量: 49)
4. Fun (数量: 2)
5. Furniture (数量: 13)
6. Garment Full body (数量: 13292)
7. Garment Lower body (数量: 19812)
8. Garment Upper body (数量: 42741)
9. Garment and Shoe care (数量: 9)
10. Interior textile (数量: 3)
11. Items (数量: 17)
12. Nightwear (数量: 1899)
13. Shoes (数量: 5283)
14. Socks & Tights (数量: 2442)
15. Stationery (数量: 5)
16. Swimwear (数量: 3127)
17. Underwear (数量: 5490)
18. Underwear/nightwear (数量: 54)
19. Unknown (数量: 121)


In [8]:
# 检查是否有任何产品类型包含珠宝相关的金属材质关键词
metal_keywords = ['gold', 'silver', 'platinum', 'metal', 'metallic']

print("检查包含金属材质关键词的产品类型:")
for product_type in sorted(unique_product_types):
    product_type_lower = product_type.lower()
    for keyword in metal_keywords:
        if keyword in product_type_lower:
            count = len(df[df['product_type_name'] == product_type])
            print(f"  - {product_type} (数量: {count})")
            break

检查包含金属材质关键词的产品类型:


In [9]:
# 检查 colour_master_name 中是否有珠宝相关的材质
colour_masters = df['perceived_colour_master_name'].unique()
print(f"\n所有色彩主类别 ({len(colour_masters)} 种):")
for i, cm in enumerate(sorted(colour_masters), 1):
    count = len(df[df['perceived_colour_master_name'] == cm])
    print(f"{i}. {cm} (数量: {count})")


所有色彩主类别 (20 种):
1. Beige (数量: 5657)
2. Black (数量: 22585)
3. Blue (数量: 18469)
4. Bluish Green (数量: 3)
5. Brown (数量: 2269)
6. Green (数量: 3526)
7. Grey (数量: 8924)
8. Khaki green (数量: 3181)
9. Lilac Purple (数量: 1100)
10. Metal (数量: 2180)
11. Mole (数量: 1223)
12. Orange (数量: 2734)
13. Pink (数量: 9403)
14. Red (数量: 5878)
15. Turquoise (数量: 1829)
16. Unknown (数量: 685)
17. White (数量: 12665)
18. Yellow (数量: 3121)
19. Yellowish Green (数量: 5)
20. undefined (数量: 105)


In [10]:
# 查看包含 'Metal' 色彩的产品
metal_color_df = df[df['perceived_colour_master_name'] == 'Metal']
print(f"\n包含 'Metal' 色彩的产品数量: {len(metal_color_df)}")
print("\n这些产品的产品类型分布:")
metal_product_types = metal_color_df['product_type_name'].value_counts()
print(metal_product_types)


包含 'Metal' 色彩的产品数量: 2180

这些产品的产品类型分布:
product_type_name
Earring              766
Necklace             442
Ring                 188
Other accessories     99
Sandals               89
Bracelet              83
Sunglasses            74
Hair clip             61
Hair/alice band       49
Ballerinas            44
Sneakers              34
Belt                  32
Boots                 22
Dress                 21
Bag                   21
Skirt                 15
Heeled sandals        14
Hair string           11
Vest top              10
Earrings              10
Watch                  9
Other shoe             7
Trousers               7
Jacket                 7
Wedge                  6
Top                    6
Wallet                 5
Slippers               5
Blouse                 4
Leggings/Tights        4
Flat shoe              4
Sweater                3
Bra                    2
Cap/peaked             2
Blazer                 2
Flip flop              2
Outdoor Waistcoat      1
Socks            

In [11]:
# 查看 graphical_appearance_name 中是否有珠宝相关的材质
graphical_appearances = df['graphical_appearance_name'].unique()
print(f"\n所有外观类型 ({len(graphical_appearances)} 种):")
for i, ga in enumerate(sorted(graphical_appearances), 1):
    count = len(df[df['graphical_appearance_name'] == ga])
    print(f"{i}. {ga} (数量: {count})")


所有外观类型 (30 种):
1. All over pattern (数量: 17165)
2. Application/3D (数量: 1341)
3. Argyle (数量: 15)
4. Chambray (数量: 322)
5. Check (数量: 2178)
6. Colour blocking (数量: 1830)
7. Contrast (数量: 376)
8. Denim (数量: 4842)
9. Dot (数量: 681)
10. Embroidery (数量: 1165)
11. Front print (数量: 3215)
12. Glittering/Metallic (数量: 958)
13. Hologram (数量: 8)
14. Jacquard (数量: 830)
15. Lace (数量: 1513)
16. Melange (数量: 5938)
17. Mesh (数量: 86)
18. Metallic (数量: 346)
19. Mixed solid/pattern (数量: 1132)
20. Neps (数量: 66)
21. Other pattern (数量: 515)
22. Other structure (数量: 1502)
23. Placement print (数量: 3098)
24. Sequin (数量: 806)
25. Slub (数量: 153)
26. Solid (数量: 49747)
27. Stripe (数量: 4990)
28. Transparent (数量: 86)
29. Treatment (数量: 586)
30. Unknown (数量: 52)


In [12]:
# 查看包含 'Metallic' 外观的产品
metallic_df = df[df['graphical_appearance_name'] == 'Metallic']
print(f"\n包含 'Metallic' 外观的产品数量: {len(metallic_df)}")
print("\n这些产品的产品类型分布:")
metallic_product_types = metallic_df['product_type_name'].value_counts()
print(metallic_product_types)


包含 'Metallic' 外观的产品数量: 346

这些产品的产品类型分布:
product_type_name
Sunglasses                  33
Bag                         21
Dress                       19
Other accessories           18
Belt                        18
Hair clip                   17
Sweater                     17
Sandals                     14
Hair/alice band             12
Swimwear bottom             12
Bikini top                  12
Bracelet                    12
Jacket                      12
Earring                     11
Necklace                     9
Ring                         9
Skirt                        9
Trousers                     8
Sneakers                     7
Vest top                     7
Ballerinas                   6
Boots                        5
Swimsuit                     4
Hair string                  4
Top                          4
Leggings/Tights              4
Cardigan                     3
Flat shoe                    3
T-shirt                      3
Watch                        3
Heeled san

In [13]:
# 查看所有 Accessories 类别的产品详情
print("\nAccessories 类别的产品详情:")
accessories_details = accessories_df[['prod_name', 'product_type_name', 'detail_desc']].drop_duplicates()
print(accessories_details.to_string(index=False))


Accessories 类别的产品详情:
                     prod_name product_type_name                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 detail_desc
                       2p Claw         Hair clip                                                                                                                                                                                                                                                     

In [14]:
# 总结分析结果
print("=" * 80)
print("珠宝产品类型分析总结")
print("=" * 80)

print(f"\n1. 数据集总产品类型数: {len(unique_product_types)}")
print(f"2. Accessories 类别产品类型数: {len(accessories_product_types)}")
print(f"3. 包含珠宝关键词的产品类型数: {len(jewelry_related)}")

if len(jewelry_related) > 0:
    print(f"\n找到的珠宝相关产品类型:")
    for pt in jewelry_related:
        count = len(df[df['product_type_name'] == pt])
        print(f"  - {pt} (数量: {count})")
else:
    print(f"\n未找到明确的珠宝相关产品类型。")
    print(f"\nAccessories 类别可能包含一些珠宝相关的产品:")
    for pt in sorted(accessories_product_types):
        count = len(accessories_df[accessories_df['product_type_name'] == pt])
        print(f"  - {pt} (数量: {count})")

print(f"\n包含 'Metal' 色彩的产品数: {len(metal_color_df)}")
print(f"包含 'Metallic' 外观的产品数: {len(metallic_df)}")

珠宝产品类型分析总结

1. 数据集总产品类型数: 131
2. Accessories 类别产品类型数: 38
3. 包含珠宝关键词的产品类型数: 21

找到的珠宝相关产品类型:
  - Bracelet (数量: 180)
  - Dog Wear (数量: 20)
  - Dog wear (数量: 7)
  - Earring (数量: 1159)
  - Earrings (数量: 11)
  - Hair string (数量: 238)
  - Keychain (数量: 1)
  - Kids Underwear top (数量: 96)
  - Necklace (数量: 581)
  - Ring (数量: 240)
  - Sleeping sack (数量: 48)
  - Swimwear bottom (数量: 1307)
  - Swimwear set (数量: 192)
  - Swimwear top (数量: 50)
  - Underwear Tights (数量: 546)
  - Underwear body (数量: 174)
  - Underwear bottom (数量: 2748)
  - Underwear corset (数量: 7)
  - Underwear set (数量: 47)
  - Watch (数量: 73)
  - Wireless earphone case (数量: 2)

包含 'Metal' 色彩的产品数: 2180
包含 'Metallic' 外观的产品数: 346
