# articles_hm 전처리
`articles_hm.csv`(상품 메타데이터)을 분석/시각화에 바로 쓸 수 있는 형태로 정리

In [1]:
import pandas as pd
articles = pd.read_csv("h&m_dataset/articles_hm.csv")
articles.head()

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."


In [2]:
# 테이블의 전체적인 정보 확인
articles.info()

<class 'pandas.DataFrame'>
RangeIndex: 105542 entries, 0 to 105541
Data columns (total 25 columns):
 #   Column                        Non-Null Count   Dtype
---  ------                        --------------   -----
 0   article_id                    105542 non-null  int64
 1   product_code                  105542 non-null  int64
 2   prod_name                     105542 non-null  str  
 3   product_type_no               105542 non-null  int64
 4   product_type_name             105542 non-null  str  
 5   product_group_name            105542 non-null  str  
 6   graphical_appearance_no       105542 non-null  int64
 7   graphical_appearance_name     105542 non-null  str  
 8   colour_group_code             105542 non-null  int64
 9   colour_group_name             105542 non-null  str  
 10  perceived_colour_value_id     105542 non-null  int64
 11  perceived_colour_value_name   105542 non-null  str  
 12  perceived_colour_master_id    105542 non-null  int64
 13  perceived_colour_master_n

In [3]:
# 행/열 크기 확인
articles.shape

(105542, 25)

In [4]:
# 결측치가 많은 컬럼 상위 확인
na_a = articles.isna().sum().sort_values(ascending=False) 
na_a.head(3)

detail_desc     416
product_code      0
prod_name         0
dtype: int64

# 전처리 작업전 컬럼 분석

- article_id                    # 상품을 식별하는 ID
- product_code                  # 상품 코드
- prod_name                     # 상품 이름

- 상품 분류
    - product_type_no	            # 상품 유형 코드
    - product_type_name	            # 상품 유형 명 - “T-shirt”, “Dress”
    - product_group_name            # 상품 대분류 - “Garment Upper body”

- 상품군
    - garment_group_no	            # 의류 그룹 코드
    - garment_group_name	        # 의류 그룹 이름 - “Jersey Basic”

- 카테고리(상위)
    - index_group_no                # 판매 인덱스 그룹 코드
    - index_group_name              # 성별·연령대 기반 그룹 - “Ladieswear”, “Menswear”

- 타깃 관련
    - index_code                    # 중분류 코드
    - index_name                    # 중분류 명 (예: Lingeries/Tights)

- 판매 섹션
    - section_no                    # 섹션 코드
    - section_name                  # 실제 판매 섹션 - “Womens Everyday Basics”

- 부서
    - department_no                 # 내부 부서 코드
    - department_name               # 부서 이름 - 보조적 상품 분류

- 디자인
    - graphical_appearance_no       # 외형 정보 코드 
    - graphical_appearance_name     # 외형 스타일 - “Solid”, “All over pattern”

- 색상
    - colour_group_code             # 색상 그룹 코드
    - colour_group_name             # 상품 색상 명 - “Black”, “White”

- 명도
    - perceived_colour_value_id     # 색 밝기/톤 코드 - 명도 분류
    - perceived_colour_value_name   # 밝기 느낌 - “Dark”, “Light”

- 색 계열 코드
    - perceived_colour_master_id    # 색 계열 코드
    - perceived_colour_master_name  # 색 계열 이름 - “Black”, “Blue”



## 제거할 컬럼
- *_no, *_code
    - 가독성이 매우 낮고 name 컬럼들과 중복됨 
- department_*
    - 부서 분류까지 갈 필요성 X
- prod_name   # 생각 보류 
    - 변수로 사용하기 애매하지만 설명용으로 사용 가능 예상

##### 반드시 필요하다 판단된 컬럼
- article_id
    - 거래 데이터와 조인하는 키
- product_group_name
    - 상품 분류, 가장 큰 분류
- garment_group_name
    - 의류 대분류, 구매 패턴 비교에 필요
- index_group_name
    - 타겟 그룹(Ladies/Mens 등), 고객군 해석에 필요하다 판단
- section_name
    - 실제 매장/서비스 단위, 실무 해석 필요하다 판단

In [5]:
# 원본 보존을 위해 복사본에서 작업
art = articles.copy()

### 필요없는 컬럼 제거

In [6]:
# 필요 없는 설명 텍스트 제거
# detail_desc: 자유 서술형 텍스트로 결측도 많고, 
# 이번 분석 목표(패턴/색상/형태/종류/라인 중심)와 어울리지 않기 때문에 삭제
art_drop = art.drop('detail_desc', axis=1)
art_drop.isna().sum()

article_id                      0
product_code                    0
prod_name                       0
product_type_no                 0
product_type_name               0
product_group_name              0
graphical_appearance_no         0
graphical_appearance_name       0
colour_group_code               0
colour_group_name               0
perceived_colour_value_id       0
perceived_colour_value_name     0
perceived_colour_master_id      0
perceived_colour_master_name    0
department_no                   0
department_name                 0
index_code                      0
index_name                      0
index_group_no                  0
index_group_name                0
section_no                      0
section_name                    0
garment_group_no                0
garment_group_name              0
dtype: int64

### 제품 색상 코드 컬럼 정리

제품 색상 코드를 나타내는 컬럼이 
- 'colour_group_code', 'colour_group_name', 
- 'perceived_colour_value_id', 'perceived_colour_value_name', 
- 'perceived_colour_master_id', 'perceived_colour_master_name'  로 총 6개 <br>
데이터를 단순화 하기 위해 'perceived_colour_master_id', 'perceived_colour_master_name' 2개 컬럼을 대표로 사용하고 나머지 컬럼은 삭제한다

In [7]:
# 중복/가독성 낮은 색상 관련 컬럼 정리
# 프로젝트에서는 master 기준 색 계열을 사용한다.
# *perceived_colour_value은 명도/톤 계열(밝기)이지만, 본 분석 범위에서는 우선 제외한다.
drop_cols = [
    "colour_group_code", "colour_group_name",
    "perceived_colour_value_id", "perceived_colour_value_name",
]
art_drop = art_drop.drop(drop_cols, axis=1)

art_drop.columns

Index(['article_id', 'product_code', 'prod_name', 'product_type_no',
       'product_type_name', 'product_group_name', 'graphical_appearance_no',
       'graphical_appearance_name', 'perceived_colour_master_id',
       'perceived_colour_master_name', 'department_no', 'department_name',
       'index_code', 'index_name', 'index_group_no', 'index_group_name',
       'section_no', 'section_name', 'garment_group_no', 'garment_group_name'],
      dtype='str')

In [8]:
art_drop.isna().sum().sort_values(ascending=False)

article_id                      0
product_code                    0
prod_name                       0
product_type_no                 0
product_type_name               0
product_group_name              0
graphical_appearance_no         0
graphical_appearance_name       0
perceived_colour_master_id      0
perceived_colour_master_name    0
department_no                   0
department_name                 0
index_code                      0
index_name                      0
index_group_no                  0
index_group_name                0
section_no                      0
section_name                    0
garment_group_no                0
garment_group_name              0
dtype: int64

### 컬럼 고유값 확인

#### product type

In [9]:
# product_type_no / product_type_name 분포 확인
art_drop[['product_type_no', 'product_type_name']].value_counts()

product_type_no  product_type_name
272              Trousers             11169
265              Dress                10362
252              Sweater               9302
255              T-shirt               7904
254              Top                   4155
                                      ...  
366              Towel                    1
493              Wood balls               1
464              Pre-walkers              1
468              Bumbag                   1
483              Clothing mist            1
Name: count, Length: 132, dtype: int64

In [10]:
# product_type_no → product_type_name 1:1 여부 확인
check = art_drop.groupby('product_type_no')['product_type_name'].nunique()
print(check[check > 1])

Series([], Name: product_type_name, dtype: int64)


product_type_no == product_type_name 깔끔하게 서로 매칭됨<br> 
→ 두 컬럼을 하나로 합치는 파생 컬럼 생성

In [11]:
art_drop['product_type_info'] = (
    art_drop['product_type_name'] 
    + " (" 
    + art_drop['product_type_no'].astype(str) 
    + ")")
art_drop[[
    "product_type_no",
    "product_type_name",
    "product_type_info"
    ]].head()

Unnamed: 0,product_type_no,product_type_name,product_type_info
0,253,Vest top,Vest top (253)
1,253,Vest top,Vest top (253)
2,253,Vest top,Vest top (253)
3,306,Bra,Bra (306)
4,306,Bra,Bra (306)


#### graphical_appearance

In [12]:
# graphical_appearance_no / graphical_appearance_name 분포 확인 (패턴/외형)
art_drop[['graphical_appearance_no', 'graphical_appearance_name']].value_counts().head(10)

graphical_appearance_no  graphical_appearance_name
1010016                  Solid                        49747
1010001                  All over pattern             17165
1010010                  Melange                       5938
1010017                  Stripe                        4990
1010023                  Denim                         4842
1010008                  Front print                   3215
1010014                  Placement print               3098
1010004                  Check                         2178
1010005                  Colour blocking               1830
1010021                  Lace                          1513
Name: count, dtype: int64

In [13]:
# graphical_appearance_no → graphical_appearance_name 1:1 여부 확인
check2 = art_drop.groupby('graphical_appearance_no')['graphical_appearance_name'].nunique()
print(check2[check2 > 1])

Series([], Name: graphical_appearance_name, dtype: int64)


graphical_appearance_no == graphical_appearance_name 깔끔하게 서로 매칭됨<br>
 → 두 컬럼을 하나로 합치는 파생 컬럼 생성

In [14]:
# 패턴/외형 대표 컬럼 생성
art_drop['graphical_appearance_info'] =(
    art_drop['graphical_appearance_name'] 
    + " (" 
    + art_drop['graphical_appearance_no'].astype(str) 
    + ")")
art_drop[[
    "graphical_appearance_no",
    "graphical_appearance_name",
    "graphical_appearance_info"
    ]].head()

Unnamed: 0,graphical_appearance_no,graphical_appearance_name,graphical_appearance_info
0,1010016,Solid,Solid (1010016)
1,1010016,Solid,Solid (1010016)
2,1010017,Stripe,Stripe (1010017)
3,1010016,Solid,Solid (1010016)
4,1010016,Solid,Solid (1010016)


#### perceived_colour_master

In [15]:
# perceived_colour_master_id / perceived_colour_master_name 분포 확인 (색 계열)
art_drop[['perceived_colour_master_id', 'perceived_colour_master_name']].value_counts().head()

perceived_colour_master_id  perceived_colour_master_name
5                           Black                           22585
2                           Blue                            18469
9                           White                           12665
4                           Pink                             9403
12                          Grey                             8924
Name: count, dtype: int64

In [16]:
# (검증) perceived_colour_master_id → perceived_colour_master_name 1:1 여부 확인
check3 = art_drop.groupby('perceived_colour_master_id')['perceived_colour_master_name'].nunique()
print(check3[check3 > 1])

Series([], Name: perceived_colour_master_name, dtype: int64)


perceived_colour_master_id == perceived_colour_master_name 깔끔하게 서로 매칭됨<br> 
→ 두 컬럼을 하나로 합치는 파생 컬럼 생성

In [17]:
# 색 계열 대표 컬럼 생성
art_drop['perceived_colour_master_info'] = (
    art_drop['perceived_colour_master_name'] 
    + " (" 
    + art_drop['perceived_colour_master_id'].astype(str) 
    + ")")

art_drop[[
    "perceived_colour_master_id",
    "perceived_colour_master_name",
    "perceived_colour_master_info"
    ]].head()

Unnamed: 0,perceived_colour_master_id,perceived_colour_master_name,perceived_colour_master_info
0,5,Black,Black (5)
1,9,White,White (9)
2,9,White,White (9)
3,5,Black,Black (5)
4,9,White,White (9)


#### department

In [18]:
# department_no / department_name 분포 확인 (부서)
art_drop[['department_no', 'department_name']].value_counts().head()

department_no  department_name        
7616           Kids Girl Jersey Fancy     2032
1338           Expressive Lingerie        1921
8716           Young Girl Jersey Fancy    1874
4242           Swimwear                   1839
7648           Kids Boy Jersey Fancy      1488
Name: count, dtype: int64

In [19]:
# (검증) department_no → department_name 1:1 여부 확인
check4 = art_drop.groupby('department_no')['department_name'].nunique()
print(check4[check4 > 1])

Series([], Name: department_name, dtype: int64)


department_no == department_name 깔끔하게 서로 매칭됨<br>
→ 두 컬럼을 하나로 합치는 파생 컬럼 생성

In [20]:
# 부서 분류 대표 컬럼 생성
art_drop['department_info'] = (
    art_drop['department_name'] 
    + " (" 
    + art_drop['department_no'].astype(str) 
    + ")")

art_drop[[
    "department_no", 
    "department_name", 
    "department_info"
]].head()

Unnamed: 0,department_no,department_name,department_info
0,1676,Jersey Basic,Jersey Basic (1676)
1,1676,Jersey Basic,Jersey Basic (1676)
2,1676,Jersey Basic,Jersey Basic (1676)
3,1339,Clean Lingerie,Clean Lingerie (1339)
4,1339,Clean Lingerie,Clean Lingerie (1339)


#### index

In [21]:
# index_code / index_name 분포 확인
art_drop[['index_code', 'index_name']].value_counts()

index_code  index_name                    
A           Ladieswear                        26001
D           Divided                           15149
F           Menswear                          12553
H           Children Sizes 92-140             12007
I           Children Sizes 134-170             9214
G           Baby Sizes 50-98                   8875
C           Ladies Accessories                 6961
B           Lingeries/Tights                   6775
J           Children Accessories, Swimwear     4615
S           Sport                              3392
Name: count, dtype: int64

In [22]:
# (검증) index_code → index_name 1:1 여부 확인
check5 = art_drop.groupby('index_code')['index_name'].nunique()
print(check5[check5 > 1])

Series([], Name: index_name, dtype: int64)


art_drop.

index_code == index_name 깔끔하게 서로 매칭됨<br> 
→ 두 컬럼을 하나로 합치는 파생 컬럼 생성

In [23]:
# index 대표 컬럼 생성
art_drop['index_info'] = (
    art_drop['index_name'] 
    + " (" + art_drop['index_code'].astype(str) 
    + ")")

art_drop[[
    "index_code", 
    "index_name", 
    "index_info"
    ]].head()


Unnamed: 0,index_code,index_name,index_info
0,A,Ladieswear,Ladieswear (A)
1,A,Ladieswear,Ladieswear (A)
2,A,Ladieswear,Ladieswear (A)
3,B,Lingeries/Tights,Lingeries/Tights (B)
4,B,Lingeries/Tights,Lingeries/Tights (B)


#### index_group_no

In [24]:
# index_group_no / index_group_name 분포 확인 (Ladies / Mens 등 타겟 그룹)
art_drop[['index_group_no', 'index_group_name']].value_counts()

index_group_no  index_group_name
1               Ladieswear          39737
4               Baby/Children       34711
2               Divided             15149
3               Menswear            12553
26              Sport                3392
Name: count, dtype: int64

In [25]:
# (검증) index_group_no → index_group_name 1:1 여부 확인
check5 = art_drop.groupby('index_group_no')['index_group_name'].nunique()
print(check5[check5 > 1])

Series([], Name: index_group_name, dtype: int64)


index_group_no == index_group_name 깔끔하게 서로 매칭됨<br>
→ 두 컬럼을 하나로 합치는 파생 컬럼 생성

In [26]:
# index_group 대표 컬럼 생성
art_drop['index_group_info'] =(
    art_drop['index_group_name']
    + " (" 
    + art_drop['index_group_no'].astype(str) 
    + ")")

art_drop[[
    "index_group_no", 
    "index_group_name", 
    "index_group_info"
    ]].head()

Unnamed: 0,index_group_no,index_group_name,index_group_info
0,1,Ladieswear,Ladieswear (1)
1,1,Ladieswear,Ladieswear (1)
2,1,Ladieswear,Ladieswear (1)
3,1,Ladieswear,Ladieswear (1)
4,1,Ladieswear,Ladieswear (1)


#### section

In [27]:
# section_no / section_name 분포 확인 (라인/섹션 단위 분석)
art_drop[['section_no', 'section_name']].value_counts()

section_no  section_name                  
15          Womens Everyday Collection        7295
53          Divided Collection                7124
44          Baby Essentials & Complements     4932
76          Kids Girl                         4469
77          Young Girl                        3899
61          Womens Lingerie                   3598
79          Girls Underwear & Basics          3490
11          Womens Tailoring                  3376
46          Kids Boy                          3328
66          Womens Small accessories          3270
6           Womens Casual                     2725
45          Kids Outerwear                    2665
18          Womens Trend                      2622
50          Divided Projects                  2364
47          Young Boy                         2352
2           H&M+                              2337
26          Men Underwear                     2322
8           Mama                              2266
42          Kids & Baby Shoes          

In [28]:
# (검증) section_no → section_name 1:1 여부 확인
check7 = art_drop.groupby('section_no')['section_name'].nunique()
print(check7[check7 > 1])

Series([], Name: section_name, dtype: int64)


section_no == section_name 깔끔하게 서로 매칭됨<br>
→ 두 컬럼을 하나로 합치는 파생 컬럼 생성

In [29]:
# section 대표 컬럼 생성
art_drop['section_info'] = art_drop['section_name'] + " (" + art_drop['section_no'].astype(str) + ")"
art_drop.head()

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,perceived_colour_master_id,perceived_colour_master_name,...,section_name,garment_group_no,garment_group_name,product_type_info,graphical_appearance_info,perceived_colour_master_info,department_info,index_info,index_group_info,section_info
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,5,Black,...,Womens Everyday Basics,1002,Jersey Basic,Vest top (253),Solid (1010016),Black (5),Jersey Basic (1676),Ladieswear (A),Ladieswear (1),Womens Everyday Basics (16)
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,White,...,Womens Everyday Basics,1002,Jersey Basic,Vest top (253),Solid (1010016),White (9),Jersey Basic (1676),Ladieswear (A),Ladieswear (1),Womens Everyday Basics (16)
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,9,White,...,Womens Everyday Basics,1002,Jersey Basic,Vest top (253),Stripe (1010017),White (9),Jersey Basic (1676),Ladieswear (A),Ladieswear (1),Womens Everyday Basics (16)
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,5,Black,...,Womens Lingerie,1017,"Under-, Nightwear",Bra (306),Solid (1010016),Black (5),Clean Lingerie (1339),Lingeries/Tights (B),Ladieswear (1),Womens Lingerie (61)
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,White,...,Womens Lingerie,1017,"Under-, Nightwear",Bra (306),Solid (1010016),White (9),Clean Lingerie (1339),Lingeries/Tights (B),Ladieswear (1),Womens Lingerie (61)


#### garment_group

In [30]:
# garment_group_no / garment_group_name 분포 확인 (의류 대분류)
art_drop[['garment_group_no', 'garment_group_name']].value_counts()

garment_group_no  garment_group_name           
1005              Jersey Fancy                     21445
1019              Accessories                      11519
1002              Jersey Basic                      8126
1003              Knitwear                          7490
1017              Under-, Nightwear                 7441
1009              Trousers                          6727
1010              Blouses                           5838
1020              Shoes                             5145
1013              Dresses Ladies                    4874
1007              Outdoor                           4501
1001              Unknown                           3873
1016              Trousers Denim                    3100
1018              Swimwear                          2787
1021              Socks and Tights                  2272
1011              Shirts                            2116
1006              Woven/Jersey/Knitted mix Baby     1965
1025              Shorts                

In [31]:
# (검증) garment_group_no → garment_group_name 1:1 여부 확인
check8 = art_drop.groupby('garment_group_no')['garment_group_name'].nunique()
print(check8[check8 > 1])

Series([], Name: garment_group_name, dtype: int64)


garment_group_no == garment_group_name 깔끔하게 서로 매칭됨<br>
→ 두 컬럼을 하나로 합치는 파생 컬럼 생성

In [32]:
# garment_group 대표 컬럼 생성
art_drop['garment_group_info'] = (
    art_drop['garment_group_name'] 
    + " (" 
    + art_drop['garment_group_no'].astype(str) 
    + ")")

art_drop[[
    "garment_group_no", 
    "garment_group_name", 
    "garment_group_info"
    ]].head()

Unnamed: 0,garment_group_no,garment_group_name,garment_group_info
0,1002,Jersey Basic,Jersey Basic (1002)
1,1002,Jersey Basic,Jersey Basic (1002)
2,1002,Jersey Basic,Jersey Basic (1002)
3,1017,"Under-, Nightwear","Under-, Nightwear (1017)"
4,1017,"Under-, Nightwear","Under-, Nightwear (1017)"


### 요약한 컬럼을 활용해 df 생성

In [33]:
# 최종 산출 컬럼만 추출
new_art = art_drop[
    [
        "article_id",
        "product_code",
        "prod_name",
        "product_type_info",
        "product_group_name",
        "graphical_appearance_info",
        "perceived_colour_master_info",
        "department_info",
        "index_info",
        "index_group_info",
        "section_info",
        "garment_group_info",
    ]
].copy()

new_art.head()

Unnamed: 0,article_id,product_code,prod_name,product_type_info,product_group_name,graphical_appearance_info,perceived_colour_master_info,department_info,index_info,index_group_info,section_info,garment_group_info
0,108775015,108775,Strap top,Vest top (253),Garment Upper body,Solid (1010016),Black (5),Jersey Basic (1676),Ladieswear (A),Ladieswear (1),Womens Everyday Basics (16),Jersey Basic (1002)
1,108775044,108775,Strap top,Vest top (253),Garment Upper body,Solid (1010016),White (9),Jersey Basic (1676),Ladieswear (A),Ladieswear (1),Womens Everyday Basics (16),Jersey Basic (1002)
2,108775051,108775,Strap top (1),Vest top (253),Garment Upper body,Stripe (1010017),White (9),Jersey Basic (1676),Ladieswear (A),Ladieswear (1),Womens Everyday Basics (16),Jersey Basic (1002)
3,110065001,110065,OP T-shirt (Idro),Bra (306),Underwear,Solid (1010016),Black (5),Clean Lingerie (1339),Lingeries/Tights (B),Ladieswear (1),Womens Lingerie (61),"Under-, Nightwear (1017)"
4,110065002,110065,OP T-shirt (Idro),Bra (306),Underwear,Solid (1010016),White (9),Clean Lingerie (1339),Lingeries/Tights (B),Ladieswear (1),Womens Lingerie (61),"Under-, Nightwear (1017)"


## 모든 컬럼명을 대상으로 첫 글자 대문자로 변경

In [34]:
new_art.columns = [col.title() for col in new_art.columns]
new_art.columns

Index(['Article_Id', 'Product_Code', 'Prod_Name', 'Product_Type_Info',
       'Product_Group_Name', 'Graphical_Appearance_Info',
       'Perceived_Colour_Master_Info', 'Department_Info', 'Index_Info',
       'Index_Group_Info', 'Section_Info', 'Garment_Group_Info'],
      dtype='str')

## 최종점검

In [35]:
new_art.head()

Unnamed: 0,Article_Id,Product_Code,Prod_Name,Product_Type_Info,Product_Group_Name,Graphical_Appearance_Info,Perceived_Colour_Master_Info,Department_Info,Index_Info,Index_Group_Info,Section_Info,Garment_Group_Info
0,108775015,108775,Strap top,Vest top (253),Garment Upper body,Solid (1010016),Black (5),Jersey Basic (1676),Ladieswear (A),Ladieswear (1),Womens Everyday Basics (16),Jersey Basic (1002)
1,108775044,108775,Strap top,Vest top (253),Garment Upper body,Solid (1010016),White (9),Jersey Basic (1676),Ladieswear (A),Ladieswear (1),Womens Everyday Basics (16),Jersey Basic (1002)
2,108775051,108775,Strap top (1),Vest top (253),Garment Upper body,Stripe (1010017),White (9),Jersey Basic (1676),Ladieswear (A),Ladieswear (1),Womens Everyday Basics (16),Jersey Basic (1002)
3,110065001,110065,OP T-shirt (Idro),Bra (306),Underwear,Solid (1010016),Black (5),Clean Lingerie (1339),Lingeries/Tights (B),Ladieswear (1),Womens Lingerie (61),"Under-, Nightwear (1017)"
4,110065002,110065,OP T-shirt (Idro),Bra (306),Underwear,Solid (1010016),White (9),Clean Lingerie (1339),Lingeries/Tights (B),Ladieswear (1),Womens Lingerie (61),"Under-, Nightwear (1017)"


In [36]:
new_art.info()

<class 'pandas.DataFrame'>
RangeIndex: 105542 entries, 0 to 105541
Data columns (total 12 columns):
 #   Column                        Non-Null Count   Dtype
---  ------                        --------------   -----
 0   Article_Id                    105542 non-null  int64
 1   Product_Code                  105542 non-null  int64
 2   Prod_Name                     105542 non-null  str  
 3   Product_Type_Info             105542 non-null  str  
 4   Product_Group_Name            105542 non-null  str  
 5   Graphical_Appearance_Info     105542 non-null  str  
 6   Perceived_Colour_Master_Info  105542 non-null  str  
 7   Department_Info               105542 non-null  str  
 8   Index_Info                    105542 non-null  str  
 9   Index_Group_Info              105542 non-null  str  
 10  Section_Info                  105542 non-null  str  
 11  Garment_Group_Info            105542 non-null  str  
dtypes: int64(2), str(10)
memory usage: 9.7 MB


In [37]:
new_art.shape

(105542, 12)

In [38]:
na_a = new_art.isna().sum().sort_values(ascending=False) 
na_a.head(3)

Article_Id      0
Product_Code    0
Prod_Name       0
dtype: int64

In [39]:
new_art.to_csv("data/articles_processed.csv", index=False, encoding="utf-8-sig")