# customer_hm
`customer_hm.csv`(고객 메타데이터)를 분석용으로 바로 merge 가능한 형태로 정리한다.

In [1]:
import pandas as pd
customer = pd.read_csv("h&m_dataset/customer_hm.csv")
customer.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0,0,ACTIVE,NONE,49
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0,0,ACTIVE,NONE,25
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0,0,ACTIVE,NONE,24
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0,0,ACTIVE,NONE,54
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1,1,ACTIVE,Regularly,52


In [2]:
# 테이블의 전체적인 정보 확인
customer.info()

<class 'pandas.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 6 columns):
 #   Column                  Non-Null Count    Dtype
---  ------                  --------------    -----
 0   customer_id             1048575 non-null  str  
 1   FN                      1048575 non-null  int64
 2   Active                  1048575 non-null  int64
 3   club_member_status      1048575 non-null  str  
 4   fashion_news_frequency  1048574 non-null  str  
 5   age                     1048575 non-null  int64
dtypes: int64(3), str(3)
memory usage: 48.0 MB


In [3]:
# 행/열 크기 확인
customer.shape

(1048575, 6)

In [4]:
# 결측치가 많은 컬럼 상위 확인
na_c = customer.isna().sum().sort_values(ascending=False) 
na_c.head(3)

fashion_news_frequency    1
customer_id               0
FN                        0
dtype: int64

# 전처리 작업
- customer_id                   # 식별자 (조인 키)
- FN                            # 패션 뉴스 수신 여부
- Active                        # 활성 고객 여부
- club_member_status            # 클럽 가입 상태 (신규 / 활성 / 탈퇴)
- fashion_news_frequency        # 뉴스 알림 주기
- age                           # 연령대 분석


### 결측치 여부
- fashion_news_frequency에 결측치가 존재하긴 하나 1개 존재.
    - 별도의 카테고리로 구성

### 형변환
- FN, Active의 값은 0/1 로 구성되어 있다. 불리언(참/거짓)으로 분리

### 중복여부

### 이상치 여부
데이터 내에서 age의 범위는 16~99로 정상범주에 속한다 볼수 있다. <br>
튀는 값 자체는 존재하나 아직 전처리 초기 단계이고 값들이 상식선 안에 있기 때문에 유지한다.

In [5]:
cust = customer.copy()

## fashion_news_frequency 결측치 제거

## age 이상치 분석

In [6]:
# age가 80 이상인 데이터의 건수 합계
count_80 = len(cust[cust['age'] >= 80])
print(f"80세 이상 데이터 건수: {count_80}건")

80세 이상 데이터 건수: 1147건


In [7]:
total_count = len(cust)
percentage = (count_80 / total_count) * 100

print(f"80세 이상 비중: {percentage:.4f}%")

80세 이상 비중: 0.1094%


Age의 Min: 16, Max: 99 <br>
→ 80세 이상 데이터는 전체 데이터의 0.1% 수준이고, 너무 고령이기 때문에 분석에서의 유의미한 인사이트를 도출이 어렵다 판단해 제거

In [8]:
# 고령 고객 제외 (80세 이상 제거)
customer_drop = cust[cust['age'] < 80]

In [9]:
customer_drop.describe().round(3)

Unnamed: 0,FN,Active,age
count,1047428.0,1047428.0,1047428.0
mean,0.355,0.346,36.318
std,0.479,0.476,14.233
min,0.0,0.0,16.0
25%,0.0,0.0,24.0
50%,0.0,0.0,32.0
75%,1.0,1.0,49.0
max,1.0,1.0,79.0


In [10]:
customer_drop.info()

<class 'pandas.DataFrame'>
Index: 1047428 entries, 0 to 1048574
Data columns (total 6 columns):
 #   Column                  Non-Null Count    Dtype
---  ------                  --------------    -----
 0   customer_id             1047428 non-null  str  
 1   FN                      1047428 non-null  int64
 2   Active                  1047428 non-null  int64
 3   club_member_status      1047428 non-null  str  
 4   fashion_news_frequency  1047427 non-null  str  
 5   age                     1047428 non-null  int64
dtypes: int64(3), str(3)
memory usage: 55.9 MB


In [11]:
na_c = customer_drop.isna().sum().sort_values(ascending=False) 
na_c.head(3)

fashion_news_frequency    1
customer_id               0
FN                        0
dtype: int64

### 나이대 구성

In [12]:
# age 연령대 파생 컬럼 생성 (10s ~)
customer_drop["age_group"] = (customer_drop["age"] // 10) * 10
customer_drop["age_group"] = (
    customer_drop["age_group"]
    .astype(int)
    .astype(str) + "s"
)
# 60대 이상 → 60s+
customer_drop.loc[customer_drop["age"] >= 60, "age_group"] = "60s+"

# 결측 확인
customer_drop[["age_group"]].isna().sum()

age_group    0
dtype: int64

In [13]:
print(customer_drop['age_group'].value_counts().sort_index())

age_group
10s      55256
20s     409196
30s     181112
40s     157445
50s     174715
60s+     69704
Name: count, dtype: int64


### 각 컬럼의 고유값 확인

In [14]:
customer_drop['FN'].value_counts()

FN
0    675069
1    372359
Name: count, dtype: int64

In [15]:
customer_drop['Active'].value_counts()

Active
0    684911
1    362517
Name: count, dtype: int64

In [16]:
customer_drop['club_member_status'].value_counts()

club_member_status
ACTIVE        981598
PRE-CREATE     65471
LEFT CLUB        359
Name: count, dtype: int64

In [17]:
customer_drop['fashion_news_frequency'].value_counts()

fashion_news_frequency
NONE         674002
Regularly    372767
Monthly         658
Name: count, dtype: int64

#### # 컬럼 고유값 형식이 동일하게 되도록 변경(ex. ACTIVE → Active)

In [18]:
customer_clean = customer_drop.copy()

In [19]:
customer_clean['club_member_status'] = customer_drop['club_member_status'].str.title()
customer_clean['club_member_status'].value_counts()

club_member_status
Active        981598
Pre-Create     65471
Left Club        359
Name: count, dtype: int64

In [20]:
customer_clean['fashion_news_frequency'] = customer_drop['fashion_news_frequency'].str.capitalize()
customer_clean['fashion_news_frequency'].value_counts()

fashion_news_frequency
None         674002
Regularly    372767
Monthly         658
Name: count, dtype: int64

### FN, Active 불리언 형으로 형변환

In [21]:
# FN, Active .map()을 사용해 형 변환
customer_clean["FN"] = (customer_clean["FN"]
                        .map({
                            1: True,
                            0: False})
                        .astype("boolean"))
customer_clean["Active"] = (customer_clean["Active"]
                        .map({
                            1: True,
                            0: False})
                        .astype("boolean"))

customer_clean[["FN", "Active"]].info()

<class 'pandas.DataFrame'>
Index: 1047428 entries, 0 to 1048574
Data columns (total 2 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   FN      1047428 non-null  boolean
 1   Active  1047428 non-null  boolean
dtypes: boolean(2)
memory usage: 12.0 MB


### FN 값과 fashion_news_frequency 간 관계 파악

In [22]:
customer_clean['fashion_news_frequency'].value_counts()

fashion_news_frequency
None         674002
Regularly    372767
Monthly         658
Name: count, dtype: int64

In [23]:
customer_clean['FN'].value_counts()

FN
False    675069
True     372359
Name: count, dtype: Int64

- FN
    - False:     675069
    - True:      372358  


- fashion_news_frequency
    - None:         674002
    - Regularly:    372767
    - Monthly:         658


→ FN에서 0 개수와 fashion_news_frequency에서 None 개수가 맞지 않음

차이가 발생하게 된 이유 추측: <br>
FN(뉴스 구독)은 0으로 해놓았으나, 뉴스 수신 빈도는 Regularly 또는 Monthly로 설정  

<br>

추가적으로, FN 0인 행 수보다 fashion_news_frequency None인 행 수가 더 많음 == FN 1인데 주기 설정을 아직 안해서 뉴스 못 받는 경우도 존재


### fashion_news_frequency 처리
- 결측치(1개)는 `None`으로 처리한다.
- `FN`이 False(뉴스 미구독)인데 `fashion_news_frequency`가 Regularly/Monthly 등으로 들어간 케이스가 있어 정합성을 맞춘다.
    - `FN == False`이면 `fashion_news_frequency = 'None'`으로 강제한다.
    - `FN == True`인데 `fashion_news_frequency == 'None'`이면 FN도 False로 통일

In [24]:
# fashion_news_frequency: 결측치/표기 정리
customer_clean["fashion_news_frequency"] = (
    customer_clean["fashion_news_frequency"]
    .fillna("None")
    .astype(str)
    .str.capitalize()
)

# FN(False)인데 빈도가 설정된 데이터 정합성 보정
# 구독 안 했으면 무조건 None
mask_fn_false = customer_clean["FN"].fillna(False) == False
customer_clean.loc[mask_fn_false, "fashion_news_frequency"] = "None"

# 구독(True)인데 frequency가 None이면 -> FN도 False로 통일
mask_mismatch = (customer_clean["FN"] == True) & (customer_clean["fashion_news_frequency"] == "None")
customer_clean.loc[mask_mismatch, "FN"] = False

customer_clean["fashion_news_frequency"].value_counts(dropna=False)

fashion_news_frequency
None         675694
Regularly    371084
Monthly         650
Name: count, dtype: int64

In [25]:
customer_clean['FN'].value_counts()

FN
False    675694
True     371734
Name: count, dtype: Int64

### 정리한 컬럼을 활용해 df 생성

In [26]:
# 최종 산출 컬럼만 추출
new_customer = customer_clean.copy()
new_customer.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,age_group
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,False,False,Active,,49,40s
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,False,False,Active,,25,20s
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,False,False,Active,,24,20s
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,False,False,Active,,54,50s
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,True,True,Active,Regularly,52,50s


## 모든 컬럼명을 대상으로 첫 글자 대문자로 변경

In [27]:
new_customer.columns = [col.title() for col in new_customer.columns]
new_customer.columns

Index(['Customer_Id', 'Fn', 'Active', 'Club_Member_Status',
       'Fashion_News_Frequency', 'Age', 'Age_Group'],
      dtype='str')

## 최종점검

In [28]:
new_customer.head()

Unnamed: 0,Customer_Id,Fn,Active,Club_Member_Status,Fashion_News_Frequency,Age,Age_Group
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,False,False,Active,,49,40s
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,False,False,Active,,25,20s
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,False,False,Active,,24,20s
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,False,False,Active,,54,50s
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,True,True,Active,Regularly,52,50s


In [29]:
new_customer.info()

<class 'pandas.DataFrame'>
Index: 1047428 entries, 0 to 1048574
Data columns (total 7 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   Customer_Id             1047428 non-null  str    
 1   Fn                      1047428 non-null  boolean
 2   Active                  1047428 non-null  boolean
 3   Club_Member_Status      1047428 non-null  str    
 4   Fashion_News_Frequency  1047428 non-null  str    
 5   Age                     1047428 non-null  int64  
 6   Age_Group               1047428 non-null  str    
dtypes: boolean(2), int64(1), str(4)
memory usage: 51.9 MB


- Customer_Id               : 고객 식별키
- Fn                        : 패션 뉴스 수신 여부
- Active                    : 활성 고객 여부
- Club_Member_Status        : 클럽 가입 상태 (신규 / 활성 / 탈퇴)
- Fashion_News_Frequency    : 뉴스 알림 주기
- Age                       : 고객의 나이
- Age_Group                 : 고객의 나이태 분류

In [30]:
new_customer.shape

(1047428, 7)

In [31]:
na_a = new_customer.isna().sum().sort_values(ascending=False) 
na_a.head(3)

Customer_Id    0
Fn             0
Active         0
dtype: int64

In [32]:
new_customer.to_csv("data/customer_processed.csv", index=False, encoding="utf-8-sig")