# customer_hm

In [50]:
import pandas as pd
customer = pd.read_csv("../../main/h&m_dataset/customer_hm.csv")
customer.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0,0,ACTIVE,NONE,49
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0,0,ACTIVE,NONE,25
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0,0,ACTIVE,NONE,24
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0,0,ACTIVE,NONE,54
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1,1,ACTIVE,Regularly,52


## Prepare

In [51]:
customer.info()

<class 'pandas.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 6 columns):
 #   Column                  Non-Null Count    Dtype
---  ------                  --------------    -----
 0   customer_id             1048575 non-null  str  
 1   FN                      1048575 non-null  int64
 2   Active                  1048575 non-null  int64
 3   club_member_status      1048575 non-null  str  
 4   fashion_news_frequency  1048574 non-null  str  
 5   age                     1048575 non-null  int64
dtypes: int64(3), str(3)
memory usage: 48.0 MB


In [52]:
customer.shape

(1048575, 6)

In [53]:
na_c = customer.isna().sum().sort_values(ascending=False) 
na_c.head(3)

fashion_news_frequency    1
customer_id               0
FN                        0
dtype: int64

In [54]:
customer["age"].describe().T[["min", "max"]]

min    16.0
max    99.0
Name: age, dtype: float64

In [55]:
customer["FN"].value_counts(dropna=False)

FN
0    675765
1    372810
Name: count, dtype: int64

In [56]:
customer["Active"].value_counts(dropna=False)

Active
0    685629
1    362946
Name: count, dtype: int64

In [57]:
customer["club_member_status"].value_counts(dropna=False)

club_member_status
ACTIVE        982635
PRE-CREATE     65581
LEFT CLUB        359
Name: count, dtype: int64

In [58]:
customer["fashion_news_frequency"].value_counts(dropna=False)

fashion_news_frequency
NONE         674698
Regularly    373218
Monthly         658
NaN               1
Name: count, dtype: int64

In [59]:
customer["customer_id"].duplicated().sum()
# 행수가 transactions과 동일해서 중복가능성을 염두 했으나 중복행은 존재하지 않음
# 고객 테이블은 customer_id 기준으로 중복이 존재하지 않음을 확인하였다.

np.int64(0)

## 고객 1명당 1행으로 구성된 고객 특성 테이블
- 직접적인 매출 정보가 없기 때문에 거래 데이터와의 결합이 필요

## Prepare 결과
- customer_id                   # 식별자 (조인 키)
- FN                            # 패션 뉴스 수신 여부
- Active                        # 활성 고객 여부
- club_member_status            # 클럽 가입 상태 (신규 / 활성 / 탈퇴)
- fashion_news_frequency        # 뉴스 알림 주기
- age                           # 연령대 분석



### 결측치 여부
- fashion_news_frequency에 결측치가 존재하긴 하나 1개 존재.
    - 별도의 카테고리로 구성

### 형변환
- FN, Active의 값은 0/1 로 구성되어 있다. 불리언(참/거짓)으로 분리

### 중복여부

### 이상치 여부
데이터 내에서 age의 범위는 16~99로 정상범주에 속한다 볼수 있다. <br>
튀는 값 자체는 존재하나 아직 전처리 초기 단계이고 값들이 상식선 안에 있기 때문에 유지한다.

# 데이터 전처리

In [60]:
cust = customer.copy()
cust.info()

<class 'pandas.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 6 columns):
 #   Column                  Non-Null Count    Dtype
---  ------                  --------------    -----
 0   customer_id             1048575 non-null  str  
 1   FN                      1048575 non-null  int64
 2   Active                  1048575 non-null  int64
 3   club_member_status      1048575 non-null  str  
 4   fashion_news_frequency  1048574 non-null  str  
 5   age                     1048575 non-null  int64
dtypes: int64(3), str(3)
memory usage: 48.0 MB


In [61]:
print("FN 1/0비율")
print(cust["FN"].value_counts(dropna=False))

print("\nActive 1/0비율")
print(cust["Active"].value_counts(dropna=False))

FN 1/0비율
FN
0    675765
1    372810
Name: count, dtype: int64

Active 1/0비율
Active
0    685629
1    362946
Name: count, dtype: int64


In [62]:
# FN, Active .map()을 사용해 형 변환
cust["FN"] = (cust["FN"]
                        .map({
                            1: True,
                            0: False})
                        .astype("boolean"))
cust["Active"] = (cust["Active"]
                        .map({
                            1: True,
                            0: False})
                        .astype("boolean"))

cust[["FN", "Active"]].info()

<class 'pandas.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 2 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   FN      1048575 non-null  boolean
 1   Active  1048575 non-null  boolean
dtypes: boolean(2)
memory usage: 4.0 MB


In [63]:
# age의 이상치 제거
cust = cust[
    (cust["age"] >= 10) &
    (cust["age"] <= 100)
]

In [64]:
# age 연령대 파생 컬럼 생성
cust["age_group"] = (cust["age"] // 10) * 10 # 자릿수 올림
cust["age_group"] = (cust["age_group"]
                            .astype(int)
                            .astype(str) + "s")

cust[["age_group"]].isna().sum()

age_group    0
dtype: int64

In [65]:
cust["age_group"].value_counts(dropna=False)

age_group
20s    409196
30s    181112
50s    174715
40s    157445
60s     56124
10s     55256
70s     13580
80s      1075
90s        72
Name: count, dtype: int64

In [66]:
cust["fashion_news_frequency"].value_counts(dropna=False)

fashion_news_frequency
NONE         674698
Regularly    373218
Monthly         658
NaN               1
Name: count, dtype: int64

## fashion_news_frequency 정리
- 결측치(1개)는 `None`으로 처리한다.
- `FN`이 False(뉴스 미구독)인데 `fashion_news_frequency`가 Regularly/Monthly 등으로 들어간 케이스가 있어 정합성을 맞춘다.
    - `FN == False`이면 `fashion_news_frequency = 'None'`으로 강제한다.


In [67]:
# fashion_news_frequency: 결측치/표기 정리
cust["fashion_news_frequency"] = (
    cust["fashion_news_frequency"]
    .fillna("None")
    .astype(str)
    .str.capitalize()
)

# FN(False)인데 빈도가 설정된 데이터 정합성 보정
# 구독 안 했으면 무조건 None
mask_fn_false = cust["FN"].fillna(False) == False
cust.loc[mask_fn_false, "fashion_news_frequency"] = "None"

# 구독(True)인데 frequency가 None이면 -> FN도 False로 통일
mask_mismatch = (cust["FN"] == True) & (cust["fashion_news_frequency"] == "None")
cust.loc[mask_mismatch, "FN"] = False

cust["fashion_news_frequency"].value_counts(dropna=False)


fashion_news_frequency
None         676391
Regularly    371534
Monthly         650
Name: count, dtype: int64

In [68]:
cust.info()

<class 'pandas.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 7 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   customer_id             1048575 non-null  str    
 1   FN                      1048575 non-null  boolean
 2   Active                  1048575 non-null  boolean
 3   club_member_status      1048575 non-null  str    
 4   fashion_news_frequency  1048575 non-null  str    
 5   age                     1048575 non-null  int64  
 6   age_group               1048575 non-null  str    
dtypes: boolean(2), int64(1), str(4)
memory usage: 44.0 MB


# 파일저장

In [69]:
cust.to_csv("data/cust_processed.csv", index=False, encoding="utf-8-sig")

test