In [118]:
import pandas as pd
import numpy as np

data = {
    'User_ID': [1, 2, 3, 4, 1, 5, 6, 4, 7, 8], 
    'Name': [' Kevin', 'Tom', 'Alice', 'Mike ', ' Kevin', 'Sarah', 'Bob', 'Mike ', 'Jenny', 'Dan'],
    'Age': [25, np.nan, 30, 200, 25, 28, np.nan, 200, 15, 45],
    'Total_Spend': ['$1,000', '0', '$500', '$10,000', '$1,000', '0', np.nan, '$10,000', '$200', 'error'],
    'Membership': ['VIP', 'Basic', np.nan, 'VIP', 'VIP', np.nan, 'Basic', 'VIP', 'Basic', 'VIP'],
    'Join_Date': ['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04', '2024-01-01', '2024-01-05', '2024-01-06', '2024-01-04', '2024-01-07', '20240108']
}
#User_ID : PK이다. "1,4"번 중복 문제
#Name : 공백 문제
#Age : nan존재, 200 이상치 존재
#Total_Spend : 문자열, nan, 에러
#Membership : nan
#Join_Data : 날짜 형식 통일

df = pd.DataFrame(data)
print(df)

   User_ID    Name    Age Total_Spend Membership   Join_Date
0        1   Kevin   25.0      $1,000        VIP  2024-01-01
1        2     Tom    NaN           0      Basic  2024-01-02
2        3   Alice   30.0        $500        NaN  2024-01-03
3        4   Mike   200.0     $10,000        VIP  2024-01-04
4        1   Kevin   25.0      $1,000        VIP  2024-01-01
5        5   Sarah   28.0           0        NaN  2024-01-05
6        6     Bob    NaN         NaN      Basic  2024-01-06
7        4   Mike   200.0     $10,000        VIP  2024-01-04
8        7   Jenny   15.0        $200      Basic  2024-01-07
9        8     Dan   45.0       error        VIP    20240108


In [119]:
#1. 데이터 파악 : info(), describe()
print("===info 파악하기===")
print(df.info(),'\n\n')
#info 결과 : 
    #nan값 파악
print(df.isnull().sum()) #Age(2), Total_Spend(1), Membership(2) 결측치 처리 필요

    #정규화 필요
    #total_spend, join_date 값 숫자로 변경

===info 파악하기===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   User_ID      10 non-null     int64  
 1   Name         10 non-null     object 
 2   Age          8 non-null      float64
 3   Total_Spend  9 non-null      object 
 4   Membership   8 non-null      object 
 5   Join_Date    10 non-null     object 
dtypes: float64(1), int64(1), object(4)
memory usage: 612.0+ bytes
None 


User_ID        0
Name           0
Age            2
Total_Spend    1
Membership     2
Join_Date      0
dtype: int64


In [120]:
#2. 결측치 처리
    #1) Age : 중앙값으로 처리
df['Age'] = df['Age'].fillna(df['Age'].median())
print(df['Age'],'\n\n')

    #2) Total_Spend : 정규화 이후 진행
    #3) Membership : Basic 처리 (도메인 지식 : 등급이 없는건 신규유저(Basic)이다.)
df['Membership'] = df['Membership'].fillna('Basic')
print(df['Membership'])

0     25.0
1     29.0
2     30.0
3    200.0
4     25.0
5     28.0
6     29.0
7    200.0
8     15.0
9     45.0
Name: Age, dtype: float64 


0      VIP
1    Basic
2    Basic
3      VIP
4      VIP
5    Basic
6    Basic
7      VIP
8    Basic
9      VIP
Name: Membership, dtype: object


In [121]:
#3. 정규화
    #Name : 공백제거
df['Name'] = df['Name'].str.strip()
# print(df['Name'],'\n\n')

    #Total_Spend : "$", "," 제거
df['Total_Spend'] = df['Total_Spend'].str.replace('$','')
df['Total_Spend'] = df['Total_Spend'].str.replace(',','')

    #Total_Spend : 이상한 문자값 NaN처리
df['Total_Spend'] = pd.to_numeric(df['Total_Spend'], errors = 'coerce')

    #Total_Spend 결측치 처리 : NaN = 구매기록 없음으로 보고 0으로 처리
df['Total_Spend'] = df['Total_Spend'].fillna(0)
# print(df['Total_Spend'], '\n\n')

    #Join_Data : 날짜 객체로 바꾸기
    #(8자리 날짜 살려보기)
df['Join_Date_8'] = pd.to_datetime(
    df['Join_Date'],
    errors = 'coerce'
)
mask = df['Join_Date_8'].isna() #8자리였던거
df.loc[mask, 'Join_Date_8'] = pd.to_datetime(
    df.loc[mask, 'Join_Date'], #자리로 찾아서 na값을 날짜로 바꿈
    format='%Y%m%d',
    errors='coerce'
)
df['Join_Date'] = df['Join_Date_8']
df = df.drop(columns='Join_Date_8')
print(df['Join_Date'])

0   2024-01-01
1   2024-01-02
2   2024-01-03
3   2024-01-04
4   2024-01-01
5   2024-01-05
6   2024-01-06
7   2024-01-04
8   2024-01-07
9   2024-01-08
Name: Join_Date, dtype: datetime64[ns]


In [122]:
#4. 중복 제거
print(df)
    #ID : 중복, 값도 같네
df = df.drop_duplicates(
    subset=['User_ID'],
    keep='last'
)

   User_ID   Name    Age  Total_Spend Membership  Join_Date
0        1  Kevin   25.0       1000.0        VIP 2024-01-01
1        2    Tom   29.0          0.0      Basic 2024-01-02
2        3  Alice   30.0        500.0      Basic 2024-01-03
3        4   Mike  200.0      10000.0        VIP 2024-01-04
4        1  Kevin   25.0       1000.0        VIP 2024-01-01
5        5  Sarah   28.0          0.0      Basic 2024-01-05
6        6    Bob   29.0          0.0      Basic 2024-01-06
7        4   Mike  200.0      10000.0        VIP 2024-01-04
8        7  Jenny   15.0        200.0      Basic 2024-01-07
9        8    Dan   45.0          0.0        VIP 2024-01-08


In [None]:
#5. 이상치처리
    #Age가 200살 넘음 -> 처리기준이 애매해 지움. 그 기준이 뭘까
df = df[df['Age']<=100]
    #Age만 NaN으로 남겨둘 수도 있다 (행 유지)
    #df.loc[df['Age']>100, 'Age'] = np.nan
print(df)

   User_ID   Name   Age  Total_Spend Membership  Join_Date
1        2    Tom  29.0          0.0      Basic 2024-01-02
2        3  Alice  30.0        500.0      Basic 2024-01-03
4        1  Kevin  25.0       1000.0        VIP 2024-01-01
5        5  Sarah  28.0          0.0      Basic 2024-01-05
6        6    Bob  29.0          0.0      Basic 2024-01-06
8        7  Jenny  15.0        200.0      Basic 2024-01-07
9        8    Dan  45.0          0.0        VIP 2024-01-08
