In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')


In [6]:
df = pd.read_csv('./customer.csv', encoding='utf-8')
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 8:26,3.39,17850.0,United Kingdom


In [7]:
print(f"원본 데이터: {len(df):,}건")

원본 데이터: 541,909건


In [8]:
df_clean = df[df['CustomerID'].notna()].copy()
print(f"CustomerID 있는 데이터: {len(df_clean):,}건")

CustomerID 있는 데이터: 406,829건


In [9]:
df_clean = df_clean[df_clean['Quantity'] > 0]
print(f"반품제외수량: {len(df_clean):,}건")

반품제외수량: 397,924건


In [10]:
df_clean = df_clean[df_clean['UnitPrice'] > 0]
print(f"반품제외단가: {len(df_clean):,}건")

반품제외단가: 397,884건


In [13]:
df_clean['InvoiceDate'] = pd.to_datetime(df_clean['InvoiceDate'], errors='coerce')
df_clean['InvoiceYearMonth'] = df_clean['InvoiceDate'].dt.to_period('M')
df_clean['InvoiceDate_only'] = df_clean['InvoiceDate'].dt.date

print(df_clean.info())

<class 'pandas.core.frame.DataFrame'>
Index: 397884 entries, 0 to 541908
Data columns (total 10 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   InvoiceNo         397884 non-null  object        
 1   StockCode         397884 non-null  object        
 2   Description       397884 non-null  object        
 3   Quantity          397884 non-null  int64         
 4   InvoiceDate       397884 non-null  datetime64[ns]
 5   UnitPrice         397884 non-null  float64       
 6   CustomerID        397884 non-null  float64       
 7   Country           397884 non-null  object        
 8   InvoiceYearMonth  397884 non-null  period[M]     
 9   InvoiceDate_only  397884 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(5), period[M](1)
memory usage: 33.4+ MB
None


In [14]:
df_clean['Revenue'] = df_clean['Quantity'] * df_clean['UnitPrice']

In [15]:
print(f"\n최종 데이터: {len(df_clean):,}건")
print(f"고유 고객 수: {df_clean['CustomerID'].nunique():,}명")
print(f"기간: {df_clean['InvoiceDate'].min()} ~ {df_clean['InvoiceDate'].max()}")



최종 데이터: 397,884건
고유 고객 수: 4,338명
기간: 2010-12-01 08:26:00 ~ 2011-12-09 12:50:00


In [16]:
df_clean.to_csv('./online_retail_clean.csv', index=False)
print("\n✅ 전처리 완료: ./online_retail_clean.csv")


✅ 전처리 완료: ./online_retail_clean.csv


# 1. 데이터 전처리

- 원본 데이터: 541,909건   
- CustomerID 있는 데이터: 406,829건   
- 반품제외수량: 397,924건   
- 반품제외단가: 397,884건   
=================================   
- 최종 데이터: 397,884건   
- 고유 고객 수: 4,338명   
- 기간: 2010-12-01 08:26:00 ~ 2011-12-09 12:50:00   
