In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import numpy as np

from yellowbrick.cluster import KElbowVisualizer
# k 값 참고: distance map 라이브러리 import 
from yellowbrick.cluster import intercluster_distance

# k 값 참고: 실루엣 계수 확인을 위한 라이브러리 import 
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans
from sklearn.preprocessing import StandardScaler

In [None]:
data_path = '/Users/jun/GitStudy/Data_4/Data/eCommerce3'
os.chdir(data_path)
orders = pd.read_csv('orders.csv')
orders.dropna(subset=['order_approved_at','order_delivered_timestamp'],inplace=True)

In [None]:
# 데이터 형식 바꾸기
orders['order_purchase_timestamp'] = pd.to_datetime(orders['order_purchase_timestamp'], errors = 'coerce')
orders['order_approved_at'] = pd.to_datetime(orders['order_approved_at'], errors = 'coerce')
orders['order_delivered_timestamp'] = pd.to_datetime(orders['order_delivered_timestamp'], errors = 'coerce')
orders['order_estimated_delivery_date'] = pd.to_datetime(orders['order_estimated_delivery_date'], errors = 'coerce')

# 날짜 데이터 이상치확인
# 역방향이면 이상치로 의심
Check_date_outliers = orders[
    (orders['order_purchase_timestamp'] > orders['order_approved_at']) |
    (orders['order_approved_at'] > orders['order_delivered_timestamp'])
]
out_ids = Check_date_outliers['order_id'].unique().tolist()

In [None]:
data_cleaned_df = pd.read_csv('capstone_data_cleaned.csv')
#item_id, customer_zip_code_prefix 숫자에서 문자열로 변경
data_cleaned_df['order_item_id'] = data_cleaned_df['order_item_id'].astype(str)
data_cleaned_df['customer_zip_code_prefix'] = data_cleaned_df['customer_zip_code_prefix'].astype(str)
# to_datetime
data_cleaned_df['order_purchase_timestamp'] = pd.to_datetime(data_cleaned_df['order_purchase_timestamp'])
data_cleaned_df['order_delivered_timestamp'] = pd.to_datetime(data_cleaned_df['order_delivered_timestamp'])
data_cleaned_df['order_approved_at']=pd.to_datetime(data_cleaned_df['order_approved_at'])
data_cleaned_df['order_estimated_delivery_date'] = pd.to_datetime(data_cleaned_df['order_estimated_delivery_date'])
# payment
data_cleaned_df['total_payment'] = data_cleaned_df['price'] + data_cleaned_df['shipping_charges']
# volume
data_cleaned_df['volume'] = data_cleaned_df['product_height_cm'] * data_cleaned_df['product_length_cm'] * data_cleaned_df['product_width_cm']

# 안 쓰는 행 삭제
columns_to_remove = ['order_estimated_delivery_date', 'shipping_charges', 'price', 'payment_value','customer_city','order_approved_at']

retail_df = data_cleaned_df.drop(columns=columns_to_remove)

In [None]:
electronics =  [
        "audio", "computers_accessories", "electronics", 
        "telephony", "tablets_printing_image", "computers", "cine_photo",  
        "dvds_blu_ray", "fixed_telephony","consoles_games"]

food = ["food", "drinks", "food_drink", "la_cuisine"]


toys = ["toys"]

home_appliances =[ "home_appliances", "home_appliances_2" ]

furniture = [
        "housewares", "furniture_decor", "bed_bath_table",
        "kitchen_dining_laundry_garden_furniture", 
        "furniture_living_room", "furniture_bedroom",
        "furniture_mattress_and_upholstery", "home_confort", "home_comfort_2", 
        "office_furniture"]

construction = ["costruction_tools_tools", "construction_tools_lights","construction_tools_safety", "home_construction", "construction_tools_construction"]

fashion_beauty = [
        "fashion_bags_accessories", "fashion_shoes", "fashion_male_clothing", "watches_gifts",
        "fashio_female_clothing", "fashion_childrens_clothes", 
        "fashion_underwear_beach", "fashion_sport","cool_stuff", "health_beauty", "perfumery","luggage_accessories","sports_leisure"
    ]

baby_products = [ "baby","diapers_and_hygiene"]

arts_hobbies =  [ "art", "arts_and_craftmanship", "music", "musical_instruments", 
                "books_general_interest", "books_technical", "books_imported", 
        "christmas_supplies", "stationery", "party_supplies","garden_tools","flowers","costruction_tools_garden"]

industry = ["industry_commerce_and_business", "agro_industry_and_commerce", "market_place"]

security = ["signaling_and_security", "security_and_services" ]
others = ["pet_shop","auto"]

In [None]:
def categorize_product(row):
    if row in electronics:
        return 'electronics'
    elif row in food:
        return 'food'
    elif row in toys:
        return 'toys'
    elif row in home_appliances:
        return 'home_appliances'
    elif row in furniture:
        return 'furniture'
    elif row in construction:
        return 'construction'
    elif row in fashion_beauty:
        return 'fashion_beauty'
    elif row in baby_products:
        return 'baby_products'
    elif row in arts_hobbies:
        return 'arts_hobbies'
    elif row in industry:
        return 'industry'
    elif row in security:
        return 'security'
    else:
        return 'others'

# retail['product_category_name'] 컬럼을 새로운 카테고리로 분류
retail_df['category'] = retail_df['product_category_name'].apply(categorize_product)
retail_df = retail_df.drop(columns='product_category_name')
retail_df = retail_df.reset_index(drop=True)

In [None]:
payment_list = sorted(retail_df['payment_type'].unique())

# 그룹화된 payment_type을 sorted된 순서로 결합
result = (
    retail_df[['order_id', 'payment_type']]
    .groupby('order_id')['payment_type']
    .apply(lambda x: '/'.join(sorted(set(x), key=lambda y: payment_list.index(y))))
    .reset_index()
)

# retail_df 에 있는 payment_type 을 지우고 order_id로 groupby
retail_cleaned = retail_df.drop(columns='payment_type')
retail_grouped = retail_cleaned.groupby('order_id').first().reset_index()

# 아까 만든 id 별 payment_type 테이블과 join
merged_df = result.merge(retail_grouped, on='order_id', how='left')


In [None]:
#deliverd_
merged_df['delivery_hours'] = (merged_df['order_delivered_timestamp'] - merged_df['order_purchase_timestamp']).dt.total_seconds() //3600  
merged_df = merged_df.drop(columns='order_delivered_timestamp')


# 가장 최근 구매 건을 기준으로 해당 아이템의 구매가 얼마나 오래 되었는지 나타내는 # Recency column 추가
max_date = max(retail_df['order_purchase_timestamp']) #최근 구매
merged_df['Diff_days'] = (max_date - merged_df['order_purchase_timestamp']).dt.days + 1
merged_df = merged_df.drop(columns='order_purchase_timestamp')
merged_df =merged_df[merged_df['Diff_days'] <= 365 ]

#Loyal: 3개월 이내 (자주 구매/활성화 고객).
#Potential: 3-6개월 (재활성화 가능성 있는 고객).
#At Risk: 6-9개월 (이탈 위험 고객).
#Lost: 9-12개월 (거의 이탈한 고객).

def diff_type(n):
    n = (n-1)//91 
    if n < 1 : 
        return 'loyal'
    elif n < 2:
        return 'potential'
    elif n < 3:
        return 'at_risk'
    else :
        return 'lost'

merged_df['Diff_type'] = merged_df['Diff_days'].apply(diff_type)


In [None]:
# 가로세로높이 부피로으로 통합했으니 삭제
merged_df2 = merged_df.drop(columns=['product_length_cm','product_height_cm','product_width_cm'])

# 화영님이 넣으신 이상치에 out_ids ID들 삭제
# 'order_id' 열의 값이 out_ids에 포함된 행 삭제
merged_df2 = merged_df2[~merged_df2['order_id'].isin(out_ids)]

In [None]:
# 클러스터링 할 컬럼 지정
feature_names = ['payment_type','product_weight_g', 'total_payment', 'volume', 'category', 'Diff_days']

merged_df_f = pd.DataFrame(merged_df2 , columns=feature_names)
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# 인코딩 할 컬럼 지정 (범주형)
columns_to_encode = ['payment_type', 'category']

# 원-핫 인코딩 수행
encoded_data = encoder.fit_transform(merged_df_f[columns_to_encode])

# 원-핫 인코딩된 컬럼 이름 생성
encoded_columns = encoder.get_feature_names_out(columns_to_encode)

# 결과를 DataFrame으로 변환
encoded_df = pd.DataFrame(encoded_data, columns=encoded_columns)

# 기존 컬럼과 병합 (인코딩 제외한 나머지 컬럼 추가)
merged_result = pd.concat([merged_df_f.drop(columns=columns_to_encode).reset_index(drop=True), encoded_df], axis=1)

In [None]:
# 원핫 인코딩되지 않은 컬럼 선택
columns_to_scale = merged_result.columns.difference(encoded_columns)

# 스케일링 대상 데이터 추출
data_to_scale = merged_result[columns_to_scale]

# StandardScaler 초기화 및 스케일링
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data_to_scale)

# 스케일링 결과를 DataFrame으로 변환
scaled_df = pd.DataFrame(scaled_data, columns=columns_to_scale, index=merged_result.index)

# 스케일링된 데이터와 원핫 인코딩된 데이터 병합
final_result = pd.concat([scaled_df, merged_result[encoded_columns]], axis=1)

In [None]:
from sklearn.decomposition import PCA

# 주성분 개수를 판단하기 위한 pca임의 시행 
pca = PCA(n_components=6)
pca.fit(final_result)

In [None]:
# pca 시행
pca_df = pca.fit_transform(final_result)
pca_df = pd.DataFrame(data = pca_df, columns = ['PC1','PC2','PC3','PC4','PC5','PC6']) 

In [None]:
# 군집개수(n_cluster)는 5,초기 중심 설정방식 랜덤,  
kmeans = KMeans(n_clusters=4, random_state=42, init='random')

# pca df 를 이용한 kmeans 알고리즘 적용
kmeans.fit(pca_df)

# 클러스터 번호 가져오기 
labels = kmeans.labels_

# 클러스터 번호를 PCA 데이터프레임에 추가하기
# 클러스터 번호가 할당된 데이터셋 생성
# 이제 pca_df의 마지막 컬럼(Cluster)에는 각 데이터 포인트가 속한 클러스터 번호가 포함되어 있습니다.
kmeans_df = pd.concat([pca_df, pd.DataFrame({'Cluster':labels})],axis = 1)

# PCA 데이터프레임에 클러스터 번호 추가
pca_df['Cluster'] = labels


In [None]:
# 클러스터 번호가 할당된 데이터셋 생성
kmeans_df.groupby(['Cluster'])['PC1'].count().reset_index()

In [None]:
cluster_centers = kmeans.cluster_centers_
print(pd.DataFrame(cluster_centers, columns=['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6']))