# **匯入套件**

In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm # 進度條
from collections import Counter
import pickle
import gzip


# **讀取資料**

*   **原始檔案保留欄位**： 用戶ID、項目ID、交易日期
*   **新增欄位**：購物籃ID(同用戶同天交易的項目視為一個購物籃)、項目新索引
*   **計算原始欄位唯一值數量**：用戶、項目和交易日期
*   **轉換日期格式**： 11/1/2000 --> 2000-11-01
*   **排序**：先按用戶ID排序，對於每位用戶，則按照交易日期排序

處理前：
*   Customer length = 32266
*   Product length = 23812
*   Transaction length = 120




In [2]:
df = pd.read_csv('raw_data/ta_feng_all_months_merged.csv')
df.head(20)

Unnamed: 0,TRANSACTION_DT,CUSTOMER_ID,AGE_GROUP,PIN_CODE,PRODUCT_SUBCLASS,PRODUCT_ID,AMOUNT,ASSET,SALES_PRICE
0,11/1/2000,1104905,45-49,115,110411,4710199010372,2,24,30
1,11/1/2000,418683,45-49,115,120107,4710857472535,1,48,46
2,11/1/2000,1057331,35-39,115,100407,4710043654103,2,142,166
3,11/1/2000,1849332,45-49,Others,120108,4710126092129,1,32,38
4,11/1/2000,1981995,50-54,115,100205,4710176021445,1,14,18
5,11/1/2000,1741797,35-39,115,110122,78895770025,1,54,75
6,11/1/2000,308359,60-64,115,110507,4710192225520,1,85,105
7,11/1/2000,1607000,35-39,221,520503,4712936888817,1,45,68
8,11/1/2000,1057331,35-39,115,320203,4715398106864,2,70,78
9,11/1/2000,236645,35-39,Unknown,120110,4710126091870,1,43,53


In [3]:
# 僅留用戶ID、項目ID、交易日期
ta_feng_df = df.loc[:,["CUSTOMER_ID","PRODUCT_ID","TRANSACTION_DT"]]

# 新增兩個欄位：購物籃ID、每個項目新的索引
ta_feng_df.loc[:,["CART_ID","new_item_id"]] = ""

ta_feng_df

Unnamed: 0,CUSTOMER_ID,PRODUCT_ID,TRANSACTION_DT,CART_ID,new_item_id
0,1104905,4710199010372,11/1/2000,,
1,418683,4710857472535,11/1/2000,,
2,1057331,4710043654103,11/1/2000,,
3,1849332,4710126092129,11/1/2000,,
4,1981995,4710176021445,11/1/2000,,
...,...,...,...,...,...
817736,312790,4713317035042,2/28/2001,,
817737,57486,4710731060124,2/28/2001,,
817738,733526,4716340052307,2/28/2001,,
817739,173704,4714276145315,2/28/2001,,


In [4]:
# 檢查資料集是否有空值跟重複的資料
ta_feng_df.info()
ta_feng_df.duplicated().any()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 817741 entries, 0 to 817740
Data columns (total 5 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   CUSTOMER_ID     817741 non-null  int64 
 1   PRODUCT_ID      817741 non-null  int64 
 2   TRANSACTION_DT  817741 non-null  object
 3   CART_ID         817741 non-null  object
 4   new_item_id     817741 non-null  object
dtypes: int64(2), object(3)
memory usage: 31.2+ MB


False

In [5]:
# 檢查原始資料集各欄位的數量

CUSTOMER_ID_u = ta_feng_df["CUSTOMER_ID"].unique()
PRODUCT_ID_u = ta_feng_df["PRODUCT_ID"].unique()
TRANSACTION_DT_u = ta_feng_df["TRANSACTION_DT"].unique()

print(" --- Raw Data Information --- ")
print("Customer length =",len(CUSTOMER_ID_u))
print("Product length =",len(PRODUCT_ID_u))
print("Transaction length =",len(TRANSACTION_DT_u))

 --- Raw Data Information --- 
Customer length = 32266
Product length = 23812
Transaction length = 120


In [6]:
# 轉換日期格式：datetime64
ta_feng_df['TRANSACTION_DT'] = ta_feng_df['TRANSACTION_DT'].astype('datetime64[ns]')

# 排序：先按 CUSTOMER_ID 排序，對於相同的 CUSTOMER_ID ，按照 TRANSACTION_DT 的值排序
ta_feng_df = ta_feng_df.sort_values(["CUSTOMER_ID","TRANSACTION_DT"])

ta_feng_df.head(20)

Unnamed: 0,CUSTOMER_ID,PRODUCT_ID,TRANSACTION_DT,CART_ID,new_item_id
89853,1069,9556439880610,2000-11-13,,
90164,1069,4710176008699,2000-11-13,,
542122,1069,4710320224661,2001-01-21,,
542544,1069,4710022101208,2001-01-21,,
542959,1069,4712603661644,2001-01-21,,
686679,1069,4710088620156,2001-02-03,,
687898,1069,4710176008699,2001-02-03,,
688980,1069,22000167620,2001-02-03,,
689424,1069,4710011402026,2001-02-03,,
691872,1069,4903101125107,2001-02-03,,


# 處理

1.   刪除最不頻繁的項目，使剩下的項目保留了所有交易中95%以上的購買量
2.   刪除項目數量小於3的購物籃 （先不加）
3.   刪除購物籃筆數少於 3 的用戶資料
4.   重新計算商品編號
5.   新增購物籃編號


處理後：
*   Customer length = 13972
*   Product length = 12085
*   CART length = 93901
*   Transaction length = 120

## 處理1: 刪除最不頻繁的項目，使剩下的項目保留了所有交易中95%以上的購買量

In [7]:
# 計算每個項目的交易總量
product_purchase_counts = ta_feng_df['PRODUCT_ID'].value_counts()

# 計算交易總量的累積百分比
cumulative_percentage = product_purchase_counts.cumsum() / product_purchase_counts.sum()

# 找出滿足至少 95% 交易量的商品
valid_products = cumulative_percentage[cumulative_percentage <= 0.95].index

# 僅保留這些商品
ta_feng_df = ta_feng_df[ta_feng_df.PRODUCT_ID.isin(valid_products)]

In [8]:
#剩餘項目數量
PRODUCT_ID_u = ta_feng_df["PRODUCT_ID"].unique()

print(" --- Process 1 --- ")
print("Product length =",len(PRODUCT_ID_u))

 --- Process 1 --- 
Product length = 12085


## 處理2：刪除項目數量小於3的購物籃

In [9]:
print(" --- Process 2 & 3 --- ")

# 計算每個購物籃（CUSTOMER_ID 和 TRANSACTION_DT 組合）的商品數量
#basket_item_count = ta_feng_df.groupby(['CUSTOMER_ID', 'TRANSACTION_DT']).size()

# 篩選出商品數量大於或等於 3 的購物籃
#valid_baskets = basket_item_count[basket_item_count >= 3].index

# 保留這些有效的購物籃在原始數據中
#ta_feng_df = ta_feng_df.set_index(['CUSTOMER_ID', 'TRANSACTION_DT']).loc[valid_baskets].reset_index()

 --- Process 2 & 3 --- 


## 處理3: 刪除購物籃筆數少於 3 的用戶資料

In [10]:
# 計算每個用戶的購物籃數量
customer_basket_count = ta_feng_df.groupby('CUSTOMER_ID')['TRANSACTION_DT'].nunique()

# 篩選出購物籃數量少於 3 的用戶
customers_to_remove = customer_basket_count[customer_basket_count < 3].index

# 刪除這些用戶
ta_feng_df = ta_feng_df[~ta_feng_df['CUSTOMER_ID'].isin(customers_to_remove)]

ta_feng_df.head(20)

Unnamed: 0,CUSTOMER_ID,PRODUCT_ID,TRANSACTION_DT,CART_ID,new_item_id
89853,1069,9556439880610,2000-11-13,,
90164,1069,4710176008699,2000-11-13,,
542122,1069,4710320224661,2001-01-21,,
542544,1069,4710022101208,2001-01-21,,
542959,1069,4712603661644,2001-01-21,,
686679,1069,4710088620156,2001-02-03,,
687898,1069,4710176008699,2001-02-03,,
688980,1069,22000167620,2001-02-03,,
689424,1069,4710011402026,2001-02-03,,
691872,1069,4903101125107,2001-02-03,,


In [11]:
# 顯示剩餘
CUSTOMER_ID_u = ta_feng_df["CUSTOMER_ID"].unique()
PRODUCT_ID_u = ta_feng_df["PRODUCT_ID"].unique()

print("Product length =",len(PRODUCT_ID_u))
print("Customer length =",len(CUSTOMER_ID_u))

Product length = 12085
Customer length = 13972


## 處理4: 重新計算商品編號

In [12]:
# 檢查當前的項目是否在字典中，如果不在則代表是一個新商品編號
itemid_dict = {}
new_id = 0

def give_item_id(x):
    if x not in itemid_dict.keys():
        if itemid_dict:
            new_id = max(itemid_dict.values()) + 1
        else:
            new_id = 0
        itemid_dict[x] = new_id

    return itemid_dict[x]

In [13]:
tqdm.pandas(desc = 'apply')
new_itemID_series = ta_feng_df["PRODUCT_ID"].progress_apply( give_item_id )

#填入對應新編號
ta_feng_df["new_item_id"] = new_itemID_series
ta_feng_df

apply:   0%|          | 0/588025 [00:00<?, ?it/s]

Unnamed: 0,CUSTOMER_ID,PRODUCT_ID,TRANSACTION_DT,CART_ID,new_item_id
89853,1069,9556439880610,2000-11-13,,0
90164,1069,4710176008699,2000-11-13,,1
542122,1069,4710320224661,2001-01-21,,2
542544,1069,4710022101208,2001-01-21,,3
542959,1069,4712603661644,2001-01-21,,4
...,...,...,...,...,...
528162,20002000,20513184,2001-01-20,,1379
531237,20002000,4714541091071,2001-01-20,,7079
532062,20002000,4710018008634,2001-01-20,,2660
661318,20002000,4710085120680,2001-02-05,,309


## 處理5: 新增購物籃編號

In [14]:
# 使用同個客戶、同一購買日期進行分群
df_gp = ta_feng_df.groupby(["CUSTOMER_ID","TRANSACTION_DT"]).groups
df_gp

{(1069, 2000-11-13 00:00:00): [89853, 90164], (1069, 2001-01-21 00:00:00): [542122, 542544, 542959], (1069, 2001-02-03 00:00:00): [686679, 687898, 688980, 689424, 691872], (1069, 2001-02-10 00:00:00): [643871], (1113, 2000-11-12 00:00:00): [76074, 80078], (1113, 2000-11-26 00:00:00): [186918, 188130, 188131], (1113, 2000-11-27 00:00:00): [190132, 193725, 194153, 194573, 194970, 194974], (1113, 2001-01-06 00:00:00): [457372, 458108, 458469, 460077, 461655, 462278], (1823, 2000-11-02 00:00:00): [2001, 2053, 2077, 2256, 2316, 2420, 2488, 2592, 2621], (1823, 2000-11-06 00:00:00): [41999, 44235], (1823, 2001-01-24 00:00:00): [593057, 593444, 593453], (3667, 2000-12-08 00:00:00): [242202], (3667, 2000-12-26 00:00:00): [361469], (3667, 2001-02-01 00:00:00): [709422, 709846, 709859, 711431], (3667, 2001-02-08 00:00:00): [656394, 660699], (5241, 2000-12-16 00:00:00): [330329, 330745, 331143, 333833, 335022, 337296], (5241, 2001-01-07 00:00:00): [465517, 465849, 466674, 467096, 468381, 469098, 4

In [15]:
# 將每個購物籃的索引添加到 df_gp_list 列表中

# k = 每個組合的用戶ID和交易日期，v 是對應的行索引列表
df_gp_list = []
for k,v in tqdm( df_gp.items() ):
    df_gp_list.append( list(pd.Series(v) ) )

  0%|          | 0/93901 [00:00<?, ?it/s]

In [16]:
# 分配購物籃 ID
cart_id_list = [] # 用於儲存每行數據對應的購物籃ID

for items_list in tqdm(df_gp_list):
    cart_id = df_gp_list.index(items_list) # 找到當前購物籃索引列表在 df_gp_list 中的位置(即為購物籃ID）
    for item in items_list: # 當前購物籃的每個行索引
        cart_id_list.append(cart_id) # 添加購物籃ID，每個行索引都對應相同的購物籃ID

  0%|          | 0/93901 [00:00<?, ?it/s]

In [17]:
# 更新數據框 ta_feng_df
ta_feng_df["CART_ID"] = cart_id_list
ta_feng_df.head(20)

Unnamed: 0,CUSTOMER_ID,PRODUCT_ID,TRANSACTION_DT,CART_ID,new_item_id
89853,1069,9556439880610,2000-11-13,0,0
90164,1069,4710176008699,2000-11-13,0,1
542122,1069,4710320224661,2001-01-21,1,2
542544,1069,4710022101208,2001-01-21,1,3
542959,1069,4712603661644,2001-01-21,1,4
686679,1069,4710088620156,2001-02-03,2,5
687898,1069,4710176008699,2001-02-03,2,1
688980,1069,22000167620,2001-02-03,2,6
689424,1069,4710011402026,2001-02-03,2,7
691872,1069,4903101125107,2001-02-03,2,8


In [18]:
# 顯示最終數量
print(" ----- Final Data  -----")

CUSTOMER_ID_u = ta_feng_df["CUSTOMER_ID"].unique()
PRODUCT_ID_u = ta_feng_df["PRODUCT_ID"].unique()
CART_ID_u = ta_feng_df["CART_ID"].unique()
TRANSACTION_DT_u = ta_feng_df["TRANSACTION_DT"].unique()

print("Customer length =",len(CUSTOMER_ID_u))
print("Product length =",len(PRODUCT_ID_u))
print("CART length =",len(CART_ID_u))
print("Transaction length =",len(TRANSACTION_DT_u))

 ----- Final Data  -----
Customer length = 13972
Product length = 12085
CART length = 93901
Transaction length = 120


In [19]:
# 輸出整理後的檔案
ta_feng_clean = "cleaned_dataset/TaFeng_clean.csv"
ta_feng_df.to_csv(ta_feng_clean,sep=",",index=False,header=True)

print(ta_feng_df.head(15))

        CUSTOMER_ID     PRODUCT_ID TRANSACTION_DT  CART_ID  new_item_id
89853          1069  9556439880610     2000-11-13        0            0
90164          1069  4710176008699     2000-11-13        0            1
542122         1069  4710320224661     2001-01-21        1            2
542544         1069  4710022101208     2001-01-21        1            3
542959         1069  4712603661644     2001-01-21        1            4
686679         1069  4710088620156     2001-02-03        2            5
687898         1069  4710176008699     2001-02-03        2            1
688980         1069    22000167620     2001-02-03        2            6
689424         1069  4710011402026     2001-02-03        2            7
691872         1069  4903101125107     2001-02-03        2            8
643871         1069  4712162000038     2001-02-10        3            9
76074          1113  4902105011621     2000-11-12        4           10
80078          1113  4711271000014     2000-11-12        4      

# 處理成 TIFUKNN 可用形式

*   刪除 'PRODUCT_ID' 欄位
*   重新命名 'CART_ID' 為 'ORDER_NUMBER'，'new_item_id' 為 'MATERIAL_NUMBER'
*   將 ta_feng_clean.csv 分成 ta_feng_history.csv 與 ta_feng_future.csv
  *   ta_feng_history.csv：每個用戶的歷史購物記錄
  *   ta_feng_future.csv：每個用戶的最後一次購物紀錄



In [20]:
# 讀取 CSV 檔案
file_path = 'cleaned_dataset/TaFeng_clean.csv'  # 請將 '您的檔案路徑' 替換為實際檔案所在的路徑
data = pd.read_csv(file_path)

# 刪除 'PRODUCT_ID' 欄位
data.drop('PRODUCT_ID', axis=1, inplace=True)

# 重新命名 'CART_ID' 為 'ORDER_NUMBER'，'new_item_id' 為 'MATERIAL_NUMBER'
data.rename(columns={'CART_ID': 'ORDER_NUMBER', 'new_item_id': 'MATERIAL_NUMBER'}, inplace=True)

In [21]:
# 找到每位用戶的最後一個購物籃
last_order_numbers = data.groupby('CUSTOMER_ID')['ORDER_NUMBER'].max().reset_index()

# 將這些購物籃數據保存到 Future CSV 檔
last_orders_data = pd.merge(data, last_order_numbers, on=['CUSTOMER_ID', 'ORDER_NUMBER'])
future_data_path = 'cleaned_dataset/TaFeng_future.csv'
last_orders_data.to_csv(future_data_path, index=False)

# 將刪除最後一個購物籃的數據保存到 History CSV 檔
data = data[~data.set_index(['CUSTOMER_ID', 'ORDER_NUMBER']).index.isin(last_orders_data.set_index(['CUSTOMER_ID', 'ORDER_NUMBER']).index)]
history_data_path = 'cleaned_dataset/TaFeng_history.csv'
data.to_csv(history_data_path, index=False)


# 顯示修改後的數據
print(data.head(15))

    CUSTOMER_ID TRANSACTION_DT  ORDER_NUMBER  MATERIAL_NUMBER
0          1069     2000-11-13             0                0
1          1069     2000-11-13             0                1
2          1069     2001-01-21             1                2
3          1069     2001-01-21             1                3
4          1069     2001-01-21             1                4
5          1069     2001-02-03             2                5
6          1069     2001-02-03             2                1
7          1069     2001-02-03             2                6
8          1069     2001-02-03             2                7
9          1069     2001-02-03             2                8
11         1113     2000-11-12             4               10
12         1113     2000-11-12             4               11
13         1113     2000-11-26             5               10
14         1113     2000-11-26             5               12
15         1113     2000-11-26             5               13
