In [16]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Read the CSV file
# dataset_path = '/home/u1/Desktop/Gra_pr/QTransformer/dataset/Dataco_dataset/example_DataCoSupplyChainDataset.csv'
# dataset_path = '/home/u1/Desktop/Gra_pr/QTransformer/dataset/Dataco_dataset/DataCoSupplyChainDataset.csv'
dataset_path = 'dataset/DataCoSupplyChainDataset.csv'
df = pd.read_csv(dataset_path, encoding='ISO-8859-1')

df['order date (DateOrders)'] = pd.to_datetime(df['order date (DateOrders)'])
# Sắp xếp theo thời gian từ cũ đến mới
df = df.sort_values('order date (DateOrders)', ascending=True)


# Lọc đơn hàng hoàn thành
# df = df[df['Order Status'] == 'COMPLETE']
display(df.head())
print(f"Total number of orders: {len(df)}")
print(df.columns.to_list())

Unnamed: 0,Type,Days for shipping (real),Days for shipment (scheduled),Benefit per order,Sales per customer,Delivery Status,Late_delivery_risk,Category Id,Category Name,Customer City,...,Order Zipcode,Product Card Id,Product Category Id,Product Description,Product Image,Product Name,Product Price,Product Status,shipping date (DateOrders),Shipping Mode
3026,PAYMENT,3,4,68.25,227.5,Advance shipping,0,24,Women's Apparel,Chicago,...,,502,24,,http://images.acmesports.sports/Nike+Men%27s+D...,Nike Men's Dri-FIT Victory Golf Polo,50.0,0,1/4/2015 0:21,Standard Class
8619,CASH,5,4,33.59,159.940002,Late delivery,1,46,Indoor/Outdoor Games,San Antonio,...,,1014,46,,http://images.acmesports.sports/O%27Brien+Men%...,O'Brien Men's Neoprene Life Vest,49.98,0,1/6/2015 1:03,Standard Class
8618,DEBIT,6,4,9.38,82.970001,Late delivery,1,46,Indoor/Outdoor Games,Caguas,...,,1014,46,,http://images.acmesports.sports/O%27Brien+Men%...,O'Brien Men's Neoprene Life Vest,49.98,0,1/7/2015 1:24,Standard Class
33128,TRANSFER,4,4,16.879999,149.380005,Shipping on time,0,17,Cleats,Caguas,...,,365,17,,http://images.acmesports.sports/Perfect+Fitnes...,Perfect Fitness Perfect Rip Deck,59.990002,0,1/5/2015 2:27,Standard Class
17686,TRANSFER,4,4,1.49,49.5,Shipping on time,0,24,Women's Apparel,Caguas,...,,502,24,,http://images.acmesports.sports/Nike+Men%27s+D...,Nike Men's Dri-FIT Victory Golf Polo,50.0,0,1/5/2015 2:27,Standard Class


Total number of orders: 74223
['Type', 'Days for shipping (real)', 'Days for shipment (scheduled)', 'Benefit per order', 'Sales per customer', 'Delivery Status', 'Late_delivery_risk', 'Category Id', 'Category Name', 'Customer City', 'Customer Country', 'Customer Email', 'Customer Fname', 'Customer Id', 'Customer Lname', 'Customer Password', 'Customer Segment', 'Customer State', 'Customer Street', 'Customer Zipcode', 'Department Id', 'Department Name', 'Latitude', 'Longitude', 'Market', 'Order City', 'Order Country', 'Order Customer Id', 'order date (DateOrders)', 'Order Id', 'Order Item Cardprod Id', 'Order Item Discount', 'Order Item Discount Rate', 'Order Item Id', 'Order Item Product Price', 'Order Item Profit Ratio', 'Order Item Quantity', 'Sales', 'Order Item Total', 'Order Profit Per Order', 'Order Region', 'Order State', 'Order Status', 'Order Zipcode', 'Product Card Id', 'Product Category Id', 'Product Description', 'Product Image', 'Product Name', 'Product Price', 'Product Sta

In [17]:
# buyer_features = ['Customer Country', 'Customer Segment', 'Category Name',
#                   'Order Item Product Price', 'Order Item Discount Rate', 'Order Item Profit Ratio',
#                   'Order Profit Per Order', 'Customer Id']
# seller_features = ['Order Region', 'Market', 'Category Name', 'Shipping Mode', 'Days for shipping (real)',
#                    'Late_delivery_risk', 'Department Name', 'Order Item Product Price',
#                    'Order Item Discount Rate', 'Order Item Profit Ratio', 'Order Profit Per Order']

buyer_features = ['Customer Country', 'Customer Segment',
                  'Order Item Product Price', 'Order Item Discount Rate', 'Order Item Profit Ratio',
                  'Order Profit Per Order', 'Customer Id']
seller_features = ['Order Region', 'Market', 'Shipping Mode', 'Days for shipping (real)',
                   'Late_delivery_risk', 'Order Item Product Price',
                   'Order Item Discount Rate', 'Order Item Profit Ratio', 'Order Profit Per Order']

# Hàm tổng hợp chuỗi thời gian
def create_time_series(df, group_cols, target_col='Order Item Quantity', freq='D'):
    ts_data = df.groupby(group_cols + [pd.Grouper(key='order date (DateOrders)', freq=freq)])[target_col].sum().reset_index()
    features = [col for col in df.columns if col not in ['order date (DateOrders)', target_col] + group_cols]
    feature_data = df.groupby(group_cols + [pd.Grouper(key='order date (DateOrders)', freq=freq)])[features].first().reset_index()
    ts_data = ts_data.merge(feature_data, on=group_cols + ['order date (DateOrders)'], how='left')
    ts_data = ts_data.sort_values('order date (DateOrders)', ascending=True)
    return ts_data

# Tạo dữ liệu cho người mua và người bán
buyer_levels = [['Customer Country']]
seller_levels = [['Order Region']]

for level in buyer_levels:
    ts_data = create_time_series(df, level)
    ts_data.to_csv(f'dataset/buyer_{"_".join(level).replace(" ", "_")}.csv', index=False)

for level in seller_levels:
    ts_data = create_time_series(df, level)
    ts_data.to_csv(f'dataset/seller_{"_".join(level).replace(" ", "_")}.csv', index=False)



In [18]:
# Mã hóa và chuẩn hóa
def preprocess_data(file_path, features):
    df = pd.read_csv(file_path)
    df['order date (DateOrders)'] = pd.to_datetime(df['order date (DateOrders)'])
    df = df.sort_values('order date (DateOrders)', ascending=True)
    
    cat_cols = [col for col in features if df[col].dtype == 'object']
    num_cols = [col for col in features if df[col].dtype != 'object']
    
    # Mã hóa đặc trưng danh mục
    enc = OneHotEncoder(sparse=False, handle_unknown='ignore')
    if cat_cols:
        encoded = enc.fit_transform(df[cat_cols])
        encoded_cols = [f"{col}_{val}" for col, vals in zip(cat_cols, enc.categories_) for val in vals]
        df_encoded = pd.DataFrame(encoded, columns=encoded_cols)
    else:
        df_encoded = pd.DataFrame()
    
    # Chuẩn hóa đặc trưng số
    scaler = StandardScaler()
    if num_cols:
        scaled = scaler.fit_transform(df[num_cols])
        df_scaled = pd.DataFrame(scaled, columns=num_cols)
    else:
        df_scaled = pd.DataFrame()
    
    # Kết hợp
    df_processed = pd.concat([df[['order date (DateOrders)', 'Order Item Quantity']], df_scaled, df_encoded], axis=1)
    df_processed.to_csv(file_path.replace('.csv', '_processed.csv'), index=False)
    return len(encoded_cols) + len(num_cols) + 1  # +1 cho Order Item Quantity



In [7]:
import os

folder_path = "dataset"

for root, dirs, files in os.walk(folder_path):
    for filename in files:
        old_path = os.path.join(root, filename)
        
        # Đổi dấu cách thành dấu gạch dưới
        new_filename = filename.replace(" ", "_")
        new_path = os.path.join(root, new_filename)
        
        # Chỉ đổi tên nếu có sự thay đổi
        if old_path != new_path:
            os.rename(old_path, new_path)
            print(f"Đổi: {old_path}  -->  {new_path}")

print("Hoàn thành đổi tên file!")


Đổi: dataset/buyer_Customer Country_Customer State_processed.csv  -->  dataset/buyer_Customer_Country_Customer_State_processed.csv
Đổi: dataset/seller_Order Country_processed.csv  -->  dataset/seller_Order_Country_processed.csv
Đổi: dataset/buyer_Customer Country_Customer State_Customer City_processed.csv  -->  dataset/buyer_Customer_Country_Customer_State_Customer_City_processed.csv
Đổi: dataset/seller_Order Country_Order State_Order City_processed.csv  -->  dataset/seller_Order_Country_Order_State_Order_City_processed.csv
Đổi: dataset/seller_Order Country_Order State_processed.csv  -->  dataset/seller_Order_Country_Order_State_processed.csv
Đổi: dataset/seller_Order Country.csv  -->  dataset/seller_Order_Country.csv
Đổi: dataset/seller_Order Country_Order State_Order City.csv  -->  dataset/seller_Order_Country_Order_State_Order_City.csv
Đổi: dataset/buyer_Customer Country_Customer State_Customer City.csv  -->  dataset/buyer_Customer_Country_Customer_State_Customer_City.csv
Đổi: datas

In [None]:
# Xử lý tất cả file
buyer_files = [f'dataset/buyer_{"_".join(level).replace(" ", "_")}.csv' for level in buyer_levels]
seller_files = [f'dataset/seller_{"_".join(level).replace(" ", "_")}.csv' for level in seller_levels]

buyer_enc_in = [preprocess_data(f, buyer_features) for f in buyer_files]
seller_enc_in = [preprocess_data(f, seller_features) for f in seller_files]

print(f"Buyer encoded input dimensions: {buyer_enc_in}")
print(f"Seller encoded input dimensions: {seller_enc_in}")

# Buyer encoded input dimensions: [11]
# Seller encoded input dimensions: [39]



Buyer encoded input dimensions: [11]
Seller encoded input dimensions: [39]
