In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Read the CSV file
# dataset_path = '/home/u1/Desktop/Gra_pr/QTransformer/dataset/Dataco_dataset/example_DataCoSupplyChainDataset.csv'
# dataset_path = '/home/u1/Desktop/Gra_pr/QTransformer/dataset/Dataco_dataset/DataCoSupplyChainDataset.csv'
dataset_path = 'dataset/DataCoSupplyChainDataset.csv'
df = pd.read_csv(dataset_path, encoding='ISO-8859-1')

df['order date (DateOrders)'] = pd.to_datetime(df['order date (DateOrders)'])

# Lọc đơn hàng hoàn thành
# df = df[df['Order Status'] == 'COMPLETE']
display(df.head())
print(f"Total number of orders: {len(df)}")

Unnamed: 0,Type,Days for shipping (real),Days for shipment (scheduled),Benefit per order,Sales per customer,Delivery Status,Late_delivery_risk,Category Id,Category Name,Customer City,...,Order Zipcode,Product Card Id,Product Category Id,Product Description,Product Image,Product Name,Product Price,Product Status,shipping date (DateOrders),Shipping Mode
0,DEBIT,4,4,-39.84,170.970001,Shipping on time,0,17,Cleats,Aurora,...,,365,17,,http://images.acmesports.sports/Perfect+Fitnes...,Perfect Fitness Perfect Rip Deck,59.990002,0,5/13/2015 18:44,Standard Class
1,DEBIT,4,4,54.709999,170.970001,Shipping on time,0,17,Cleats,Crown Point,...,,365,17,,http://images.acmesports.sports/Perfect+Fitnes...,Perfect Fitness Perfect Rip Deck,59.990002,0,3/15/2017 18:51,Standard Class
2,DEBIT,4,4,62.060001,170.970001,Shipping on time,0,17,Cleats,Irwin,...,,365,17,,http://images.acmesports.sports/Perfect+Fitnes...,Perfect Fitness Perfect Rip Deck,59.990002,0,5/29/2017 14:21,Standard Class
3,DEBIT,5,4,59.84,170.970001,Late delivery,1,17,Cleats,San Antonio,...,,365,17,,http://images.acmesports.sports/Perfect+Fitnes...,Perfect Fitness Perfect Rip Deck,59.990002,0,4/27/2017 20:11,Standard Class
4,DEBIT,4,4,62.060001,170.970001,Shipping on time,0,17,Cleats,Augusta,...,,365,17,,http://images.acmesports.sports/Perfect+Fitnes...,Perfect Fitness Perfect Rip Deck,59.990002,0,5/13/2017 0:43,Standard Class


Total number of orders: 51709


In [17]:
# Đặc trưng
buyer_features = ['Customer Country', 'Customer State', 'Customer City', 'Customer Segment',
                  'Category Name', 'Product Name', 'Order Item Product Price', 'Order Item Discount Rate',
                  'Order Item Profit Ratio', 'Order Profit Per Order', 'Customer Id']

seller_features = ['Order Country', 'Order State', 'Order City', 'Order Region', 'Market',
                   'Category Name', 'Product Name', 'Shipping Mode', 'Days for shipping (real)',
                   'Late_delivery_risk', 'Department Name', 'Latitude', 'Longitude',
                   'Order Item Product Price', 'Order Item Discount Rate', 'Order Item Profit Ratio',
                   'Order Profit Per Order']

# Hàm tổng hợp chuỗi thời gian
def create_time_series(df, group_cols, target_col='Order Item Quantity', freq='D'):
    ts_data = df.groupby(group_cols + [pd.Grouper(key='order date (DateOrders)', freq=freq)])[target_col].sum().reset_index()
    features = [col for col in df.columns if col not in ['order date (DateOrders)', target_col] + group_cols]
    feature_data = df.groupby(group_cols + [pd.Grouper(key='order date (DateOrders)', freq=freq)])[features].first().reset_index()
    ts_data = ts_data.merge(feature_data, on=group_cols + ['order date (DateOrders)'], how='left')
    return ts_data

# Tạo dữ liệu cho người mua và người bán
buyer_levels = [['Customer Country'], ['Customer Country', 'Customer State'], ['Customer Country', 'Customer State', 'Customer City']]
seller_levels = [['Order Country'], ['Order Country', 'Order State'], ['Order Country', 'Order State', 'Order City']]

for level in buyer_levels:
    ts_data = create_time_series(df, level)
    ts_data.to_csv(f'dataset/buyer_{"_".join(level)}.csv', index=False)

for level in seller_levels:
    ts_data = create_time_series(df, level)
    ts_data.to_csv(f'dataset/seller_{"_".join(level)}.csv', index=False)



In [None]:
# Mã hóa và chuẩn hóa
def preprocess_data(file_path, features):
    df = pd.read_csv(file_path)
    df['order date (DateOrders)'] = pd.to_datetime(df['order date (DateOrders)'])
    
    cat_cols = [col for col in features if df[col].dtype == 'object']
    num_cols = [col for col in features if df[col].dtype != 'object']
    
    # Mã hóa đặc trưng danh mục
    enc = OneHotEncoder(sparse=False, handle_unknown='ignore')
    if cat_cols:
        encoded = enc.fit_transform(df[cat_cols])
        encoded_cols = [f"{col}_{val}" for col, vals in zip(cat_cols, enc.categories_) for val in vals]
        df_encoded = pd.DataFrame(encoded, columns=encoded_cols)
    else:
        df_encoded = pd.DataFrame()
    
    # Chuẩn hóa đặc trưng số
    scaler = StandardScaler()
    if num_cols:
        scaled = scaler.fit_transform(df[num_cols])
        df_scaled = pd.DataFrame(scaled, columns=num_cols)
    else:
        df_scaled = pd.DataFrame()
    
    # Kết hợp
    df_processed = pd.concat([df[['order date (DateOrders)', 'Order Item Quantity']], df_scaled, df_encoded], axis=1)
    df_processed.to_csv(file_path.replace('.csv', '_processed.csv'), index=False)
    return len(encoded_cols) + len(num_cols) + 1  # +1 cho Order Item Quantity

# Xử lý tất cả file
buyer_files = [f'dataset/buyer_{"_".join(level)}.csv' for level in buyer_levels]
seller_files = [f'dataset/seller_{"_".join(level)}.csv' for level in seller_levels]

buyer_enc_in = [preprocess_data(f, buyer_features) for f in buyer_files]
seller_enc_in = [preprocess_data(f, seller_features) for f in seller_files]