In [12]:
import pandas as pd
import numpy as np
import sqlite3
from sklearn.preprocessing import MinMaxScaler
import pickle
import os
import torch

In [22]:
# Đảm bảo thư mục utils tồn tại để lưu scaler
if not os.path.exists('utils'):
    os.makedirs('utils')

In [13]:
# Cell 2: Định nghĩa hàm reduce_mem_usage
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics: 
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [14]:
# Bước 1: Tải dữ liệu
def load_data():
    sales = pd.read_csv('C:/Users/Ho Hau/Downloads/M5/data/raw/sales_train_validation.csv')
    prices = pd.read_csv('C:/Users/Ho Hau/Downloads/M5/data/raw/sell_prices.csv')
    calendar = pd.read_csv('C:/Users/Ho Hau/Downloads/M5/data/raw/calendar.csv')
    
    sales = reduce_mem_usage(sales)
    prices = reduce_mem_usage(prices)
    calendar = reduce_mem_usage(calendar)
    
    return sales, prices, calendar

In [15]:
# Bước 2: Lọc dữ liệu cho bang California và 3 năm gần nhất (2013-2016)
def filter_data(sales, calendar):
    sales_ca = sales[sales['state_id'] == 'CA']
    
    start_date = '2013-01-01'
    end_date = '2016-05-22'
    calendar['date'] = pd.to_datetime(calendar['date'])
    calendar_filtered = calendar[(calendar['date'] >= start_date) & (calendar['date'] <= end_date)]
    
    day_columns = [col for col in sales.columns if col.startswith('d_')]
    day_to_date = calendar.set_index('d')['date'].to_dict()
    selected_days = [d for d in day_columns if day_to_date.get(d, pd.Timestamp('1900-01-01')) >= pd.Timestamp(start_date)]
    
    sales_columns = ['item_id', 'store_id', 'cat_id', 'state_id'] + selected_days
    sales_ca = sales_ca[sales_columns]
    
    print("Data filtered successfully!")
    print(f"sales_ca shape: {sales_ca.shape}")
    print(f"calendar_filtered shape: {calendar_filtered.shape}")
    print(f"Number of selected days: {len(selected_days)}")
    
    return sales_ca, calendar_filtered, selected_days

In [16]:
# Bước 3: Chọn top 100 sản phẩm có doanh số cao nhất
def select_top_products(sales_ca, selected_days):
    sales_sums = sales_ca[selected_days].sum(axis=1)
    sales_ca['total_sales'] = sales_sums
    top_products = sales_ca.nlargest(100, 'total_sales')['item_id'].values
    sales_top = sales_ca[sales_ca['item_id'].isin(top_products)]
    sales_top = sales_top.drop(columns=['total_sales'])
    # In thông tin chi tiết của top 100 sản phẩm
    print("Top 100 sản phẩm bán chạy nhất:")
    top_sales_details = sales_ca[sales_ca['item_id'].isin(top_products)][['item_id', 'total_sales']].drop_duplicates()
    for index, row in top_sales_details.iterrows():
        print(f"Item ID: {row['item_id']}, Tổng doanh số: {row['total_sales']}")
    print("Top products selected successfully!")
    print(f"sales_top shape: {sales_top.shape}")
    print(f"Number of top products: {len(top_products)}")
    
    return sales_top, top_products

In [27]:
# Bước 4: Kỹ thuật hóa đặc trưng
def engineer_features(sales_top, prices, calendar_filtered, selected_days):
    id_columns = ['item_id', 'store_id', 'cat_id', 'state_id']
    sales_melted = pd.melt(
        sales_top,
        id_vars=id_columns,
        value_vars=selected_days,
        var_name='d',
        value_name='sales'
    )
    
    sales_melted = sales_melted.merge(
        calendar_filtered[['d', 'date', 'wm_yr_wk', 'weekday', 'snap_CA', 'event_name_1', 'event_name_2']],
        on='d',
        how='left'
    )
    
    sales_melted = sales_melted.merge(
        prices[prices['store_id'].str.contains('CA')][['store_id', 'item_id', 'wm_yr_wk', 'sell_price']],
        on=['store_id', 'item_id', 'wm_yr_wk'],
        how='left'
    )
    
    # Xử lý NaN trong sell_price
    original_len = len(sales_melted)
    sales_melted['sell_price'] = sales_melted.groupby('item_id')['sell_price'].transform(lambda x: x.fillna(x.mean()))
    sales_melted = sales_melted.dropna(subset=['sell_price'])
    print(f"Dropped {original_len - len(sales_melted)} rows due to NaN in sell_price")
    
    # Thêm đặc trưng day_of_week
    sales_melted['day_of_week'] = sales_melted['weekday'].map({
        'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3,
        'Friday': 4, 'Saturday': 5, 'Sunday': 6
    })
    
    # Thêm đặc trưng is_holiday
    sales_melted['is_holiday'] = sales_melted[['event_name_1', 'event_name_2']].notnull().any(axis=1).astype(int)
    
    # Thêm đặc trưng month và day_of_month
    sales_melted['month'] = sales_melted['date'].dt.month
    sales_melted['day_of_month'] = sales_melted['date'].dt.day
    
    # Thêm các đặc trưng lag và rolling mean
    sales_melted = sales_melted.sort_values(['item_id', 'store_id', 'date'])
    for lag in [7, 14, 28]:
        sales_melted[f'sales_lag_{lag}'] = sales_melted.groupby(['item_id', 'store_id'])['sales'].shift(lag)
    
    for window in [7, 14]:
        sales_melted[f'sales_roll_mean_{window}'] = sales_melted.groupby(['item_id', 'store_id'])['sales'].shift(1).rolling(window=window).mean()
    
    # Xử lý NaN do lag và rolling mean
    sales_melted = sales_melted.fillna(0)
    
    # Danh sách các đặc trưng
    features = ['sales', 'sell_price', 'day_of_week', 'snap_CA', 'is_holiday', 'month', 'day_of_month',
                'sales_lag_7', 'sales_lag_14', 'sales_lag_28', 'sales_roll_mean_7', 'sales_roll_mean_14']
    sales_melted = sales_melted[['item_id', 'store_id', 'date'] + features]
    
    print("Feature engineering completed!")
    print(f"sales_melted shape: {sales_melted.shape}")
    
    return sales_melted, features

In [28]:
# Bước 5: Chuẩn hóa và lưu vào SQLite
def save_to_sqlite(sales_melted, features):
    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(sales_melted[features])
    sales_melted[features] = scaled_data
    
    conn = sqlite3.connect('historical_data.db')
    sales_melted.to_sql('historical_data', conn, if_exists='replace', index=False)
    conn.close()
    
    with open('utils/scaler.pkl', 'wb') as f:
        pickle.dump(scaler, f)
    
    print("Data preprocessing completed and saved to SQLite.")

In [29]:
# Hàm chính
def main():
    sales, prices, calendar = load_data()
    sales_ca, calendar_filtered, selected_days = filter_data(sales, calendar)
    sales_top, top_products = select_top_products(sales_ca, selected_days)
    sales_melted, features = engineer_features(sales_top, prices, calendar_filtered, selected_days)
    save_to_sqlite(sales_melted, features)

if __name__ == "__main__":
    main()

Mem. usage decreased to 95.00 Mb (78.7% reduction)
Mem. usage decreased to 130.48 Mb (37.5% reduction)
Mem. usage decreased to  0.12 Mb (41.9% reduction)
Data filtered successfully!
sales_ca shape: (12196, 1214)
calendar_filtered shape: (1238, 14)
Number of selected days: 1210
Top products selected successfully!
sales_top shape: (252, 1214)
Number of top products: 100
Dropped 0 rows due to NaN in sell_price
Feature engineering completed!
sales_melted shape: (304920, 15)
Data preprocessing completed and saved to SQLite.
