In [26]:
# Cell 1: Import tất cả các thư viện cần thiết
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
import warnings

warnings.filterwarnings('ignore')

In [27]:
# Cell 2: Định nghĩa hàm reduce_mem_usage
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics: 
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [28]:
# Cell 3: Đọc dữ liệu
def read_data():
    INPUT_DIR_PATH = 'C:/Users/Ho Hau/Downloads/M5/data/raw/'
    sell_prices_df = pd.read_csv(INPUT_DIR_PATH + 'sell_prices.csv')
    sell_prices_df = reduce_mem_usage(sell_prices_df)
    print('Sell prices has {} rows and {} columns'.format(sell_prices_df.shape[0], sell_prices_df.shape[1]))
    
    calendar_df = pd.read_csv(INPUT_DIR_PATH + 'calendar.csv')
    calendar_df = reduce_mem_usage(calendar_df)
    print('Calendar has {} rows and {} columns'.format(calendar_df.shape[0], calendar_df.shape[1]))
    
    sales_train_validation_df = pd.read_csv(INPUT_DIR_PATH + 'sales_train_validation.csv')
    sales_train_validation_df = reduce_mem_usage(sales_train_validation_df)
    print('Sales train validation has {} rows and {} columns'.format(sales_train_validation_df.shape[0], sales_train_validation_df.shape[1]))
    
    return sell_prices_df, calendar_df, sales_train_validation_df

sell_prices_df, calendar_df, sales_train_validation_df = read_data()

Mem. usage decreased to 130.48 Mb (37.5% reduction)
Sell prices has 6841121 rows and 4 columns
Mem. usage decreased to  0.12 Mb (41.9% reduction)
Calendar has 1969 rows and 14 columns
Mem. usage decreased to 95.00 Mb (78.7% reduction)
Sales train validation has 30490 rows and 1919 columns


In [29]:
# Cell 4: Lọc top 10 sản phẩm bán chạy nhất (đã sửa lỗi)
d_cols = [c for c in sales_train_validation_df.columns if c.startswith('d_')]
sum_series = sales_train_validation_df.copy()
sum_series['total'] = sum_series[d_cols].sum(axis=1)
top_k = 10
top_series = sum_series.nlargest(top_k, 'total')[['item_id', 'store_id']]
sales_long = (
    sales_train_validation_df.merge(top_series, on=['item_id', 'store_id'], how='inner')
    .melt(id_vars=['item_id', 'store_id'], value_vars=d_cols,
          var_name='d', value_name='sales')
)

In [30]:
# Cell 5: Kết hợp dữ liệu
df = sales_long.merge(calendar_df, on='d', how='left')
df = df.merge(sell_prices_df, on=['store_id', 'item_id', 'wm_yr_wk'], how='left')


In [31]:
# Cell 6: Xử lý đặc trưng
df['date'] = pd.to_datetime(df['date'])
df['day'] = df['date'].dt.day
df['month'] = df['date'].dt.month
df['weekday'] = df['date'].dt.weekday
try:
    df['week'] = df['date'].dt.isocalendar().week
except:
    df['week'] = df['date'].dt.week

df['event'] = df['event_name_1'].fillna('None')
df = pd.get_dummies(df, columns=['event'], prefix='evt')
for col in ['snap_CA', 'snap_TX', 'snap_WI']:
    df[col] = df[col].fillna(0).astype(int)

In [32]:
# Thêm lag và rolling mean
for lag in [7, 14, 28]:
    df[f'sales_lag_{lag}'] = df.groupby(['item_id', 'store_id'])['sales'].shift(lag)
df['rolling_mean_7'] = (
    df.groupby(['item_id', 'store_id'])['sales']
      .transform(lambda x: x.shift(1).rolling(7).mean())
)

In [33]:
# Xử lý NaN
df = df.fillna({'sell_price': 0, 'sales': 0, 'sales_lag_7': 0, 'sales_lag_14': 0, 'sales_lag_28': 0, 'rolling_mean_7': 0})

In [34]:
# Cell 7: Chuẩn hóa dữ liệu và mã hóa item/store
import pickle
import os

# Đảm bảo thư mục utils/ tồn tại
if not os.path.exists('utils'):
    os.makedirs('utils')

scaler = MinMaxScaler()
numeric_cols = ['sell_price', 'sales', 'sales_lag_7', 'sales_lag_14', 'sales_lag_28', 'rolling_mean_7']
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

# Lưu scaler vào utils/scaler.pkl
with open('utils/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
print("Scaler saved to 'utils/scaler.pkl'")

item_enc = LabelEncoder().fit(df['item_id'])
store_enc = LabelEncoder().fit(df['store_id'])
df['item_idx'] = item_enc.transform(df['item_id'])
df['store_idx'] = store_enc.transform(df['store_id'])

Scaler saved to 'utils/scaler.pkl'


In [35]:
# Cell 8: Lưu dữ liệu đã xử lý
df.to_csv('processed_data.csv', index=False)
print('Processed data saved to processed_data.csv')

Processed data saved to processed_data.csv
