In [4]:
import sys
import os
import logging

logging.basicConfig(level=logging.INFO)

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(project_root)

import pandas as pd
from src.data.load_data import load_all_months

In [5]:
def clean_chunk(df):
    
    # event_time
    df["event_time"] = pd.to_datetime(df["event_time"], utc=True, errors="coerce")
    df = df.dropna(subset=['event_time'])

    # event_type
    valid_events = df["event_type"].unique()
    df = df[df["event_type"].isin(valid_events)]

    # category_code to main_category and sub_category
    df['category_code'] = df['category_code'].fillna('unknown')
    df['category_code'] = df['category_code'].astype(str)
    df['main_category'] = df['category_code'].apply(
        lambda x: x.split('.')[0] if x != 'unknown' else 'unknown')
    df['sub_category'] = df['category_code'].apply(
        lambda x: 'unknown' if x == 'unknown' else '.'.join(x.split('.')[1:]) or x)

    # user_id
    df = df.dropna(subset=['user_id'])

    # brand
    df["brand"] = df["brand"].fillna("unknown")
    df["brand"] = df["brand"].astype(str)

    # price
    df['price'] = pd.to_numeric(df['price'], errors='coerce')
    df['price'] = df['price'].fillna(0)
    df = df[~((df['event_type'] == 'purchase') & (df['price'].isnull()))]
    df['price'] = df['price'].astype(float)

    # user_session
    df['user_session'] = df['user_session'].fillna('unknown')
    df['user_session'] = df['user_session'].astype(str)

    # drop columns
    df = df.drop(columns=["category_id", "category_code", "product_id"])

    return df.reset_index(drop=True)

In [None]:
data_raw_path = "../.data/raw"
data_interim_path = "../.data/interim"
CHUNK_SIZE = 30_000_000

chunks = load_all_months(data_raw_path, chunck_size=CHUNK_SIZE)


for i, chunk in enumerate(chunks):
    logging.info("Start cleaning chunk_%s",i)
    df_clean = clean_chunk(chunk)
    logging.info("chunk_%s cleaned successfully.",i)
    df_clean.to_parquet(
        f"{data_interim_path}/cleaned_chunk_{i}.parquet", index=False)
    logging.info("cleaned_chunk_%s extracted successfully to %s/cleaned_chunk_%s.parquet.", i, data_interim_path, i)

logging.info("All operations are successfully processed.")

INFO:root:
Starting file: 2019-Dec.csv
INFO:root:Reading file in chunks: ../.data/raw\2019-Dec.csv
INFO:root:
Start cleaning chunk_0
INFO:root:chunk_0 cleaned successfully.
INFO:root:cleaned_chunk_0 extracted successfully to ../.data/interim/cleaned_chunk_0.parquet.
INFO:root:
Start cleaning chunk_1
INFO:root:chunk_1 cleaned successfully.
INFO:root:cleaned_chunk_1 extracted successfully to ../.data/interim/cleaned_chunk_1.parquet.
INFO:root:
Start cleaning chunk_2
INFO:root:chunk_2 cleaned successfully.
INFO:root:cleaned_chunk_2 extracted successfully to ../.data/interim/cleaned_chunk_2.parquet.
INFO:root:
Starting file: 2019-Nov.csv
INFO:root:Reading file in chunks: ../.data/raw\2019-Nov.csv
INFO:root:
Start cleaning chunk_3
INFO:root:chunk_3 cleaned successfully.
INFO:root:cleaned_chunk_3 extracted successfully to ../.data/interim/cleaned_chunk_3.parquet.
INFO:root:
Start cleaning chunk_4
INFO:root:chunk_4 cleaned successfully.
INFO:root:cleaned_chunk_4 extracted successfully to ../.