# 초반 세팅 부분(데이터 타입 최적화)

## 데이터 불러와서 미리 살펴본 데이터의 형태에 맞게 데이터 타입 변환
그리고 parquet파일로 저장

In [None]:
import os
from glob import glob

import pandas as pd
from tqdm.auto import tqdm

# directory = "../dataset/Looker Ecommerce BigQuery Dataset/*.csv"
def data_set_open_dictionary(directory):
    # 파일 데이터를 저장할 빈 사전 초기화
    file_dict = {}

    for file_path in tqdm(glob(directory)):
        standardized_path = os.path.normpath(file_path)
        file_name = standardized_path.split(os.sep)[-1].replace(".csv", "")
        # 파일명을 키로 하여 데이터프레임을 사전에 저장
        file_dict[file_name] = pd.read_csv(file_path)

    return file_dict

def data_set_display(file_dict):
    for df in tqdm(file_dict):
        print(df)
        file_dict[df].info()
        display(file_dict[df].head(3))

In [None]:
typechange_dict ={
    "distribution_centers": 
        {
        "int32_cols": [],
        "int16_cols": [],
        "int8_cols": ["id"],
        "float32_cols": ["latitude", "longitude"],
        "cate_cols": ["name"]
        }, 
    "events": 
        {
        "int32_cols": ["id","user_id"],
        "int16_cols": [],
        "int8_cols": ["sequence_number"],
        "float32_cols": [],
        "cate_cols": ["city", "state", "browser", "traffic_source", "event_type"]
        },
    "inventory_items": 
        {
        "int32_cols": ["id"],
        "int16_cols": ["product_id"],
        "int8_cols": ["product_distribution_center_id"],
        "float32_cols": [],
        "cate_cols": ["product_category", "product_brand", "product_department"]
        },
    "orders": 
        {
        "int32_cols": ["order_id", "user_id"],
        "int16_cols": [],
        "int8_cols": ["num_of_item"],
        "float32_cols": [],
        "cate_cols": ["status", "gender"]
        },
    "order_items": 
        {
        "int32_cols": ["id", "order_id", "user_id", "inventory_item_id"],
        "int16_cols": ["product_id"],
        "int8_cols": [],
        "float32_cols": ["sale_price"],
        "cate_cols": ["status"]
        },
    "products": 
        {
        "int32_cols": [],
        "int16_cols": ["id"],
        "int8_cols": ["distribution_center_id"],
        "float32_cols": ["cost", "retail_price"],
        "cate_cols": ["category", "brand", "department"]
        },
    "users": 
        {
        "int32_cols": ["id"],
        "int16_cols": [],
        "int8_cols": ["age"],
        "float32_cols": ["latitude", "longitude"],
        "cate_cols": ["first_name", "last_name", "gender", "state", "city", "country", "traffic_source"]
        },
    }
type_value = {"int32_cols": 2_147_483_647, "int16": 32_727, "int8_cols": 127, "float16_cols": 65_504, "cate_cols": 10_000}

In [None]:
directory = "../dataset/Looker Ecommerce BigQuery Dataset/*.csv"

file_dict = data_set_open_dictionary(directory)

In [None]:
for df_name, df in tqdm(file_dict.items()):
    memory_usage_before = df.memory_usage().sum()
    df. info()
    # datetime 바꾸기
    for col_datetime in df.columns[df.columns.str.endswith('_at')]:
        df[col_datetime] = pd.to_datetime(df[col_datetime], yearfirst=True, format="mixed", utc=True)
        
    df.info()
    for type_name, cols in typechange_dict[df_name].items():
        if type_name == "int32_cols":
            for col in cols:
                assert abs(df[col].max()) < 2_147_483_647 
                df[col] = df[col].astype(pd.Int32Dtype())
                # df.info()
        elif type_name == "int16_cols":
            for col in cols:
                assert abs(df[col].max()) < 2_147_483_647 
                df[col] = df[col].astype(pd.Int16Dtype())
                # df.info()
        elif type_name == "int8_cols":
            for col in cols:
                assert abs(df[col].max()) < 127
                df[col] = df[col].astype(pd.Int8Dtype())
                # df.info()
        elif type_name == "float16_cols":
            for col in cols:
                assert abs(df[col].max()) < 3.4e38
                df[col] = df[col].astype(pd.Float32Dtype())
                # df.info()
        elif type_name == "cate_cols":
            for col in cols:
                assert df[col].nunique() < 10_000
                df[col] = df[col].astype("category")
                # df.info()
    df.info()
    memory_usage_after = df.memory_usage().sum()
    reduction_ratio = 1 - (memory_usage_after / memory_usage_before)

    print(f"Memory Usage: {memory_usage_before:,} -> {memory_usage_after:,} ({reduction_ratio*100:.2f}% reduced)")
    
    df.to_parquet(f"../dataset/Looker Ecommerce BigQuery Dataset/Optimization/{df_name}_optimized.parquet")
    # df.to_csv(f"../dataset/Looker Ecommerce BigQuery Dataset/Optimization/{df_name}_optimized.csv")

## 데이터 정보 확인해보기
데이터 종류 몇가지 있는지, 크기 몇부터 몇까지인지, 날짜 몇부터 몇까지 있는지

In [None]:
def type_check(df):
    for col, data_type in df.dtypes.items():
        if data_type in ("object", "category"):
            ser_target = df[col].value_counts()
            print(f"{col}({data_type}): {len(ser_target):,}가지")
        elif data_type in ("float", "int", "int64","Int8", "Int16", "Int32"):
            max_value = df[col].max()
            min_value = df[col].min()
            if (((df[col].dropna() % 1) != 0).sum() == 0) or (data_type == "int"):
                target_data_type = "int"
            else:
                target_data_type = "float"
            print(f"{col}({target_data_type}): {min_value:,.2f} ~ {max_value:,.2f}")
        elif data_type in ("datetime64[ns]", "datetime64[ns, UTC]"):
            max_date = df[col].max()
            min_date = df[col].min()
            print(f"{col}({data_type}): {min_date} ~ {max_date}")
        else:
            raise Exception("New data type:", data_type)

In [None]:
for i, df in file_dict.items():
    print(i)
    type_check(df)
    df.info()
    print("---------------------------")

# 컬럼명 변경

In [None]:
# df.to_parquet(f"../dataset/Looker Ecommerce BigQuery Dataset/Optimization/{df_name}_optimized.parquet")