# Here’s what I did
- Overall EDA of the data:
    - Extract column names by file
    - Check the shape of each dataset
    - Define terminology for this data
    
- Result: Output in the cell

In [1]:
import pandas as pd
import glob
import json

In [2]:
src_path = "../downloads/olist/*"
csv_paths = glob.glob(src_path)
csv_paths.sort()
csv_paths

['../downloads/olist/olist_customers_dataset.csv',
 '../downloads/olist/olist_geolocation_dataset.csv',
 '../downloads/olist/olist_order_items_dataset.csv',
 '../downloads/olist/olist_order_payments_dataset.csv',
 '../downloads/olist/olist_order_reviews_dataset.csv',
 '../downloads/olist/olist_orders_dataset.csv',
 '../downloads/olist/olist_products_dataset.csv',
 '../downloads/olist/olist_sellers_dataset.csv',
 '../downloads/olist/product_category_name_translation.csv']

In [3]:
df_by_path = {}
for csv_path in csv_paths:
    df = pd.read_csv(csv_path)
    df_by_path[csv_path] = df
    
for csv_path, df in df_by_path.items():
    print(f"{csv_path:<60} -> {df.shape}")

../downloads/olist/olist_customers_dataset.csv               -> (99441, 5)
../downloads/olist/olist_geolocation_dataset.csv             -> (1000163, 5)
../downloads/olist/olist_order_items_dataset.csv             -> (112650, 7)
../downloads/olist/olist_order_payments_dataset.csv          -> (103886, 5)
../downloads/olist/olist_order_reviews_dataset.csv           -> (99224, 7)
../downloads/olist/olist_orders_dataset.csv                  -> (99441, 8)
../downloads/olist/olist_products_dataset.csv                -> (32951, 9)
../downloads/olist/olist_sellers_dataset.csv                 -> (3095, 4)
../downloads/olist/product_category_name_translation.csv     -> (71, 2)


In [4]:
for csv_path, df in df_by_path.items():
    print(csv_path)
    print(df.isna().sum(), end='\n\n')

../downloads/olist/olist_customers_dataset.csv
customer_id                 0
customer_unique_id          0
customer_zip_code_prefix    0
customer_city               0
customer_state              0
dtype: int64

../downloads/olist/olist_geolocation_dataset.csv
geolocation_zip_code_prefix    0
geolocation_lat                0
geolocation_lng                0
geolocation_city               0
geolocation_state              0
dtype: int64

../downloads/olist/olist_order_items_dataset.csv
order_id               0
order_item_id          0
product_id             0
seller_id              0
shipping_limit_date    0
price                  0
freight_value          0
dtype: int64

../downloads/olist/olist_order_payments_dataset.csv
order_id                0
payment_sequential      0
payment_type            0
payment_installments    0
payment_value           0
dtype: int64

../downloads/olist/olist_order_reviews_dataset.csv
review_id                      0
order_id                       0
review_sco

In [5]:
for csv_path, df in df_by_path.items():
    print(csv_path)
    print(df.dtypes, end='\n\n')

../downloads/olist/olist_customers_dataset.csv
customer_id                 object
customer_unique_id          object
customer_zip_code_prefix     int64
customer_city               object
customer_state              object
dtype: object

../downloads/olist/olist_geolocation_dataset.csv
geolocation_zip_code_prefix      int64
geolocation_lat                float64
geolocation_lng                float64
geolocation_city                object
geolocation_state               object
dtype: object

../downloads/olist/olist_order_items_dataset.csv
order_id                object
order_item_id            int64
product_id              object
seller_id               object
shipping_limit_date     object
price                  float64
freight_value          float64
dtype: object

../downloads/olist/olist_order_payments_dataset.csv
order_id                 object
payment_sequential        int64
payment_type             object
payment_installments      int64
payment_value           float64
dtype: obje

In [6]:
columns_by_csv = {}
for csv_path, df in df_by_path.items():
    columns = sorted(df.columns.to_list())
    columns_by_csv[csv_path] = columns

print(json.dumps(columns_by_csv, indent=4))

{
    "../downloads/olist/olist_customers_dataset.csv": [
        "customer_city",
        "customer_id",
        "customer_state",
        "customer_unique_id",
        "customer_zip_code_prefix"
    ],
    "../downloads/olist/olist_geolocation_dataset.csv": [
        "geolocation_city",
        "geolocation_lat",
        "geolocation_lng",
        "geolocation_state",
        "geolocation_zip_code_prefix"
    ],
    "../downloads/olist/olist_order_items_dataset.csv": [
        "freight_value",
        "order_id",
        "order_item_id",
        "price",
        "product_id",
        "seller_id",
        "shipping_limit_date"
    ],
    "../downloads/olist/olist_order_payments_dataset.csv": [
        "order_id",
        "payment_installments",
        "payment_sequential",
        "payment_type",
        "payment_value"
    ],
    "../downloads/olist/olist_order_reviews_dataset.csv": [
        "order_id",
        "review_answer_timestamp",
        "review_comment_message",
        "r

In [7]:
all_unique_columns = []
for columns in list(columns_by_csv.values()):
    all_unique_columns += columns
all_unique_columns = sorted(set(all_unique_columns))
# all_unique_columns

### 용어 정리 표

| **카테고리**       | **컬럼명**                     | **설명**                                                                 |
|--------------------|--------------------------------|--------------------------------------------------------------------------|
| **Customer**       | customer_city                 | 고객이 거주하는 도시                                                     |
|                    | customer_id                   | 고객을 식별하기 위한 고유 ID                                             |
|                    | customer_state                | 고객이 거주하는 주(state)                                                |
|                    | customer_unique_id            | 고객의 고유 식별자(중복되지 않는 ID)                                     |
|                    | customer_zip_code_prefix      | 고객의 우편번호 접두사                                                   |
| **Freight**        | freight_value                 | 배송비(운송 비용)                                                        |
| **Geolocation**    | geolocation_city              | 위치 정보에 해당하는 도시                                               |
|                    | geolocation_lat               | 위도(latitude)                                                          |
|                    | geolocation_lng               | 경도(longitude)                                                         |
|                    | geolocation_state             | 위치 정보에 해당하는 주(state)                                          |
|                    | geolocation_zip_code_prefix   | 위치 정보에 해당하는 우편번호 접두사                                     |
| **Order**          | order_approved_at             | 주문이 승인된 날짜 및 시간                                               |
|                    | order_delivered_carrier_date  | 주문이 배송사에 전달된 날짜                                              |
|                    | order_delivered_customer_date | 주문이 고객에게 배송 완료된 날짜                                         |
|                    | order_estimated_delivery_date | 주문의 예상 배송 날짜                                                    |
|                    | order_id                      | 주문을 식별하기 위한 고유 ID                                             |
|                    | order_item_id                 | 주문 항목을 식별하기 위한 고유 ID                                        |
|                    | order_purchase_timestamp      | 주문이 생성된 날짜 및 시간                                               |
|                    | order_status                  | 주문 상태(예: delivered, shipped, canceled 등)                           |
| **Payment**        | payment_installments          | 할부 횟수                                                                |
|                    | payment_sequential            | 결제 순서(결제와 관련된 순차적 번호)                                     |
|                    | payment_type                  | 결제 유형(예: credit_card, boleto 등)                                    |
|                    | payment_value                 | 결제 금액                                                                |
| **Product**        | price                         | 상품 가격                                                                |
|                    | product_category_name         | 상품 카테고리 이름(원어)                                                 |
|                    | product_category_name_english | 상품 카테고리 이름(영어)                                                 |
|                    | product_description_lenght    | 상품 설명의 길이(문자 수)                                                |
|                    | product_height_cm             | 상품 높이(cm)                                                            |
|                    | product_id                    | 상품을 식별하기 위한 고유 ID                                             |
|                    | product_length_cm             | 상품 길이(cm)                                                            |
|                    | product_name_lenght           | 상품 이름의 길이(문자 수)                                                |
|                    | product_photos_qty            | 상품 사진 개수                                                           |
|                    | product_weight_g              | 상품 무게(g)                                                             |
|                    | product_width_cm              | 상품 너비(cm)                                                            |
| **Review**         | review_answer_timestamp       | 리뷰에 대한 답변이 작성된 날짜 및 시간                                    |
|                    | review_comment_message        | 리뷰 코멘트 내용                                                         |
|                    | review_comment_title          | 리뷰 제목                                                                |
|                    | review_creation_date          | 리뷰가 작성된 날짜                                                       |
|                    | review_id                     | 리뷰를 식별하기 위한 고유 ID                                              |
|                    | review_score                  | 리뷰 점수(예: 1~5점)                                                     |
| **Seller**         | seller_city                   | 판매자가 위치한 도시                                                     |
|                    | seller_id                     | 판매자를 식별하기 위한 고유 ID                                            |
|                    | seller_state                  | 판매자가 위치한 주(state)                                                |
|                    | seller_zip_code_prefix        | 판매자의 우편번호 접두사                                                 |
| **Shipping**       | shipping_limit_date           | 판매자가 상품을 배송사에 전달해야 하는 기한                              |

In [8]:
column_by_csvs = {}
for unique_col in all_unique_columns:
    column_by_csvs[unique_col] = []

for csv_path, cols in columns_by_csv.items():
    for unique_col in column_by_csvs.keys():
        if unique_col in cols:
            column_by_csvs[unique_col].append(csv_path)
            
print(json.dumps(column_by_csvs, indent=4))

{
    "customer_city": [
        "../downloads/olist/olist_customers_dataset.csv"
    ],
    "customer_id": [
        "../downloads/olist/olist_customers_dataset.csv",
        "../downloads/olist/olist_orders_dataset.csv"
    ],
    "customer_state": [
        "../downloads/olist/olist_customers_dataset.csv"
    ],
    "customer_unique_id": [
        "../downloads/olist/olist_customers_dataset.csv"
    ],
    "customer_zip_code_prefix": [
        "../downloads/olist/olist_customers_dataset.csv"
    ],
    "freight_value": [
        "../downloads/olist/olist_order_items_dataset.csv"
    ],
    "geolocation_city": [
        "../downloads/olist/olist_geolocation_dataset.csv"
    ],
    "geolocation_lat": [
        "../downloads/olist/olist_geolocation_dataset.csv"
    ],
    "geolocation_lng": [
        "../downloads/olist/olist_geolocation_dataset.csv"
    ],
    "geolocation_state": [
        "../downloads/olist/olist_geolocation_dataset.csv"
    ],
    "geolocation_zip_code_prefix": [


In [9]:
columns_shared_across_csvs = dict(filter(lambda item: len(item[1]) > 1, column_by_csvs.items()))
print(json.dumps(columns_shared_across_csvs, indent=4))

{
    "customer_id": [
        "../downloads/olist/olist_customers_dataset.csv",
        "../downloads/olist/olist_orders_dataset.csv"
    ],
    "order_id": [
        "../downloads/olist/olist_order_items_dataset.csv",
        "../downloads/olist/olist_order_payments_dataset.csv",
        "../downloads/olist/olist_order_reviews_dataset.csv",
        "../downloads/olist/olist_orders_dataset.csv"
    ],
    "product_category_name": [
        "../downloads/olist/olist_products_dataset.csv",
        "../downloads/olist/product_category_name_translation.csv"
    ],
    "product_id": [
        "../downloads/olist/olist_order_items_dataset.csv",
        "../downloads/olist/olist_products_dataset.csv"
    ],
    "seller_id": [
        "../downloads/olist/olist_order_items_dataset.csv",
        "../downloads/olist/olist_sellers_dataset.csv"
    ]
}
