In [None]:
import pandas as pd
import numpy as np

In [None]:
df_products = pd.read_csv('data/brazilian-e-commerce-dataset/olist_products_dataset.csv')
df_orders = pd.read_csv('data/brazilian-e-commerce-dataset/olist_orders_dataset.csv')
df_reviews = pd.read_csv('data/brazilian-e-commerce-dataset/olist_order_reviews_dataset.csv')
df_order_items = pd.read_csv('data/brazilian-e-commerce-dataset/olist_order_items_dataset.csv')
df_customer = pd.read_csv('data/brazilian-e-commerce-dataset/olist_customers_dataset.csv')

In [None]:
df_final = df_orders.join(df_customer.set_index("customer_id"), on="customer_id")

In [None]:
df_final = df_final.join(df_reviews.set_index("order_id"), on="order_id")

In [None]:
df_final = df_final.join(df_order_items.set_index("order_id"), on="order_id")

In [None]:
df_final = df_final.join(df_products.set_index("product_id"), on="product_id")

In [None]:
df_final.fillna('', inplace=True)

In [None]:
df_to_json = df_final.iloc[:20]
# Preencher valores nulos com uma string vazia
df_to_json = df_to_json.copy()
df_to_json.fillna('', inplace=True)



In [None]:
import json

dict_list = df_to_json.to_dict(orient='records')

# Salvando a lista de dicionários no formato adequado para o _bulk
with open('data_bulk.json', 'w') as f:
    for idx, item in enumerate(dict_list):
        # Defina o _index e _id de acordo com o seu DataFrame
        action = {
            "index": {
                "_index": "salles",  # Nome do índice
                "_id": idx  # Use o índice como _id
            }
        }
        f.write(json.dumps(action) + '\n')
        f.write(json.dumps(item) + '\n')

In [None]:
customers = {}

# Iterar sobre cada linha do DataFrame
for index, row in df_final.iterrows():
    customer_id = row['customer_id']
    order_id = row['order_id']
    product_id = row['product_id']
    review_id = row['review_id']
    
    # Se o cliente ainda não estiver na lista de clientes
    if customer_id not in customers:
        customers[customer_id] = {
            "customer_id": customer_id,
            "customer_zip_code_prefix": row["customer_zip_code_prefix"], 
            "customer_city": row["customer_city"],
            "customer_unique_id": row["customer_unique_id"],
            "state": row["customer_state"],
            "orders": {}  # Alterado para um dicionário para mapear order_id para os detalhes do pedido
        }
    
    # Se o pedido ainda não estiver na lista de pedidos do cliente
    if order_id not in customers[customer_id]["orders"]:
        customers[customer_id]["orders"][order_id] = {
            "order_status": row["order_status"],
            "purchase_timestamp": row["order_purchase_timestamp"],
            "approved_at": row["order_approved_at"],
            "delivered_carrier_date": row["order_delivered_carrier_date"],
            "delivered_customer_date": row["order_delivered_customer_date"],
            "estimated_delivery_date": row["order_estimated_delivery_date"],
            "items": [],
            "review": {
                "review_id": row["review_id"],            
                "review_score": row["review_score"],                  
                "review_comment_title": row["review_comment_title"],
                "review_comment_message": row["review_comment_message"],
                "review_creation_date": row["review_creation_date"], 
                "review_answer_timestamp": row["review_answer_timestamp"],
            },
        }
        
    # Adicionar informações do item ao pedido
    customers[customer_id]["orders"][order_id]["items"].append({
        "product_category_name": row["product_category_name"],
        "product_name_lenght": row["product_name_lenght"],
        "product_description_lenght": row["product_description_lenght"],
        "product_photos_qty": row["product_photos_qty"],
        "product_weight_g": row["product_weight_g"],
        "product_length_cm": row["product_length_cm"],
        "product_height_cm": row["product_height_cm"],
        "product_width_cm": row["product_width_cm"],
        "order_item_id": row["order_item_id"],
        "product_id": product_id,
        "seller_id": row["seller_id"],
        "shipping_limit_date": row["shipping_limit_date"],
        "price": row["price"],
        "freight_value": row["freight_value"]
    })


import json

# A estrutura de dados que você criou
data = {
    "customers": customers,
}

with open('data_non_structured.json', 'w') as f:
    for customer_id, customer_data in data["customers"].items():
        for order_id, order_data in customer_data["orders"].items():
            # Definindo o índice e o ID de acordo com a sua estrutura de dados
            action = {
                "index": {
                    "_index": "salles",  # Nome do índice
                    "_id": order_id  # Use o ID do pedido como _id
                }
            }
            f.write(json.dumps(action) + '\n')
            f.write(json.dumps(order_data) + '\n')