# Generador de datos sintéticos para actualizaciones de la DB (deltas)

##Instalación de librerías necesarias

In [0]:
%pip install sdv
%pip install sqlalchemy
%pip install --upgrade pandas
%pip install boto3

## Importación de librerías necesarias

In [0]:
import pandas as pd
import numpy as np
import requests
from sqlalchemy import create_engine
from sdv.tabular import GaussianCopula
import random
import boto3
import zipfile
import os
import zlib

## Descarga del modelo ya entrenado

In [0]:
r = requests.get('https://proyecto-grupal-henry.s3.amazonaws.com/generador_deltas.pkl')
with open('./generador_deltas.pkl', 'wb') as f:
    f.write(r.content)

## Generación de datos sintéticos con el modelo

In [0]:
model = GaussianCopula()
loaded = model.load('generador_deltas.pkl')
deltas = loaded.sample(num_rows=100000)

## Preparación de los datos sintéticos para que sean coherentes con las fechas y no haya órdenes duplicadas

In [0]:
# Minimiza el riesgo de duplicados haciendo cambios pseudoaleatorios a las order_id.
def random_id_replacer(string):
    return random.choice(random.choice('0123456789abcdefghijklmnopqrstuvwxyz'))

deltas['order_id'].str.replace('[a-z]', rand, regex=True)

In [0]:
caracteres=['a','b','c','d','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','1','2','3','4','5','6','7','8','9','0']

pachin = ''.join(random.choice('0123456789abcdefghijklmnopqrstuvwxyz') for i in range(32))

deltas['order_id'] = deltas.order_id.apply(pachin)
deltas.order_id.str.replace(*order_shuffler2)
deltas.order_id.str.replace(*order_shuffler3)
deltas.drop_duplicates('order_id', keep='first', inplace=True)
cond1 = (deltas.order_purchase_timestamp < deltas.order_approved_at)
cond2 = (deltas.order_purchase_timestamp < deltas.order_delivered_carrier_date)
cond3 = (deltas.order_purchase_timestamp < deltas.order_estimated_delivery_date)
cond4 = (deltas.order_approved_at < deltas.order_delivered_carrier_date)
cond5 = (deltas.order_approved_at < deltas.order_estimated_delivery_date)
cond6 = (deltas.order_delivered_customer_date >= deltas.order_delivered_carrier_date)
coherentes = cond1 & cond2 & cond3 & cond4 & cond5 & cond6
delta = deltas[coherentes]

## Generación de las tablas deltas finales desde el delta unificado

In [0]:
delta_orders = delta[['order_id', 'customer_id', 'order_status', 'order_purchase_timestamp', 'order_approved_at', "order_delivered_carrier_date","order_delivered_customer_date","order_estimated_delivery_date"]]
delta_order_items = delta[["order_id","order_item_id","product_id","seller_id","shipping_limit_date","price","freight_value"]]
delta_order_payments = delta[["order_id","payment_sequential","payment_type","payment_installments","payment_value"]]
delta_order_reviews = delta[["review_id","order_id","review_score","review_comment_title","review_comment_message","review_creation_date","review_answer_timestamp"]]
delta_products = delta[["product_id","product_category_name","product_name_lenght","product_description_lenght","product_photos_qty","product_weight_g","product_length_cm","product_height_cm","product_width_cm"]]
delta_sellers = delta[["seller_id","seller_zip_code_prefix","seller_city","seller_state"]]
delta_customers = delta[["customer_id","customer_unique_id","customer_zip_code_prefix","customer_city","customer_state"]]
delta_customers = delta_customers.rename(columns={'customer_zip_code_prefix':'zip_code_prefix', 'customer_city':'city', 'customer_state':'state'})
delta_order_payments = delta_order_payments.drop(columns='payment_sequential')
delta_order_items = delta_order_items.rename(columns={'order_item_id':'product_quantity'})
delta_order_reviews = delta_order_reviews.rename(columns={'review_comment_message':'comment_message', 'review_creation_date':'creation_date', 'review_comment_title':'comment_title', 'review_answer_timestamp':'answer_timestamp'})
delta_orders = delta_orders.rename(columns={'order_purchase_timestamp':'purchase_timestamp', 'order_delivered_carrier_date':'delivered_carrier_date', 'order_delivered_customer_date':'delivered_customer_date', 'order_estimated_delivery_date':'estimated_delivery_date'})
delta_products = delta_products.rename(columns={'product_description_lenght':'description_length', 'product_name_lenght':'name_length', 'product_weight_g':'weight_g', 'product_length_cm':'length_cm', 'product_height_cm':'height_cm', 'product_width_cm':'width_cm', 'product_photos_qty':'photos_quantity'})
delta_sellers = delta_sellers.rename(columns={'seller_zip_code_prefix':'zip_code_prefix', 'seller_city':'city', 'seller_state':'state'})
delta_order_reviews['comment_message'].fillna('sem_comentarios',inplace=True)
delta_order_reviews['comment_title'].fillna('sem_titulo',inplace=True)

## Guardado de los deltas en archivos .csv y luego todos juntos en un archivo .zip

In [0]:
delta_orders.to_csv('delta_orders.csv')
delta_products.to_csv('delta_products.csv')
delta_sellers.to_csv('delta_sellers.csv')
delta_order_reviews.to_csv('delta_order_reviews.csv')
delta_order_payments.to_csv('delta_order_payments.csv')
delta_customers.to_csv('delta_customers.csv')
delta_order_items.to_csv('delta_order_items.csv')

lista_csvs = ['delta_orders.csv', 'delta_products.csv', 'delta_sellers.csv', 'delta_order_reviews.csv', 'delta_order_payments.csv', 'delta_customers.csv', 'delta_order_items.csv']

with zipfile.ZipFile('deltas.zip', 'w') as zipF:
    for csv in lista_csvs:
        zipF.write(csv, compress_type=zipfile.ZIP_DEFLATED)

In [0]:
print(os.stat('deltas.zip'))

## Guardado del archivo comprimido con los deltas en el DataLake

In [0]:
session = boto3.Session(
aws_access_key_id='AKIAXVEE25BHU5DMNRQF',
aws_secret_access_key='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'
)

#Creating S3 Resource From the Session.
s3 = session.resource('s3')

s3.Bucket('proyecto-grupal-henry').upload_file("deltas.zip", "deltas.zip")