In [8]:
import io
import csv

import psycopg2
from getpass import getpass

from sqlalchemy import create_engine, text
from tqdm.auto import tqdm

import pandas as pd
import numpy as np

In [2]:
db_params = dict(user="jacobgdt@look-inna-book-postgres-hasura-pg-server",
                 password=getpass('Password: '),
                 host="look-inna-book-postgres-hasura-pg-server.postgres.database.azure.com",
                 port="5432",
                 database="look-inna-book"
                )

Password: ········


In [3]:
conn_str = 'postgresql://{user}:{password}@{host}:{port}/{database}'
engine = create_engine(conn_str.format(**db_params))

engine.table_names()

['final."Book"',
 'Book',
 'PaymentInfo',
 'ShippingAddress',
 'Order',
 'Store',
 'Offer',
 'author',
 'book',
 'authorbook',
 'purchaseditem',
 'paymentmethod',
 'purchase',
 'shipment',
 'residentialaddress',
 'cardholder',
 'customer',
 'household',
 'phonenumber',
 'publisher']

## Create Schema

In [11]:
# This doesn't work for whatever reason.
# Can just run in pgAdmin or Hasura.

with open('../SQL/DDL.sql') as f:
    schema = f.read()
    
engine.execute(text(schema))

<sqlalchemy.engine.result.ResultProxy at 0x11ce88050>

## Inserting Book Data

In [12]:
df = pd.read_csv('./amazon_books.csv') \
        .drop(columns=['filename', 'category_id']) \
        .rename(columns={
            'ASIN': 'asin',
            'image_url': 'coverURL'
        })

random_field = lambda low, high: np.random.randint(low, high, (len(df),))
df = df.assign(
    asin=lambda df: df.asin.apply(lambda asin: asin[:10]),
    publisherID=np.nan,
    pages=random_field(100, 1000),
    price=(np.random.rand() * random_field(2,50)).round(2),
    inventory=random_field(12, 100)
)

dump = df.drop(columns="author")
dump.head()

df.head()

Unnamed: 0,asin,coverURL,title,author,category,publisherID,pages,price,inventory
0,761183272,http://ecx.images-amazon.com/images/I/61Y5cOdH...,Mom's Family Wall Calendar 2016,Sandra Boynton,Calendars,,586,39.09,38
1,1623439671,http://ecx.images-amazon.com/images/I/61t-hrSw...,Doug the Pug 2016 Wall Calendar,Doug the Pug,Calendars,,365,11.97,36
2,B00O80WC6I,http://ecx.images-amazon.com/images/I/41X-KQqs...,"Moleskine 2016 Weekly Notebook, 12M, Large, Bl...",Moleskine,Calendars,,112,23.93,79
3,761182187,http://ecx.images-amazon.com/images/I/61j-4gxJ...,365 Cats Color Page-A-Day Calendar 2016,Workman Publishing,Calendars,,278,6.38,43
4,1578052084,http://ecx.images-amazon.com/images/I/51Ry4Tsq...,Sierra Club Engagement Calendar 2016,Sierra Club,Calendars,,901,29.52,68


In [13]:
# https://stackoverflow.com/a/55495065/6766123
def psql_insert_copy(table, conn, keys, data_iter):
    # gets a DBAPI connection that can provide a cursor
    data_iter = tqdm(data_iter)
    dbapi_conn = conn.connection
    
    with dbapi_conn.cursor() as cur:
        s_buf = io.StringIO()
        writer = csv.writer(s_buf)
        writer.writerows(data_iter)
        s_buf.seek(0)

        columns = ', '.join('"{}"'.format(k) for k in keys)
        if table.schema:
            table_name = '{}.{}'.format(table.schema, table.name)
        else:
            table_name = table.name

        sql = 'COPY {} ({}) FROM STDIN WITH CSV'.format(table_name, columns)
        cur.copy_expert(sql=sql, file=s_buf)

In [15]:
dump.columns = [c.lower() for c in dump.columns]
dump.to_sql('book', engine, schema="final", if_exists="append", index=False, method=psql_insert_copy)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




## Inserting Author Data

In [None]:
# delete all rows; can't use replace because of dependencies
engine.execute('truncate final."author";') 

seq_key = engine.execute("""SELECT pg_get_serial_sequence('final."author"', 'authorid');""").fetchall()[0][0]
reset_pk = "ALTER SEQUENCE {} RESTART WITH 1".format(seq_key)
engine.execute(reset_pk)

In [17]:
author_dump = df[['author', 'asin']].rename(columns={'author': 'fullname'}).dropna()
author_dump[['fullname']] \
    .drop_duplicates(subset='fullname') \
    .to_sql('author', engine, schema="final", if_exists="append", index=False, method=psql_insert_copy)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [19]:
engine.execute('SELECT COUNT(*) FROM final."author"').fetchall()

[(117374,)]

In [20]:
query = engine.execute('SELECT * FROM final."author"')
author_ids = pd.DataFrame(query.fetchall(), columns=['authorid', 'fullname'])

author_ids.head()

Unnamed: 0,authorid,fullname
0,1,Sandra Boynton
1,2,Doug the Pug
2,3,Moleskine
3,4,Workman Publishing
4,5,Sierra Club


In [21]:
writtenby = pd.merge(author_dump, author_ids, on='fullname')

print(writtenby.shape)
writtenby.head()

(193159, 3)


Unnamed: 0,fullname,asin,authorid
0,Sandra Boynton,761183272,1
1,Sandra Boynton,761177817,1
2,Sandra Boynton,761185631,1
3,Sandra Boynton,761137998,1
4,Sandra Boynton,894801996,1


In [22]:
writtenby.drop(columns='fullname') \
        .to_sql('writtenby', engine, schema="final", if_exists="replace", index=False, method=psql_insert_copy)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




## Inserting Users

In [49]:
def random_uuids(n):
    return (str(uuid.uuid4()) for _ in range(n))

In [97]:
import uuid

NUM_USERS = 1_000

users = pd.DataFrame([{'userid': uuid} for uuid in random_uuids(NUM_USERS)])
users.head()

Unnamed: 0,userid
0,2efd3928-b2de-4967-89f9-388307a90fb5
1,15246d67-79e5-4a86-856b-21a11010a4d9
2,24ef752e-ef38-4756-8af1-763fa6bc181e
3,d4de2b55-4ea3-44c8-a093-7a3c1009ddc9
4,eb019055-5dec-45ce-858f-dc4914ad9c90


In [98]:
users.to_sql('customer', engine, schema="final", if_exists="append", index=False, method=psql_insert_copy)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




## Simulating Purchases

In [46]:
from datetime import datetime

# https://stackoverflow.com/a/57722873/6766123

def random_dates(first_date, second_date, n):
    d1 = datetime.strptime(first_date, "%m/%d/%Y %I:%M %p")
    d2 = datetime.strptime(second_date, "%m/%d/%Y %I:%M %p")
    
    first_timestamp = int(d1.timestamp())
    second_timestamp = int(d2.timestamp())
    random_timestamps = np.random.randint(first_timestamp, second_timestamp, (n,))
    
    return pd.Series(random_timestamps).apply(datetime.fromtimestamp)

In [99]:
NUM_PURCHASES = np.random.randint(2000, 3000)
NUM_SAMPLED_USERS = np.random.randint(NUM_USERS//3, NUM_USERS//3*2)

purchases = users.sample(NUM_SAMPLED_USERS) \
                .sample(NUM_PURCHASES, replace=True) \
                .assign(
                    orderid=list(random_uuids(NUM_PURCHASES)),
                    ordertime=random_dates("1/1/2018 1:30 PM", "1/1/2019 4:50 AM", NUM_PURCHASES)
                )

assert purchases.userid.isin(users.userid).all()

print(purchases.shape)
purchases.head()

(2600, 3)


Unnamed: 0,userid,orderid,ordertime
117,b2280b1b-c036-4d55-b600-4b501326c3e2,b5625b33-f5ed-457f-abae-11981a26c1a6,2018-08-28 03:59:09
66,c3d708dc-9781-4089-8d9f-e412e197b4f1,7adcd152-b418-46f8-a338-6b15b6550f11,2018-08-31 14:50:40
762,b569642b-b337-45fc-bd55-85a170f67eb2,faaff1c1-f6db-4cc8-9f6f-2b036ad76ae4,2018-05-09 23:31:55
88,e84c0ee5-257c-4915-b855-5d7fcf62a6f4,0ac758ef-09ff-47e3-8ace-7878984b1b9a,2018-12-17 05:19:31
189,d26a0954-ac0a-42fc-80f0-987ec718df5c,f4a3e9af-e47a-4d39-bd41-9508db913b93,2018-06-28 02:18:15


In [106]:
# https://stackoverflow.com/a/50425683/6766123
def softmax(x, axis=None):
    x = x - x.max(axis=axis, keepdims=True)
    y = np.exp(x)
    return y / y.sum(axis=axis, keepdims=True)

MAX_QTY = 4
PURCHASE_QUANTITIES = list(range(1,MAX_QTY+1))
QTY_WEIGHTS = softmax(-2*np.array(PURCHASE_QUANTITIES))
QTY_WEIGHTS

array([0.86495488, 0.11705891, 0.0158422 , 0.00214401])

In [107]:
NUM_PURCHASED_ITEMS = np.random.randint(8000, 12_000)
purchased_items = df[['asin']].sample(NUM_PURCHASED_ITEMS, replace=True) \
                            .assign(
                                orderid=purchases.orderid.sample(NUM_PURCHASED_ITEMS, replace=True).values,
                                quantity=pd.Series(PURCHASE_QUANTITIES) \
                                            .sample(NUM_PURCHASED_ITEMS, 
                                                    replace=True, 
                                                    weights=QTY_WEIGHTS) \
                                            .values
                            )

print(purchased_items.shape)
purchased_items.head()

(9208, 3)


Unnamed: 0,asin,orderid,quantity
161362,1558964975,e1d44d9c-2623-4fd0-b1aa-4744769b7218,1
128808,870204629,4402095e-aab7-47fe-a332-556786dd6e0e,2
131587,038574238X,7624058b-b0eb-4e13-a556-6583bfa0324b,1
103694,177041133X,4952d3aa-e969-4002-885b-a086ad858419,1
10934,449224430,1d175dbe-01ab-4bce-bf23-d877b7e97dde,1


In [108]:
purchased_items.orderid.value_counts()

165a947e-fe58-472d-a1f0-e2f3e7525964    11
edee4bf8-fe4c-4c84-9aad-0bfcb9ffe242    10
4402095e-aab7-47fe-a332-556786dd6e0e    10
adf9f1dc-5137-491a-ab92-26fa89bc895e    10
78ab8c66-cd42-4887-900e-0f22545013ca    10
                                        ..
9298b769-8c6c-4d59-a177-346744307a4d     1
5b1ad1b3-456f-4850-8b4a-0f1b4dd4dae0     1
891a98c9-56d2-477a-88ab-55a581c7d9b3     1
437c6afc-e8b5-4e6d-82bf-2ccd2b1dc111     1
135c6237-9831-4393-8f65-8862e306d98b     1
Name: orderid, Length: 2525, dtype: int64

In [109]:
purchased_items.quantity.value_counts()

1    7955
2    1080
3     150
4      23
Name: quantity, dtype: int64

In [95]:
'1db54b37-1114-4f0d-a510-d51a28ab04ed' in users.userid.values

True

In [110]:
purchases.to_sql('purchase', engine, schema="final", if_exists="append", index=False, method=psql_insert_copy)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [111]:
purchased_items.to_sql('purchaseditem', 
                       engine, 
                       schema="final", 
                       if_exists="append", 
                       index=False, 
                       method=psql_insert_copy)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


