In [8]:
import io
import csv

import psycopg2
from getpass import getpass

from sqlalchemy import create_engine, text
from tqdm.auto import tqdm

import pandas as pd
import numpy as np

In [194]:
db_params = dict(user=f"{input('User: ')}@look-inna-book-postgres-hasura-pg-server",
                 password=getpass('Password: '),
                 host="look-inna-book-postgres-hasura-pg-server.postgres.database.azure.com",
                 port="5432",
                 database="look-inna-book"
                )

User: jacobgdt
Password: ········


In [195]:
conn_str = 'postgresql://{user}:{password}@{host}:{port}/{database}'
engine = create_engine(conn_str.format(**db_params))

engine.table_names()

['final."Book"',
 'PaymentInfo',
 'ShippingAddress',
 'Order',
 'Store',
 'authorbook',
 'purchaseditem',
 'paymentmethod',
 'author',
 'book',
 'shipment',
 'cardholder',
 'residentialaddress',
 'customer',
 'household',
 'phonenumber',
 'purchase',
 'Book',
 'Offer',
 'publisher']

## Data Generation Status

Currently automating creation of:

* ✅ author
* ✅ book
* ✅ cardholder 
* ✅ customer 
* ✅ household 
* ✅ paymentmethod 
* ❌ phonenumber
* ❌ publisher
* ✅ purchase 
* ✅ purchaseditem 
* ❌ residentialaddress
* ❌ shipment
* ✅ writtenby 


## Create Schema

In [11]:
# This doesn't work for whatever reason.
# Can just run in pgAdmin or Hasura.

with open('../SQL/DDL.sql') as f:
    schema = f.read()
    
engine.execute(text(schema))

<sqlalchemy.engine.result.ResultProxy at 0x11ce88050>

## Inserting Book Data

In [12]:
df = pd.read_csv('./amazon_books.csv') \
        .drop(columns=['filename', 'category_id']) \
        .rename(columns={
            'ASIN': 'asin',
            'image_url': 'coverURL'
        })

random_field = lambda low, high: np.random.randint(low, high, (len(df),))
df = df.assign(
    asin=lambda df: df.asin.apply(lambda asin: asin[:10]),
    publisherID=np.nan,
    pages=random_field(100, 1000),
    price=(np.random.rand() * random_field(2,50)).round(2),
    inventory=random_field(12, 100)
)

dump = df.drop(columns="author")
dump.head()

df.head()

Unnamed: 0,asin,coverURL,title,author,category,publisherID,pages,price,inventory
0,761183272,http://ecx.images-amazon.com/images/I/61Y5cOdH...,Mom's Family Wall Calendar 2016,Sandra Boynton,Calendars,,586,39.09,38
1,1623439671,http://ecx.images-amazon.com/images/I/61t-hrSw...,Doug the Pug 2016 Wall Calendar,Doug the Pug,Calendars,,365,11.97,36
2,B00O80WC6I,http://ecx.images-amazon.com/images/I/41X-KQqs...,"Moleskine 2016 Weekly Notebook, 12M, Large, Bl...",Moleskine,Calendars,,112,23.93,79
3,761182187,http://ecx.images-amazon.com/images/I/61j-4gxJ...,365 Cats Color Page-A-Day Calendar 2016,Workman Publishing,Calendars,,278,6.38,43
4,1578052084,http://ecx.images-amazon.com/images/I/51Ry4Tsq...,Sierra Club Engagement Calendar 2016,Sierra Club,Calendars,,901,29.52,68


In [13]:
# https://stackoverflow.com/a/55495065/6766123
def psql_insert_copy(table, conn, keys, data_iter):
    # gets a DBAPI connection that can provide a cursor
    data_iter = tqdm(data_iter)
    dbapi_conn = conn.connection
    
    with dbapi_conn.cursor() as cur:
        s_buf = io.StringIO()
        writer = csv.writer(s_buf)
        writer.writerows(data_iter)
        s_buf.seek(0)

        columns = ', '.join('"{}"'.format(k) for k in keys)
        if table.schema:
            table_name = '{}.{}'.format(table.schema, table.name)
        else:
            table_name = table.name

        sql = 'COPY {} ({}) FROM STDIN WITH CSV'.format(table_name, columns)
        cur.copy_expert(sql=sql, file=s_buf)

In [15]:
dump.columns = [c.lower() for c in dump.columns]
dump.to_sql('book', engine, schema="final", if_exists="append", index=False, method=psql_insert_copy)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




## Inserting Author Data

In [None]:
# delete all rows; can't use replace because of dependencies
engine.execute('truncate final."author";') 

seq_key = engine.execute("""SELECT pg_get_serial_sequence('final."author"', 'authorid');""").fetchall()[0][0]
reset_pk = "ALTER SEQUENCE {} RESTART WITH 1".format(seq_key)
engine.execute(reset_pk)

In [17]:
author_dump = df[['author', 'asin']].rename(columns={'author': 'fullname'}).dropna()
author_dump[['fullname']] \
    .drop_duplicates(subset='fullname') \
    .to_sql('author', engine, schema="final", if_exists="append", index=False, method=psql_insert_copy)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [19]:
engine.execute('SELECT COUNT(*) FROM final."author"').fetchall()

[(117374,)]

In [20]:
query = engine.execute('SELECT * FROM final."author"')
author_ids = pd.DataFrame(query.fetchall(), columns=['authorid', 'fullname'])

author_ids.head()

Unnamed: 0,authorid,fullname
0,1,Sandra Boynton
1,2,Doug the Pug
2,3,Moleskine
3,4,Workman Publishing
4,5,Sierra Club


In [21]:
writtenby = pd.merge(author_dump, author_ids, on='fullname')

print(writtenby.shape)
writtenby.head()

(193159, 3)


Unnamed: 0,fullname,asin,authorid
0,Sandra Boynton,761183272,1
1,Sandra Boynton,761177817,1
2,Sandra Boynton,761185631,1
3,Sandra Boynton,761137998,1
4,Sandra Boynton,894801996,1


In [22]:
writtenby.drop(columns='fullname') \
        .to_sql('writtenby', engine, schema="final", if_exists="replace", index=False, method=psql_insert_copy)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




## Inserting Users

In [49]:
def random_uuids(n):
    return (str(uuid.uuid4()) for _ in range(n))

In [97]:
import uuid

NUM_USERS = 1_000

users = pd.DataFrame([{'userid': uuid} for uuid in random_uuids(NUM_USERS)])
users.head()

Unnamed: 0,userid
0,2efd3928-b2de-4967-89f9-388307a90fb5
1,15246d67-79e5-4a86-856b-21a11010a4d9
2,24ef752e-ef38-4756-8af1-763fa6bc181e
3,d4de2b55-4ea3-44c8-a093-7a3c1009ddc9
4,eb019055-5dec-45ce-858f-dc4914ad9c90


In [98]:
users.to_sql('customer', engine, schema="final", if_exists="append", index=False, method=psql_insert_copy)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




## Inserting Credit Cards

In [160]:
from faker import Faker

fake = Faker()
Faker.seed(0)

def random_credit_cards(n):
    for _ in range(n):
        yield {
            'cardnumber': fake.credit_card_number(card_type='visa'),
            'cvv': fake.credit_card_security_code(card_type='visa'),
            'expirationdate': fake.credit_card_expire(end='+5y', date_format='20%y-%m-01')
        }

In [161]:
NUM_CARDS = int(NUM_USERS*1.75)

random_cardholders = users.sample(NUM_CARDS, replace=True).values.squeeze()
random_cc_iter = zip(random_cardholders, random_credit_cards(NUM_CARDS))

cc_users = pd.DataFrame([{'userid': uid, **cc} for (uid, cc) in random_cc_iter])

print(cc_users.shape)
cc_users.head()

(1750, 4)


Unnamed: 0,userid,cardnumber,cvv,expirationdate
0,36197d06-b097-498f-88f7-071fb4a85a94,4604876475938242,219,2022-06-01
1,d37b1d38-7798-4f15-bc9a-872d97ff935a,4892411578156590,387,2024-01-01
2,bbf13de6-a76b-4d6d-b8d1-0db788c4d936,4408016097535138,933,2021-07-01
3,edb8ef92-5f75-4184-8877-81dc6dca60b6,4711587148418583,398,2025-04-01
4,3814b635-8886-43e6-8cb8-2b4d31cfee82,4196593423209477,112,2021-07-01


In [162]:
payment_method = cc_users.drop(columns='userid')
payment_method.to_sql('paymentmethod', engine, schema="final", if_exists="append", index=False, method=psql_insert_copy)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [163]:
cardholders = cc_users[['userid', 'cardnumber']]
cardholders.to_sql('cardholder', engine, schema="final", if_exists="append", index=False, method=psql_insert_copy)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




## Simulating Purchases

In [46]:
from datetime import datetime

# https://stackoverflow.com/a/57722873/6766123

def random_dates(first_date, second_date, n):
    d1 = datetime.strptime(first_date, "%m/%d/%Y %I:%M %p")
    d2 = datetime.strptime(second_date, "%m/%d/%Y %I:%M %p")
    
    first_timestamp = int(d1.timestamp())
    second_timestamp = int(d2.timestamp())
    random_timestamps = np.random.randint(first_timestamp, second_timestamp, (n,))
    
    return pd.Series(random_timestamps).apply(datetime.fromtimestamp)

In [164]:
NUM_PURCHASES = np.random.randint(2000, 3000)
NUM_SAMPLED_USERS = np.random.randint(NUM_USERS//3, NUM_USERS//3*2)

purchases = cardholders.sample(NUM_SAMPLED_USERS) \
                .sample(NUM_PURCHASES, replace=True) \
                .assign(
                    orderid=list(random_uuids(NUM_PURCHASES)),
                    ordertime=random_dates("1/1/2018 1:30 PM", "1/1/2019 4:50 AM", NUM_PURCHASES)
                )

assert purchases.userid.isin(cardholders.userid).all()

print(purchases.shape)
purchases.head()

(2392, 4)


Unnamed: 0,userid,cardnumber,orderid,ordertime
238,339d6cec-bc9b-4446-88f2-069cb9977dae,4585798032331211,a622d9d2-a1f8-46cd-83e3-bbc125355dd9,2018-05-24 07:16:17
640,3937a551-808a-4256-a347-d57256cf56b1,4117209234437951,2c1d3c41-b183-4f25-82b8-9fd2a703af40,2018-05-02 15:23:21
189,419ca2eb-2051-4525-9e60-395cc28615a6,4290421242648656,0fff2202-5dfa-44ca-a80f-3a7732eda3a4,2018-05-23 03:01:35
1482,c5a79af9-75ab-4b11-8b26-d61ff73d01f8,4726252170614475,fff51b89-d35f-4319-bd87-bed1df42abae,2018-09-04 17:33:16
878,bfea4a4f-c75e-4c84-9ad8-2f0f4317738d,4670488300424288,1818cf0d-b8ea-4819-b286-6b4a37377528,2018-07-11 05:12:52


In [165]:
# https://stackoverflow.com/a/50425683/6766123
def softmax(x, axis=None):
    x = x - x.max(axis=axis, keepdims=True)
    y = np.exp(x)
    return y / y.sum(axis=axis, keepdims=True)

MAX_QTY = 4
PURCHASE_QUANTITIES = list(range(1,MAX_QTY+1))
QTY_WEIGHTS = softmax(-2*np.array(PURCHASE_QUANTITIES))
QTY_WEIGHTS

array([0.86495488, 0.11705891, 0.0158422 , 0.00214401])

In [166]:
NUM_PURCHASED_ITEMS = np.random.randint(8000, 12_000)
purchased_items = df[['asin']].sample(NUM_PURCHASED_ITEMS, replace=True) \
                            .assign(
                                orderid=purchases.orderid.sample(NUM_PURCHASED_ITEMS, replace=True).values,
                                quantity=pd.Series(PURCHASE_QUANTITIES) \
                                            .sample(NUM_PURCHASED_ITEMS, 
                                                    replace=True, 
                                                    weights=QTY_WEIGHTS) \
                                            .values
                            )

print(purchased_items.shape)
purchased_items.head()

(10046, 3)


Unnamed: 0,asin,orderid,quantity
205418,1564648990,ef1cd360-073a-4703-9f15-88a4ed409ce6,1
158462,1452896623,0062ac6b-7568-4435-8229-6069fc747f27,1
189837,159775451X,f58b8b9b-1866-442c-b06e-7c98be482fec,1
22693,62128450,952ce858-d059-44f0-9ae5-f2d8a9084364,1
535,1419717529,1d36c23d-0602-44a1-9434-9ec79a50704f,1


In [167]:
purchased_items.orderid.value_counts()

07bed1da-add4-4a46-85ea-fe0457db1597    14
bda313f7-3667-42a7-bcae-1735361c9eef    14
fb561465-5eed-455e-b8bd-24948bc6d993    12
1b444404-9e43-4381-831c-adbf93bceea1    11
d2812f61-70ce-4256-a2af-1114ea9a92cd    11
                                        ..
7d20f08e-96ee-453e-8ef3-99edc95eb17a     1
bfa8e91a-3dcf-410b-aba0-50fd28e59a58     1
07e10ff8-3774-4431-b262-4533d262091a     1
275b7b92-2b3a-4f24-a24d-a22f0b2b157d     1
6d94b173-e3a5-4c9e-821c-9a62606775d4     1
Name: orderid, Length: 2360, dtype: int64

In [168]:
purchased_items.quantity.value_counts()

1    8686
2    1187
3     156
4      17
Name: quantity, dtype: int64

In [169]:
purchases.to_sql('purchase', engine, schema="final", if_exists="append", index=False, method=psql_insert_copy)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [170]:
purchased_items.to_sql('purchaseditem', 
                       engine, 
                       schema="final", 
                       if_exists="append", 
                       index=False, 
                       method=psql_insert_copy)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


