In [1]:
%run 02_function_get_database_connection.ipynb

In [5]:
def get_cursor(connection):
    return connection.cursor()

In [2]:
%run 06_reading_data_from_file.ipynb

In [3]:
orders.head(3)

Unnamed: 0,order_id,order_date,order_customer_id,order_status
0,1,2013-07-25 00:00:00.0,11599,CLOSED
1,2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT
2,3,2013-07-25 00:00:00.0,12111,COMPLETE


In [4]:
order_items.head(3)

Unnamed: 0,order_item_id,order_item_order_id,order_item_product_id,order_item_quantity,order_item_subtotal,order_item_product_price
0,1,1,957,1,299.98,299.98
1,2,2,1073,1,199.99,199.99
2,3,2,502,5,250.0,50.0


In [6]:
query = ("""INSERT INTO orders
         (order_id, order_date, order_customer_id, order_status)
         VALUES
         (%s, %s, %s, %s)""")

```{note}
Inserting and committing one row in each iteration. Commit is quite expensive as it result in database checkpoint.
```

In [56]:
def load_orders(connection, cursor, query, data):
    for rec in data:
        cursor.execute(query, rec)
        connection.commit()

In [57]:
cursor = get_cursor(retail_connection)

In [58]:
%%time
load_orders(retail_connection, cursor, query, orders.values.tolist()[:10000])

CPU times: user 394 ms, sys: 419 ms, total: 813 ms
Wall time: 6.67 s


In [59]:
cursor.execute('TRUNCATE TABLE orders')

```{note}
Inserting one row at a time but committing at the end. Even though it is much faster than previous approach, it is transferring one record at a time between Python Engine and Database Engine.

We can further tune by leveraging batch insert.
```

In [73]:
def load_orders(connection, cursor, query, data):
    for rec in data:
        cursor.execute(query, rec)
    connection.commit()

In [74]:
cursor = get_cursor(retail_connection)

In [75]:
%%time
# Inserting all orders
load_orders(retail_connection, cursor, query, orders.values.tolist())

CPU times: user 1.14 s, sys: 998 ms, total: 2.14 s
Wall time: 4.8 s


In [72]:
cursor.execute('TRUNCATE TABLE orders')

```{note}
All the records will be inserted as part of one batch insert operation. If there is lot of data to be inserted, then this might start running into issues such as out of memory.

Also, if the job fails in the middle then all the data that is transferred thus far will be lost. Hence it is better to batch with manageable size and then insert as well as commit.
```

In [69]:
def load_orders(connection, cursor, query, data):
    cursor.executemany(query, data)
    connection.commit()

In [70]:
cursor = get_cursor(retail_connection)

In [71]:
%%time
# Inserting all orders
load_orders(retail_connection, cursor, query, orders.values.tolist())

CPU times: user 1.33 s, sys: 927 ms, total: 2.26 s
Wall time: 4.91 s


```{note}
You might not see significant difference in performance as our database is running in the same server from where the code is running to insert the data.
```

In [76]:
cursor.execute('TRUNCATE TABLE orders')

In [77]:
def load_orders(connection, cursor, query, data, batch_size=10000):
    for i in range(0, len(data), batch_size):
        cursor.executemany(query, data[i:i+batch_size])
        connection.commit()

In [78]:
cursor = get_cursor(retail_connection)

In [79]:
%%time
# Inserting all orders
load_orders(retail_connection, cursor, query, orders.values.tolist())

CPU times: user 1.14 s, sys: 991 ms, total: 2.13 s
Wall time: 4.8 s


In [81]:
%load_ext sql

In [82]:
%env DATABASE_URL=postgresql://itversity_retail_user:retail_password@localhost:5432/itversity_retail_db

env: DATABASE_URL=postgresql://itversity_retail_user:retail_password@localhost:5432/itversity_retail_db


In [83]:
%%sql

SELECT count(1) FROM orders

1 rows affected.


count
68883
