## Best Practices

Let us go through some best practices to perform batch load.
* We should minimize the number of connections to database.
* We should avoid executing queries using hard coded values. Rather, we should prefer bind variables.
* Too much committing is bad as commit incurs overhead.
* If we have to load considerable amount of data, we should consider committing every 1,000 records or 10,000 records or even more based up on the capacity of the database.
* Most of the mainstream databases perform direct path I/O or batch load which might perform better compare to looping, inserting and committing data.

In [None]:
import mysql.connector as mc
from mysql.connector import errorcode as ec
import pandas as pd
import datetime

In [None]:
def get_connection(user, password, host, db):
    try:
        connection = mc.connect(user=user, 
                                password=password,
                                host=host,
                                database=db
                               )
    except mc.Error as error:
        if error.errno == ec.ER_ACCESS_DENIED_ERROR:
            print("Invalid Credentials")
        else:
            print(err)
    return connection

In [None]:
def get_cursor(connection):
    return connection.cursor()

In [None]:
def get_orders():
    orders_path = "/Users/itversity/Research/data/retail_db/orders/orders.csv"
    orders_schema = [
        "order_id",
        "order_date",
        "order_customer_id",
        "order_status"
    ]
    orders = pd.read_csv(
        orders_path,
        header=None,
        names=orders_schema
    )
    return orders

In [None]:
def load_orders(connection, cursor, query, orders):
    for idx, order in orders.iterrows():
        cursor.execute(query, (order.order_id, order.order_date, order.order_customer_id, order.order_status))
        connection.commit()

In [None]:
connection = get_connection('demo_user', 'itversity', 'localhost', 'demo_db')

In [None]:
cursor = get_cursor(connection)

In [None]:
orders = get_orders()
orders.count()

In [None]:
query = ("""INSERT INTO orders
         (order_id, order_date, order_customer_id, order_status)
         VALUES
         (%s, %s, %s, %s)""")

In [None]:
%%time
load_orders(connection, cursor, query, orders)

* Truncate the table and reduce the frequency of the commit.

In [None]:
def load_orders(connection, cursor, query, orders):
    print(datetime.datetime.now())
    for idx, order in orders.iterrows():
        cursor.execute(query, (order.order_id, order.order_date, order.order_customer_id, order.order_status))
    connection.commit()

In [None]:
connection = get_connection('demo_user', 'itversity', 'localhost', 'demo_db')

In [None]:
cursor = get_cursor(connection)

In [None]:
orders = get_orders()
orders.count()

In [None]:
query = ("""INSERT INTO orders
         (order_id, order_date, order_customer_id, order_status)
         VALUES
         (%s, %s, %s, %s)""")

In [None]:
%%time
load_orders(connection, cursor, query, orders)

* Committing every 1000 records using batch. Make sure to truncate table before invoking load_orders function with frequent commits.
* In this case one insert statement will be used to insert 1000 records at a time. This is more efficient than issuing 1000 statements for 1000 records (one statement per record)

In [None]:
def load_orders(connection, cursor, query, orders):
    print(datetime.datetime.now())
    employees_batch = []
    count = 1
    for idx, order in orders.iterrows():
        employees_batch.append(tuple(order))
        if(count%1000 == 0):
            cursor.executemany(query, employees_batch)
            connection.commit()
            employees_batch = []
        count = count + 1
    cursor.executemany(query, employees_batch)
    connection.commit()

In [None]:
connection = get_connection('demo_user', 'itversity', 'localhost', 'demo_db')

In [None]:
cursor = get_cursor(connection)

In [None]:
orders = get_orders()
orders.count()

In [None]:
query = ("""INSERT INTO orders
         (order_id, order_date, order_customer_id, order_status)
         VALUES
         (%s, %s, %s, %s)""")

In [None]:
%%time
load_orders(connection, cursor, query, orders)