In [1]:
import mysql.connector
from mysql.connector import errorcode

In [2]:
CONFIG_USER = "admin"
CONFIG_PASSWORD = "adminadmin"
CONFIG_HOST = "sf-fires-1.cmbkjcuzgzbs.us-east-2.rds.amazonaws.com"

conn = mysql.connector.connect(
    user=CONFIG_USER, 
    password=CONFIG_PASSWORD,
    host=CONFIG_HOST,
    buffered=True
)

cursor = conn.cursor()

In [3]:
DB_NAME = 'sf_fires_testing'
cursor.execute(f"USE {DB_NAME}")

## Create Table

### Setup

In [4]:
import json

In [5]:
import pickle

with open('col_data_types.pickle', 'rb') as handle:
    columns = pickle.load(handle)

In [6]:
def get_create_str():
    create_str = "CREATE TABLE IF NOT EXISTS sf_fires ("
    for k, v in columns.items():
        create_str += f"`{k.lower()}` {v},"
    create_str += "PRIMARY KEY (id)) ENGINE=InnoDB"
    return create_str

In [7]:
print(get_create_str()[:500])
print()
print(get_create_str()[-500:])

CREATE TABLE IF NOT EXISTS sf_fires (`incident_number` INT,`exposure_number` SMALLINT,`suppression_units` SMALLINT,`suppression_personnel` SMALLINT,`ems_units` SMALLINT,`ems_personnel` SMALLINT,`other_units` SMALLINT,`other_personnel` SMALLINT,`estimated_property_loss` FLOAT,`estimated_contents_loss` FLOAT,`fire_fatalities` SMALLINT,`fire_injuries` SMALLINT,`civilian_fatalities` SMALLINT,`civilian_injuries` SMALLINT,`number_of_alarms` SMALLINT,`floor_of_fire_origin` FLOAT,`number_of_floors_with_

t` VARCHAR(255),`detector_type` VARCHAR(255),`detector_operation` VARCHAR(255),`detector_effectiveness` VARCHAR(255),`detector_failure_reason` VARCHAR(255),`automatic_extinguishing_system_present` VARCHAR(255),`automatic_extinguishing_sytem_type` VARCHAR(255),`automatic_extinguishing_sytem_perfomance` VARCHAR(255),`automatic_extinguishing_sytem_failure_reason` VARCHAR(255),`supervisor_district` VARCHAR(255),`neighborhood_district` VARCHAR(255),`point` VARCHAR(255),PRIMARY KEY (id)) ENGINE=Inno

### Table Creation

In [8]:
def drop_table(cursor):
    drop_table_query = f"DROP TABLE IF EXISTS `sf_fires`"
    cursor.execute(drop_table_query)
    print("Table dropped")

def create_table(cursor):
    create_str = "CREATE TABLE IF NOT EXISTS sf_fires ("
    for k, v in columns.items():
        create_str += f"`{k.lower()}` {v},"
    create_str += "PRIMARY KEY (id)) ENGINE=InnoDB"

    try:
        cursor.execute(create_str)
        print("table created")
    except mysql.connector.Error as err:
        if err.errno == errorcode.ER_TABLE_EXISTS_ERROR:
            print("This table already exists")
        else:
            print(err.msg)

def reinit_table(cursor):
    drop_table(cursor)
    create_table(cursor)

In [9]:
reinit_table(cursor)
conn.commit()

Table dropped
table created


## Populate Table

### Data

In [10]:
import json

In [11]:
with open('data.json') as json_file:
    data = json.load(json_file)

In [12]:
def get_cell_value(item, col):
    if col not in item.keys():
        return None
    if col == "point":
        coords = item["point"].get("coordinates")
        if coords:
            return str(coords)
        else:
            return None
    if "INT" in columns[col]:
        return int(item[col])
    if "FLOAT" in columns[col]:
        return float(item[col])
    return item[col]

In [14]:
from datetime import datetime

In [15]:
import pandas as pd

In [21]:
results = pd.read_csv("comparison_results.csv")
results.head(3)

Unnamed: 0,date,strategy,rows,elapsed


In [105]:
def monitor_strategy(function, data, results, reinit=True):
    init_time = datetime.now()
    if reinit:
        reinit_table(cursor)
    function(data)
    time_now = datetime.now()
    time_diff = time_now - init_time
    row = (init_time, str(function), len(data), time_diff)
    results.loc[len(results)] = row
    results.to_csv("comparison_results.csv")


### Method 1: for loop

In [19]:
# building INSERT query
def get_query_for():
    insert_query = "INSERT INTO sf_fires ("
    for col in columns:
        insert_query += f"{col}, "

    insert_query = f"{insert_query[:-2]}) VALUES ("
    insert_query += "%s, " * len(columns)
    insert_query = insert_query[:-2] + ")"

    return insert_query

print(get_query_for()[:500])
print()
print(get_query_for()[-500:])

INSERT INTO sf_fires (incident_number, exposure_number, suppression_units, suppression_personnel, ems_units, ems_personnel, other_units, other_personnel, estimated_property_loss, estimated_contents_loss, fire_fatalities, fire_injuries, civilian_fatalities, civilian_injuries, number_of_alarms, floor_of_fire_origin, number_of_floors_with_minimum_damage, number_of_floors_with_significant_damage, number_of_floors_with_heavy_damage, number_of_floors_with_extreme_damage, number_of_sprinkler_heads_oper

ector_failure_reason, automatic_extinguishing_system_present, automatic_extinguishing_sytem_type, automatic_extinguishing_sytem_perfomance, automatic_extinguishing_sytem_failure_reason, supervisor_district, neighborhood_district, point) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %

In [42]:
def update_table_for(data):
    insert_query = get_query_for()
    for i, item in enumerate(data):
        insert_query_values = tuple([ get_cell_value(item, col) for col in columns ])
        cursor.execute(insert_query, insert_query_values)
    conn.commit()

In [39]:
monitor_strategy(update_table_for, data[:5])

Table dropped
table created


In [40]:
results

Unnamed: 0,date,strategy,rows,elapsed
0,2021-12-01 00:30:30.956731,<function update_table_for at 0x7f36a2ba71f0>,5,0 days 00:00:02.072320


### Method 2: execute many

In [13]:
def update_table_many(data):
    insert_query = get_query_for()
    values = []
    for item in data:
        row = [ get_cell_value(item, col) for col in columns ]
        values.append(tuple(row))
    cursor.executemany(insert_query, values)
    conn.commit()

In [16]:
def update_table_many_batch(data, batches):
    batch_size = int(len(data) / batches) + 1
    first, last = 0, batch_size
    for i in range(batches):
        data_batch = data[first:last]
        update_table_many(data_batch)
        first = last
        last_new = last + batch_size
        last = last_new if last_new < len(data) else len(data)

In [26]:
reinit_table(cursor)
update_table_many_batch(data, 4)

Table dropped
table created


In [24]:
4000*200

800000

In [9]:
print(max_aux)
print(index)

32
0


In [53]:
update_table_many(data[7:10])

In [54]:
monitor_strategy(update_table_many, data[10:100])

Table dropped
table created


In [58]:
monitor_strategy(update_table_many, data[:100000])

Table dropped
table created


In [59]:
results

Unnamed: 0,date,strategy,rows,elapsed
0,2021-12-01 00:30:30.956731,<function update_table_for at 0x7f36a2ba71f0>,5,0 days 00:00:02.072320
1,2021-12-01 00:35:56.507644,<function update_table_many at 0x7f36a2ba7d30>,90,0 days 00:00:01.452725
2,2021-12-01 00:36:32.500884,<function update_table_many at 0x7f36a2ba7d30>,10000,0 days 00:00:13.283197
3,2021-12-01 00:37:08.909547,<function update_table_many at 0x7f36a2ba7d30>,100000,0 days 00:04:54.775291


### Method 3: long execute command

In [132]:
def get_cell_value_str(item, col):
    if col not in item.keys():
        return "\'None\'"
    if col == "point":
        coords = item["point"].get("coordinates")
        if coords:
            return f"\'{str(coords)}\'"
        else:
            return "\'None\'"
    
    data_type = columns[col]
    value = item[col]
    if "INT" in data_type:
        return str(int(value))
    if "TIMESTAMP" in data_type:
        return f"\'{value.replace('T', ' ')}\'"
    
    result = value
    result = result.replace(",", "\,")
    result = result.replace("'", "\'")
    return f"\'{result}\'"

In [133]:
# building INSERT query
def get_query_long():
    insert_query = "INSERT INTO sf_fires ("
    for col in columns:
        insert_query += f"{col}, "
    insert_query = f"{insert_query[:-2]}) VALUES "
    return insert_query

# VALUES for insert query
def update_table_long(data):
    insert_query = get_query_long()
    for i, item in enumerate(data):
        values = "("
        for j, col in enumerate(columns):
            ending = "), " if j == len(columns) - 1 else ", "
            values += get_cell_value_str(item, col) + ending
        insert_query += values
    insert_query = insert_query[:-2] 
    print(insert_query)
    cursor.execute(insert_query)
    conn.commit()

### Method 4: using pandas

In [112]:
conn.close()

In [87]:
import pandas as pd
from sqlalchemy import create_engine

In [108]:
SQL_USER = "admin"
SQL_PASSWORD = "adminadmin"
HOST = "sf-fires-1.cmbkjcuzgzbs.us-east-2.rds.amazonaws.com"
DB_NAME = "sf_fires_testing"
MYSQL_CONN = "mysqlconnector"
sql_config = f"mysql+{MYSQL_CONN}://{SQL_USER}:{SQL_PASSWORD}@{HOST}/{DB_NAME}"
sql_engine = create_engine(sql_config)
conn = sql_engine.connect()

In [92]:
import numpy as np

In [101]:
def get_df(data):
    df = pd.DataFrame(data)
    for col, data_type in columns.items():
        if "INT" in data_type:
            df[col] = pd.to_numeric(df[col], downcast="integer")
        elif "TIME" in data_type:
            df[col] = pd.to_datetime(df[col])
    return df

def fill_point(df):
    for i in df.index:
        value = df.at[i, "point"]
        try:
            df.at[i, "point"] = str(value["coordinates"])
        except:
            df.at[i, "point"] = None

def update_table_pandas(data):
    df = get_df(data)
    fill_point(df)
    df.to_sql("sf_fires", conn, if_exists="replace")

In [106]:
monitor_strategy(update_table_pandas, data[:50], results, reinit=False)

In [107]:
results

Unnamed: 0,date,strategy,rows,elapsed
0,2021-12-01 00:30:30.956731,<function update_table_for at 0x7f36a2ba71f0>,5,0 days 00:00:02.072320
1,2021-12-01 00:35:56.507644,<function update_table_many at 0x7f36a2ba7d30>,90,0 days 00:00:01.452725
2,2021-12-01 00:36:32.500884,<function update_table_many at 0x7f36a2ba7d30>,10000,0 days 00:00:13.283197
3,2021-12-01 00:37:08.909547,<function update_table_many at 0x7f36a2ba7d30>,100000,0 days 00:04:54.775291
4,2021-12-01 01:06:25.579237,<function update_table_pandas at 0x7f36a1f04b80>,50,0 days 00:00:06.032035
