In [1]:
import mysql.connector
from mysql.connector import errorcode

In [2]:
CONFIG_USER = "admin"
CONFIG_PASSWORD = "adminadmin"
CONFIG_HOST = "sf-fires-1.cmbkjcuzgzbs.us-east-2.rds.amazonaws.com"

conn = mysql.connector.connect(
    user=CONFIG_USER, 
    password=CONFIG_PASSWORD,
    host=CONFIG_HOST,
    buffered=True
)

cursor = conn.cursor()

In [3]:
DB_NAME = 'sf_fires_testing'
cursor.execute(f"USE {DB_NAME}")

## Create Table

### Setup

In [4]:
import json

In [5]:
import pickle

with open('col_data_types.pickle', 'rb') as handle:
    columns = pickle.load(handle)

In [6]:
def get_create_str():
    create_str = "CREATE TABLE IF NOT EXISTS sf_fires ("
    for k, v in columns.items():
        create_str += f"`{k.lower()}` {v},"
    create_str += "PRIMARY KEY (id)) ENGINE=InnoDB"
    return create_str

In [7]:
print(get_create_str()[:500])
print()
print(get_create_str()[-500:])

CREATE TABLE IF NOT EXISTS sf_fires (`incident_number` INT,`exposure_number` SMALLINT,`suppression_units` SMALLINT,`suppression_personnel` SMALLINT,`ems_units` SMALLINT,`ems_personnel` SMALLINT,`other_units` SMALLINT,`other_personnel` SMALLINT,`estimated_property_loss` FLOAT,`estimated_contents_loss` FLOAT,`fire_fatalities` SMALLINT,`fire_injuries` SMALLINT,`civilian_fatalities` SMALLINT,`civilian_injuries` SMALLINT,`number_of_alarms` SMALLINT,`floor_of_fire_origin` FLOAT,`number_of_floors_with_

t` VARCHAR(255),`detector_type` VARCHAR(255),`detector_operation` VARCHAR(255),`detector_effectiveness` VARCHAR(255),`detector_failure_reason` VARCHAR(255),`automatic_extinguishing_system_present` VARCHAR(255),`automatic_extinguishing_sytem_type` VARCHAR(255),`automatic_extinguishing_sytem_perfomance` VARCHAR(255),`automatic_extinguishing_sytem_failure_reason` VARCHAR(255),`supervisor_district` VARCHAR(255),`neighborhood_district` VARCHAR(255),`point` VARCHAR(255),PRIMARY KEY (id)) ENGINE=Inno

### Table Creation

In [8]:
def drop_table(cursor):
    drop_table_query = f"DROP TABLE IF EXISTS `sf_fires`"
    cursor.execute(drop_table_query)
    print("Table dropped")

def create_table(cursor):
    create_str = "CREATE TABLE IF NOT EXISTS sf_fires ("
    for k, v in columns.items():
        create_str += f"`{k.lower()}` {v},"
    create_str += "PRIMARY KEY (id)) ENGINE=InnoDB"

    try:
        cursor.execute(create_str)
        print("table created")
    except mysql.connector.Error as err:
        if err.errno == errorcode.ER_TABLE_EXISTS_ERROR:
            print("This table already exists")
        else:
            print(err.msg)

def reinit_table(cursor):
    drop_table(cursor)
    create_table(cursor)

In [9]:
reinit_table(cursor)
conn.commit()

Table dropped
table created


## Populate Table

### Data

In [10]:
import json

In [11]:
with open('data.json') as json_file:
    data = json.load(json_file)

In [12]:
def get_cell_value(item, col):
    if col not in item.keys():
        return None
    if col == "point":
        coords = item["point"].get("coordinates")
        if coords:
            return str(coords)
        else:
            return None
    if "INT" in columns[col]:
        return int(item[col])
    if "FLOAT" in columns[col]:
        return float(item[col])
    return item[col]

In [13]:
# building INSERT query
def get_query_for():
    insert_query = "INSERT INTO sf_fires ("
    for col in columns:
        insert_query += f"{col}, "

    insert_query = f"{insert_query[:-2]}) VALUES ("
    insert_query += "%s, " * len(columns)
    insert_query = insert_query[:-2] + ")"

    return insert_query

print(get_query_for()[:500])
print()
print(get_query_for()[-500:])

INSERT INTO sf_fires (incident_number, exposure_number, suppression_units, suppression_personnel, ems_units, ems_personnel, other_units, other_personnel, estimated_property_loss, estimated_contents_loss, fire_fatalities, fire_injuries, civilian_fatalities, civilian_injuries, number_of_alarms, floor_of_fire_origin, number_of_floors_with_minimum_damage, number_of_floors_with_significant_damage, number_of_floors_with_heavy_damage, number_of_floors_with_extreme_damage, number_of_sprinkler_heads_oper

ector_failure_reason, automatic_extinguishing_system_present, automatic_extinguishing_sytem_type, automatic_extinguishing_sytem_perfomance, automatic_extinguishing_sytem_failure_reason, supervisor_district, neighborhood_district, point) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %

In [14]:
def update_table_many(data):
    insert_query = get_query_for()
    values = []
    for item in data:
        row = [ get_cell_value(item, col) for col in columns ]
        values.append(tuple(row))
    cursor.executemany(insert_query, values)
    conn.commit()

In [15]:
def update_table_inner_batch(data, batches):
    batch_size = int(len(data) / batches) + 1
    first, last = 0, batch_size
    for i in range(batches):
        print(first, " - ", last)
        data_batch = data[first:last]
        update_table_many(data_batch)
        first = last
        last_new = last + batch_size
        last = last_new if last_new < len(data) else len(data)

In [None]:
def update_table_outer_batch(data, batches):
    batch_size = int(len(data) / batches) + 1
    first, last = 0, batch_size
    for i in range(batches):
        print(first, " - ", last)
        data_batch = data[first:last]
        update_table_many(data_batch)
        first = last
        last_new = last + batch_size
        last = last_new if last_new < len(data) else len(data)

In [16]:
# reinit_table(cursor)
# update_table_many_batch(data[:10000], 4)

In [17]:
# reinit_table(cursor)
# update_table_many_batch(data[:100000], 4)

In [28]:
reinit_table(cursor)
update_table_many_batch(data, 20)

Table dropped
table created
0  -  28689
28689  -  57378
57378  -  86067
86067  -  114756
114756  -  143445
143445  -  172134
172134  -  200823
200823  -  229512
229512  -  258201
258201  -  286890
286890  -  315579
315579  -  344268
344268  -  372957
372957  -  401646
401646  -  430335
430335  -  459024
459024  -  487713
487713  -  516402
516402  -  545091
545091  -  573769


In [29]:
reinit_table(cursor)
update_table_many_batch(data, 40)

Table dropped
table created
0  -  14345
14345  -  28690
28690  -  43035
43035  -  57380
57380  -  71725
71725  -  86070
86070  -  100415
100415  -  114760
114760  -  129105
129105  -  143450
143450  -  157795
157795  -  172140
172140  -  186485
186485  -  200830
200830  -  215175
215175  -  229520
229520  -  243865
243865  -  258210
258210  -  272555
272555  -  286900
286900  -  301245
301245  -  315590
315590  -  329935
329935  -  344280
344280  -  358625
358625  -  372970
372970  -  387315
387315  -  401660
401660  -  416005
416005  -  430350
430350  -  444695
444695  -  459040
459040  -  473385
473385  -  487730
487730  -  502075
502075  -  516420
516420  -  530765
530765  -  545110
545110  -  559455
559455  -  573769


In [30]:
reinit_table(cursor)
update_table_many_batch(data, 100)

Table dropped
table created
0  -  5738
5738  -  11476
11476  -  17214
17214  -  22952
22952  -  28690
28690  -  34428
34428  -  40166
40166  -  45904
45904  -  51642
51642  -  57380
57380  -  63118
63118  -  68856
68856  -  74594
74594  -  80332
80332  -  86070
86070  -  91808
91808  -  97546
97546  -  103284
103284  -  109022
109022  -  114760
114760  -  120498
120498  -  126236
126236  -  131974
131974  -  137712
137712  -  143450
143450  -  149188
149188  -  154926
154926  -  160664
160664  -  166402
166402  -  172140
172140  -  177878
177878  -  183616
183616  -  189354
189354  -  195092
195092  -  200830
200830  -  206568
206568  -  212306
212306  -  218044
218044  -  223782
223782  -  229520
229520  -  235258
235258  -  240996
240996  -  246734
246734  -  252472
252472  -  258210
258210  -  263948
263948  -  269686
269686  -  275424
275424  -  281162
281162  -  286900
286900  -  292638
292638  -  298376
298376  -  304114
304114  -  309852
309852  -  315590
315590  -  321328
32132

In [32]:
reinit_table(cursor)
update_table_many_batch(data, 70)

Table dropped
table created
0  -  8197
8197  -  16394
16394  -  24591
24591  -  32788
32788  -  40985
40985  -  49182
49182  -  57379
57379  -  65576
65576  -  73773
73773  -  81970
81970  -  90167
90167  -  98364
98364  -  106561
106561  -  114758
114758  -  122955
122955  -  131152
131152  -  139349
139349  -  147546
147546  -  155743
155743  -  163940
163940  -  172137
172137  -  180334
180334  -  188531
188531  -  196728
196728  -  204925
204925  -  213122
213122  -  221319
221319  -  229516
229516  -  237713
237713  -  245910
245910  -  254107
254107  -  262304
262304  -  270501
270501  -  278698
278698  -  286895
286895  -  295092
295092  -  303289
303289  -  311486
311486  -  319683
319683  -  327880
327880  -  336077
336077  -  344274
344274  -  352471
352471  -  360668
360668  -  368865
368865  -  377062
377062  -  385259
385259  -  393456
393456  -  401653
401653  -  409850
409850  -  418047
418047  -  426244
426244  -  434441
434441  -  442638
442638  -  450835
450835  -  45

In [22]:
update_table_many_batch(data[400000:500000], 4)

0  -  25001
25001  -  50002
50002  -  75003
75003  -  100000


In [23]:
update_table_many_batch(data[500000:], 4)

0  -  18443
18443  -  36886
36886  -  55329
55329  -  73769


In [24]:
# reinit_table(cursor)
# update_table_many_batch(data[:200000], 4)

In [25]:
# reinit_table(cursor)
# update_table_many_batch(data[:350000], 4)

In [26]:
# reinit_table(cursor)
# update_table_many_batch(data[:500000], 4)

In [27]:
# reinit_table(cursor)
# update_table_many_batch(data, 4)