<IMG SRC="https://github.com/jacquesroy/byte-size-data-science/raw/master/images/Banner.png" ALT="BSDS Banner" WIDTH=1195 HEIGHT=200>

# Generating table definition from csv
There is a lot of available data in open government (see videos 19,20) but most of it is in CSWV format with a variable level of documentation.
This notebook shows a quick way to generate a basic table definition from a CSV file.

Once the basic table definition is generated, it can be adjusted for data types, primary key, indexes and so on.

This is particularly advantageous when you have CSV files with a large number of columns.

In [None]:
from IPython.display import IFrame

IFrame(src="https://www.youtube.com/embed/CU4nS9Z8TDE?rel=0&amp;controls=0&amp;showinfo=0", width=560, height=315)

## Import the appropriate libraries and set up needed connections

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import ibm_db
import ibm_db_dbi

from ftplib import FTP
import requests, zipfile, io

In [None]:
credentials = {
    'username': 'bluadmin',
    'password': """PASSWORD""",
    'sg_service_url': 'https://sgmanager.ng.bluemix.net',
    'database': 'BLUDB',
    'host': 'dashdb. . . .bluemix.net',
    'port': '50001',
    'url': 'https://undefined'
}


In [None]:
dsn = (
    "DRIVER={{IBM DB2 ODBC DRIVER}};"
    "DATABASE={0};"
    "HOSTNAME={1};"
    "PORT={2};"
    "PROTOCOL=TCPIP;"
    "SECURITY=ssl;"
    "UID={3};"
    "PWD={4};").format(credentials['database'], credentials['host'],
                       credentials['port'], credentials['username'],
                       credentials['password'])

conn = ibm_db.connect(dsn, "", "")
pconn = ibm_db_dbi.Connection(conn)

In [None]:
# Try the connection
sql = """
  SELECT count(*)
  FROM SYSIBM.SYSTABLES ;
"""

data_pd = pd.read_sql(sql, pconn)
data_pd.head(5)

## Chicago accident data
We saw this dataset in videos 17, 18, and 32.

We have 221,600 records in this dataset.

In [None]:
url = 'https://github.com/jacquesroy/byte-size-data-science/raw/master/data/ChicagoTrafficCrashes20180917.csv.zip'
r = requests.get(url)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()
filename = url.rsplit('/', 1)[-1].rsplit('.', 1)[0]

# To limit the number of records, use the "nrows" parameter
collisions_pd = pd.read_csv(filename)

print("Number of records: {}".format(collisions_pd.shape[0]))
collisions_pd.head(2)

In [None]:
# Convert the two datetime columns to the proper type
collisions_pd['CRASH_DATE'] = \
           collisions_pd['CRASH_DATE'].apply(pd.to_datetime, infer_datetime_format=True, errors='coerce')
collisions_pd['DATE_POLICE_NOTIFIED'] = \
           collisions_pd['DATE_POLICE_NOTIFIED'].apply(pd.to_datetime, infer_datetime_format=True, errors='coerce')

In [None]:
collisions_pd.dtypes

### Dictionary for mapping

In [None]:
toDBtype = {'object': 'VARCHAR','int8': 'SMALLINT', 'int16': 'SMALLINT', 'int32': 'INTEGER', 'int64': 'BIGINT', 
            'float32': 'REAL', 'float64': 'DOUBLE', 'geometry': 'DB2GSE.ST_Geometry', 'datetime64[ns]': 'TIMESTAMP'}

### Generate a dataframe of column names, data type and field length
The relevant field lengths are only for character fields (object)

In [None]:
measurer = np.vectorize(len)

column_info_pd = collisions_pd.dtypes.reset_index()
column_info_pd.columns = ['Name', 'ColType']
column_info_pd['Length'] = 0
column_info_pd['Length'].loc[column_info_pd['ColType'] == 'object'] = \
            measurer(collisions_pd.select_dtypes(np.object).astype(str)).max(axis=0)
column_info_pd.head(10)

### Generate a table definition
We may need to adjust some data types but at least we have a starting point and the character fields have the proper maximum size.

In [None]:
comma = ","
print('CREATE TABLE ChicagoAccidents (')
for ix in range(len(column_info_pd)) :
    if (ix == column_info_pd.shape[0] - 1) :
        comma = " "
    if (column_info_pd.iloc[ix]['ColType'].name != 'object') :
        print("  {0:30} {1}{2}".format(column_info_pd.iloc[ix]['Name'], \
                                       toDBtype[column_info_pd.iloc[ix]['ColType'].name],comma))
    elif column_info_pd.iloc[ix]['Length'] > 4 :
        print("  {0:30} {1}({2}){3}".format(column_info_pd.iloc[ix]['Name'], \
                                            toDBtype[column_info_pd.iloc[ix]['ColType'].name], \
                                            column_info_pd.iloc[ix]['Length'],comma))
    else :
        print("  {0:30} CHAR({1}){2}".format(column_info_pd.iloc[ix]['Name'], \
                                             column_info_pd.iloc[ix]['Length'],comma))
print(') ORGANIZE BY ROW;')

### Create the table using the database connection

In [None]:
table_def = """
CREATE TABLE jroy.ChicagoAccidents (
  RD_NO                          VARCHAR(8) NOT NULL PRIMARY KEY,
  CRASH_DATE_EST_I               CHAR(3),
  CRASH_DATE                     TIMESTAMP,
  POSTED_SPEED_LIMIT             BIGINT,
  TRAFFIC_CONTROL_DEVICE         VARCHAR(23),
  DEVICE_CONDITION               VARCHAR(24),
  WEATHER_CONDITION              VARCHAR(22),
  LIGHTING_CONDITION             VARCHAR(22),
  FIRST_CRASH_TYPE               VARCHAR(28),
  TRAFFICWAY_TYPE                VARCHAR(31),
  LANE_CNT                       DOUBLE,
  ALIGNMENT                      VARCHAR(21),
  ROADWAY_SURFACE_COND           VARCHAR(15),
  ROAD_DEFECT                    VARCHAR(17),
  REPORT_TYPE                    VARCHAR(26),
  CRASH_TYPE                     VARCHAR(32),
  INTERSECTION_RELATED_I         CHAR(3),
  NOT_RIGHT_OF_WAY_I             CHAR(3),
  HIT_AND_RUN_I                  CHAR(3),
  DAMAGE                         VARCHAR(13),
  DATE_POLICE_NOTIFIED           TIMESTAMP,
  PRIM_CONTRIBUTORY_CAUSE        VARCHAR(80),
  SEC_CONTRIBUTORY_CAUSE         VARCHAR(80),
  STREET_NO                      BIGINT,
  STREET_DIRECTION               CHAR(3),
  STREET_NAME                    VARCHAR(31),
  BEAT_OF_OCCURRENCE             DOUBLE,
  PHOTOS_TAKEN_I                 CHAR(3),
  STATEMENTS_TAKEN_I             CHAR(3),
  DOORING_I                      CHAR(3),
  WORK_ZONE_I                    CHAR(3),
  WORK_ZONE_TYPE                 VARCHAR(12),
  WORKERS_PRESENT_I              CHAR(3),
  NUM_UNITS                      DOUBLE,
  MOST_SEVERE_INJURY             VARCHAR(24),
  INJURIES_TOTAL                 DOUBLE,
  INJURIES_FATAL                 DOUBLE,
  INJURIES_INCAPACITATING        DOUBLE,
  INJURIES_NON_INCAPACITATING    DOUBLE,
  INJURIES_REPORTED_NOT_EVIDENT  DOUBLE,
  INJURIES_NO_INDICATION         DOUBLE,
  INJURIES_UNKNOWN               DOUBLE,
  CRASH_HOUR                     BIGINT,
  CRASH_DAY_OF_WEEK              BIGINT,
  CRASH_MONTH                    BIGINT,
  LATITUDE                       DOUBLE,
  LONGITUDE                      DOUBLE,
  LOCATION                       DB2GSE.ST_Point 
) ORGANIZE BY ROW;
"""
cur = pconn.cursor()
cur.execute(table_def)


In [None]:
# Check two columns to make sure the create table worked
pconn.columns(schema_name="jroy", table_name="ChicagoAccidents", column_names=["RD_NO","CRASH_DATE"])

### Insert rows
Since I already have the data, might as well write it to the database.

For demo purposes, insert only 10 rows.

In [None]:
rows_pd = collisions_pd.iloc[0:10]
print("row shape: {0}".format(rows_pd.shape))
rows_pd.iloc[0].values

In [None]:
insert_stmt = """
INSERT INTO jroy.ChicagoAccidents
  VALUES(
"""
insert_stmt = insert_stmt + ("?," * (rows_pd.shape[1] - 1)) + "?);"
print(insert_stmt)

In [None]:
### I need to see how to use a prepared statement to speedup the insert. ###

insert_stmt = """
INSERT INTO jroy.ChicagoAccidents
  VALUES(
?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,
DB2GSE.ST_PointFromText(?, 4269) );
"""

cur = pconn.cursor()
errors = 0
for row in rows_pd.iterrows() :
    try :
        cur.execute(insert_stmt, tuple(row[1]) )
    except Exception as e:
        errors = errors + 1
        # print("Error on: {0}".format(row[1]['RD_NO']))
        # print(e)
cur.close()
print("Rejected records: {0}".format(errors))

In [None]:
# Try the connection
sql = """
  SELECT *
  FROM jroy.ChicagoAccidents ;
"""

data_pd = pd.read_sql(sql, pconn)
data_pd.head()