# Using OmniSci Pymapd API

Pymapd (https://github.com/omnisci/pymapd)is the python DB API compliant interface for OmniSci. 

Packages are available on conda-forge and PyPI:
conda install -c conda-forge pymapd
pip install pymapd

To install cudf for GPU Dataframe support (conda-only):
conda install -c nvidia/label/cuda10.0 -c rapidsai/label/cuda10.0 -c numba -c conda-forge -c defaults cudf=0.6 pymapd python=3.6

In [12]:
import argparse
import sys
import csv
import string
import os
import time
import re
import pandas as pd
import numpy as np
from pymapd import connect

Function to connect to the OmniSci database.

In [13]:
# Connect to the OmniSci database
def connect_to_omnisci(str_user, str_password, str_host, str_dbname, isCloud):
  try:
    if (isCloud):
      connection = connect(user=str_user, password=str_password, host=str_host, dbname=str_dbname, port=443, protocol='https')
    else:
      connection = connect(user=str_user, password=str_password, host=str_host, dbname=str_dbname, port=6274)
  except Exception as ex:
    template = "An exception of type {0} occurred. Arguments:\n{1!r}"
    message = template.format(type(ex).__name__, ex.args)
    print(message)
    if 'OmniSci Core not ready, try again' in message:
      print("Set connection to RETRY!")
      connection = "RETRY"
    else:
      connection = "ERROR"
  return connection

Call connect function passing the following arguments:
user = mapd,
password = HyperInteractive,
host = localhost,
database = mapd
Also note the last argument which is flag to indicate whether you are connecting to OmniSci Cloud instance or not.

In [14]:
# Connect to OmniSci with 5 trys, this applies to OmniSci cloud instance which is paused during inactivity
for i in range(5):
  # connecting to a non-OmniSci Cloud instance
  # connection = connect_to_omnisci("mapd", "HyperInteractive", "localhost", "mapd", False)
  # connecting to an OmniSci Cloud instance
  # connection = connect_to_omnisci("F0A7xxxx", "fiNNxxxx", "use2-api.mapd.cloud", "mapd", True)
  connection = connect_to_omnisci("F0A7674FB728C4DE89A0", "fiNNiSG6YFZYac7Y9qLROCbbFRSif7L12BWfErSn", "use2-api.mapd.cloud", "mapd", True)  
  if connection == "RETRY":
    # recommended time to sleep is 20 seconds before instance wakes up
    time.sleep(20)
    continue
  if connection == "ERROR":
    sys.exit(1)
  print(connection)
  break

Connection(mapd://F0A7674FB728C4DE89A0:***@https://use2-api.mapd.cloud:443/mapd?protocol=https)


List tables in the database

In [15]:
list_of_tables = connection.get_tables()
print('\n'.join(list_of_tables))

taxi_weather_tracts_factual
nyc_trees_2015_683k
flights_2008_7M
cb_2017_us_county_20m
veda_utility_lines
utility_lines
veda_citylots
veda_sffacs_current
veda_SFMTA_Bikeway_Network
aaron_citylots
btc_final_table5
vs_Crimes_2001_to_present
sf_citylots
sf_facility
sf_bikeway
urbanlogiq_counts
mapdcommunityeditiondl_allwebsitedata
mapdcommunityeditiondl_singa
worldcities
all_countries
tenM_small
vs_mapd_downloads_seq
dl_test1
dl_test2
CA2018CaliforniaOilandGasWellProduction_merged
CA_2017CaliforniaOilandGasWellProduction_merged
CaliforniaOilandGasWellProduction_merged
vs_alldownloads
taztest
taztest2
uber_movement_data2
uber_movement_data1
SanFrancisco_TAZ_data
san_francisco_taz2
bay_area_taz2
NaturalGas_Pipelines_US_201804
us_faults
stats_load_omnisci_cloud
vs_test_final
probeVehicle_final
vs_NaturalGas_Pipelines_US_201804
nfl_flat
nfl_flat_10games_10
nfl_flat_10games
temp_test_final
vs_gas_production_us_2018
test_final_small
ta
BigDataBowl_alldata
ARW_weather
ARW_weather2
ARW_weather3
AR

In [16]:
table_name = 'tutela_converge'

Get details of a preloaded table

In [17]:
table_details = connection.get_table_details(table_name)
print(table_details)

[ColumnDetails(name='LocalYear', type='SMALLINT', nullable=True, precision=0, scale=0, comp_param=0), ColumnDetails(name='LocalWeek', type='SMALLINT', nullable=True, precision=0, scale=0, comp_param=0), ColumnDetails(name='Location_Geohash6', type='STR', nullable=True, precision=0, scale=0, comp_param=32), ColumnDetails(name='Latitude_Center', type='FLOAT', nullable=True, precision=0, scale=0, comp_param=0), ColumnDetails(name='Longitude_Center', type='FLOAT', nullable=True, precision=0, scale=0, comp_param=0), ColumnDetails(name='Location_City', type='STR', nullable=True, precision=0, scale=0, comp_param=32), ColumnDetails(name='Connection_Category', type='STR', nullable=True, precision=0, scale=0, comp_param=8), ColumnDetails(name='Device_SIMServiceProviderBrandName', type='STR', nullable=True, precision=0, scale=0, comp_param=16), ColumnDetails(name='Average_DownloadThroughput', type='FLOAT', nullable=True, precision=0, scale=0, comp_param=0), ColumnDetails(name='Median_DownloadThro

Use SQL to select all flights by Southwest Airlines for a specified number of columns and save them to a Pandas dataframe if the number of rows returned is non-zero.

In [19]:
query = f"""
SELECT LocalWeek, Longitude_Center, Latitude_Center,
   Location_City, Average_Latency
FROM tutela_converge 
WHERE Average_Latency > 3000
"""      
%time df = connection.execute(query)
print(df.rowcount)
if df.rowcount != 0:
  mylist = list(df)
  df2 = pd.DataFrame(mylist, columns=['Week', 'Longitude', 'Latitude', 'City', 'AverageLatency'])
  print(df2.shape)
  df2.dropna(inplace=True)
  print(df2.shape)
  print(df2.head())
else:
  print("No rows returned!")

CPU times: user 4.61 s, sys: 0 ns, total: 4.61 s
Wall time: 5.01 s
20510
(20510, 5)
(4718, 5)
    Week   Longitude   Latitude          City  AverageLatency
5     11  -71.460571  41.767273      Cranston     3039.122070
8     20 -122.557983  45.557556      Portland     3147.346924
20    23  -71.021118  42.063904      Brockton     3525.406982
30    32  -90.225220  38.800964  Spanish Lake     4533.570312
34    33  -79.777222  40.448914   Monroeville     4460.000000


In [20]:
df2['CityPoint'] = np.nan
for idx, items in df2.iloc[0:].iterrows():
    pointstring = "POINT(" + df2.loc[idx, 'Longitude'].astype(str) +" " + df2.loc[idx, 'Latitude'].astype(str) +")"
    df2.loc[idx, 'CityPoint'] = pointstring
df2 = df2.drop(columns=['Longitude', 'Latitude'])
print(df2.head())
print(df2.shape)

    Week          City  AverageLatency  \
5     11      Cranston     3039.122070   
8     20      Portland     3147.346924   
20    23      Brockton     3525.406982   
30    32  Spanish Lake     4533.570312   
34    33   Monroeville     4460.000000   

                                      CityPoint  
5    POINT(-71.4605712890625 41.76727294921875)  
8   POINT(-122.5579833984375 45.55755615234375)  
20   POINT(-71.0211181640625 42.06390380859375)  
30   POINT(-90.2252197265625 38.80096435546875)  
34   POINT(-79.7772216796875 40.44891357421875)  
(4718, 4)


In [23]:
create_table_str = 'CREATE TABLE IF NOT EXISTS tutela_test (Week SMALLINT, City TEXT ENCODING DICT(8), AverageLatncy FLOAT, CityPoint GEOMETRY(POINT, 4326) ENCODING COMPRESSED(32))'
print(create_table_str)
connection.execute(create_table_str)
table_details = connection.get_table_details('tutela_test')
print(table_details)

CREATE TABLE IF NOT EXISTS tutela_test (Week SMALLINT, City TEXT ENCODING DICT(8), AverageLatncy FLOAT, CityPoint GEOMETRY(POINT, 4326) ENCODING COMPRESSED(32))
[ColumnDetails(name='Week', type='SMALLINT', nullable=True, precision=0, scale=0, comp_param=0), ColumnDetails(name='City', type='STR', nullable=True, precision=0, scale=0, comp_param=8), ColumnDetails(name='AverageLatncy', type='FLOAT', nullable=True, precision=0, scale=0, comp_param=0), ColumnDetails(name='CityPoint', type='POINT', nullable=True, precision=23, scale=4326, comp_param=32)]


In [30]:
df2['AverageLatency'] = pd.to_numeric(df2['AverageLatency'], downcast='float')
df2['Week'] = pd.to_numeric(df2['Week'], downcast='integer')
print(df2.head())
print(df2.shape)
df2.to_csv("tutela_test.csv", index=False)

    Week          City  AverageLatency  \
5     11      Cranston     3039.122070   
8     20      Portland     3147.346924   
20    23      Brockton     3525.406982   
30    32  Spanish Lake     4533.570312   
34    33   Monroeville     4460.000000   

                                      CityPoint  
5    POINT(-71.4605712890625 41.76727294921875)  
8   POINT(-122.5579833984375 45.55755615234375)  
20   POINT(-71.0211181640625 42.06390380859375)  
30   POINT(-90.2252197265625 38.80096435546875)  
34   POINT(-79.7772216796875 40.44891357421875)  
(4718, 4)


In [29]:
#load_table errors. load_table works if you drop the POINT column.
#connection.load_table('tutela_test', df2, preserve_index=False)
# load_table_rowwise no errors but does not do anything
connection.load_table_rowwise('tutela_test', df2)

I manually loaded the table tutela_test from the CSV file that I crrated in the previous step. Run a geospatial query using the POINT column.

In [37]:
#AND ST_Distance(CAST(A.CityPoint as GEOGRAPHY), CAST(B.CityPoint as GEOGRAPHY)) < 1000
query = f"""
SELECT A.City, A.AverageLatency, ST_Distance(CAST(A.CityPoint as GEOGRAPHY), CAST(B.CityPoint as GEOGRAPHY)) 
FROM tutela_test A, tutela_test B 
WHERE A.AverageLatency > 3000
AND B.AverageLatency > 3000 
AND ST_Distance(CAST(A.CityPoint as GEOGRAPHY), CAST(B.CityPoint as GEOGRAPHY)) BETWEEN 1 AND 1000
"""      
%time df = connection.execute(query)
print(df.rowcount)
if df.rowcount != 0:
  mylist = list(df)
  df2 = pd.DataFrame(mylist, columns=['City', 'AverageLatency', 'Distance'])
  print(df2.shape)
  df2.dropna(inplace=True)
  print(df2.shape)
  print(df2.head())
else:
  print("No rows returned!")

CPU times: user 83 ms, sys: 0 ns, total: 83 ms
Wall time: 844 ms
404
(404, 3)
(404, 3)
              City  AverageLatency    Distance
0        Lexington     3066.285400  610.984314
1           Selden     3557.372559  610.984314
2  Massapequa Park     3730.742676  610.984314
3       Park Ridge     3700.766846  610.984314
4        Oceanside     3010.862549  610.984314
