In [54]:
def install_and_import(module):
    import importlib
    try:
        print (f'Importing module: {module}')
        importlib.import_module(module)
    except ImportError:
        import pip
        print (f'Installing module: {module}')
        pip.main(['install', module])
    finally:
        globals()[module] = importlib.import_module(module)

In [55]:
def import_modules(modules):
    print (f'Importing modules...')
    for module in modules:
        install_and_import(module)


In [56]:
def idpDataInit():
    
    '''Initialises the notebook and install | import required python libraries.
            Parameters: 
                    None
            Returns:
                    None
    '''
    
    print (f'Initialising...')
    import_modules(["jaydebeapi","socket","pandas","json","decimal","JAVA_HOME","numpy","getpass"])

In [57]:
def get_config():
    with open('idpDataConfig.json') as config_file:
        config = json.load(config_file)    
    return config    


In [58]:
def idp_get_connection_info(database):
    
    config = get_config()
    
    conn_info = {}
    conn_info['denododriver_class'] = config['denododriver_class']
    conn_info['denododriver_path']  = config['denododriver_path']
    
    denodoserver_database  = database    
    
    config = get_config()
    denodoserver_name      = config['denodoserver_name']
    denodoserver_jdbc_port = config['denodoserver_jdbc_port']
    
    client_hostname = socket.gethostname()
    useragent = "%s-%s" % (jaydebeapi.__name__,client_hostname)
    conn_uri = "jdbc:vdb://%s:%s/%s?userAgent=%s" % (denodoserver_name,denodoserver_jdbc_port,denodoserver_database,useragent)
    conn_info['conn_uri']  = conn_uri
    
    return conn_info

In [59]:
def map_datatypes(crsr):
        
    results = crsr.fetchall()
    columns = [c[0] for c in crsr.description]
    df_results = pandas.DataFrame.from_records(results, columns=columns)
    
    types_dict = {
            int                                               :  pandas.Int64Dtype(),
            decimal.Decimal                                   :  numpy.float,
            jaydebeapi.DBAPITypeObject._mappings['DATE']      :  numpy.datetime64(),
            jaydebeapi.DBAPITypeObject._mappings['TIMESTAMP'] :  numpy.datetime64(),
            jaydebeapi.DBAPITypeObject._mappings['CHAR']      :  object,
            jaydebeapi.DBAPITypeObject._mappings['INTEGER']   :  pandas.Int64Dtype(),
            jaydebeapi.DBAPITypeObject._mappings['FLOAT']     :  numpy.float
            }
    
    types = [types_dict[c[1]] for c in crsr.description]
        
    for c,tp in  zip(df_results.columns,types):
        df_results[c] = df_results[c].astype(tp)
            
    return df_results

In [60]:
def idpDataConnect(username, database):
    
    '''Opens a connection to IDP's virtual database, returning a connection object.
            Parameters:
                    username (str): your work email address
                    database (str): the name of the Virtual Database to connect to
            Returns:
                    conn : a connection object
    '''  
    
    conn_info = idp_get_connection_info(database)
    
    conn = jaydebeapi.connect(conn_info['denododriver_class'],
                              conn_info['conn_uri'],
                              driver_args = {"user":username,
                                             "password":getpass.getpass()},
                              jars = conn_info['denododriver_path']
                             )
    
    return conn


In [61]:
def idpDataDesc(database,dataset,conn):

    '''Describes the schema of a dataset within the virtual database, returning a pandas dataframe object.
            Parameters:
                    database (str): the name of the Virtual Database
                    dataset (str): the name of the dataset to describe
                    conn : a connection object
            Returns:
                    df_schema : a Pandas dataframe
    '''
    
    df_schema = idpDataQuery(sql="SELECT column_name, column_type_name, column_type_length \
                                  FROM   CATALOG_METADATA_VIEWS() \
                                  WHERE  input_database_name = '"+database+"' \
                                  AND    input_view_name =  '"+dataset+"';",
                             conn=conn)
    return df_schema


In [62]:
def idpDataQuery(sql,conn):

    '''Executes a SQL query against the virtual database, returning a pandas dataframe object.
            Parameters:
                    sql (str): a valid SQL query
                    conn : a connection object
            Returns:
                    df_results : a Pandas dataframe
    '''
    
    cursor = conn.cursor()
    cursor.execute(sql)
    df_results = map_datatypes(cursor)
    return df_results 



In [63]:
def idpDataDisconnect(conn):
    
    '''Closes an open connection to IDP's virtual database.
            Parameters:
                    conn : a connection object
            Returns:
                    None
    '''
    conn.close()

In [64]:
#Import modules
import sys
import os
import pandas as pd

In [65]:
#### Initialise the notebook, after opening or re-starting the kernel
from idpData import idpDataInit
idpDataInit()


Initialising...
Importing modules...
Importing module: jaydebeapi
Importing module: socket
Importing module: pandas
Importing module: json
Importing module: decimal
Importing module: numpy
Importing module: getpass


In [66]:
#### Open a connection into the Virtual Database. NB Prompts for your Denodo password.
from idpData import idpDataConnect

conn = idpDataConnect("nikola.bowers","ids")


 ················


In [72]:
#### Query the virtual database, passing an SQL query and the conn (connection) object:
##### NB Queries currently require columns to be double quoted (complete with an escape character `\`) and column names are case sensitive.
##### This is because the Denodo database has to run in UNICODE rather than RESTRICTED mode, a solution to improve this incovenience is underway.
from idpData import idpDataQuery

epc_std = idpDataQuery(sql="SELECT * FROM ids.bv_epc limit 10",conn=conn)
epc_std_georef = idpDataQuery(sql="SELECT * FROM ids.bv_epc_georef limit 10",conn=conn)
rdmf = idpDataQuery(sql="SELECT * FROM ids.bv_eeoh_rdmf limit 10",conn=conn)

In [73]:
rdmf

Unnamed: 0,UARN,UPRN,COUNTRY_CODE,REGION_CODE,LA_CODE,MSOA_CODE,LSOA_CODE
0,172558197,100100406019,W92000004,,W06000011,W02000179,W01000777
1,181123204,100100470333,W92000004,,W06000019,W02000320,W01001450
2,184028202,100100953986,W92000004,,W06000003,W02000027,W01000132
3,203995193,10004873835,W92000004,,W06000010,W02000144,W01000646
4,205091204,100100462447,W92000004,,W06000019,W02000319,W01001463
5,209552206,100100947341,W92000004,,W06000004,W02000045,W01000226
6,220578202,10013464772,W92000004,,W06000001,W02000005,W01000035
7,17310193,100100159213,W92000004,,W06000010,W02000151,W01000719
8,173432196,200002664583,W92000004,,W06000013,W02000223,W01001057
9,18202197,10011727974,W92000004,,W06000011,W02000197,W01000776


In [53]:
# Set the database for standard EPC data
import numpy as np
import pandas as pd
import os
import glob


## Setting the directory where the data is
os.chdir ('Z:\Housing Analysis/Dwelling Stock Estimates/2012 to 2018/Raw Data')


NameError: name 'spark' is not defined

In [52]:
epc_std <- http://localhost:9090/denodo-odata4-service/denodo-odata.svc/ids/bv_epc

SyntaxError: invalid syntax (1443534963.py, line 1)

In [None]:
nsul = spark.sql("SELECT * FROM national_statistics_uprn_lookup.nsul_jan_2022_gb_std")
                   
df1

In [None]:
#### Close the connection at the end of your session,  passing in the conn onject:
from idpData import idpDataDisconnect

idpDataDisconnect(conn)
