In [176]:
def install_and_import(module):
    import importlib
    try:
        print (f'Importing module: {module}')
        importlib.import_module(module)
    except ImportError:
        import pip
        print (f'Installing module: {module}')
        pip.main(['install', module])
    finally:
        globals()[module] = importlib.import_module(module)

In [177]:
def import_modules(modules):
    print (f'Importing modules...')
    for module in modules:
        install_and_import(module)


In [178]:
def idpDataInit():
    
    '''Initialises the notebook and install | import required python libraries.
            Parameters: 
                    None
            Returns:
                    None
    '''
    
    print (f'Initialising...')
    import_modules(["jaydebeapi","socket","pandas","json","decimal","JAVA_HOME","numpy","getpass"])

In [179]:
def get_config():
    with open('idpDataConfig.json') as config_file:
        config = json.load(config_file)    
    return config    


In [180]:
def idp_get_connection_info(database):
    
    config = get_config()
    
    conn_info = {}
    conn_info['denododriver_class'] = config['denododriver_class']
    conn_info['denododriver_path']  = config['denododriver_path']
    
    denodoserver_database  = database    
    
    config = get_config()
    denodoserver_name      = config['denodoserver_name']
    denodoserver_jdbc_port = config['denodoserver_jdbc_port']
    
    client_hostname = socket.gethostname()
    useragent = "%s-%s" % (jaydebeapi.__name__,client_hostname)
    conn_uri = "jdbc:vdb://%s:%s/%s?userAgent=%s" % (denodoserver_name,denodoserver_jdbc_port,denodoserver_database,useragent)
    conn_info['conn_uri']  = conn_uri
    
    return conn_info

In [181]:
def map_datatypes(crsr):
        
    results = crsr.fetchall()
    columns = [c[0] for c in crsr.description]
    df_results = pandas.DataFrame.from_records(results, columns=columns)
    
    types_dict = {
            int                                               :  pandas.Int64Dtype(),
            decimal.Decimal                                   :  numpy.float,
            jaydebeapi.DBAPITypeObject._mappings['DATE']      :  numpy.datetime64(),
            jaydebeapi.DBAPITypeObject._mappings['TIMESTAMP'] :  numpy.datetime64(),
            jaydebeapi.DBAPITypeObject._mappings['CHAR']      :  object,
            jaydebeapi.DBAPITypeObject._mappings['INTEGER']   :  pandas.Int64Dtype(),
            jaydebeapi.DBAPITypeObject._mappings['FLOAT']     :  numpy.float
            }
    
    types = [types_dict[c[1]] for c in crsr.description]
        
    for c,tp in  zip(df_results.columns,types):
        df_results[c] = df_results[c].astype(tp)
            
    return df_results

In [182]:
def idpDataConnect(username, database):
    
    '''Opens a connection to IDP's virtual database, returning a connection object.
            Parameters:
                    username (str): your work email address
                    database (str): the name of the Virtual Database to connect to
            Returns:
                    conn : a connection object
    '''  
    
    conn_info = idp_get_connection_info(database)
    
    conn = jaydebeapi.connect(conn_info['denododriver_class'],
                              conn_info['conn_uri'],
                              driver_args = {"user":username,
                                             "password":getpass.getpass()},
                              jars = conn_info['denododriver_path']
                             )
    
    return conn


In [183]:
def idpDataDesc(database,dataset,conn):

    '''Describes the schema of a dataset within the virtual database, returning a pandas dataframe object.
            Parameters:
                    database (str): the name of the Virtual Database
                    dataset (str): the name of the dataset to describe
                    conn : a connection object
            Returns:
                    df_schema : a Pandas dataframe
    '''
    
    df_schema = idpDataQuery(sql="SELECT column_name, column_type_name, column_type_length \
                                  FROM   CATALOG_METADATA_VIEWS() \
                                  WHERE  input_database_name = '"+database+"' \
                                  AND    input_view_name =  '"+dataset+"';",
                             conn=conn)
    return df_schema


In [184]:
def idpDataQuery(sql,conn):

    '''Executes a SQL query against the virtual database, returning a pandas dataframe object.
            Parameters:
                    sql (str): a valid SQL query
                    conn : a connection object
            Returns:
                    df_results : a Pandas dataframe
    '''
    
    cursor = conn.cursor()
    cursor.execute(sql)
    df_results = map_datatypes(cursor)
    return df_results 



In [185]:
def idpDataDisconnect(conn):
    
    '''Closes an open connection to IDP's virtual database.
            Parameters:
                    conn : a connection object
            Returns:
                    None
    '''
    conn.close()

In [165]:
## May need to run in Terminal for the first time - then this code isnt needed again
pip install findspark
pip install pyspark

SyntaxError: invalid syntax (285895791.py, line 2)

In [196]:
#Import modules
import sys
import os
import pandas as pd
import findspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import os, IPython
import pyspark
from pyspark.sql import *
import pyspark.sql.functions as f
from pyspark.sql.types import IntegerType, DoubleType
import numpy as np
import pandas as pd
from itertools import chain
from pyspark import __version__ as current_pyspark_version

In [197]:
#### Initialise the notebook, after opening or re-starting the kernel
from idpData import idpDataInit
idpDataInit()


Initialising...
Importing modules...
Importing module: jaydebeapi
Importing module: socket
Importing module: pandas
Importing module: json
Importing module: decimal
Importing module: numpy
Importing module: getpass


In [194]:
#### Open a connection into the Virtual Database. NB Prompts for your Denodo password.
from idpData import idpDataConnect

conn = idpDataConnect("nikola.bowers","ids")


 ················


In [199]:
# ------------------------------------------------------------------------------------
# ------------------------------------------------------------------------------------
## Configure the Spark Session
# ------------------------------------------------------------------------------------
# ------------------------------------------------------------------------------------

spark = (
    SparkSession.builder.appName('epc')
    .config("spark.executor.memory", "1500m")
    .config("spark.executor.cores", 2)
    .config("spark.dynamicAllocation.enabled", 'true')
    .config('spark.dynamicAllocation.maxExecutors', 4)
    .config('spark.shuffle.service.enabled','true')
    .config('spark.ui.showConsoleProgress', 'false')
    .getOrCreate()
)

import pandas as pd
pd.set_option("display.html.table_schema", True)

In [202]:
epc_std = idpDataDesc(ids, bv_epc, conn)

NameError: name 'denododriver_path' is not defined

In [119]:
spark.sql("USE ids")

22/06/23 09:54:43 WARN ObjectStore: Failed to get database ids, returning NoSuchObjectException


AnalysisException: Database 'ids' not found

In [136]:
epc_std = spark.read('http://localhost:9090/denodo-odata4-service/denodo-odata.svc/ids/bv_epc')


TypeError: 'DataFrameReader' object is not callable

In [None]:
# Reading EPC Data from Denodo
epc_std = spark.read.table("bv_epc")

In [87]:
#### Query the virtual database, passing an SQL query and the conn (connection) object:
##### NB Queries currently require columns to be double quoted (complete with an escape character `\`) and column names are case sensitive.
##### This is because the Denodo database has to run in UNICODE rather than RESTRICTED mode, a solution to improve this incovenience is underway.
from idpData import idpDataQuery

epc_std = idpDataQuery(sql="SELECT * FROM ids.bv_epc limit 10",conn=conn)
epc_std_georef = idpDataQuery(sql="SELECT * FROM ids.bv_epc_georef limit 10",conn=conn)
rdmf = idpDataQuery(sql="SELECT * FROM ids.bv_eeoh_rdmf limit 10",conn=conn)

In [88]:
rdmf

Unnamed: 0,UARN,UPRN,COUNTRY_CODE,REGION_CODE,LA_CODE,MSOA_CODE,LSOA_CODE
0,32232204,100100784836,W92000004,,W06000020,W02000332,W01001526
1,333694195,10011741377,W92000004,,W06000023,W02000107,W01000464
2,335670195,10011790829,W92000004,,W06000023,W02000416,W01000505
3,356297204,100100269803,W92000004,,W06000021,W02000338,W01001564
4,360706000,10013701577,W92000004,,W06000005,W02000067,W01000282
5,36838202,200003173020,W92000004,,W06000002,W02000012,W01000096
6,36866202,10070366296,W92000004,,W06000002,W02000012,W01000097
7,38301204,100100650276,W92000004,,W06000022,W02000354,W01001608
8,39986196,100100502901,W92000004,,W06000013,W02000219,W01001014
9,40049196,100100503219,W92000004,,W06000013,W02000219,W01001017


In [53]:
# Set the database for standard EPC data
import numpy as np
import pandas as pd
import os
import glob


## Setting the directory where the data is
os.chdir ('Z:\Housing Analysis/Dwelling Stock Estimates/2012 to 2018/Raw Data')


NameError: name 'spark' is not defined

In [52]:
epc_std <- http://localhost:9090/denodo-odata4-service/denodo-odata.svc/ids/bv_epc

SyntaxError: invalid syntax (1443534963.py, line 1)

In [None]:
nsul = spark.sql("SELECT * FROM national_statistics_uprn_lookup.nsul_jan_2022_gb_std")
                   
df1

In [None]:
#### Close the connection at the end of your session,  passing in the conn onject:
from idpData import idpDataDisconnect

idpDataDisconnect(conn)
