In [3]:
def install_and_import(module):
    import importlib
    try:
        print (f'Importing module: {module}')
        importlib.import_module(module)
    except ImportError:
        import pip
        print (f'Installing module: {module}')
        pip.main(['install', module])
    finally:
        globals()[module] = importlib.import_module(module)

In [4]:
def import_modules(modules):
    print (f'Importing modules...')
    for module in modules:
        install_and_import(module)


In [5]:
def idpDataInit():
    
    '''Initialises the notebook and install | import required python libraries.
            Parameters: 
                    None
            Returns:
                    None
    '''
    
    print (f'Initialising...')
    import_modules(["jaydebeapi","socket","pandas","json","decimal","JAVA_HOME","numpy","getpass"])

In [6]:
def get_config():
    with open('idpDataConfig.json') as config_file:
        config = json.load(config_file)    
    return config    


In [7]:
def idp_get_connection_info(database):
    
    config = get_config()
    
    conn_info = {}
    conn_info['denododriver_class'] = config['denododriver_class']
    conn_info['denododriver_path']  = config['denododriver_path']
    
    denodoserver_database  = database    
    
    config = get_config()
    denodoserver_name      = config['denodoserver_name']
    denodoserver_jdbc_port = config['denodoserver_jdbc_port']
    
    client_hostname = socket.gethostname()
    useragent = "%s-%s" % (jaydebeapi.__name__,client_hostname)
    conn_uri = "jdbc:vdb://%s:%s/%s?userAgent=%s" % (denodoserver_name,denodoserver_jdbc_port,denodoserver_database,useragent)
    conn_info['conn_uri']  = conn_uri
    
    return conn_info

In [8]:
def map_datatypes(crsr):
        
    results = crsr.fetchall()
    columns = [c[0] for c in crsr.description]
    df_results = pandas.DataFrame.from_records(results, columns=columns)
    
    types_dict = {
            int                                               :  pandas.Int64Dtype(),
            decimal.Decimal                                   :  numpy.float,
            jaydebeapi.DBAPITypeObject._mappings['DATE']      :  numpy.datetime64(),
            jaydebeapi.DBAPITypeObject._mappings['TIMESTAMP'] :  numpy.datetime64(),
            jaydebeapi.DBAPITypeObject._mappings['CHAR']      :  object,
            jaydebeapi.DBAPITypeObject._mappings['INTEGER']   :  pandas.Int64Dtype(),
            jaydebeapi.DBAPITypeObject._mappings['FLOAT']     :  numpy.float
            }
    
    types = [types_dict[c[1]] for c in crsr.description]
        
    for c,tp in  zip(df_results.columns,types):
        df_results[c] = df_results[c].astype(tp)
            
    return df_results

In [9]:
def idpDataConnect(username, database):
    
    '''Opens a connection to IDP's virtual database, returning a connection object.
            Parameters:
                    username (str): your work email address
                    database (str): the name of the Virtual Database to connect to
            Returns:
                    conn : a connection object
    '''  
    
    conn_info = idp_get_connection_info(database)
    
    conn = jaydebeapi.connect(conn_info['denododriver_class'],
                              conn_info['conn_uri'],
                              driver_args = {"user":username,
                                             "password":getpass.getpass()},
                              jars = conn_info['denododriver_path']
                             )
    
    return conn


In [10]:
def idpDataDesc(database,dataset,conn):

    '''Describes the schema of a dataset within the virtual database, returning a pandas dataframe object.
            Parameters:
                    database (str): the name of the Virtual Database
                    dataset (str): the name of the dataset to describe
                    conn : a connection object
            Returns:
                    df_schema : a Pandas dataframe
    '''
    
    df_schema = idpDataQuery(sql="SELECT column_name, column_type_name, column_type_length \
                                  FROM   CATALOG_METADATA_VIEWS() \
                                  WHERE  input_database_name = '"+database+"' \
                                  AND    input_view_name =  '"+dataset+"';",
                             conn=conn)
    return df_schema


In [11]:
def idpDataQuery(sql,conn):

    '''Executes a SQL query against the virtual database, returning a pandas dataframe object.
            Parameters:
                    sql (str): a valid SQL query
                    conn : a connection object
            Returns:
                    df_results : a Pandas dataframe
    '''
    
    cursor = conn.cursor()
    cursor.execute(sql)
    df_results = map_datatypes(cursor)
    return df_results 



In [12]:
def idpDataDisconnect(conn):
    
    '''Closes an open connection to IDP's virtual database.
            Parameters:
                    conn : a connection object
            Returns:
                    None
    '''
    conn.close()

In [13]:
## May need to run in Terminal for the first time - then this code isnt needed again
pip install findspark
pip install pyspark

SyntaxError: invalid syntax (285895791.py, line 2)

In [14]:
#Import modules
import sys
import os
import pandas
import findspark
import decimal
import jaydebeapi
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import os, IPython
import pyspark
from pyspark.sql import *
import pyspark.sql.functions as f
from pyspark.sql.types import IntegerType, DoubleType
import numpy as np
from itertools import chain
from pyspark import __version__ as current_pyspark_version

In [15]:
#### Initialise the notebook, after opening or re-starting the kernel
from idpData import idpDataInit
idpDataInit()


Initialising...
Importing modules...
Importing module: jaydebeapi
Importing module: socket
Importing module: pandas
Importing module: json
Importing module: decimal
Importing module: numpy
Importing module: getpass


In [16]:
#### Open a connection into the Virtual Database. NB Prompts for your Denodo password.
from idpData import idpDataConnect

conn = idpDataConnect("nikola.bowers","ids")


 ················


In [17]:
# ------------------------------------------------------------------------------------
# ------------------------------------------------------------------------------------
## Configure the Spark Session
# ------------------------------------------------------------------------------------
# ------------------------------------------------------------------------------------

spark = (
    SparkSession.builder.appName('epc')
    .config("spark.executor.memory", "1500m")
    .config("spark.executor.cores", 2)
    .config("spark.dynamicAllocation.enabled", 'true')
    .config('spark.dynamicAllocation.maxExecutors', 4)
    .config('spark.shuffle.service.enabled','true')
    .config('spark.ui.showConsoleProgress', 'false')
    .getOrCreate()
)

import pandas as pd
pd.set_option("display.html.table_schema", True)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/06/23 13:34:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/06/23 13:34:13 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [20]:
epc_std = idpDataQuery(sql="SELECT \"guid\",\"construction_age_band\",\"current_energy_rating\",\"current_energy_efficiency\",\"property_type\",\"built_form\",\"inspection_date\",\"walls_description\",\"walls_energy_eff\",\"roof_description\",\"roof_energy_eff\" FROM ids.bv_epc",conn=conn)

java.sql.SQLException: java.sql.SQLException: Error executing query. Total time 900.066 seconds.

QUERY [VIRTUAL] [QUERY_TIMEOUT] 
 

In [None]:
epc_georef = idpDataQuery(sql="SELECT \"id\",\"ons_uprn\" FROM ids.bv_epc_georef",conn=conn)
epc_georef

In [None]:
rdmf = idpDataQuery(sql="SELECT \"uprn\",\"la_code\" ,\"region_code\" 
                       FROM ids.bv_eeoh_rdmf",conn=conn)
rdmf

In [237]:
epc_df = spark.sql("SELECT * FROM ids.bv_epc")

22/06/23 13:10:46 WARN ObjectStore: Failed to get database ids, returning NoSuchObjectException


AnalysisException: Table or view not found: ids.bv_epc; line 1 pos 14;
'Project [*]
+- 'UnresolvedRelation [ids, bv_epc], [], false


In [240]:
epc_std = spark.sql(sql="SELECT * FROM ids.bv_epc",conn=conn)

TypeError: sql() missing 1 required positional argument: 'sqlQuery'

In [233]:
epc_std = idpDataDesc(database='ids', dataset='bv_epc', conn=conn)
epc_std

Unnamed: 0,column_name,column_type_name,column_type_length
0,lmk_key,VARCHAR,65535
1,address1,VARCHAR,65535
2,address2,VARCHAR,65535
3,address3,VARCHAR,65535
4,postcode,VARCHAR,65535
...,...,...,...
86,lodgement_datetime,TIMESTAMP,23
87,tenure,VARCHAR,65535
88,fixed_lighting_outlets_count,BIGINT,20
89,low_energy_fixed_light_count,BIGINT,20


In [119]:
spark.sql("USE ids")

22/06/23 09:54:43 WARN ObjectStore: Failed to get database ids, returning NoSuchObjectException


AnalysisException: Database 'ids' not found

In [None]:
# Reading EPC Data from Denodo
epc_std = spark.read.table("bv_epc")

In [228]:
epc_std = idpDataQuery(sql="SELECT * FROM ids.bv_epc limit 10",conn=conn)
epc_std

Unnamed: 0,lmk_key,address1,address2,address3,postcode,building_reference_number,current_energy_rating,potential_energy_rating,current_energy_efficiency,potential_energy_efficiency,...,address,local_authority_label,constituency_label,posttown,construction_age_band,lodgement_datetime,tenure,fixed_lighting_outlets_count,low_energy_fixed_light_count,guid
0,f8530f58e312f283ce53d12f0003d84657c7b5d2d91675...,101 Lorimer House,9 Navigators Walk,,E3 2TG,10000025885,B,B,83,83,...,"101 Lorimer House, 9 Navigators Walk",Tower Hamlets,Bethnal Green and Bow,LONDON,,NaT,Not defined - use in the case of a new dwellin...,1,1.0,e79d2f99-4719-4657-b986-f51086402853
1,128d47b3038724a85b81f4b3d44441c2aa309e231b25a8...,"502, 8 Rolling Street",,,M5 4RG,10000361962,B,B,84,84,...,"502, 8 Rolling Street",Salford,Salford and Eccles,Salford,2021.0,2021-03-30 08:09:40,Not defined - use in the case of a new dwellin...,6,,3249f500-b1d5-46dd-b640-cddfce9f7732
2,b276c18e7237de68ee5d5fece18f07a06537aee81edb2b...,Apartment 301,Cliveland Street Lofts,"25, Cliveland Street",B19 3AJ,10000464949,B,B,83,83,...,"Apartment 301, Cliveland Street Lofts, 25, Cli...",Birmingham,"Birmingham, Ladywood",BIRMINGHAM,,NaT,Not defined - use in the case of a new dwellin...,5,5.0,2a977e78-665a-439d-847b-4b115f3761e8
3,d8983d36ffc580049af489a134d98aa94929eea3064ff2...,309,1 Newfoundland Place,,E14 4BJ,10000217818,B,B,84,84,...,"309, 1 Newfoundland Place",Tower Hamlets,Poplar and Limehouse,London,,NaT,Not defined - use in the case of a new dwellin...,1,1.0,5d8c8d97-8d7e-40a0-87b2-e0a847ee7e26
4,67c7d0352946acfa9621f9b09636a55725bd503c9d059f...,5202,1 Newfoundland Place,,E14 4BW,10000665408,B,B,84,84,...,"5202, 1 Newfoundland Place",Tower Hamlets,Poplar and Limehouse,London,,NaT,Not defined - use in the case of a new dwellin...,1,1.0,85a47d0f-5ffe-4341-bf8f-92fe9d1fdf01
5,4239cff28e64a04d3de57352508c560d205db67d05290b...,Flat 101,Gent Court,1 Barchester Street,E14 6UL,10000795039,B,B,87,87,...,"Flat 101, Gent Court, 1 Barchester Street",Tower Hamlets,Poplar and Limehouse,London,,NaT,Not defined - use in the case of a new dwellin...,50,50.0,b736b42a-76f7-4742-96ef-16861afa2b3f
6,0effbce4bd303c6b330a349844c0b78f770a68ce67f83a...,Apartment 1506,1 Park Drive,,E14 9GG,10000722884,B,B,83,83,...,"Apartment 1506, 1 Park Drive",Tower Hamlets,Poplar and Limehouse,LONDON,,NaT,Owner-occupied,15,15.0,bf9299bd-bc11-4d63-b9f6-91809d3f3e85
7,cb36464bf062418c8124c3c1f2f251cfa3d9c598ca5b58...,Apartment 1803,1 Park Drive,,E14 9GG,10000723157,B,B,85,85,...,"Apartment 1803, 1 Park Drive",Tower Hamlets,Poplar and Limehouse,LONDON,,NaT,Owner-occupied,25,25.0,0380f24c-c784-4c64-b4bb-8ffad7631d59
8,2e822653dbd9fb1c7541d36cdb853d8c7ad669b6a45458...,Apartment 1814 10 Marsh Wall,,,E14 9GU,10000723163,B,B,84,84,...,Apartment 1814 10 Marsh Wall,Tower Hamlets,Poplar and Limehouse,LONDON,,NaT,Not defined - use in the case of a new dwellin...,1,1.0,1130499a-146b-46f7-83c4-de4e1be13d36
9,cdaa43ab7df06fa7580e157fec69cf354d0c3ad88d9d49...,"Apartment 3509, Hampton Tower, 75, Marsh Wall",London,,E14 9RW,10000725232,B,B,86,86,...,"Apartment 3509, Hampton Tower, 75, Marsh Wall,...",Tower Hamlets,Poplar and Limehouse,Canary Wharf,,NaT,Not defined - use in the case of a new dwellin...,25,25.0,8b1686ce-53aa-4d52-bddc-377d7c4ae9a3


In [87]:
#### Query the virtual database, passing an SQL query and the conn (connection) object:
##### NB Queries currently require columns to be double quoted (complete with an escape character `\`) and column names are case sensitive.
##### This is because the Denodo database has to run in UNICODE rather than RESTRICTED mode, a solution to improve this incovenience is underway.
from idpData import idpDataQuery

epc_std = idpDataQuery(sql="SELECT * FROM ids.bv_epc limit 10",conn=conn)
epc_std_georef = idpDataQuery(sql="SELECT * FROM ids.bv_epc_georef limit 10",conn=conn)
rdmf = idpDataQuery(sql="SELECT * FROM ids.bv_eeoh_rdmf limit 10",conn=conn)

In [88]:
rdmf

Unnamed: 0,UARN,UPRN,COUNTRY_CODE,REGION_CODE,LA_CODE,MSOA_CODE,LSOA_CODE
0,32232204,100100784836,W92000004,,W06000020,W02000332,W01001526
1,333694195,10011741377,W92000004,,W06000023,W02000107,W01000464
2,335670195,10011790829,W92000004,,W06000023,W02000416,W01000505
3,356297204,100100269803,W92000004,,W06000021,W02000338,W01001564
4,360706000,10013701577,W92000004,,W06000005,W02000067,W01000282
5,36838202,200003173020,W92000004,,W06000002,W02000012,W01000096
6,36866202,10070366296,W92000004,,W06000002,W02000012,W01000097
7,38301204,100100650276,W92000004,,W06000022,W02000354,W01001608
8,39986196,100100502901,W92000004,,W06000013,W02000219,W01001014
9,40049196,100100503219,W92000004,,W06000013,W02000219,W01001017


In [53]:
# Set the database for standard EPC data
import numpy as np
import pandas as pd
import os
import glob


## Setting the directory where the data is
os.chdir ('Z:\Housing Analysis/Dwelling Stock Estimates/2012 to 2018/Raw Data')


NameError: name 'spark' is not defined

In [52]:
epc_std <- http://localhost:9090/denodo-odata4-service/denodo-odata.svc/ids/bv_epc

SyntaxError: invalid syntax (1443534963.py, line 1)

In [None]:
nsul = spark.sql("SELECT * FROM national_statistics_uprn_lookup.nsul_jan_2022_gb_std")
                   
df1

In [None]:
#### Close the connection at the end of your session,  passing in the conn onject:
from idpData import idpDataDisconnect

idpDataDisconnect(conn)
