In [None]:
import os

import findspark
findspark.init()

from pyspark.sql import SparkSession
import pyspark.sql.functions as sf

# Mocking data for ingestion job of operators

For siplicity the code is written for only two countries: DK and NL

In [None]:
country_codes = {'DK', 'NL'}

In [None]:
spark = (
    SparkSession
    .builder
    .appName("NameMatching_Notebook")
    .config('spark.dynamicAllocation.enabled', False)
    .config('spark.executorEnv.PYTHON_EGG_CACHE', '/tmp')
    .config('spark.executor.instances', 4)
    .config('spark.executor.cores', 13)
    .config('spark.executor.memory', '14g')
    .config('spark.driver.memory', '7g')
    .getOrCreate()
)
sc = spark.sparkContext

## Mock current operator data

This will consist of only 5% of the current data.

In [None]:
operators_old_dir = 'adl://ulohubdldevne.azuredatalakestore.net/data/parquet/'
test_output_dir = 'adl://ulohubdldevne.azuredatalakestore.net/data/parquet/test/'

oprs_old = (spark
.read.parquet(operators_old_dir + 'OPERATORS_MERGED.parquet')
.sample(False, 0.05)
.filter(sf.col('countryCode').isin(country_codes)))
            
(oprs_old
 .write
.partitionBy('countryCode')
.parquet(test_output_dir + 'OPERATORS_MERGED.parquet', mode='overwrite'))

In [None]:
spark.read.parquet(test_output_dir + 'OPERATORS_MERGED.parquet').groupby('countryCode').count().show(10)

In [None]:
# needed for later steps to assess known operators in the input
oprs_old = oprs_old.withColumn('id', sf.explode(sf.col('refIds')))

## Mock input data as 1% from original

In [None]:
# read operators and assign id
operators_matched_dir = 'adl://ulohubdldevne.azuredatalakestore.net/data/parquet/'
test_output_dir = 'adl://ulohubdldevne.azuredatalakestore.net/data/parquet/test/'

input_oprs_raw = (spark
.read.parquet(operators_matched_dir + 'OPERATORS.parquet')
.sample(False, 0.01)
.filter(sf.col('COUNTRY_CODE').isin(country_codes)))

**Check for overlap and alter some records**

In [None]:
input_oprs = (
    input_oprs_raw
    .withColumn('id', sf.concat_ws('~',
                                   sf.col('COUNTRY_CODE'),
                                   sf.col('SOURCE'),
                                   sf.col('REF_OPERATOR_ID')))
)

In [None]:
oprs = input_oprs.join(oprs_old, on='id', how='left')
# 15% of the already known operators will be altered on the `STREET_CLEANSED`
oprs = df.withColumn('STREET_CLEANSED', sf.when(sf.col('ohubOperatorId').isNotNull() & (sf.rand(seed=0) < 0.15), 'street_changed').otherwise(sf.col('STREET_CLEANSED')))
# assign column with the type of incoming operator record: known changed, known unchanged, new
oprs = (df_altered
 .withColumn('record_type',
             sf.when(sf.col('ohubOperatorId').isNotNull() & (sf.col('STREET_CLEANSED').isNull() | (sf.col('STREET_CLEANSED') != 'street_changed')), 'known_unchanged')
             .when(sf.col('ohubOperatorId').isNotNull() & (sf.col('STREET_CLEANSED') == 'street_changed'), 'known_changed')
             .when(sf.col('ohubOperatorId').isNull(), 'new'))
)
oprs.persist()
oprs.count()

In [None]:
if oprs.count() != oprs.dropna(subset=['record_type']).count():
    raise ValueError("There are some NULL values in record_type")

Ohub known Operators: Operators already grouped and in the Ohub data storage

In [None]:
oprs_old.groupby('countryCode').count().show()

Input operators: incoming data of operators 

In [None]:
oprs.groupby('COUNTRY_CODE').count().show()

Input Known: incoming operator data already in Ohub

In [None]:
oprs.filter(sf.col('record_type').isin({'known_unchanged', 'known_changed'})).groupby('COUNTRY_CODE').count().show()

Input Known unchanged: incoming operator data already in Ohub with no change in its data

In [None]:
oprs.filter(sf.col('record_type') == 'known_unchanged').groupby('COUNTRY_CODE').count().show()

Input Known Changed: incoming operator data already in Ohub with change in its data

In [None]:
oprs.filter(sf.col('record_type') == 'known_changed').groupby('COUNTRY_CODE').count().show()

Input New: Completely new operator data (id not in Ohub)

In [None]:
oprs.filter(sf.col('record_type') == 'new').groupby('COUNTRY_CODE').count().show()

## Write mocked input operator data

In [None]:
(oprs
 .select(input_oprs_raw.columns + ['record_type'])
 .write
 .partitionBy('COUNTRY_CODE')
 .parquet(test_output_dir + 'OPERATORS.parquet', mode='overwrite'))

In [None]:
spark.read.parquet(test_output_dir + 'OPERATORS.parquet').groupby('COUNTRY_CODE').count().show()