## This notebook creates the edges to calculate some of the graph features

### Load Packages

In [ ]:
%%configure -f
{
"conf": {
    "spark.dynamicAllocation.disableIfMinMaxNotSpecified.enabled": true,
    "spark.dynamicAllocation.enabled": true,
    "spark.dynamicAllocation.minExecutors": 2,
    "spark.dynamicAllocation.maxExecutors": 40
   }
}

In [ ]:
from pyspark.ml.feature import StringIndexer
from pyspark.sql.functions import col, when
import pyspark.sql.functions as F
import time
import numpy as np

### Load Input Files

In [ ]:
batch_id = ''
invoice_cleaned_path = ''
edge_path = ''
model_path = ''
heatmap_path = ''

In [ ]:
# Initiate logging
import logging
from opencensus.ext.azure.log_exporter import AzureLogHandler
from opencensus.ext.azure.trace_exporter import AzureExporter
from opencensus.trace import config_integration
from opencensus.trace.samplers import AlwaysOnSampler
from opencensus.trace.tracer import Tracer

instrumentation_connection_string = mssparkutils.credentials.getSecretWithLS("keyvault", "AppInsightsConnectionString")
config_integration.trace_integrations(['logging'])

logger = logging.getLogger(__name__)
logger.addHandler(AzureLogHandler(connection_string=instrumentation_connection_string))
logger.setLevel(logging.INFO)

tracer = Tracer(
    exporter=AzureExporter(
        connection_string=instrumentation_connection_string
    ),
    sampler=AlwaysOnSampler()
)

# Spool parameters
run_time_parameters = {'custom_dimensions': {
    'batch_id': batch_id,
    'invoice_cleaned_path': invoice_cleaned_path,
    'edge_path': edge_path,
    'notebook_name': mssparkutils.runtime.context['notebookname']
} }
  
logger.info(f"{mssparkutils.runtime.context['notebookname']}: INITIALISED", extra=run_time_parameters)

In [ ]:
with tracer.span('Loading cleaned invoice files'):
    df = spark.read.parquet(invoice_cleaned_path,inferSchema=True, header=True)

### Create edges and save output

In [ ]:
with tracer.span('Creating edges for dataset'):
    indexer = StringIndexer(inputCol='issuer_id', outputCol='issuer_id_indexed', stringOrderType='frequencyDesc', handleInvalid='keep')
    model = indexer.fit(df)
    model.write().overwrite().save(model_path + '/' + '_feature_engineering_indexer_issuer_id.pkl')
    df = model.transform(df)
    model.setInputCol("receiver_id")
    model.setOutputCol("receiver_id_indexed")
    df = model.transform(df)
    df = df.withColumn('issuer_id_indexed',col('issuer_id_indexed').cast("Integer"))
    df = df.withColumn('receiver_id_indexed',col('receiver_id_indexed').cast("Integer"))
    all_issuers = df.groupby('issuer_id_indexed').count()
    edges = df.groupby('issuer_id_indexed','receiver_id_indexed').count()
    edges = edges.toDF(*['seller','buyer','edge_count'])
    edges_trimmed = edges.join(all_issuers,edges['buyer']==all_issuers['issuer_id_indexed']).drop('issuer_id_indexed').drop('count')
    edges_trimmed_df = edges_trimmed.toDF(*['issuer_id_indexed','receiver_id_indexed','edge_count'])
    edges_trimmed_df = edges_trimmed_df.withColumn("issuer_id_equals_receiver_id",when(col("issuer_id_indexed")==col("receiver_id_indexed"),1).otherwise(0))
    edges_trimmed_df = edges_trimmed_df.filter(col("issuer_id_equals_receiver_id")==0).drop("issuer_id_equals_receiver_id")

with tracer.span('Saving edges to ADLS'):
    edges_trimmed_df.write.mode("overwrite").option("header", "true").save(edge_path,format='parquet')

In [ ]:
# serverless SQL config
import pyodbc
database = 'eiad'
driver= '{ODBC Driver 17 for SQL Server}'

sql_user_name = mssparkutils.credentials.getSecretWithLS("keyvault", "SynapseSQLUserName")
sql_user_pwd = mssparkutils.credentials.getSecretWithLS("keyvault", "SynapseSQLPassword")
serverless_sql_endpoint = mssparkutils.credentials.getSecretWithLS("keyvault", "SyanpseServerlessSQLEndpoint")

In [ ]:
def generate_schema_string(dataframe):
    schema_string = ""
    for name in dataframe.schema.fieldNames():
        schema_string += "[" + name + "] "
        datatype = str(dataframe.schema[name].dataType.simpleString())
        if datatype == 'double': datatype = 'float'
        if datatype == 'string': datatype = 'nvarchar(MAX)'
        if datatype == 'timestamp': datatype = 'datetime2(7)'
        schema_string += datatype + ", "
    return schema_string[:-2]

In [ ]:
with tracer.span('Creating SQL table for edges'):
    table_name = edge_path.split('/')[3] + '_' + edge_path.split('/')[2].split('@')[0] + '_' + edge_path.split('/')[4]
    schema_string = generate_schema_string(edges_trimmed_df)
    drop_table_command = f"DROP EXTERNAL TABLE [{table_name}]"
    location = "/".join([i for idx, i in enumerate(edge_path.split('/')) if idx > 2])
    df_sql_command = f"CREATE EXTERNAL TABLE [{table_name}] ({schema_string}) WITH (LOCATION = '{location}/**', DATA_SOURCE = [output_<<STORAGE_ACCOUNT_NAME>>_dfs_core_windows_net], FILE_FORMAT = [SynapseParquetFormat])"
    with pyodbc.connect('DRIVER='+driver+';SERVER=tcp:'+serverless_sql_endpoint+';PORT=1433;DATABASE='+database+';UID='+sql_user_name+';PWD='+ sql_user_pwd) as conn:
        with conn.cursor() as cursor:
            try:
                cursor.execute(drop_table_command)
            except:
                pass
            cursor.execute(df_sql_command)

### Create activiy heatmap and save output

In [ ]:
with tracer.span('Creating heatmap of dataset'):
    df = df.withColumn('DD_code',col('activity_issuer').substr(1,2))
    activity_types = df.groupby('issuer_id').agg(F.array_distinct(F.collect_list('DD_code')))
    code_receiver_counts = df.groupby('DD_code','receiver_id').count()
    code_versus_code = code_receiver_counts.join(activity_types,code_receiver_counts['receiver_id']==activity_types['issuer_id']).collect()

    heatmap_cells = {i:{j:0 for j in range(10,100)} for i in range(10,100)}
    dd_code_hist = {i:0 for i in range(10,100)}
    for c in code_versus_code:
        dd_code_hist[int(c[0])] += 1
        for d in c[4]:
            heatmap_cells[int(c[0])][int(d)] += c[2]

    heatmap_list = []
    for i in heatmap_cells:
        user_dict = {"DD_issue":i}
        receive_dict = {"DD_receive_fraction_{}".format(j):float(heatmap_cells[i][j]/max(1,np.sum([heatmap_cells[i][j] for j in heatmap_cells[i]] ))) for j in heatmap_cells[i] if j < 11}
        user_dict.update(receive_dict)
        heatmap_list.append(user_dict)

    dd_issue = []
    dd_receive = []
    fraction = []
    for row in heatmap_list:
        ddi = row['DD_issue']
        for key in row:
            if 'rec' not in key: continue
            ddr = int(key[-2:])
            frac = row[key]
            dd_issue += [ddi]
            dd_receive += [ddr]
            fraction += [frac]

    heatmap_list = []
    for a,b,c in zip(dd_issue, dd_receive, fraction):
        heatmap_list += [
            {"issuer_activity_code":a,
            "receiver_activity_code":b,
            "fraction":c }
        ]

with tracer.span('Saving heatmap to ADLS'):
    heatmap_df = spark.createDataFrame(heatmap_list)
    heatmap_df.write.mode("overwrite").option("header", "true").save(heatmap_path,format='parquet')

In [ ]:
with tracer.span('Creating SQL table for heatmap'):
    table_name = heatmap_path.split('/')[3] + '_' + heatmap_path.split('/')[2].split('@')[0] + '_' + heatmap_path.split('/')[4]
    schema_string = generate_schema_string(heatmap_df)
    drop_table_command = f"DROP EXTERNAL TABLE [{table_name}]"
    location = "/".join([i for idx, i in enumerate(heatmap_path.split('/')) if idx > 2])
    df_sql_command = f"CREATE EXTERNAL TABLE [{table_name}] ({schema_string}) WITH (LOCATION = '{location}/**', DATA_SOURCE = [output_<<STORAGE_ACCOUNT_NAME>>_dfs_core_windows_net], FILE_FORMAT = [SynapseParquetFormat])"
    with pyodbc.connect('DRIVER='+driver+';SERVER=tcp:'+serverless_sql_endpoint+';PORT=1433;DATABASE='+database+';UID='+sql_user_name+';PWD='+ sql_user_pwd) as conn:
        with conn.cursor() as cursor:
            try:
                cursor.execute(drop_table_command)
            except:
                pass
            cursor.execute(df_sql_command)