In [1]:
%%configure -f
{
    "numExecutors": 12,
    "driverMemory": "20g"
}

In [2]:
import pyspark.sql.functions as f
from pyspark.sql.types import *
from pyspark.sql.window import Window
from pyspark.ml.feature import OneHotEncoder, Normalizer, MinMaxScaler
from pyspark.ml.linalg import SparseVector, DenseVector, Vectors, VectorUDT
import numpy as np



Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
0,application_1673908281065_0001,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [78]:
@f.udf(returnType=StringType())
def resolve_sector_name(cell_name):
    """
    resolve cell name to match with EDS event data
    """
    try:
        split_parts = cell_name.split("_")
        m2000id = split_parts[0]
        site_name = split_parts[1]
        cell_number = split_parts[3]
        return f"{m2000id}_{site_name}_{cell_number}"
    except Exception as _:
        return cell_name
@f.udf(returnType=VectorUDT())
def sparse_vector_sum(vectors):
    res = None
    for vec in vectors:
        if res is None:
            res = vec
        else:
            res = np.add(vec, res)
    return SparseVector(len(res), {k: v for k, v in enumerate(res) if v != 0})

@f.udf(returnType=ArrayType(FloatType()))
def to_dense(sparse_vector):
    v = DenseVector(sparse_vector)
    feature = [float(x) for x in v]
    feature.append(0.0)  # adding util placeholder
    return feature

@f.udf(returnType=ArrayType(FloatType()))
def to_dense_flat(sparse_vector):
    v = DenseVector(sparse_vector)
    feature = [float(x) for x in v]
    return feature

@f.udf(returnType=ArrayType(ArrayType(StringType())))
def construct_nodes(edge_input):
    """
    node types:
    0 - user
    1 - cell
    2 - sector
    """
    node_list = []
    for a in edge_input:
        if [a[0], 0] not in node_list:
            node_list.append([a[0], 0])
        if [a[1], 1] not in node_list:
            node_list.append([a[1], 1])
        if [a[2], 2] not in node_list:
            node_list.append([a[2], 2])
    return node_list

@f.udf(returnType=VectorUDT())
def add_sparse31(v1, val):
    v1d = dict(zip(v1.indices, v1.values))
    new_idx = v1.size
    v1d[new_idx] = val
    return SparseVector(new_idx+1, {k: v for k, v in v1d.items() if v != 0.0})

@f.udf(returnType=ArrayType(ArrayType(StringType())))
def construct_edge_index(edge_input):

    edge_index = []
    for a in edge_input:
        if [a[0], a[1]] not in edge_index:
            edge_index.append([a[0], a[1]])
        if [a[1], a[2]] not in edge_index:
            edge_index.append([a[1], a[2]])
    return edge_index

@f.udf(returnType=ArrayType(ArrayType(IntegerType())))
def construct_node_types(node_list):
    node_types = [[] for i in range(3)]
    for ntype, nid in node_list:
        node_types[int(ntype)].append(nid)
    return node_types
        

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
# Event Data
eventdf = spark.read.parquet('s3://kl-prod-tpgt-knowledge-lake-sandpit/TempDir/tmp/tokenised_ebmr_sample/dt=2023-01-10/')

# CEI Data
cei_df = spark.read.parquet('s3://kl-prod-bi-analy-data-extract/features/kl-prod/Mobility/customer_exp_index_v3/part_dt=2023-01-11/')
cei_df =cei_df.withColumnRenamed('subs_id', 'Subs_Id')

# Cell INFO
@f.udf(returnType=StringType())
def resolve_cell_name(cell_name):
    """
    resolve cell name to match with EDS event data
    """
    try:
        split_parts = cell_name.split("_")
        m2000id = split_parts[0]
        site_name = split_parts[1]
        technology = split_parts[3]
        cell_number = split_parts[4]
        return f"{m2000id}_{site_name}_{technology}_{cell_number}"
    except Exception as _:
        return cell_name

locglbl = spark.read.parquet(
    's3://kl-prod-bi-analy-data-extract/features/kl-prod/ICE/EDS_locglbl_v3_extract/part_dt=2023-01-11/'
)
cell_lookup = spark.read.parquet(
    's3://kl-prod-bi-analy-data-extract/features/kl-prod/ICE/TD_cell_look_up/part_dt=2023-01-11/'
)
site_df = spark.read.parquet(
    's3://kl-prod-bi-analy-data-extract/features/kl-prod/ICE/TD_site_table/part_dt=2023-01-11/'
)

cell_info = (
    locglbl.join(
        cell_lookup.filter(
            f.col("Radio_Type_Cd").isin(["L", "U", "N"])
        ),  # ignore 3G cells. ie. 'U'. Ignores 5G for test. ie 'N'
        on=locglbl.glbl_loc_id == cell_lookup.Glbl_loc_Id,
        how="inner",
    )
    .withColumn("m2000_id", f.split(f.col("Cell_Name"), "_").getItem(0))
    .join(
        site_df,  # .filter(f.col("sa6_state_nm") == sa6_nm),
        on="m2000_id",
        how="inner",
    )
    .withColumn("cell_name_adjusted", resolve_cell_name(f.col("Cell_Name")))
    .select(
        locglbl.glbl_loc,
        locglbl.glbl_loc_id,
        f.col("cell_name_adjusted").alias("cell_name"),
        locglbl.mcc,
        locglbl.mnc,
        site_df.state
    )
)
cell_info

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

DataFrame[glbl_loc: string, glbl_loc_id: string, cell_name: string, mcc: string, mnc: string, state: string]

In [28]:
cell_info = cell_info.withColumn('m2000id', f.split('cell_name', '_').getItem(0).cast('string'))
cell_info = cell_info.withColumn('sector_name', resolve_sector_name(f.col('cell_name')))
cell_info.persist()
cell_info.show(5, False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------------+--------------------------+-----------------------------+---+---+---------------+-------+------------------------+
|glbl_loc                  |glbl_loc_id               |cell_name                    |mcc|mnc|state          |m2000id|sector_name             |
+--------------------------+--------------------------+-----------------------------+---+---+---------------+-------+------------------------+
|U_505_03_00345__7264_     |U_505_03_00345_07264_07264|303455_MillPark_U21B_1       |505|03 |Victoria       |303455 |303455_MillPark_1       |
|L_505_03__20282__006112278|L_505_03_20282_|_006112278|201593_CampsieNorth_L18B_2   |505|03 |New South Wales|201593 |201593_CampsieNorth_2   |
|L_505_03__30345__8471850  |L_505_03_30345_|_008471850|338598_Greensborough_L21H_2  |505|03 |Victoria       |338598 |338598_Greensborough_2  |
|L_505_03__30338__7707966  |L_505_03_30338_|_007707966|303039_DocklandsStadm_L18A_12|505|03 |Victoria       |303039 |303039_DocklandsStadm_12|

In [21]:
eventdf = eventdf.withColumn('hour', f.hour('event_timestamp'))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [14]:
# site timezone
site_tz = spark.read.csv('s3://kl-prod-bi-analy-data-extract/data/kl-prod/site_timezone/part_dt=2023-01-11/', header=True)
site_tz.show(5, False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------+---------------+--------+
|M2000_ID|State          |timezone|
+--------+---------------+--------+
|202096  |New South Wales|null    |
|444470  |Queensland     |null    |
|202670  |New South Wales|null    |
|404682  |Queensland     |null    |
|220130  |New South Wales|null    |
+--------+---------------+--------+
only showing top 5 rows

In [27]:
utilisation = spark.read.parquet('s3://kl-prod-bi-analy-data-extract/features/kl-prod/ICE/RAN_util_hr_multibeam/part_dt=2023-01-11/')
utilisation = utilisation.withColumn('hour', f.hour('starttime'))
utilisation.show(5, False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------------------+------------------------+---------+--------+------------------+------------------------+----------------------------------+------------------+----------------------+--------------------+----------------------+-------------+-------------+-----------+-----------+----+
|starttime          |sector_name             |multibeam|m2000_id|prb_dl_util_pct   |weighted_prb_dl_util_pct|prb_dl_util_pct_multibeam_weighted|utilisation       |chmeas_prb_dl_used_avg|mean_prb_avail_pdsch|traf_activeuser_dl_avg|thrp_dl_qci_8|thrp_ul_qci_8|dlpayload  |ulpayload  |hour|
+-------------------+------------------------+---------+--------+------------------+------------------------+----------------------------------+------------------+----------------------+--------------------+----------------------+-------------+-------------+-----------+-----------+----+
|2023-01-10 00:00:00|200027_MLCCentre_1      |0        |200027  |1.5300000000000002|1.5300000000000002      |1.5300000000000002         

## Get Valid Subs for 1 hour

In [34]:
eventdf.filter('hour = 19').count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

145659782

In [49]:
flat_event = eventdf.filter('hour = 19').join(
    cell_info,
    on=eventdf.loc_info==cell_info.glbl_loc
).drop('glbl_loc', 'glbl_loc_id', 'loc_info', 'mcc', 'mnc').join(
    utilisation,
    on=['hour', 'sector_name']
)
flat_event.persist()
flat_event.count()


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

53386050

In [50]:
flat_event = flat_event.select(
    'hour', 'Subs_Id', 'cell_name', 'sector_name', 'event_type_cd', 'utilisation'
).filter('utilisation IS NOT NULL AND Subs_Id IS NOT NULL AND length(Subs_Id) < 10')

flat_event.show(20, False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----+---------+-------------------------+--------------------+-------------+-----------------+
|hour|Subs_Id  |cell_name                |sector_name         |event_type_cd|utilisation      |
+----+---------+-------------------------+--------------------+-------------+-----------------+
|19  |389246765|227885_CurransHill_L21C_2|227885_CurransHill_2|13           |60.98153846153846|
|19  |389246765|227885_CurransHill_L21C_2|227885_CurransHill_2|13           |60.98153846153846|
|19  |389246765|227885_CurransHill_L21H_2|227885_CurransHill_2|13           |60.98153846153846|
|19  |389246765|227885_CurransHill_L21H_2|227885_CurransHill_2|13           |60.98153846153846|
|19  |389246765|227885_CurransHill_L21H_2|227885_CurransHill_2|13           |60.98153846153846|
|19  |389246765|227885_CurransHill_L21H_2|227885_CurransHill_2|13           |60.98153846153846|
|19  |389246765|227885_CurransHill_L21H_2|227885_CurransHill_2|13           |60.98153846153846|
|19  |389246765|227885_CurransHill_L21H_

In [51]:
flat_event.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

37595869

In [52]:
flat_event = flat_event.withColumn('event_type_cd', f.col('event_type_cd').cast('int'))
encoder = OneHotEncoder(inputCol='event_type_cd', outputCol='event_type_encode')
flat_event = encoder.fit(flat_event).transform(flat_event).groupBy(
    'Subs_Id', 'cell_name', 'sector_name'
).agg(
    sparse_vector_sum(f.collect_list('event_type_encode')).alias('event_code_feature'),
    f.max('utilisation').alias('utilisation')
)
flat_event.persist()
flat_event.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+--------------------+--------------------+--------------------+------------------+
|  Subs_Id|           cell_name|         sector_name|  event_code_feature|       utilisation|
+---------+--------------------+--------------------+--------------------+------------------+
|279224204|477187_NerangWest...| 477187_NerangWest_2|(30,[7,12,13],[23...| 42.60727272727272|
|280136956|477089_KurabyNort...|477089_KurabyNorth_2|     (30,[13],[6.0])|            64.604|
|280146139|688043_OsbPkEast_...|  688043_OsbPkEast_3|      (30,[7],[1.0])| 61.35272727272727|
|280165254|688078_Balcatta_L...|   688078_Balcatta_1|     (30,[13],[1.0])|             79.08|
|280166981|558006_Lewiston_L...|   558006_Lewiston_2|(30,[7,13],[2.0,1...|             38.36|
|280170043|201803_Penhust594...|201803_Penhust594...|      (30,[8],[2.0])|             97.82|
|280183720|202922_Bogangar_L...|   202922_Bogangar_1|(30,[7,12,13],[3....|             87.02|
|280191478|227658_Wareemba_L...|   227658_Wareemba_3|(30,[9,

In [53]:
flat_event.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

6370738

In [59]:
flat_event.write.parquet('s3://kl-prod-tpgt-knowledge-lake-sandpit/TempDir/tmp/case_study/dataset_hr/flat_event/')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [62]:
# subs event
subs_feature = flat_event.groupBy('Subs_Id').agg(
    sparse_vector_sum(f.collect_list('event_code_feature')).alias('event_code_feature')
)
subs_feature.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+--------------------+
|  Subs_Id|  event_code_feature|
+---------+--------------------+
|279180139|     (30,[13],[1.0])|
|279227892|(30,[7,8,12,13],[...|
|279269062|(30,[12,13],[1.0,...|
|279577183|    (30,[13],[42.0])|
|279805437|(30,[6,7,8,9,10,1...|
|280137083|(30,[9,10,13,16],...|
|280137356|     (30,[13],[5.0])|
|280137993|(30,[7,13],[10.0,...|
|280138236|(30,[2,4,7,13,15]...|
|280139073|(30,[7,8,9,10,11,...|
|280139131|(30,[7,13],[7.0,4...|
|280140644|     (30,[13],[8.0])|
|280141119|      (30,[7],[2.0])|
|280142043|(30,[8,13],[1.0,3...|
|280145283|     (30,[13],[2.0])|
|280145900|     (30,[13],[5.0])|
|280148028|     (30,[13],[1.0])|
|280149147|    (30,[13],[82.0])|
|280153704|    (30,[13],[81.0])|
|280154233|     (30,[15],[1.0])|
+---------+--------------------+
only showing top 20 rows

In [61]:
# subs event
cell_feature = flat_event.groupBy('cell_name', 'Subs_Id').agg(
    sparse_vector_sum(f.collect_list('event_code_feature')).alias('event_code_feature')
)
cell_feature.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+---------+--------------------+
|           cell_name|  Subs_Id|  event_code_feature|
+--------------------+---------+--------------------+
|200027_MLCCentre_...|364998406|      (30,[7],[1.0])|
|200027_MLCCentre_...|385444631|(30,[9,10,13,16],...|
|200027_MLCCentre_...|385971490|     (30,[13],[2.0])|
|200027_MLCCentre_...|387116334|(30,[6,7,8,10,13,...|
|200027_MLCCentre_...|390199932|(30,[7,13],[8.0,5...|
|200028_AustraliaS...|362085952|      (30,[8],[2.0])|
|200028_AustraliaS...|375554753|(30,[7,13],[1.0,2...|
|200031_ChifleyTwr...|378050132|      (30,[7],[1.0])|
|200031_ChifleyTwr...|390449224|(30,[7,8,9,10,13]...|
|200032_GrosvenorP...|388292129|      (30,[7],[1.0])|
|200032_GrosvenorP...|388891740|(30,[7,9,10,13,16...|
|200032_GrosvenorP...|385268601|      (30,[7],[1.0])|
|200033_Gateway_L0...|390609488|    (30,[13],[19.0])|
|200033_Gateway_L1...|362841800|(30,[8,11,13,16],...|
|200033_Gateway_L1...|375411567|(30,[7,13],[3.0,1...|
|200033_Gateway_U0...|384371

In [58]:
sector_feature = flat_event.groupBy('sector_name', 'Subs_Id').agg(
    sparse_vector_sum(f.collect_list('event_code_feature')).alias('event_code_feature'),
    f.max('utilisation').alias('utilisation')
)
sector_feature.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+---------+--------------------+------------------+
|         sector_name|  Subs_Id|  event_code_feature|       utilisation|
+--------------------+---------+--------------------+------------------+
|  200027_MLCCentre_1|371462724|(30,[8,11,12,13],...|             8.685|
|  200027_MLCCentre_1|386167527|     (30,[13],[1.0])|             8.685|
|  200027_MLCCentre_1|389514751|    (30,[13],[34.0])|             8.685|
|  200027_MLCCentre_1|390629170|(30,[2,4,6,7,8,11...|             8.685|
|  200027_MLCCentre_1|391192847|(30,[2,4,6,7,8,11...|             8.685|
|200028_AustraliaS...|388293965|(30,[2,15],[1.0,1...|             10.74|
| 200031_ChifleyTwr_1|283460450|(30,[7,8],[1.0,10...|              9.56|
| 200031_ChifleyTwr_1|373187164|(30,[7,8,11,12,13...|              9.56|
| 200031_ChifleyTwr_1|378893831|(30,[7,8,13],[2.0...|              9.56|
|200032_GrosvenorP...|364407217|(30,[7,9,10,13],[...|             2.475|
|    200033_Gateway_2|383954442|(30,[2,4,15],[4.0..

In [79]:
import numpy as np
@f.udf(returnType=FloatType())
def sum_vector(v):
    s = 0.0
    for i in v.values:
        s += i
    return float(s)

@f.udf(returnType=FloatType())
def irregular_event_pct(v):
    vd = dict(zip(v.indices, v.values))
    irregular_count = 0
    regular_count = 0
    for k, v in vd.items():
        if k in [13, 7, 22, 26, 30, 23, 24, 25]:
            regular_count += v
        else:
            irregular_count += v
    return float(1.0 * irregular_count / (irregular_count + regular_count))
            
    

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [75]:
# create cei label
weighted_sector_utilisation = sector_feature.join(
    subs_feature.withColumn('event_count', sum_vector('event_code_feature')).drop('event_code_feature'),
    on='Subs_Id'
).withColumn(
    'sector_event_count', sum_vector('event_code_feature')
).withColumn(
    'sector_usage_pct', 1.0 * f.col('sector_event_count') / f.col('event_count') * f.col('utilisation')
).groupBy('Subs_Id').agg(f.sum('sector_usage_pct').alias('weighted_sector_utilisation'))
weighted_sector_utilisation.persist()
weighted_sector_utilisation.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+---------------------------+
|  Subs_Id|weighted_sector_utilisation|
+---------+---------------------------+
|341898685|         15.290703296703297|
|345173688|          57.86940055623389|
|387140283|          48.09715315315316|
|375626765|          73.88402777777779|
|390246017|          77.93041666666667|
|385466902|          72.17905183946489|
|387919448|          50.92038013268783|
|376321824|          75.37409050719397|
|368606061|                     87.015|
|375840354|          64.21046153846154|
|389916995|         28.096140778140786|
|386545558|          81.58851851851853|
|391167934|          64.65333333333334|
|365535524|                       4.72|
|391682411|          55.59076923076923|
|388730789|          52.69576923076923|
|388473108|          50.57230769230769|
|373276124|          57.32718481518482|
|390480490|          18.85333333333333|
|378341527|          58.14940828402367|
+---------+---------------------------+
only showing top 20 rows

In [82]:
cei_df = subs_feature.withColumn('irregular_pct', irregular_event_pct('event_code_feature')).join(
    weighted_sector_utilisation,
    on='Subs_Id'
).withColumn('cei', 0.7 * (100 - 100 * f.col('irregular_pct')) + 0.3 * (100 - f.col('weighted_sector_utilisation')))

cei_df.show(5, False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+----------------------------------+-------------+---------------------------+-----------------+
|Subs_Id  |event_code_feature                |irregular_pct|weighted_sector_utilisation|cei              |
+---------+----------------------------------+-------------+---------------------------+-----------------+
|341898685|(30,[7,8,13],[4.0,3.0,7.0])       |0.21428572   |15.290703296703297         |80.4127874851101 |
|345173688|(30,[7,8,13],[44.0,9.0,1.0])      |0.16666667   |57.86940055623389          |70.97250960607904|
|387140283|(30,[7,8,13],[13.0,7.0,17.0])     |0.1891892    |48.09715315315316          |72.32761369760874|
|375626765|(30,[7,8,11,13],[4.0,6.0,1.0,1.0])|0.5833333    |73.88402777777779          |37.00145922342936|
|390246017|(30,[7,8,13],[15.0,21.0,4.0])     |0.525        |77.93041666666667          |39.87087767028808|
+---------+----------------------------------+-------------+---------------------------+-----------------+
only showing top 5 rows

In [83]:
cei_df.write.parquet('s3://kl-prod-tpgt-knowledge-lake-sandpit/TempDir/tmp/case_study/dataset_hr/cei/')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [84]:
subs_feature.show(5, False)
cell_feature.show(5, False)
sector_feature.show(5, False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+---------------------------------------------------------------+
|Subs_Id  |event_code_feature                                             |
+---------+---------------------------------------------------------------+
|279180139|(30,[13],[1.0])                                                |
|279227892|(30,[7,8,12,13],[2.0,1.0,1.0,13.0])                            |
|279269062|(30,[12,13],[1.0,2.0])                                         |
|279577183|(30,[13],[42.0])                                               |
|279805437|(30,[6,7,8,9,10,11,13,16],[3.0,3.0,49.0,3.0,2.0,1.0,10.0,28.0])|
+---------+---------------------------------------------------------------+
only showing top 5 rows

+-----------------------+---------+-----------------------------------------------+
|cell_name              |Subs_Id  |event_code_feature                             |
+-----------------------+---------+-----------------------------------------------+
|200027_MLCCentre_L08A_1|364998406|(30,

In [87]:
# write node overview
flat_event.groupBy('Subs_Id').agg(
    construct_nodes(
            f.collect_list(f.struct('Subs_Id', 'cell_name', 'sector_name'))
        ).alias('node_list')
).withColumn(
    'node_id_type', f.explode('node_list')
).drop('node_list').withColumn(
    'node_name', f.col('node_id_type')[0]
).withColumn(
    'node_type', f.col('node_id_type')[1]
).drop('node_id_type').withColumn(
    'node_id', f.row_number().over(Window.partitionBy('Subs_Id').orderBy('node_type')) - 1
).write.mode('overwrite').parquet('s3://kl-prod-tpgt-knowledge-lake-sandpit/TempDir/tmp/case_study/dataset_hr/node_overview/')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [91]:
normaler = Normalizer(inputCol='event_code_feature', outputCol='feature_norm', p=1)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [92]:
subs_feature = normaler.transform(subs_feature)
cell_feature = normaler.transform(cell_feature)
sector_feature = normaler.transform(sector_feature)
subs_feature.show(5, False)
cell_feature.show(5, False)
sector_feature.show(5, False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+---------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Subs_Id  |event_code_feature                                             |feature_norm                                                                                                                                                                                  |
+---------+---------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|279180139|(30,[13],[1.0])                                                |(30,[13],[1.0])                                                                                                             

In [93]:
sector_feature = sector_feature.withColumn('utilisation', f.col('utilisation')/100.0)
subs_feature = subs_feature.withColumn('util_vec', add_sparse31(f.col('feature_norm'), f.lit(0.0)))
cell_feature = cell_feature.withColumn('util_vec', add_sparse31(f.col('feature_norm'), f.lit(0.0)))
sector_feature = sector_feature.withColumn('util_vec', add_sparse31(f.col('feature_norm'), f.col('utilisation')))

subs_feature.show(5, False)
cell_feature.show(5, False)
sector_feature.show(5, False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+---------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Subs_Id  |event_code_feature                                             |feature_norm                                                                                                                                                                                  |util_vec                                                                                                                                                                                      |
+---------+---------------------------------------------------------------+---------

In [95]:
node_feature = subs_feature.select(
    f.col('Subs_Id').alias('trace_id'), f.col('Subs_Id').alias('node_name'), 'util_vec'
).union(cell_feature.select(
    f.col('Subs_Id').alias('trace_id'), f.col('cell_name').alias('node_name'), 'util_vec'
)).union(sector_feature.select(
    f.col('Subs_Id').alias('trace_id'), f.col('sector_name').alias('node_name'), 'util_vec'
))
node_feature.persist()
node_feature.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+---------+--------------------+
| trace_id|node_name|            util_vec|
+---------+---------+--------------------+
|280155403|280155403|(31,[7,13],[0.028...|
|280166904|280166904|(31,[8,13],[0.090...|
|280180096|280180096|     (31,[13],[1.0])|
|280199973|280199973|     (31,[13],[1.0])|
|280238701|280238701|     (31,[13],[1.0])|
|280244365|280244365|(31,[7,13],[0.333...|
|280257816|280257816|(31,[7,13],[0.5,0...|
|280300968|280300968|     (31,[13],[1.0])|
|280330831|280330831|     (31,[13],[1.0])|
|280346661|280346661|     (31,[13],[1.0])|
|280358968|280358968|(31,[7,8,11,13,15...|
|280387876|280387876|     (31,[13],[1.0])|
|280406521|280406521|     (31,[13],[1.0])|
|280406628|280406628|(31,[7,8,13],[0.4...|
|280411512|280411512|(31,[2,6,7,8,11,1...|
|280424708|280424708|(31,[11,12,13],[0...|
|280427282|280427282|(31,[7,8,13],[0.0...|
|280456481|280456481|(31,[7,12,13],[0....|
|280492014|280492014|     (31,[13],[1.0])|
|280545534|280545534|(31,[7,9,10,13,16...|
+---------+

In [96]:
node_feature.write.parquet('s3://kl-prod-tpgt-knowledge-lake-sandpit/TempDir/tmp/case_study/dataset_hr/node_feature/')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Edge Index

In [98]:
edge_index = flat_event.groupBy('Subs_Id').agg(
        construct_edge_index(
            f.collect_list(f.struct('Subs_Id', 'cell_name', 'sector_name'))
        ).alias('edge_index_list')
).withColumn(
    'edge_pair', f.explode('edge_index_list')
).drop('edge_index_list').withColumn(
    'src_name', f.col('edge_pair')[0]
).withColumn(
    'dst_name', f.col('edge_pair')[1]
).drop('edge_pair')

edge_index.persist()
edge_index.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+--------------------+--------------------+
|  Subs_Id|            src_name|            dst_name|
+---------+--------------------+--------------------+
|280155403|           280155403|588264_MorphettVa...|
|280155403|588264_MorphettVa...|588264_MorphettVa...|
|280155403|           280155403|588264_MorphettVa...|
|280155403|588264_MorphettVa...|588264_MorphettVa...|
|280155403|           280155403|588264_MorphettVa...|
|280155403|588264_MorphettVa...|588264_MorphettVa...|
|280166904|           280166904|606215_Dunsboroug...|
|280166904|606215_Dunsboroug...|606215_Dunsborough_1|
|280166904|           280166904|606215_Dunsboroug...|
|280166904|606215_Dunsboroug...|606215_Dunsborough_1|
|280180096|           280180096|640125_Anketell_L...|
|280180096|640125_Anketell_L...|   640125_Anketell_2|
|280180096|           280180096|640125_Anketell_L...|
|280180096|640125_Anketell_L...|   640125_Anketell_2|
|280180096|           280180096|640125_Anketell_L...|
|280180096|640125_Anketell_L

In [101]:
edge_index.write.parquet('s3://kl-prod-tpgt-knowledge-lake-sandpit/TempDir/tmp/case_study/dataset_hr/edge_index/')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [100]:
edge_index.dropDuplicates().count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

12741476

In [103]:
cei_df.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+--------------------+-------------+---------------------------+------------------+
|  Subs_Id|  event_code_feature|irregular_pct|weighted_sector_utilisation|               cei|
+---------+--------------------+-------------+---------------------------+------------------+
|279180139|     (30,[13],[1.0])|          0.0|                     31.024|           90.6928|
|279227892|(30,[7,8,12,13],[...|   0.11764706|         24.946176470588235| 84.28085042796415|
|279269062|(30,[12,13],[1.0,...|   0.33333334|                     94.656| 48.26986488647461|
|279577183|    (30,[13],[42.0])|          0.0|                      43.56|            86.932|
|279805437|(30,[6,7,8,9,10,1...|   0.86868685|          57.07272727272726|22.070103815252132|
|280137083|(30,[9,10,13,16],...|    0.5833333|                      94.11|30.933667556762693|
|280137356|     (30,[13],[5.0])|          0.0|          50.88727272727273| 84.73381818181818|
|280137993|(30,[7,13],[10.0,...|          0.0|         52.42

## Prep 

In [10]:
root_dir = 's3://kl-prod-tpgt-knowledge-lake-sandpit/TempDir/tmp/case_study/dataset_hr/'
output_csv_dir = 's3://kl-prod-tpgt-knowledge-lake-sandpit/TempDir/tmp/case_study/csv/dataset_hr/'

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [6]:
cei = spark.read.parquet('s3://kl-prod-tpgt-knowledge-lake-sandpit/TempDir/tmp/case_study/dataset_hr/cei/')
cei.show(5, False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+------------------------------------------------------------------+-------------+---------------------------+------------------+
|Subs_Id  |event_code_feature                                                |irregular_pct|weighted_sector_utilisation|cei               |
+---------+------------------------------------------------------------------+-------------+---------------------------+------------------+
|361100902|(30,[2,4,6,7,8,11,13,15,16],[2.0,1.0,2.0,5.0,9.0,2.0,5.0,4.0,1.0])|0.67741936   |45.87325062034739          |38.818669802909454|
|371481963|(30,[7,13],[1.0,1.0])                                             |0.0          |7.443636363636363          |97.7669090909091  |
|391311988|(30,[7,8,11,13],[5.0,2.0,1.0,4.0])                                |0.25         |37.19428095053095          |71.34171571484072 |
|355568500|(30,[7,8,9,11,13,16],[36.0,13.0,1.0,1.0,18.0,1.0])                |0.22857143   |53.74485215810884          |67.87654130080954 |
|364768390|(30,[7,8,

In [5]:
node_overview = f'{root_dir}/node_overview/'
node_feature = f'{root_dir}/node_feature/'
flat_event = f'{root_dir}flat_event/'
edge_index = f'{root_dir}edge_index/'

node_overview = spark.read.parquet(node_overview)
node_feature = spark.read.parquet(node_feature)
flat_event = spark.read.parquet(flat_event)
edge_index = spark.read.parquet(edge_index)

node_overview.show(5, False)
node_feature.show(5, False)
flat_event.show(5, False)
edge_index.show(5, False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+------------------------+---------+-------+
|Subs_Id  |node_name               |node_type|node_id|
+---------+------------------------+---------+-------+
|279258404|279258404               |0        |0      |
|279258404|201823_GundarooDr_L18P_2|1        |1      |
|279258404|227292_Ngunnawal_L21H_2 |1        |2      |
|279258404|201823_GundarooDr_L21H_2|1        |3      |
|279258404|201823_GundarooDr_2     |2        |4      |
+---------+------------------------+---------+-------+
only showing top 5 rows

+---------+---------------------------+-----------------------------------------------------------------------------------------------------------------------+
|trace_id |node_name                  |util_vec                                                                                                               |
+---------+---------------------------+-----------------------------------------------------------------------------------------------------------------------+


### Edge Index CSV

In [7]:
node_overview.select('node_type').dropDuplicates().show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+
|node_type|
+---------+
|        0|
|        1|
|        2|
+---------+

In [8]:
node_overview = node_overview.withColumnRenamed('Subs_Id', 'trace_id')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [9]:
csv_edge_index = edge_index.withColumnRenamed('Subs_Id', 'trace_id').join(
    node_overview.select(
        'trace_id', 'node_name', 'node_id').withColumnRenamed(
        'node_name', 'src_name').withColumnRenamed('node_id', 'src_id'),
    on=['trace_id', 'src_name'],
    how='inner'
).join(node_overview.select(
        'trace_id', 'node_name', 'node_id').withColumnRenamed(
        'node_name', 'dst_name').withColumnRenamed('node_id', 'dst_id'),
    on=['trace_id', 'dst_name'],
    how='inner'
)
csv_edge_index.persist()
csv_edge_index.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+--------------------+--------------------+------+------+
| trace_id|            dst_name|            src_name|src_id|dst_id|
+---------+--------------------+--------------------+------+------+
|280142812|201566_DeeWhySout...|           280142812|     0|     3|
|280142812|201566_DeeWhySouth_2|201566_DeeWhySout...|     3|     4|
|280142812|201566_DeeWhySout...|           280142812|     0|     1|
|280142812|201566_DeeWhySouth_2|201566_DeeWhySout...|     1|     4|
|280142812|201566_DeeWhySout...|           280142812|     0|     2|
|280142812|201566_DeeWhySouth_2|201566_DeeWhySout...|     2|     4|
|280147067|660002_BurswdEntC...|           280147067|     0|     1|
|280147067|660002_BurswdEntC...|660002_BurswdEntC...|     1|     2|
|280158936|477186_AshmoreNer...|           280158936|     0|     1|
|280158936|477186_AshmoreNer...|477186_AshmoreNer...|     1|     2|
|280248981|202525_Leumeah_L1...|           280248981|     0|     5|
|280248981|    202525_Leumeah_2|202525_Leumeah_L

In [20]:
csv_edge_index.write.mode('overwrite').csv(f'{output_csv_dir}/edge_index/', header=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### Node Features

In [13]:
node_feature.show(5, False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+---------------------------+-----------------------------------------------------------------------------------------------------------------------+
|trace_id |node_name                  |util_vec                                                                                                               |
+---------+---------------------------+-----------------------------------------------------------------------------------------------------------------------+
|391202425|200031_ChifleyTwr_L18A_1   |(31,[7,9,10,13,16],[0.5555555555555556,0.05555555555555555,0.05555555555555555,0.2777777777777778,0.05555555555555555])|
|389811731|200040_MacquarieShop_L08A_2|(31,[7],[1.0])                                                                                                         |
|389793371|200040_MacquarieShop_L18A_1|(31,[13],[1.0])                                                                                                        |
|375827306|200040_MacquarieShop_L18B_1|(

In [15]:
node_feature = node_feature.withColumn(
    'feature_list', to_dense_flat(f.col('util_vec'))
)

for idx in range(31):
    node_feature = node_feature.withColumn(f'f_{idx}', f.col('feature_list')[idx])
node_feature.show(5, False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+---------------------------+-----------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---+---+---+---+---+---+---+---------+---+-----------+-----------+----+----+---------+----+----+-----------+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
|trace_id |node_name                  |util_vec                                                                                                               |feature_list                                                                                                                                                                                   |f_0|f_1|f_2|f_3|f_4|f_5|f_6|f_7      |f_8|f_9        |f_10       |f_11|f_12|f_13     |f_14|f_15|f_16       |f_17|f_18|f

In [18]:
csv_node_feature = node_feature.join(
    node_overview.select('trace_id', 'node_name', 'node_id'),
    on=['trace_id', 'node_name']
).drop('util_vec', 'feature_list', 'node_name')

csv_node_feature.show(5, False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+---+---+---+---+---+---+---+---------+---+-----------+-----------+----+----+---------+----+----+-----------+----+----+----+----+----+----+----+----+----+----+----+----+----+----+-------+
|trace_id |f_0|f_1|f_2|f_3|f_4|f_5|f_6|f_7      |f_8|f_9        |f_10       |f_11|f_12|f_13     |f_14|f_15|f_16       |f_17|f_18|f_19|f_20|f_21|f_22|f_23|f_24|f_25|f_26|f_27|f_28|f_29|f_30|node_id|
+---------+---+---+---+---+---+---+---+---------+---+-----------+-----------+----+----+---------+----+----+-----------+----+----+----+----+----+----+----+----+----+----+----+----+----+----+-------+
|391202425|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.5555556|0.0|0.055555556|0.055555556|0.0 |0.0 |0.2777778|0.0 |0.0 |0.055555556|0.0 |0.0 |0.0 |0.0 |0.0 |0.0 |0.0 |0.0 |0.0 |0.0 |0.0 |0.0 |0.0 |0.0 |4      |
|389811731|0.0|0.0|0.0|0.0|0.0|0.0|0.0|1.0      |0.0|0.0        |0.0        |0.0 |0.0 |0.0      |0.0 |0.0 |0.0        |0.0 |0.0 |0.0 |0.0 |0.0 |0.0 |0.0 |0.0 |0.0 |0.0 |0.0 |0.0 |0.0 |0.0 |6      |
|389793371

In [72]:
csv_node_feature.write.mode('overwrite').csv(f'{output_csv_dir}/node_feature_norm/', header=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### trace_info

In [23]:
csv_cei = cei.withColumn(
    'trace_bool', f.col('cei') > 70
).withColumnRenamed('Subs_Id', 'trace_id').withColumn(
    'error_trace_type', f.col('trace_bool')
).drop('event_code_feature', 'irregular_pct', 'weighted_sector_utilisation')
csv_cei.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+------------------+----------+----------------+
| trace_id|               cei|trace_bool|error_trace_type|
+---------+------------------+----------+----------------+
|390163328| 82.54266666666666|      true|            true|
|362095265| 7.780984615384614|     false|           false|
|381328738|  84.3061858974359|      true|            true|
|388943505|  75.6978866970807|      true|            true|
|382766889| 83.31732150858849|      true|            true|
|386713217| 68.67036462476506|     false|           false|
|373490918| 80.01532718766478|      true|            true|
|379723115|  74.0327734984268|      true|            true|
|391556428| 81.88011764705882|      true|            true|
|380122769|52.061310989379876|     false|           false|
|391623026| 78.79599999999999|      true|            true|
|373115335| 74.77004950185922|      true|            true|
|384773101| 82.76677358156022|      true|            true|
|385835686| 61.62842485798458|     false|           fals

In [24]:
csv_cei.write.mode('overwrite').csv(f'{output_csv_dir}/trace_info/', header=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### Train/Eval/Test

In [25]:
csv_cei.persist()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

DataFrame[trace_id: string, cei: double, trace_bool: boolean, error_trace_type: boolean]

In [27]:
csv_cei.groupBy('cei', 'trace_bool').count().orderBy(f.col('count').desc()).show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----------------+----------+-----+
|              cei|trace_bool|count|
+-----------------+----------+-----+
|76.62307692307692|      true| 1252|
| 75.8236923076923|      true|  805|
|          71.1508|      true|  783|
|79.22153846153846|      true|  777|
|76.32492307692307|      true|  763|
|72.18584615384616|      true|  753|
|75.22553846153846|      true|  752|
|76.09969230769231|      true|  738|
| 73.8796923076923|      true|  728|
| 80.1436923076923|      true|  711|
|76.12369230769231|      true|  700|
|72.03907692307692|      true|  666|
|76.90646153846154|      true|  649|
|           76.752|      true|  648|
|           74.884|      true|  643|
|77.26654545454545|      true|  623|
|78.18584615384616|      true|  622|
| 81.5930909090909|      true|  607|
|           79.084|      true|  604|
|79.33876923076923|      true|  578|
+-----------------+----------+-----+
only showing top 20 rows

In [31]:
csv_cei.groupBy('trace_bool').count().show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------+-------+
|trace_bool|  count|
+----------+-------+
|      true|1302700|
|     false| 544927|
+----------+-------+

In [42]:
csv_cei.filter('trace_bool = True').sample(
    fraction=0.6
).withColumn('cei', f.col('cei').cast('int')).groupBy('cei', 'trace_bool').count().orderBy(f.col('count').desc()).show(50, False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---+----------+-----+
|cei|trace_bool|count|
+---+----------+-----+
|79 |true      |43846|
|78 |true      |42890|
|82 |true      |42478|
|80 |true      |40609|
|81 |true      |40113|
|77 |true      |38431|
|76 |true      |37894|
|84 |true      |37700|
|83 |true      |37635|
|75 |true      |34121|
|85 |true      |33703|
|74 |true      |30344|
|86 |true      |30295|
|87 |true      |28985|
|73 |true      |27861|
|72 |true      |26313|
|88 |true      |25183|
|89 |true      |23131|
|90 |true      |21907|
|71 |true      |21412|
|91 |true      |20499|
|92 |true      |17688|
|93 |true      |16055|
|94 |true      |13808|
|70 |true      |12891|
|95 |true      |11891|
|96 |true      |10243|
|97 |true      |7240 |
|98 |true      |4612 |
|99 |true      |1015 |
+---+----------+-----+

In [43]:
csv_cei.filter('trace_bool = True').withColumn('cei', f.col('cei').cast('int')).groupBy('cei', 'trace_bool').count().orderBy(f.col('count').desc()).show(50, False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---+----------+-----+
|cei|trace_bool|count|
+---+----------+-----+
|79 |true      |73136|
|78 |true      |71377|
|82 |true      |71236|
|80 |true      |67890|
|81 |true      |66720|
|77 |true      |64072|
|76 |true      |63013|
|83 |true      |62907|
|84 |true      |62733|
|75 |true      |57101|
|85 |true      |56418|
|74 |true      |50666|
|86 |true      |50286|
|87 |true      |48493|
|73 |true      |46450|
|72 |true      |43835|
|88 |true      |42238|
|89 |true      |38652|
|90 |true      |36495|
|71 |true      |35403|
|91 |true      |34176|
|92 |true      |29602|
|93 |true      |26777|
|94 |true      |23219|
|70 |true      |21567|
|95 |true      |19826|
|96 |true      |17012|
|97 |true      |12080|
|98 |true      |7640 |
|99 |true      |1680 |
+---+----------+-----+

In [45]:
train_ids = csv_cei.filter('trace_bool = True').sample(
    fraction=0.6
).select('trace_id')
train_ids.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

781405

In [46]:
rest_trace = csv_cei.join(
    train_ids,
    on='trace_id',
    how='left_anti'
)
rest_trace.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

1066222

In [52]:
eval_ids = rest_trace.sample(fraction=0.2).select('trace_id')

eval_ids.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

213659

In [61]:
test_ids = rest_trace.join(
    eval_ids,
    on='trace_id',
    how='left_anti'
)#.sample(fraction=0.5)

test_ids.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

852563

In [62]:
rest_trace.groupBy('trace_bool').count().show()
# eval_ids.groupBy('trace_bool').count().show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------+------+
|trace_bool| count|
+----------+------+
|      true|521295|
|     false|544927|
+----------+------+

In [63]:
train_ids.count(), eval_ids.count(), test_ids.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

(781405, 213659, 852563)

In [64]:
dataset_ids = train_ids.select('trace_id').union(eval_ids.select('trace_id')).union(test_ids.select('trace_id'))
dataset_ids = dataset_ids.withColumn('index', f.row_number().over(
    Window.orderBy('trace_id')
))

dataset_ids.select(f.max('index')).show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------+
|max(index)|
+----------+
|   1847627|
+----------+

In [65]:
dataset_ids.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

1847627

In [66]:
dataset_ids.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+-----+
| trace_id|index|
+---------+-----+
|279136044|    1|
|279145285|    2|
|279150070|    3|
|279151007|    4|
|279153120|    5|
|279155053|    6|
|279161055|    7|
|279161293|    8|
|279178583|    9|
|279179734|   10|
|279180139|   11|
|279183407|   12|
|279196207|   13|
|279224140|   14|
|279224204|   15|
|279224510|   16|
|279224607|   17|
|279224659|   18|
|279224894|   19|
|279225611|   20|
+---------+-----+
only showing top 20 rows

### Reindex and output

In [112]:
# train/eval/test sets
train_ids.join(
    dataset_ids,
    on='trace_id',
).drop('trace_id').withColumnRenamed('index', 'trace_id').repartition(1).write.mode('overwrite').csv(
    's3://kl-prod-tpgt-knowledge-lake-sandpit/TempDir/tmp/case_study/csv/input/train/')

eval_ids.join(
    dataset_ids,
    on='trace_id',
).drop('trace_id').withColumnRenamed('index', 'trace_id').repartition(1).write.mode('overwrite').csv(
    's3://kl-prod-tpgt-knowledge-lake-sandpit/TempDir/tmp/case_study/csv/input/eval/')

test_ids.join(
    dataset_ids,
    on='trace_id',
).drop('trace_id').withColumnRenamed('index', 'trace_id').select('trace_info')..repartition(1).write.mode('overwrite').csv(
    's3://kl-prod-tpgt-knowledge-lake-sandpit/TempDir/tmp/case_study/csv/input/test/')

# train_ids.repartition(1).write.csv(
#     's3://kl-prod-tpgt-knowledge-lake-sandpit/TempDir/tmp/case_study/csv/input/train/')
# eval_ids.repartition(1).write.csv(
#     's3://kl-prod-tpgt-knowledge-lake-sandpit/TempDir/tmp/case_study/csv/input/eval/')
# test_ids.repartition(1).write.csv(
#     's3://kl-prod-tpgt-knowledge-lake-sandpit/TempDir/tmp/case_study/csv/input/test/')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [109]:
eval_ids.join(
    dataset_ids,
    on='trace_id',
).drop('trace_id').withColumnRenamed('index', 'trace_id').show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------+
|trace_id|
+--------+
|       2|
|       4|
|       8|
|      14|
|      42|
|      44|
|      46|
|      52|
|      60|
|      68|
|      82|
|      83|
|      84|
|     102|
|     111|
|     113|
|     115|
|     116|
|     120|
|     129|
+--------+
only showing top 20 rows

In [114]:
test_ids.join(
    dataset_ids,
    on='trace_id',
).drop('trace_id').withColumnRenamed('index', 'trace_id').show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------------+----------+----------------+--------+
|               cei|trace_bool|error_trace_type|trace_id|
+------------------+----------+----------------+--------+
| 62.31248366873604|     false|           false|       1|
| 6.591996330275229|     false|           false|       3|
| 60.82758796339768|     false|           false|       5|
|           23.9205|     false|           false|       6|
| 80.21753846153847|      true|            true|       7|
| 79.78798418412224|      true|            true|      15|
|           77.9812|      true|            true|      16|
|24.041999999999998|     false|           false|      18|
| 82.68981505779006|      true|            true|      20|
|25.901329772949218|     false|           false|      22|
| 75.48149163879599|      true|            true|      23|
| 81.70553846153847|      true|            true|      24|
| 80.41471709090908|      true|            true|      26|
| 76.90646153846154|      true|            true|      28|
| 48.269864886

In [94]:
csv_node_feature.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- trace_id: string (nullable = true)
 |-- f_0: float (nullable = true)
 |-- f_1: float (nullable = true)
 |-- f_2: float (nullable = true)
 |-- f_3: float (nullable = true)
 |-- f_4: float (nullable = true)
 |-- f_5: float (nullable = true)
 |-- f_6: float (nullable = true)
 |-- f_7: float (nullable = true)
 |-- f_8: float (nullable = true)
 |-- f_9: float (nullable = true)
 |-- f_10: float (nullable = true)
 |-- f_11: float (nullable = true)
 |-- f_12: float (nullable = true)
 |-- f_13: float (nullable = true)
 |-- f_14: float (nullable = true)
 |-- f_15: float (nullable = true)
 |-- f_16: float (nullable = true)
 |-- f_17: float (nullable = true)
 |-- f_18: float (nullable = true)
 |-- f_19: float (nullable = true)
 |-- f_20: float (nullable = true)
 |-- f_21: float (nullable = true)
 |-- f_22: float (nullable = true)
 |-- f_23: float (nullable = true)
 |-- f_24: float (nullable = true)
 |-- f_25: float (nullable = true)
 |-- f_26: float (nullable = true)
 |-- f_27: float (nu

In [95]:
dataset_ids.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- trace_id: string (nullable = true)
 |-- index: integer (nullable = false)

In [91]:
# node feature
csv_node_feature.join(
    dataset_ids,
    on='trace_id'
).drop('trace_id').withColumnRenamed('index', 'trace_id').repartition(1).write.mode('overwrite').csv(
    's3://kl-prod-tpgt-knowledge-lake-sandpit/TempDir/tmp/case_study/csv/input/node_feature_norm/', header=True
)



FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [76]:
# edge index
edge_index_ = spark.read.csv('s3://kl-prod-tpgt-knowledge-lake-sandpit/TempDir/tmp/case_study/csv/dataset_hr/edge_index/', header=True)
edge_index_.show()

edge_index_.join(
    dataset_ids,
    on='trace_id'
).drop('trace_id', 'dst_name', 'src_name').withColumnRenamed('index', 'trace_id').repartition(1).write.csv(
    's3://kl-prod-tpgt-knowledge-lake-sandpit/TempDir/tmp/case_study/csv/input/edge_index/', header=True
)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+--------------------+--------------------+------+------+
| trace_id|            dst_name|            src_name|src_id|dst_id|
+---------+--------------------+--------------------+------+------+
|280142812|201566_DeeWhySout...|           280142812|     0|     3|
|280142812|201566_DeeWhySouth_2|201566_DeeWhySout...|     3|     4|
|280142812|201566_DeeWhySout...|           280142812|     0|     1|
|280142812|201566_DeeWhySouth_2|201566_DeeWhySout...|     1|     4|
|280142812|201566_DeeWhySout...|           280142812|     0|     2|
|280142812|201566_DeeWhySouth_2|201566_DeeWhySout...|     2|     4|
|280147067|660002_BurswdEntC...|           280147067|     0|     1|
|280147067|660002_BurswdEntC...|660002_BurswdEntC...|     1|     2|
|280158936|477186_AshmoreNer...|           280158936|     0|     1|
|280158936|477186_AshmoreNer...|477186_AshmoreNer...|     1|     2|
|280248981|202525_Leumeah_L1...|           280248981|     0|     5|
|280248981|    202525_Leumeah_2|202525_Leumeah_L

In [79]:
# node types

node_types = node_overview.join(
    dataset_ids,
    on='trace_id'
).drop('trace_id').withColumnRenamed('index', 'trace_id').groupBy('trace_id').agg(
    construct_node_types(
        f.collect_list(f.struct('node_type', 'node_id'))
    ).alias('node_types')
).orderBy('trace_id')

node_types.show(5, False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------+------------------------+
|trace_id|node_types              |
+--------+------------------------+
|1       |[[0], [1, 2], [3, 4]]   |
|2       |[[0], [1], [2]]         |
|3       |[[0], [1, 2, 3], [4, 5]]|
|4       |[[0], [1], [2]]         |
|5       |[[0], [1, 2], [3]]      |
+--------+------------------------+
only showing top 5 rows

In [87]:
node_types.withColumn('node_types', f.col('node_types').cast('string')).repartition(1).write.mode('overwrite').csv(
    's3://kl-prod-tpgt-knowledge-lake-sandpit/TempDir/tmp/case_study/csv/input/node_types/')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [88]:
# trace_info
trace_info_ = spark.read.csv('s3://kl-prod-tpgt-knowledge-lake-sandpit/TempDir/tmp/case_study/csv/dataset_hr/trace_info/', header=True)
trace_info_.show()

trace_info_.join(
    dataset_ids,
    on='trace_id'
).drop('trace_id').withColumnRenamed('index', 'trace_id').repartition(1).write.mode('overwrite').csv(
    's3://kl-prod-tpgt-knowledge-lake-sandpit/TempDir/tmp/case_study/csv/input/trace_info/', header=True)


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+------------------+----------+----------------+
| trace_id|               cei|trace_bool|error_trace_type|
+---------+------------------+----------+----------------+
|390163328| 82.54266666666666|      true|            true|
|362095265| 7.780984615384614|     false|           false|
|381328738|  84.3061858974359|      true|            true|
|388943505|  75.6978866970807|      true|            true|
|382766889| 83.31732150858849|      true|            true|
|386713217| 68.67036462476506|     false|           false|
|373490918| 80.01532718766478|      true|            true|
|379723115|  74.0327734984268|      true|            true|
|391556428| 81.88011764705882|      true|            true|
|380122769|52.061310989379876|     false|           false|
|391623026| 78.79599999999999|      true|            true|
|373115335| 74.77004950185922|      true|            true|
|384773101| 82.76677358156022|      true|            true|
|385835686| 61.62842485798458|     false|           fals

In [106]:
# check
spark.read.csv(
    's3://kl-prod-tpgt-knowledge-lake-sandpit/TempDir/tmp/case_study/csv/input/edge_index/', header=True).show()

spark.read.csv(
    's3://kl-prod-tpgt-knowledge-lake-sandpit/TempDir/tmp/case_study/csv/input/node_feature_norm/', header=True).show()

spark.read.csv(
    's3://kl-prod-tpgt-knowledge-lake-sandpit/TempDir/tmp/case_study/csv/input/trace_info/', header=True).show()

spark.read.csv(
    's3://kl-prod-tpgt-knowledge-lake-sandpit/TempDir/tmp/case_study/csv/input/node_types/', header=False).show()


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------+------+--------+
|src_id|dst_id|trace_id|
+------+------+--------+
|     0|     1|     131|
|     1|     3|     131|
|     0|     2|     131|
|     2|     4|     131|
|     0|     1|     891|
|     1|     3|     891|
|     0|     2|     891|
|     2|     3|     891|
|     0|     1|    1086|
|     1|     2|    1086|
|     0|     2|    1626|
|     2|     7|    1626|
|     0|     4|    1626|
|     4|     7|    1626|
|     0|     1|    1626|
|     1|     6|    1626|
|     0|     3|    1626|
|     3|     7|    1626|
|     0|     5|    1626|
|     5|     8|    1626|
+------+------+--------+
only showing top 20 rows

+---+---+---+---+---+---+---+---------+---+----------+----------+----+----+----------+----+----+----------+----+----+----+----+----+----+----+----+----+----+----+----+----+----+-------+--------+
|f_0|f_1|f_2|f_3|f_4|f_5|f_6|      f_7|f_8|       f_9|      f_10|f_11|f_12|      f_13|f_14|f_15|      f_16|f_17|f_18|f_19|f_20|f_21|f_22|f_23|f_24|f_25|f_26|f_27|f_28|f_29|f_30|no

In [93]:
spark.read.csv(
    's3://kl-prod-tpgt-knowledge-lake-sandpit/TempDir/tmp/case_study/csv/input/node_feature_norm/',
    header=True).select(f.max('trace_id')).show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------------+
|max(trace_id)|
+-------------+
|       999999|
+-------------+

In [96]:
spark.read.csv(
    's3://kl-prod-tpgt-knowledge-lake-sandpit/TempDir/tmp/case_study/csv/input/edge_index/',
    header=True).select(f.max('trace_id')).show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------------+
|max(trace_id)|
+-------------+
|       999999|
+-------------+

In [105]:
edge_index_.join(
    dataset_ids,
    on='trace_id'
).drop('trace_id', 'dst_name', 'src_name').withColumnRenamed('index', 'trace_id').repartition(1).write.mode('overwrite').csv(
    's3://kl-prod-tpgt-knowledge-lake-sandpit/TempDir/tmp/case_study/csv/input/edge_index/', header=True
)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### Check

In [119]:
testdf = cei.join(
    dataset_ids,
    on=cei.Subs_Id == dataset_ids.trace_id
)

testdf.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+--------------------+-------------+---------------------------+------------------+---------+-----+
|  Subs_Id|  event_code_feature|irregular_pct|weighted_sector_utilisation|               cei| trace_id|index|
+---------+--------------------+-------------+---------------------------+------------------+---------+-----+
|279136044|(30,[7,8,13],[3.0...|    0.2857143|         58.958392857142854| 62.31248366873604|279136044|    1|
|279145285|(30,[8,13],[1.0,5...|   0.16666667|                       5.86| 86.57532977294922|279145285|    2|
|279150070|(30,[2,6,8,16],[1...|          1.0|          78.02667889908257| 6.591996330275229|279150070|    3|
|279151007|     (30,[15],[1.0])|          1.0|         21.206666666666667|            23.638|279151007|    4|
|279153120|(30,[8,12,13],[1....|   0.33333334|          52.79692307692308| 60.82758796339768|279153120|    5|
|279155053|(30,[8,11,12,16],...|          1.0|                     20.265|           23.9205|279155053|    6|
|279161055

In [120]:
testdf.filter('index = 1').show(10, False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+----------------------------+-------------+---------------------------+-----------------+---------+-----+
|Subs_Id  |event_code_feature          |irregular_pct|weighted_sector_utilisation|cei              |trace_id |index|
+---------+----------------------------+-------------+---------------------------+-----------------+---------+-----+
|279136044|(30,[7,8,13],[3.0,8.0,17.0])|0.2857143    |58.958392857142854         |62.31248366873604|279136044|1    |
+---------+----------------------------+-------------+---------------------------+-----------------+---------+-----+

In [121]:
testdf.filter('index = 2').show(10, False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+---------------------+-------------+---------------------------+-----------------+---------+-----+
|Subs_Id  |event_code_feature   |irregular_pct|weighted_sector_utilisation|cei              |trace_id |index|
+---------+---------------------+-------------+---------------------------+-----------------+---------+-----+
|279145285|(30,[8,13],[1.0,5.0])|0.16666667   |5.86                       |86.57532977294922|279145285|2    |
+---------+---------------------+-------------+---------------------------+-----------------+---------+-----+

In [122]:
flat_event.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+--------------------+--------------------+--------------------+------------------+
|  Subs_Id|           cell_name|         sector_name|  event_code_feature|       utilisation|
+---------+--------------------+--------------------+--------------------+------------------+
|279153120|201607_LalorparkE...|201607_LalorparkE...|      (30,[8],[1.0])| 52.79692307692308|
|280135519|703813_Risdonvale...| 703813_Risdonvale_1|(30,[10,12,13],[1...|32.272000000000006|
|280137017|201580_Cherrybroo...|201580_Cherrybrook_1|     (30,[13],[6.0])| 30.63428571428572|
|280148312|404131_Narangba_L...|   404131_Narangba_3|(30,[8,11,12,16],...| 83.75636363636363|
|280149231|227592_Cabramatta...|227592_Cabramatta...|      (30,[7],[1.0])| 98.28615384615385|
|280154329|200216_CastlerghK...|200216_CastlerghK...|      (30,[7],[2.0])|13.675555555555556|
|280155319|688304_Madora_L08A_2|     688304_Madora_2|(30,[7,13],[3.0,3...| 42.68666666666667|
|280159756|201720_StHelensPa...|201720_StHelensPa...|(30,[9,