In [None]:
# Bee Hive data https://drive.google.com/file/d/142IBcs6OyQiJxO7owPfkEBFbkrudnh0g/view?usp=sharing

In [None]:
APP = 'BeeHive'

In [None]:
# Install a pip package in the current Jupyter kernel
!{sys.executable} -m pip install -e '../../../Wielder/'
!{sys.executable} -m pip install -e '../'

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, IntegerType
from pyspark.sql.functions import split, row_number, udf, col
from pyspark.sql.window import Window

from pep_data.project import get_project_conf
from pep_data.spark.util import field_to_struct

import random
#from treelib import Node, Tree
from treelib import Tree

In [None]:
# Create Spark session
spark = SparkSession.builder.appName(APP).getOrCreate()

In [None]:
# Get app configuration from project.conf file
conf = get_project_conf()

In [None]:
# Create schema for the data
cols_name = conf[APP]['cols_name']
cols_double = conf[APP]['cols_double']
cols_integer = conf[APP]['cols_integer']

# Create all the fields
fields = [field_to_struct(header, doubles=cols_double, integers=cols_integer) for header in cols_name]

# Create the schema from th e fields
schema = StructType(fields)

In [None]:
# Read the data from the csv using the schema
data_path = conf[APP]['data_path']
df = spark.read.schema(schema).csv(data_path)

df.show()

In [None]:
# Remove unnecessary columns(the columns with the word remove in them)
cols_to_keep = [x for x in df.columns if 'remove' not in x]
df = df.select(*cols_to_keep)

df.show()

In [None]:
# Split Bee_ID column to Cycle(Bee value) and Cycle ID(ID value) columns
df_cleaned = df.withColumn('Cycle', split(col('Bee ID'), '_')\
                           .getItem(0))\
               .withColumn('Cycle ID', split(col('Bee ID'), '_')\
                           .getItem(1))

# Change the type of value in Cycle column from string to integer
df_cleaned= df_cleaned.withColumn("Cycle",col("Cycle")\
                                  .cast(IntegerType()))

df_cleaned.show()

In [None]:
# Sort the data frame by Cycle column and add row number for each row(new column with the name Continuous ID)
w = Window().orderBy('Cycle')
df_cleaned = df_cleaned.withColumn('Continuous ID', row_number()\
                                   .over(w))

df_cleaned.show()

In [None]:
# Create a dictionary with key = cycle , value = minimum value of Continuous ID of cycle(key)
continuous_min_id_per_cycle = {key : value for key, value  in df_cleaned.groupBy('Cycle').min('Continuous ID').collect()}

continuous_min_id_per_cycle

In [None]:
# Create sorted list of all distinct values of Cycle column
cycles = sorted([i[0] for i in df_cleaned.select('Cycle').distinct().collect()])

cycles

In [None]:
# Return parent continuous id according to cycle and n
def assert_parent_bee_id(cycle):
    n = 3

    # Get index  of cycle in cycles list
    cycle_index = cycles.index(cycle)

    # Return parent bee id if cycle is 0
    if  not cycle_index :
        return None

    min_cycle_index = 0

    # Update min_cycle_index according to cycle_index and n
    if cycle_index > n:
        min_cycle_index = cycle_index - n

    # Calculate the minimum value for random parent continuous id
    min_rand_value = continuous_min_id_per_cycle[cycles[min_cycle_index]]

    # Calculate the maximum value for random parent continuous id
    max_rand_vale = continuous_min_id_per_cycle[cycles[cycle_index]] - 1

    # Get random value of parent_continuous_id (from min_rand_value to max_rand_vale)
    parent_continuous_id = random.randint(min_rand_value, max_rand_vale)

    return parent_continuous_id

In [None]:
# Convert assert_parent_bee_id(cycle) to user defined function
assert_parent_bee_id_udf = udf(lambda z: assert_parent_bee_id(z))

# Create new column Parent Continuous ID using the assert_parent_bee_id_udf function and Cycle column
# cache() caches the specified data frame in the memory of your cluster's workers
# If executing multiple actions on the same data frame then cache it
df_cleaned = df_cleaned.withColumn("Parent Continuous ID", assert_parent_bee_id_udf(col('Cycle')))\
                        .cache()

df_cleaned.show()

In [None]:
# Create a tree using the idea of bfs algorithm

# Get the root parent (first node of the tree)
root_parent_continuous_id = 1
root_parent = df_cleaned.filter(df_cleaned['Continuous ID'] == root_parent_continuous_id).collect()[0]

# list of rows from df_cleand that will be nodes in the tree
tree_nodes = [root_parent]

# Create tree
tree = Tree()

# Add first node(root) to tree
tree.create_node(root_parent['Bee ID'], root_parent['Bee ID'])

# While tree_nodes not empty pop first value (parent) search for its kids append them to tree_nodes and add them to the tree
while tree_nodes:
    # Get first value in tree_nodes
    parent = tree_nodes.pop()

    #find all rows in df_cleaned which their value in Parent Continuous ID column equals parent's Continuous ID
    kids = df_cleaned.filter(col('Parent Continuous ID') == parent['Continuous ID'])

    # for each kid in kids append to tree_nodes and add it to tree
    for k in kids.collect():
        tree_nodes.append(k)
        tree.create_node(k['Bee ID'], k['Bee ID'], parent['Bee ID'] )


tree.show()