In [None]:
# Bee Hive data https://drive.google.com/file/d/142IBcs6OyQiJxO7owPfkEBFbkrudnh0g/view?usp=sharing

In [None]:
APP = 'BeeHive'

In [None]:
# Install a pip package in the current Jupyter kernel
!{sys.executable} -m pip install -e '../../../Wielder/'
#!{sys.executable} -m pip install -e '../'

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, IntegerType, StringType
from pyspark.sql.functions import split, row_number, udf, col, min
from pyspark.sql.window import Window

from pep_data.project import quick_conf
from pep_data.spark.util import field_to_struct

import random
#from treelib import Node, Tree
from treelib import Tree

In [None]:
# Create Spark session
spark = SparkSession.builder.appName(APP).getOrCreate()

In [None]:
# Get app configuration from project.conf file
conf = quick_conf()

In [None]:
# Create schema for the data
cols_name = conf[APP]['cols_name']
cols_double = conf[APP]['cols_double']
cols_integer = conf[APP]['cols_integer']

# Create all the fields
fields = [field_to_struct(header, doubles=cols_double, integers=cols_integer) for header in cols_name]

# Create the schema from th e fields
schema = StructType(fields)

In [None]:
# Read the data from the csv using the schema
data_path = conf[APP]['data_path']
df = spark.read.schema(schema).csv(data_path)

df.show()

In [None]:
# Remove unnecessary columns(the columns with the word remove in them)
cols_to_keep = [x for x in df.columns if 'remove' not in x]
df = df.select(*cols_to_keep)

df.show()

In [None]:
# Split Bee_ID column to Cycle(Bee value) and Cycle ID(ID value) columns
df_cleaned = df.withColumn('Cycle', split(col('Bee ID'), '_')\
                           .getItem(0))\
               .withColumn('Cycle ID', split(col('Bee ID'), '_')\
                           .getItem(1))

# Change the type of value in Cycle column from string to integer
df_cleaned= df_cleaned.withColumn("Cycle",col("Cycle")\
                                  .cast(IntegerType()))
df_cleaned.show()

In [None]:
# Sort the data frame by Cycle column and add row number for each row(new column with the name Continuous ID)
w = Window().orderBy('Cycle')
df_cleaned = df_cleaned.withColumn('Continuous ID', row_number()\
                                   .over(w))

df_cleaned.show()

In [None]:
# Create a dictionary with key = cycle , value = minimum value of Continuous ID of cycle(key)
continuous_min_id_per_cycle = {key : value for key, value  in df_cleaned.groupBy('Cycle').min('Continuous ID').collect()}

continuous_min_id_per_cycle

In [None]:
# Create sorted list of all distinct values of Cycle column
cycles = sorted([i[0] for i in df_cleaned.select('Cycle').distinct().collect()])

cycles

In [None]:
# Return parent continuous id according to cycle and n
def assert_parent_bee_id(cycle):
    n = 3

    # Get index  of cycle in cycles list
    cycle_index = cycles.index(cycle)

    # Return parent bee id if cycle is 0
    if  not cycle_index :
        return None

    min_cycle_index = 0

    # Update min_cycle_index according to cycle_index and n
    if cycle_index > n:
        min_cycle_index = cycle_index - n

    # Calculate the minimum value for random parent continuous id
    min_rand_value = continuous_min_id_per_cycle[cycles[min_cycle_index]]

    # Calculate the maximum value for random parent continuous id
    max_rand_vale = continuous_min_id_per_cycle[cycles[cycle_index]] - 1

    # Get random value of parent_continuous_id (from min_rand_value to max_rand_vale)
    parent_continuous_id = random.randint(min_rand_value, max_rand_vale)

    return parent_continuous_id

In [None]:
# Convert assert_parent_bee_id(cycle) to user defined function
assert_parent_bee_id_udf = udf(lambda z: assert_parent_bee_id(z))

# Create new column Parent Continuous ID using the assert_parent_bee_id_udf function and Cycle column
# cache() caches the specified data frame in the memory of your cluster's workers
# If executing multiple actions on the same data frame then cache it
df_cleaned = df_cleaned.withColumn("Parent Continuous ID", assert_parent_bee_id_udf(col('Cycle')))\
                        .cache()

df_cleaned.show()

# Create forest start

In [None]:
# Create a DataFrame of parent bees
df_parent_bees = df_cleaned.select('Bee ID', 'Continuous ID')
df_parent_bees = df_parent_bees.withColumnRenamed("Bee ID","Parent Bee ID")
df_parent_bees = df_parent_bees.withColumnRenamed("Continuous ID","Temp Continuous ID")
df_parent_bees= df_parent_bees.withColumn("Temp Continuous ID",col("Temp Continuous ID")\
                                  .cast(StringType()))

df_parent_bees.show()

In [None]:
# Add the parent bee id to each bee id with the use of join
df_beeId_parent_beeId= df_cleaned.join(df_parent_bees, df_cleaned['Parent Continuous ID'] == df_parent_bees['Temp Continuous ID'],'left')
df_beeId_parent_beeId.show()

In [None]:
# Create a forest
def create_tree():
    tree =Tree()
    tree.create_node('God', 'God')

    # Add all the bees for cycle 0 to god
    df_cleaned_0_cycle = df_beeId_parent_beeId.filter(col('Cycle') == 0)
    for row in df_cleaned_0_cycle.collect():
        tree.create_node(str(row['Bee ID']), str(row['Bee ID'])+'-'+str(row['Continuous ID']), 'God')
    # get all bees in cycle c
    for c in cycles[1:]:
        df_cleaned_c_cycle = df_beeId_parent_beeId.filter(col('Cycle') == c)
        # Add bee from cycle c to tree
        for row in df_cleaned_c_cycle.collect():
            tree.create_node(str(row['Bee ID']), str(row['Bee ID'])+'-'+str(row['Continuous ID']), str(row['Parent Bee ID'])+'-'+str(row['Parent Continuous ID']) )

    return tree

In [None]:
tree = create_tree()

In [None]:
file_name = 'forest.txt'
tree.save2file(conf[APP]['data_save']+file_name)

# Create forest end

In [None]:
# Choose bee roots for the tree
bee_ancestor_continuous_id = 1
bee_ancestors = df_cleaned.filter(df_cleaned['Continuous ID'] <= bee_ancestor_continuous_id).collect()

In [None]:
# Create a tree using the idea of bfs algorithm
# Slow version for alot of nodes
# Works fast for a subtree (only one or few bees as roots)
def create_tree_bfs(tree_nodes):
    tree = Tree()

    # Add first node/s(root/s) to tree
    if len(tree_nodes) == 1:
        tree.create_node(str(tree_nodes[0]['Bee ID']), str(tree_nodes[0]['Bee ID'])+'-'+str(tree_nodes[0]['Continuous ID']))

    else:
        tree.create_node('God', 'God')
        for root in tree_nodes:
            tree.create_node(str(root['Bee ID']), str(root['Bee ID'])+'-'+str(root['Continuous ID']), 'God')

    # While tree_nodes not empty pop first value (parent) search for its kids append them to tree_nodes and add them to the tree
    while tree_nodes:
        # Get first value in tree_nodes
        parent = tree_nodes.pop()

        #find all rows in df_cleaned which their value in Parent Continuous ID column equals parent's Continuous ID
        kids = df_cleaned.filter(col('Parent Continuous ID') == parent['Continuous ID'])

        # for each kid in kids append to tree_nodes and add it to tree
        for k in kids.collect():
                tree_nodes.append(k)
                tree.create_node(str(k['Bee ID']), str(k['Bee ID'])+'-'+str(k['Continuous ID']), str(parent['Bee ID'])+'-'+str(parent['Continuous ID']) )

    return tree

In [None]:
# Create tree
tree = create_tree_bfs(bee_ancestors)

In [None]:
tree.size()
tree.show()

In [None]:
# Create a DataFrame with the best bee per cycle
w = Window.partitionBy('Cycle')
df_best_bee = df_cleaned.withColumn('minDaughtersEfficiencyScore', min('DaughtersEfficiencyScore').over(w))\
    .where(col('DaughtersEfficiencyScore') == col('minDaughtersEfficiencyScore'))\
    .drop('minDaughtersEfficiencyScore')

df_best_bee.show()

In [None]:
# Create a tree which the best bee in the cycle (for each cycle)
for row in df_best_bee:
    #TODO check func maybe change row to DF
    create_tree(row).show()