In [1]:
# Bee Hive data https://drive.google.com/file/d/142IBcs6OyQiJxO7owPfkEBFbkrudnh0g/view?usp=sharing

In [2]:
APP = 'BeeHive'

In [3]:
# Install a pip package in the current Jupyter kernel
# ! '../../package_py.bash'

''..' is not recognized as an internal or external command,
operable program or batch file.


In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, IntegerType, StringType
from pyspark.sql.functions import split, row_number, udf, col, min
from pyspark.sql.window import Window

import random
#from treelib import Node, Tree
from treelib import Tree

from pep_data.project import quick_conf
from src.pep_data.spark.util import base_df



In [5]:
# Create Spark session
spark = SparkSession.builder.appName(APP).getOrCreate()

In [6]:
# Get app configuration from project.conf file
conf = quick_conf()

In [7]:
df = base_df(spark, conf, APP)

df.show()

+------+------------------------+-----------+-----------+------------+-----------+-----------+
|Bee ID|DaughtersEfficiencyScore|Father SIZE|Father TYPE|           X|          Y|          Z|
+------+------------------------+-----------+-----------+------------+-----------+-----------+
|   0_0|           -0.1647494899|          5|     107027|    5.345708|25.54008605|24.79858692|
|   0_9|          -0.09111780618|          8|      35473|  3.61617713|15.39993678|14.11150683|
|  0_16|           -0.2783737487|          9|      72732| 6.487132473|27.96111467|23.50405554|
|  0_76|           0.01988315069|          6|      49069| 8.285176906|21.88111447|10.80561155|
|  0_35|           -0.0758420403|          5|        187|        -1.0|       13.0|       17.0|
|  0_17|           -0.3362110457|          5|      74276|        -0.6|       12.0|       16.0|
|  0_49|           -0.1311013747|         10|     108253| 4.123056465|16.28346035| 12.7359525|
|  0_11|           -0.1881197044|          5|     

In [8]:
# Split Bee_ID column to Cycle(Bee value) and Cycle ID(ID value) columns
df_cleaned = df.withColumn('Cycle', split(col('Bee ID'), '_')\
                           .getItem(0))\
               .withColumn('Cycle ID', split(col('Bee ID'), '_')\
                           .getItem(1))

# Change the type of value in Cycle column from string to integer
df_cleaned= df_cleaned.withColumn("Cycle",col("Cycle")\
                                  .cast(IntegerType()))
df_cleaned.show()

+------+------------------------+-----------+-----------+------------+-----------+-----------+-----+--------+
|Bee ID|DaughtersEfficiencyScore|Father SIZE|Father TYPE|           X|          Y|          Z|Cycle|Cycle ID|
+------+------------------------+-----------+-----------+------------+-----------+-----------+-----+--------+
|   0_0|           -0.1647494899|          5|     107027|    5.345708|25.54008605|24.79858692|    0|       0|
|   0_9|          -0.09111780618|          8|      35473|  3.61617713|15.39993678|14.11150683|    0|       9|
|  0_16|           -0.2783737487|          9|      72732| 6.487132473|27.96111467|23.50405554|    0|      16|
|  0_76|           0.01988315069|          6|      49069| 8.285176906|21.88111447|10.80561155|    0|      76|
|  0_35|           -0.0758420403|          5|        187|        -1.0|       13.0|       17.0|    0|      35|
|  0_17|           -0.3362110457|          5|      74276|        -0.6|       12.0|       16.0|    0|      17|
|  0_49|  

In [9]:
# Sort the data frame by Cycle column and add row number for each row(new column with the name Continuous ID)
w = Window().orderBy('Cycle')
df_cleaned = df_cleaned.withColumn('Continuous ID', row_number()\
                                   .over(w))

df_cleaned.show()

+------+------------------------+-----------+-----------+------------+-----------+-----------+-----+--------+-------------+
|Bee ID|DaughtersEfficiencyScore|Father SIZE|Father TYPE|           X|          Y|          Z|Cycle|Cycle ID|Continuous ID|
+------+------------------------+-----------+-----------+------------+-----------+-----------+-----+--------+-------------+
|   0_0|           -0.1647494899|          5|     107027|    5.345708|25.54008605|24.79858692|    0|       0|            1|
|   0_9|          -0.09111780618|          8|      35473|  3.61617713|15.39993678|14.11150683|    0|       9|            2|
|  0_16|           -0.2783737487|          9|      72732| 6.487132473|27.96111467|23.50405554|    0|      16|            3|
|  0_76|           0.01988315069|          6|      49069| 8.285176906|21.88111447|10.80561155|    0|      76|            4|
|  0_35|           -0.0758420403|          5|        187|        -1.0|       13.0|       17.0|    0|      35|            5|
|  0_17|

In [10]:
# Create a dictionary with key = cycle , value = minimum value of Continuous ID of cycle(key)
continuous_min_id_per_cycle = {key : value for key, value  in df_cleaned.groupBy('Cycle').min('Continuous ID').collect()}

continuous_min_id_per_cycle

{0: 1,
 1: 551,
 2: 1859,
 3: 3224,
 4: 4453,
 5: 5171,
 6: 5840,
 7: 6541,
 8: 7394,
 9: 8071,
 10: 8782,
 11: 9290,
 12: 10863,
 13: 12233,
 14: 13375,
 15: 14053,
 16: 14766,
 17: 15458,
 18: 16203,
 19: 16915,
 20: 17700,
 21: 18323,
 22: 19045,
 23: 19859,
 24: 20535,
 25: 21226,
 26: 21862,
 27: 22595,
 28: 23384,
 29: 24147,
 30: 24944,
 31: 25730,
 32: 26403,
 33: 27153,
 34: 27796,
 35: 28645,
 36: 29505,
 37: 30313,
 38: 31053,
 39: 31688,
 40: 32331,
 41: 33170,
 42: 34095,
 43: 34753,
 44: 35502,
 45: 36433,
 46: 37375,
 47: 38147,
 48: 38878,
 49: 39590,
 50: 40388,
 51: 41283,
 52: 43136,
 53: 44282,
 54: 45148,
 55: 46070,
 56: 46859,
 57: 47902,
 58: 48614,
 59: 49444,
 60: 50387,
 61: 51066,
 62: 51920,
 63: 52674,
 64: 53467,
 65: 54124,
 66: 54863,
 67: 55537,
 68: 56387,
 69: 57250,
 70: 58143,
 71: 58892,
 72: 59758,
 73: 60472,
 74: 61223,
 75: 61953,
 76: 62685,
 77: 63550,
 78: 64324,
 79: 65146,
 80: 65767,
 81: 66350,
 82: 67220,
 83: 67951,
 84: 68655,
 85: 6

In [11]:
# Create sorted list of all distinct values of Cycle column
cycles = sorted([i[0] for i in df_cleaned.select('Cycle').distinct().collect()])

cycles

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99]

In [12]:
# Return parent continuous id according to cycle and n
def assert_parent_bee_id(cycle):
    n = 3

    # Get index  of cycle in cycles list
    cycle_index = cycles.index(cycle)

    # Return parent bee id if cycle is 0
    if  not cycle_index :
        return None

    min_cycle_index = 0

    # Update min_cycle_index according to cycle_index and n
    if cycle_index > n:
        min_cycle_index = cycle_index - n

    # Calculate the minimum value for random parent continuous id
    min_rand_value = continuous_min_id_per_cycle[cycles[min_cycle_index]]

    # Calculate the maximum value for random parent continuous id
    max_rand_vale = continuous_min_id_per_cycle[cycles[cycle_index]] - 1

    # Get random value of parent_continuous_id (from min_rand_value to max_rand_vale)
    parent_continuous_id = random.randint(min_rand_value, max_rand_vale)

    return parent_continuous_id

In [13]:
# Convert assert_parent_bee_id(cycle) to user defined function
assert_parent_bee_id_udf = udf(lambda z: assert_parent_bee_id(z))

# Create new column Parent Continuous ID using the assert_parent_bee_id_udf function and Cycle column
# cache() caches the specified data frame in the memory of your cluster's workers
# If executing multiple actions on the same data frame then cache it
df_cleaned = df_cleaned.withColumn("Parent Continuous ID", assert_parent_bee_id_udf(col('Cycle')))\
                        .cache()

df_cleaned.show()

Py4JJavaError: An error occurred while calling o100.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 11.0 failed 1 times, most recent failure: Lost task 0.0 in stage 11.0 (TID 17) (DESKTOP-2H598RF.mshome.net executor driver): java.io.IOException: Cannot run program "python3": CreateProcess error=3, The system cannot find the path specified
	at java.lang.ProcessBuilder.start(ProcessBuilder.java:1048)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:167)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:109)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:124)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:164)
	at org.apache.spark.sql.execution.python.BatchEvalPythonExec.evaluate(BatchEvalPythonExec.scala:81)
	at org.apache.spark.sql.execution.python.EvalPythonExec.$anonfun$doExecute$2(EvalPythonExec.scala:131)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:855)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:855)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.$anonfun$getOrCompute$1(RDD.scala:378)
	at org.apache.spark.storage.BlockManager.$anonfun$doPutIterator$1(BlockManager.scala:1518)
	at org.apache.spark.storage.BlockManager.org$apache$spark$storage$BlockManager$$doPut(BlockManager.scala:1445)
	at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1509)
	at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:1332)
	at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:376)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:327)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.io.IOException: CreateProcess error=3, The system cannot find the path specified
	at java.lang.ProcessImpl.create(Native Method)
	at java.lang.ProcessImpl.<init>(ProcessImpl.java:444)
	at java.lang.ProcessImpl.start(ProcessImpl.java:139)
	at java.lang.ProcessBuilder.start(ProcessBuilder.java:1029)
	... 49 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2672)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2608)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2607)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2607)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1182)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2860)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2802)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2791)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:952)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2228)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2249)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2268)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:506)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:459)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:48)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3868)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:2863)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:3858)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:510)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3856)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:109)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:169)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:95)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3856)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2863)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:3084)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:288)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:327)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.io.IOException: Cannot run program "python3": CreateProcess error=3, The system cannot find the path specified
	at java.lang.ProcessBuilder.start(ProcessBuilder.java:1048)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:167)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:109)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:124)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:164)
	at org.apache.spark.sql.execution.python.BatchEvalPythonExec.evaluate(BatchEvalPythonExec.scala:81)
	at org.apache.spark.sql.execution.python.EvalPythonExec.$anonfun$doExecute$2(EvalPythonExec.scala:131)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:855)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:855)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.$anonfun$getOrCompute$1(RDD.scala:378)
	at org.apache.spark.storage.BlockManager.$anonfun$doPutIterator$1(BlockManager.scala:1518)
	at org.apache.spark.storage.BlockManager.org$apache$spark$storage$BlockManager$$doPut(BlockManager.scala:1445)
	at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1509)
	at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:1332)
	at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:376)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:327)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more
Caused by: java.io.IOException: CreateProcess error=3, The system cannot find the path specified
	at java.lang.ProcessImpl.create(Native Method)
	at java.lang.ProcessImpl.<init>(ProcessImpl.java:444)
	at java.lang.ProcessImpl.start(ProcessImpl.java:139)
	at java.lang.ProcessBuilder.start(ProcessBuilder.java:1029)
	... 49 more


# Create forest start

In [None]:
# Create a DataFrame of parent bees
df_parent_bees = df_cleaned.select('Bee ID', 'Continuous ID')
df_parent_bees = df_parent_bees.withColumnRenamed("Bee ID","Parent Bee ID")
df_parent_bees = df_parent_bees.withColumnRenamed("Continuous ID","Temp Continuous ID")
df_parent_bees= df_parent_bees.withColumn("Temp Continuous ID",col("Temp Continuous ID")\
                                  .cast(StringType()))

df_parent_bees.show()

In [None]:
# Add the parent bee id to each bee id with the use of join
df_beeId_parent_beeId= df_cleaned.join(df_parent_bees, df_cleaned['Parent Continuous ID'] == df_parent_bees['Temp Continuous ID'],'left')
df_beeId_parent_beeId.show()

In [None]:
# Create a forest
def create_forest(df_childID_parentID):
    tree =Tree()
    tree.create_node('God', 'God')

    # Add all the bees for cycle 0 to god
    df_cleaned_0_cycle = df_childID_parentID.filter(col('Cycle') == 0)
    for row in df_cleaned_0_cycle.collect():
        tree.create_node(row['Bee ID'], row['Bee ID']+'-'+str(row['Continuous ID']), 'God')

    # get all bees in cycle c
    for cycle in cycles[1:]:
        df_cleaned_cycle = df_childID_parentID.filter(col('Cycle') == cycle)
        # Add bee from cycle c to tree
        for row in df_cleaned_cycle.collect():
            tree.create_node(row['Bee ID'], row['Bee ID']+'-'+str(row['Continuous ID']), row['Parent Bee ID']+'-'+row['Parent Continuous ID'])

    return tree

forest = create_forest(df_beeId_parent_beeId)

In [None]:
forest = create_forest(df_beeId_parent_beeId)

In [None]:
file_name = 'forest.txt'
forest.save2file(conf[APP]['data_save']+file_name)

# Create forest end

In [None]:
# Choose bee root for the tree
bee_ancestor_id = '31_1711'
bee_ancestors = df_cleaned.filter(df_cleaned['Bee ID'] == bee_ancestor_id).collect()

In [None]:
# Create a tree using the idea of bfs algorithm
# Slow version for alot of nodes
# Works fast for a subtree (only one or few bees as roots)
def create_tree_bfs(tree_nodes, df):

    tree = Tree()

    # Add first node/s(root/s) to tree
    if len(tree_nodes) == 1:
        tree.create_node(tree_nodes[0]['Bee ID'], tree_nodes[0]['Bee ID']+'-'+str(tree_nodes[0]['Continuous ID']))

    else:
        tree.create_node('God', 'God')
        for root in tree_nodes:
            tree.create_node(root['Bee ID'], root['Bee ID']+'-'+str(root['Continuous ID']), 'God')

    # While tree_nodes not empty pop first value (parent) search for its kids append them to tree_nodes and add them to the tree
    while tree_nodes:
        # Get first value in tree_nodes
        parent = tree_nodes.pop()

        #find all rows in df_cleaned which their value in Parent Continuous ID column equals parent's Continuous ID
        children = df.filter(col('Parent Continuous ID') == parent['Continuous ID'])

        # for each kid in kids append to tree_nodes and add it to tree
        for child in children.collect():
                tree_nodes.append(child)
                tree.create_node(child['Bee ID'], child['Bee ID']+'-'+str(child['Continuous ID']), parent['Bee ID']+'-'+str(parent['Continuous ID']) )

    return tree

In [None]:
# Create tree
tree = create_tree_bfs(bee_ancestors,df_cleaned)

In [None]:
tree.size()
tree.show()

In [None]:
# Create a DataFrame with the best bee per cycle
w = Window.partitionBy('Cycle')
df_best_bee = df_cleaned.withColumn('minDaughtersEfficiencyScore', min('DaughtersEfficiencyScore').over(w))\
    .where(col('DaughtersEfficiencyScore') == col('minDaughtersEfficiencyScore'))\
    .drop('minDaughtersEfficiencyScore')

df_best_bee.show()

In [None]:
# Create the trees of all best bees using the function create_tree_bfs (option 1)
best_bee_trees = {}

for bee in df_best_bee.collect():
    row = df_cleaned.filter(col('Bee ID')==bee['Bee ID']).collect()
    best_bee_trees[bee['Bee ID']] = create_tree_bfs(row, df_cleaned)

#best_bee_trees['<bee id from best bees>'].show()

In [None]:
# Get the trees of all best bees using the subtree of the forest(option 2)
best_bee_trees = {}

for bee in df_best_bee.collect():
    best_bee_trees[bee['Bee ID']] = forest.subtree(bee['Bee ID']+'-'+str(bee['Continuous ID']))

#best_bee_trees['<bee id from best bees>'].show()

In [None]:
# Create a tree of only the ancestors of the node
def bee_ancestor_tree_1(node, tree):
    # Get the depth of the node int the tree
    depth_of_node = tree.depth(node['Bee ID']+'-'+str(node['Continuous ID']))
    child = node['Bee ID']+'-'+str(node['Continuous ID'])
    ancestors = [(node['Bee ID'],node['Bee ID']+'-'+str(node['Continuous ID'] ))]
    # Add the ancestors of the node to the list
    for i in range(depth_of_node):
        parent = tree.parent(child)
        ancestors.append((parent.tag,parent.identifier))
        child = parent.identifier

    # Reverse the ancestor list and add the oldest ancestor to the tree
    tree_ancestors = Tree()
    ancestors = ancestors[::-1]
    oldest_ancestor = ancestors[0]
    tree_ancestors.create_node(oldest_ancestor[0],oldest_ancestor[1])

    # Add the ancestors of the node to the tree
    for ancestor in ancestors[1:]:
        tree_ancestors.create_node(ancestor[0],ancestor[1],oldest_ancestor[1])
        oldest_ancestor = ancestor

    return  tree_ancestors

In [None]:
# Create the trees of all best bees ancestors using the function bee_ancestor_tree (option 1)
best_bee_ancestors_trees = {}

for bee in df_best_bee.collect():
    best_bee_ancestors_trees[bee['Bee ID']] = bee_ancestor_tree_1(bee, forest)

#best_bee_ancestors_trees['<bee id from best bees>'].show()

In [None]:
def bee_ancestor_tree_2(row, df_childID_parentID):
    ancestors = [(row['Bee ID'], row['Bee ID']+'-'+str(row['Continuous ID']))]
    ancestor = df_beeId_parent_beeId.filter(col('Continuous ID')==row['Parent Continuous ID']).collect()
    tree_ancestors = Tree()

    # If no ancestors return the tree with one node
    if not ancestor:
        tree_ancestors.create_node((row['Bee ID'], row['Bee ID']+'-'+str(row['Continuous ID'])))
        return tree_ancestors

    ancestor = ancestor[0]

    while ancestor['Parent Bee ID']:
        ancestors.append((ancestor['Bee ID'], ancestor['Bee ID']+'-'+str(ancestor['Continuous ID'])))
        ancestor = df_beeId_parent_beeId.filter(col('Continuous ID')==ancestor['Parent Continuous ID']).collect()[0]

    ancestors.append((ancestor['Bee ID'], ancestor['Bee ID']+'-'+str(ancestor['Continuous ID'])))

    # Reverse the ancestor list and add the oldest ancestor to the tree
    ancestors = ancestors[::-1]
    oldest_ancestor = ancestors[0]
    tree_ancestors.create_node(oldest_ancestor[0],oldest_ancestor[1])

    # Add the ancestors of the node to the tree
    for ancestor in ancestors[1:]:
        tree_ancestors.create_node(ancestor[0],ancestor[1],oldest_ancestor[1])
        oldest_ancestor = ancestor

    return  tree_ancestors

In [None]:
# Create the trees of all best bees ancestors using the function bee_ancestor_tree (option 1)
best_bee_ancestors_trees = {}
i=0
for bee in df_best_bee.collect():
    if not i%10:
        print(i)
    best_bee_ancestors_trees[bee['Bee ID']] = bee_ancestor_tree_2(bee, df_beeId_parent_beeId)
    i+=1

#best_bee_ancestors_trees['<bee id from best bees>'].show()