In [None]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
from pyspark.sql import functions as f

In [None]:
# Create and start Spark session
spark = SparkSession.builder.appName("BeeHive").getOrCreate()

In [None]:
#Create schema for the data

# All column names
headers = ['remove', 'Bee ID', 'remove1', 'DaughtersEfficiencyScore', 'remove2', 'Father SIZE', 'Father TYPE', 'remove3', 'X', 'Y', 'Z']

# All column names that their values are doubles
doubles = ['DaughtersEfficiencyScore', 'X', 'Y', 'Z']

# All column names that their values are integers
integers = ['Father SIZE']

def struct_field(header, doubles, integers):
    
    # Create a field for double type column
    if header in doubles:
        return StructField(header, DoubleType())
    
    # Create a field for integer type column
    if header in integers:
        return StructField(header, IntegerType())
    
    # # Create a field for string type column
    return StructField(header, StringType())

# Create all the fields
fields = [struct_field(header, doubles, integers) for header in headers]

# Create the schema from th e fields
schema = StructType(fields)

In [None]:
# Read the data from the csv using the schema
df = spark.read.schema(schema).csv('Downloads/BeeHiveTestData.csv')

df.show(5)

In [None]:
# Remove all columns with the string remove in them
cols_to_keep = [x for x in df.columns if 'remove' not in x]
df = df.select(*cols_to_keep)

df.show(5)

In [None]:
# Remove unnecessary columns
cols_to_drop = ('Father TYPE')
df_cleaned = df.drop(cols_to_drop)

df_cleaned.show(5)

In [None]:
# Create a data frame with the best bee (lowest DaughtersEfficiencyScore) for each Father SIZE
df_cleaned = df_cleaned.join(df_cleaned.groupBy('Father SIZE').agg(
    f.min('DaughtersEfficiencyScore').alias('DaughtersEfficiencyScore')), on='DaughtersEfficiencyScore', 
               how='leftsemi')

df_cleaned.show()

In [None]:
# Convert from pyspark data frame to pandas data frame
df_cleaned_pd = df_cleaned.toPandas()

df_cleaned_pd

In [None]:
%matplotlib notebook
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d

In [None]:
# Create scatter plot based on bees location with text annotation over data point

# Colors for bee groups (groups of Father SIZE)
colors = ('blue', 'orange', 'green', 'red', 'purple', 'brown', 'pink', 'gray', 'olive', 'cyan', 'yellow')

# Number of rows in the data frame
length = len(df_cleaned_pd.index)

# Create plot figure
fig = plt.figure()

# Create 3D axes
ax = plt.axes(projection='3d')

for i in range (length):
    
    # Create dot on graph with color and label
    ax.scatter3D(df_cleaned_pd['X'][i], df_cleaned_pd['Y'][i], df_cleaned_pd['Z'][i],
            c = colors[i], label = f"Father Size = {df_cleaned_pd['Father SIZE'][i]}")
    
    # Add text(Bee_ID) for dot
    ax.text(df_cleaned_pd['X'][i], df_cleaned_pd['Y'][i], df_cleaned_pd['Z'][i], 
            '%s' % (df_cleaned_pd['Bee ID'][i]), size=10)

# Add Plot Title
ax.set_title('X Y Z of all father sizes')

# Set x axis name
ax.set_xlabel('X')

# Set y axis name
ax.set_ylabel('Y')

#Set z axis name
ax.set_zlabel('Z')

# Create legend(descriptive labels for each plotted data series)
fig.legend(loc="upper left")

# Show plot, not necessary but used to remove unwanted output
plt.show()

In [None]:
import numpy as np

In [None]:
# Convert from pyspark data frame to numpy array
df_cleaned_np = np.array(df_cleaned.select("X", "Y", "Z").collect())

df_cleaned_np

In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import proj3d

In [None]:
# Create scatter plot based on bees location with text annotation over data point closest to mouse

def visualize3DData (df_cleaned_np):
    """Visualize data in 3d plot with popover next to mouse position.

    Args:
        X (np.array) - array of points, of shape (numPoints, 3)
    Returns:
        None
    """
    fig = plt.figure()
    ax = plt.axes(projection='3d')
    for i in range (length):
        ax.scatter3D(df_cleaned_pd['X'][i], df_cleaned_pd['Y'][i], df_cleaned_pd['Z'][i],
            c = colors[i], label = f"Father Size = {df_cleaned_pd['Father SIZE'][i]}", depthshade = False, picker = True)



    def distance(point, event):
        """Return distance between mouse position and given data point

        Args:
            point (np.array): np.array of shape (3,), with x,y,z in data coords
            event (MouseEvent): mouse event (which contains mouse position in .x and .xdata)
        Returns:
            distance (np.float64): distance (in screen coords) between mouse pos and data point
        """
        assert point.shape == (3,), "distance: point.shape is wrong: %s, must be (3,)" % point.shape

        # Project 3d data space to 2d data space
        x2, y2, _ = proj3d.proj_transform(point[0], point[1], point[2], plt.gca().get_proj())
        # Convert 2d data space to 2d screen space
        x3, y3 = ax.transData.transform((x2, y2))

        return np.sqrt ((x3 - event.x)**2 + (y3 - event.y)**2)


    def calcClosestDatapoint(df_cleaned_np, event):
        """"Calculate which data point is closest to the mouse position.

        Args:
            X (np.array) - array of points, of shape (numPoints, 3)
            event (MouseEvent) - mouse event (containing mouse position)
        Returns:
            smallestIndex (int) - the index (into the array of points X) of the element closest to the mouse position
        """
        distances = [distance (df_cleaned_np[i, 0:3], event) for i in range(df_cleaned_np.shape[0])]
        return np.argmin(distances)


    def annotatePlot(df_cleaned_np, index):
        """Create popover label in 3d chart

        Args:
            X (np.array) - array of points, of shape (numPoints, 3)
            index (int) - index (into points array X) of item which should be printed
        Returns:
            None
        """
        # If we have previously displayed another label, remove it first
        if hasattr(annotatePlot, 'label'):
            annotatePlot.label.remove()
        # Get data point from array of points X, at position index
        x2, y2, _ = proj3d.proj_transform(df_cleaned_np[index, 0], df_cleaned_np[index, 1], df_cleaned_np[index, 2], ax.get_proj())
        annotatePlot.label = plt.annotate( "Bee ID = %s" % df_cleaned_pd['Bee ID'][index],
            xy = (x2, y2), xytext = (-20, 20), textcoords = 'offset points', ha = 'right', va = 'bottom',
            bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5),
            arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))
        fig.canvas.draw()


    def onMouseMotion(event):
        """Event that is triggered when mouse is moved. Shows text annotation over data point closest to mouse."""
        closestIndex = calcClosestDatapoint(df_cleaned_np, event)
        annotatePlot (df_cleaned_np, closestIndex)

    fig.canvas.mpl_connect('motion_notify_event', onMouseMotion)  # on mouse motion
    plt.show()


if __name__ == '__main__':
    visualize3DData (df_cleaned_np)