In [None]:
# Bee Hive data https://drive.google.com/file/d/142IBcs6OyQiJxO7owPfkEBFbkrudnh0g/view?usp=sharing

In [None]:
APP = "BeeHive"

In [None]:
# Install a pip package in the current Jupyter kernel
!{sys.executable} -m pip install -e '../../../Wielder/'
!{sys.executable} -m pip install -e '../'

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType
from pyspark.sql.functions import split, col

from pep_data.project import quick_conf
from pep_data.spark.util import field_to_struct

%matplotlib notebook
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker

In [None]:
# Create Spark session
spark = SparkSession.builder.appName(APP).getOrCreate()

In [None]:
# Get app configuration from project.conf file
conf = quick_conf()

In [None]:
# Create schema for the data
cols_name = conf[APP]['cols_name']
cols_double = conf[APP]['cols_double']
cols_integer = conf[APP]['cols_integer']

# Create all the fields
fields = [field_to_struct(header, doubles=cols_double, integers=cols_integer) for header in cols_name]

# Create the schema from th e fields
schema = StructType(fields)

In [None]:
# Read the data from the csv using the schema
data_path = conf[APP]['data_path']
df = spark.read.schema(schema).csv(data_path)

df.show()

In [None]:
# Remove unnecessary columns(the columns with the word remove in them)
cols_to_keep = [x for x in df.columns if 'remove' not in x]
df = df.select(*cols_to_keep)

df.show()

In [None]:
# Create Cycle column from the Bee ID colums(Bee value from Bee_ID)
df_cleaned = df.withColumn('Cycle', split(col('Bee ID'), '_').getItem(0))

df.show()

In [None]:
# Convert from pyspark data frame to pandas data frame
df_cleaned_pd = df_cleaned.toPandas()

df_cleaned_pd

In [None]:
# Create scatter plot with change in number of x ticks

# Create plot figure and axes
fig, ax = plt.subplots()

# Ticks format
label_format = '{:,.0f}'

num_of_ticks = 25

#Creates dots on the graph
plt.scatter(df_cleaned_pd['Cycle'], df_cleaned_pd['DaughtersEfficiencyScore'], s = 5)

# Set x axis name
plt.xlabel(f'Cycle {min(df_cleaned_pd["Cycle"])} - {max(df_cleaned_pd["Cycle"])}')

# Set y axis name
plt.ylabel('DaughtersEfficiencyScore')

# fixing xticks with FixedLocator but also using MaxNLocator to avoid cramped x-labels
a = ax.xaxis.set_major_locator(mticker.MaxNLocator(num_of_ticks))
ticks_loc = ax.get_xticks().tolist()
c = ax.xaxis.set_major_locator(mticker.FixedLocator(ticks_loc))
d = ax.set_xticklabels([label_format.format(x) for x in ticks_loc])

plt.show()

In [None]:
# Create scatter plot with horizontal x ticks

# Create plot figure and axes
fig, ax = plt.subplots()

# Creates dots on the graph
ax.scatter(df_cleaned_pd['Cycle'], df_cleaned_pd['DaughtersEfficiencyScore'], s=5)

# Get all cycles (x ticks for the graph)
x_ticks = sorted([int(x) for x in list(set(df_cleaned_pd['Cycle']))])

# Add grid to the graph
plt.grid()

# Change mragins of the grpah
plt.gca().margins(x=0.01)

# Rotating X-axis labels
ax.set_xticks(x_ticks)
ax.set_xticklabels(ax.get_xticks(), rotation = 90)

# Change the fontsize of x ticks
plt.xticks(fontsize=9)

# Set x axis name
plt.xlabel('Cycle')

# Set y axis name
plt.ylabel('DaughtersEfficiencyScore')

plt.show()

In [None]:
# Create scatter plot with diffrent space between x ticks

# Create plot figure and axes
fig, ax = plt.subplots()

# Creates dots on the graph
ax.scatter(df_cleaned_pd['Cycle'], df_cleaned_pd['DaughtersEfficiencyScore'], s=5)

# Get all cycles (x ticks for the graph)
x_ticks = sorted([int(x) for x in list(set(df_cleaned_pd['Cycle']))])

# Load x ticks to graph
plt.xticks(x_ticks)

N = max(x_ticks)+1

# Change mragins of the grpah
plt.gca().margins(x=0.01)

# Calculate the space between x ticks
plt.gcf().canvas.draw()
tl = plt.gca().get_xticklabels()
maxsize = max([t.get_window_extent().width for t in tl])
m = 0.2 # inch margin
s = maxsize/plt.gcf().dpi*N+2*m
margin = m/plt.gcf().get_size_inches()[0]

# Change the fontsize of x ticks
plt.xticks(fontsize=8)

# Adjust graph according to new x ticks
plt.gcf().subplots_adjust(left=margin+0.01, right=1.-margin+0.01)
plt.gcf().set_size_inches(s, plt.gcf().get_size_inches()[1])

# Set x axis name
plt.xlabel('Cycle')

# Set y axis name
plt.ylabel('DaughtersEfficiencyScore')

plt.show()