In [None]:
# Bee Hive data https://drive.google.com/file/d/142IBcs6OyQiJxO7owPfkEBFbkrudnh0g/view?usp=sharing

In [None]:
APP = 'BeeHive'

In [None]:
# Install a pip package in the current Jupyter kernel
! '../../package_py.bash'

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType

from pep_data.project import quick_conf
from pep_data.spark.util import field_to_struct

%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Create Spark session
spark = SparkSession.builder.appName(APP).getOrCreate()

In [None]:
# Get app configuration from project.conf file
conf = quick_conf()

In [None]:
# Create schema for the data
cols_name = conf[APP]['cols_name']
cols_double = conf[APP]['cols_double']
cols_integer = conf[APP]['cols_integer']

# Create all the fields
fields = [field_to_struct(header, doubles=cols_double, integers=cols_integer) for header in cols_name]

# Create the schema from th e fields
schema = StructType(fields)

In [None]:
# Read the data from the csv using the schema
data_path = conf[APP]['data_path']
df = spark.read.schema(schema).csv(data_path)

df.show()

In [None]:
# Count the number of bees for each Father SIZE
df_cleaned = df.groupBy('Father SIZE').count()

df_cleaned.show()

In [None]:
# Convert from pyspark data frame to numpy array
df_cleaned_np = np.array(df_cleaned.select('Father SIZE', 'count').collect())

df_cleaned_np

In [None]:
# Create bar chart based on amount of bees in each Father SIZE group

# Create plot figure and axes
fig, ax = plt.subplots(figsize =(8, 7))

# Create bars and their labels
ax.bar([str(i) for i in df_cleaned_np[:,0]], df_cleaned_np[:,1])

# Add x, y gridlines
ax.grid( color ='grey', linestyle ='-.', linewidth = 0.5, alpha = 0.2)

for bar in ax.patches:
   
  # Using Matplotlib annotate function and
  # passing the coordinates where the annotation shall be done
  # x-coordinate: bar.get_x() + bar.get_width() / 2
  # y-coordinate: bar.get_height()
  # free space to be left to make graph pleasing: (0, 8)
  # ha and va stand for the horizontal and vertical alignment
    ax.annotate(format(bar.get_height(), '.0f'),
                   (bar.get_x() + bar.get_width() / 2,
                    bar.get_height()), ha='center', va='center',
                   size=15, xytext=(0, 8),
                   textcoords='offset points')
    

# Add Plot Title
ax.set_title('Bee population distribution by father size', fontsize = 25)

# Set x axis name
ax.set_xlabel("Father size", fontsize =18)

# Set y axis name
ax.set_ylabel('Bees amount', fontsize =18)

# Show plot, not necessary but used to remove unwanted output
plt.show()

In [None]:
# Create horizontal bar chart based on amount of bees in each Father SIZE group

# Create plot figure and axes
fig, ax = plt.subplots(figsize =(9, 8))
 
# # Create horizontal bars and their labels
ax.barh([str(i) for i in df_cleaned_np[:,0]], df_cleaned_np[:,1])
 
# Add x, y gridlines
ax.grid( color ='grey', linestyle ='-.', linewidth = 0.5, alpha = 0.2)
 
# Add annotation to bars
for i in ax.patches:
    plt.text(i.get_width()+0.2, i.get_y()+0.5,
             str(round((i.get_width()), 2)),
             fontsize = 10, fontweight ='bold',
             color ='grey')
    
# Add Plot Title
ax.set_title('Bee population distribution by father size', fontsize = 25)

# Set x axis name
ax.set_xlabel('Bees amount', fontsize =18)

# Set y axis name
ax.set_ylabel('Father size', fontsize =18)

# Show plot, not necessary but used to remove unwanted output
plt.show()

In [None]:
# Create pie chart based on amount of bees (in %) in each Father SIZE group

# Create plot figure and axes
fig1, ax1 = plt.subplots()

# Create pie slices (in %) and their labels
ax1.pie(df_cleaned_np[:,1], labels=df_cleaned_np[:,0], autopct='%1.2f%%')

# Equal aspect ratio ensures that pie is drawn as a circle.
ax1.axis('equal')  

# Show plot, not necessary but used to remove unwanted output
plt.show()