In [0]:
%pip install -r ../requirements.txt

## First we install Python packages and initialize our globals

In [0]:
# load table into spark dataframe
cows_bff = spark.read.table("db.cows_bff")

## Some graphs and charts

In [0]:
display(cows_bff.limit(10))

## Lets compute meal overlap

In [0]:
from pyspark.sql.functions import udf, countDistinct, sum, col
from pyspark.sql.types import IntegerType

calculate_time_overlap = lambda start_interval_1, end_interval_1, start_interval_2, end_interval_2: \
    0 if end_interval_1 <= start_interval_2 or end_interval_2 <= start_interval_1 else \
    max(end_interval_1, end_interval_2) - min(start_interval_1, start_interval_2) 

calculate_time_overlap_udf = udf(calculate_time_overlap, IntegerType())

cow1 = cows_bff\
    .withColumnRenamed('cow_name','cow1')\
    .withColumnRenamed('meal_start','meal_start1')\
    .withColumnRenamed('meal_end','meal_end1')\
    .withColumnRenamed('date','date1')\
    .select('cow1','meal_start1','meal_end1','date1')


cow2 = cows_bff\
    .withColumnRenamed('cow_name','cow2')\
    .withColumnRenamed('meal_start','meal_start2')\
    .withColumnRenamed('meal_end','meal_end2')\
    .withColumnRenamed('date','date2')\
    .select('cow2','meal_start2','meal_end2','date2')

df = cow1.crossJoin(cow2)\
  .where((cow1.cow1 != cow2.cow2) & (cow1.date1 == cow2.date2))

df = df\
  .withColumn('overlap', calculate_time_overlap_udf("meal_start1", "meal_end1", "meal_start2", "meal_end2"))\
  .select('cow1','cow2','date1','overlap')

df = df.groupBy('cow1', 'cow2').agg(sum('overlap').alias('total_overlap'), countDistinct('date1').alias('distinct_days'))
df = df\
  .withColumn('avg_overlap', df.total_overlap / df.distinct_days)\
  .select('cow1','cow2','avg_overlap')\
  .withColumnRenamed('avg_overlap','closeness')\
  .sort(col('cow1').asc(), col('cow2').asc())


# Display Heatmap

In [0]:
pdf = df.toPandas()
pdf = pdf.pivot(index='cow1', columns='cow2', values='closeness').fillna(0)

import plotly.express as px
fig = px.imshow(pdf, x=pdf.columns, y=pdf.index, labels=dict(x="Cow 2", y="Cow 1", color="closeness"), title="Cow BFFs", color_continuous_scale='redor')
px.imshow(pdf, x=pdf.columns, y=pdf.index, labels=dict(x="Cow 2", y="Cow 1", color="closeness"),)

fig.update_layout(width=800,height=500)
fig.show()



We know that cows are selective in who they eat with, but the heatmap isn't showing that and look like just noise. What's going on with our cow BFF calculation? Let's find out.