In [17]:
from pyspark.sql import (
    functions as f,
    SparkSession,
    types as t
)

spark = SparkSession.builder.appName("df_most_interviewed").getOrCreate()
table_schema = t.StructType([
    t.StructField("interviwer_id", t.StringType(), False),
    t.StructField("occupation_id", t.StringType(), False),
    t.StructField("rating", t.IntegerType(), False)])

csv_file_path = "file:///home/jovyan/work/sample/like.csv"
df = spark.read.schema(table_schema).csv(csv_file_path)

interviewer_count = df.groupBy("occupation_id").count().orderBy(f.desc("count"))

for d in interviewer_count.select("occupation_id", f.col("count").alias("cnt")).collect():
    print(f"{d.occupation_id}: {d.cnt}")


# But, What if we want to know what occupation_id is?  
# 1100: engineer
# 2030: developer
# 3801: painter
# 3021: chemistry teacher
# 9382: priest

meta = {
    "1100": "engineer",
    "2030": "developer",
    "3801": "painter",
    "3021": "chemistry teacher",
    "9382": "priest"
}
occupation_dict = spark.sparkContext.broadcast(meta)

def get_occupation_name(occupation_id: str) -> str:
    return occupation_dict.value[occupation_id]

occupation_lookup_udf = f.udf(get_occupation_name)

occupation_with_name = interviewer_count.withColumn("occupation_name", occupation_lookup_udf(f.col("occupation_id")))

occupation_with_name.show(10)

1100: 217
3801: 203
2030: 200
3021: 191
9382: 189
+-------------+-----+-----------------+
|occupation_id|count|  occupation_name|
+-------------+-----+-----------------+
|         1100|  217|         engineer|
|         3801|  203|          painter|
|         2030|  200|        developer|
|         3021|  191|chemistry teacher|
|         9382|  189|           priest|
+-------------+-----+-----------------+

