In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, desc

import util

spark = SparkSession.builder.appName("job-profile-analysis").getOrCreate()

df = spark.read.option("inferSchema", "true").json("test_small/*.json")

# flatten the df to make analysis easier
df = util.get_flattened_job_profile_data(df)

df.show(10)

+--------------------+---------+--------+--------------------+
|                  id|firstName|lastName|           jobDetail|
+--------------------+---------+--------+--------------------+
|da313df5-9613-450...|   Daniel|  Pierce|{2016-09-08, Pert...|
|da313df5-9613-450...|   Daniel|  Pierce|{2012-12-26, Pert...|
|da313df5-9613-450...|   Daniel|  Pierce|{2008-03-28, Pert...|
|da313df5-9613-450...|   Daniel|  Pierce|{2003-09-13, Pert...|
|da313df5-9613-450...|   Daniel|  Pierce|{2003-07-03, Pert...|
|da313df5-9613-450...|   Daniel|  Pierce|{1998-05-19, Pert...|
|2238d6ef-ff70-4d5...|    Louis|  Hanson|{2015-03-10, Sydn...|
|2238d6ef-ff70-4d5...|    Louis|  Hanson|{2014-02-10, Sydn...|
|2238d6ef-ff70-4d5...|    Louis|  Hanson|{2012-04-22, Sydn...|
|2238d6ef-ff70-4d5...|    Louis|  Hanson|{2011-05-04, Sydn...|
+--------------------+---------+--------+--------------------+
only showing top 10 rows



In [2]:
from pyspark.sql.functions import explode, asc, desc, round, lower, isnull, col, sum, countDistinct, max

In [3]:
# For each person, list only their latest job
df_max_dates = df.groupBy('id').agg(max('jobDetail.fromDate').alias('maxFromDate'))

df_result = df.join(df_max_dates, on=['id']) \
    .where(col('jobDetail.fromDate') == col('maxFromDate')) \
    .select('id', 'firstName', 'lastName', 'jobDetail')

df_result.show(truncate=False)

+------------------------------------+---------+--------+------------------------------------------------------+
|id                                  |firstName|lastName|jobDetail                                             |
+------------------------------------+---------+--------+------------------------------------------------------+
|da313df5-9613-450a-9e71-45118dc6384a|Daniel   |Pierce  |{2016-09-08, Perth, 103000, dentist, 2019-04-08}      |
|2238d6ef-ff70-4d50-9e93-39e95c6fded9|Louis    |Hanson  |{2015-03-10, Sydney, 61000, principal, 2019-04-10}    |
|11214286-41bb-4d09-9821-98a6d5c4d026|Olive    |Bays    |{2013-06-02, Perth, 94000, paralegal, 2019-03-02}     |
|3c035b6e-8483-49a5-92a1-131a6f256c91|Joe      |Johnson |{2014-09-23, Melbourne, 164000, devops engineer, null}|
|3c035b6e-8483-49a5-92a1-131a6f256c97|Andrea   |Berryman|{2014-09-23, Melbourne, 144000, devops engineer, null}|
|3c035b6e-8483-49a5-92a1-131a6f256c92|George   |Kastanza|{2014-09-23, Melbourne, 164000, actuary

In [4]:
util.get_most_recent_jobs_by_profile(df).show()

+--------------------+---------+--------+--------------------+
|                  id|firstName|lastName|           jobDetail|
+--------------------+---------+--------+--------------------+
|da313df5-9613-450...|   Daniel|  Pierce|{2016-09-08, Pert...|
|2238d6ef-ff70-4d5...|    Louis|  Hanson|{2015-03-10, Sydn...|
|11214286-41bb-4d0...|    Olive|    Bays|{2013-06-02, Pert...|
|3c035b6e-8483-49a...|      Joe| Johnson|{2014-09-23, Melb...|
|3c035b6e-8483-49a...|   Andrea|Berryman|{2014-09-23, Melb...|
|3c035b6e-8483-49a...|   George|Kastanza|{2014-09-23, Melb...|
|3c035b6e-8483-49a...|     Bob1|   Barry|{2018-09-23, Melb...|
|3c035b6e-8483-49a...|     Bob2|   Barry|{2019-09-23, Melb...|
|3c035b6e-8483-49a...|     Bob3|   Barry|{2019-09-30, Melb...|
|3c035b6e-8483-49a...|     Bob4|   Barry|{2019-11-30, Melb...|
|3c035b6e-8483-49a...|     Bob5|   Barry|{2019-09-23, Melb...|
|3c035b6e-8483-49a...|     Bob6|   Barry|{2019-04-23, Melb...|
|3c035b6e-8483-49a...|     Bob7|   Barry|{2019-06-23, M