#### 1 - Please load the dataset into a Spark dataframe

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, desc

import util

spark = SparkSession.builder.appName("job-profile-analysis").getOrCreate()

df = spark.read.option("inferSchema", "true").json("test_data/*.json")

# flatten the df to make analysis easier
df = util.get_flattened_job_profile_data(df)

df.show(10)

+--------------------+---------+--------+--------------------+
|                  id|firstName|lastName|           jobDetail|
+--------------------+---------+--------+--------------------+
|e23c7ab2-6479-401...|Elizabeth| Robledo|{2013-03-13, Pert...|
|a4c6238d-0aed-4eb...|    Karen|   Bozek|{2013-10-12, Hoba...|
|a4c6238d-0aed-4eb...|    Karen|   Bozek|{2011-11-25, Hoba...|
|a4c6238d-0aed-4eb...|    Karen|   Bozek|{2008-11-18, Hoba...|
|a4c6238d-0aed-4eb...|    Karen|   Bozek|{2006-09-02, Hoba...|
|a4c6238d-0aed-4eb...|    Karen|   Bozek|{2003-07-19, Hoba...|
|a4c6238d-0aed-4eb...|    Karen|   Bozek|{2001-01-26, Hoba...|
|a4c6238d-0aed-4eb...|    Karen|   Bozek|{2000-04-14, Hoba...|
|a4c6238d-0aed-4eb...|    Karen|   Bozek|{1996-08-04, Hoba...|
|dcbae85f-4971-4fd...|     Lisa|   Grell|{2015-07-14, Bris...|
+--------------------+---------+--------+--------------------+
only showing top 10 rows



#### 2 - Print the schema

In [2]:
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- lastName: string (nullable = true)
 |-- jobDetail: struct (nullable = true)
 |    |-- fromDate: string (nullable = true)
 |    |-- location: string (nullable = true)
 |    |-- salary: long (nullable = true)
 |    |-- title: string (nullable = true)
 |    |-- toDate: string (nullable = true)



#### 3 - How many records are there in the dataset?

In [3]:
df.count()

77135383

#### 4 - What is the average salary for each profile?
##### Display the first 10 results, ordered by lastName in descending order

In [4]:
util.get_average_salaries_by_profile(df) \
    .orderBy(desc('avgSalary')) \
    .limit(10) \
    .show()

+--------------------+---------+----------+---------+
|                  id|firstName|  lastName|avgSalary|
+--------------------+---------+----------+---------+
|01603b1b-34cd-49c...|   Hector|     Myers| 159000.0|
|5a0a4a63-cdc4-4db...|    Daren| Bjorklund| 159000.0|
|0159f5b4-87e7-40f...|       Ha|   Pearson| 159000.0|
|e9a4feb9-490b-4c3...|  Christy|    Packer| 159000.0|
|00ec95b7-8ae5-457...|     Tana|Lethbridge| 159000.0|
|d28f679e-26c8-4a4...|      Eva|   Barrese| 159000.0|
|01c195ad-2ae3-4eb...|    Karen|   Kilgore| 159000.0|
|9a7072e2-7023-491...|  Pauline|   Wallace| 159000.0|
|02256b8c-08b2-4e6...| Margaret|    Miller| 159000.0|
|84ce538f-0803-432...|  William|   Cernoch| 159000.0|
+--------------------+---------+----------+---------+



#### 5 - What is the average salary across the whole dataset?

In [5]:
util.get_average_salary_for_all_profiles(df).show()

+---------+
|avgSalary|
+---------+
| 97473.62|
+---------+



#### 6 - On average, what are the top 5 paying jobs? Bottom 5 paying jobs?
##### If there is a tie, please order by title, ~~location~~.

In [6]:
print('Top 5 paying jobs')
util.get_average_salaries_by_job_title(df) \
    .orderBy(desc('avgSalary'), 'jobTitle') \
    .limit(5) \
    .show()

print('Bottom 5 paying jobs')
util.get_average_salaries_by_job_title(df) \
    .orderBy('avgSalary', 'jobTitle') \
    .limit(5) \
    .show()

Top 5 paying jobs
+--------------------+---------+
|            jobTitle|avgSalary|
+--------------------+---------+
|      internal sales| 97555.94|
|  service technician| 97539.87|
|     support analyst| 97515.95|
|clinical psycholo...| 97515.49|
|             dentist| 97515.09|
+--------------------+---------+

Bottom 5 paying jobs
+--------------------+---------+
|            jobTitle|avgSalary|
+--------------------+---------+
|business developm...| 97410.55|
|    research analyst| 97412.93|
|retail sales cons...| 97419.07|
|administration of...| 97423.83|
|           paralegal| 97432.44|
+--------------------+---------+



#### 7 - Who is currently making the most money?
##### If there is a tie, please order in lastName descending, fromDate descending.

In [None]:
result = util.get_current_salaries_by_profile(df)
result = util.get_max_rows_for_column(result, 'currentSalary')
result.show()

#### 8 - What was the most popular job title that started in 2019?

In [None]:
util.get_most_popular_job_titles(df, 2019).show(1)

#### 9 - How many people are currently working?

In [None]:
util.get_all_current_jobs(df) \
    .select(countDistinct('id').alias('count_of_current_people_working')) \
    .show()

#### 10 - For each person, list only their latest job
##### Display the first 10 results, ordered by lastName descending, firstName ascending order.

In [None]:
util.get_most_recent_jobs_by_profile(df) \
    .orderBy(desc('lastName'), asc('firstName')) \
    .limit(10) \
    .show()

#### 11 - For each person, list their highest paying job along with their first name, last name, salary and the year they made this salary
##### Store the results in a dataframe, and then print out 10 results

In [None]:
util.get_highest_paying_job_by_profile(df) \
    .show(truncate=False)

#### 12 - Write out the last result (question 11) in parquet format, compressed, partitioned by the year of their highest paying job

In [None]:
df_result.write.partitionBy('year') \
    .parquet('output_data/', compression='gzip', mode='overwrite')