#### 1 - Please load the dataset into a Spark dataframe

In [21]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, asc, desc
from modules.common import get_flattened_job_profile_data

spark = SparkSession.builder.appName("job-profile-analysis").getOrCreate()

df = spark.read.option("inferSchema", "true").json("test_data/*.json")

# flatten the df to make analysis easier
df = get_flattened_job_profile_data(df)

df.show(10)

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/opt/conda/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

#### 2 - Print the schema

In [2]:
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- lastName: string (nullable = true)
 |-- jobDetail: struct (nullable = true)
 |    |-- fromDate: string (nullable = true)
 |    |-- location: string (nullable = true)
 |    |-- salary: long (nullable = true)
 |    |-- title: string (nullable = true)
 |    |-- toDate: string (nullable = true)



#### 3 - How many records are there in the dataset?

In [3]:
df.count()

77135383

#### 4 - What is the average salary for each profile?
##### Display the first 10 results, ordered by lastName in descending order

In [3]:
from modules.dataframes_by_profile import get_average_salaries_by_profile

get_average_salaries_by_profile(df) \
    .orderBy(desc('avgSalary')) \
    .limit(10) \
    .show()

+--------------------+---------+--------+---------+
|                  id|firstName|lastName|avgSalary|
+--------------------+---------+--------+---------+
|3c035b6e-8483-49a...|   George|Kastanza| 149000.0|
|3c035b6e-8483-49a...|      Joe| Johnson| 149000.0|
|3c035b6e-8483-49a...|     Bob2|   Barry| 139000.0|
|3c035b6e-8483-49a...|     Bob4|   Barry| 139000.0|
|3c035b6e-8483-49a...|     Bob3|   Barry| 139000.0|
|3c035b6e-8483-49a...|     Bob6|   Barry| 139000.0|
|3c035b6e-8483-49a...|   Andrea|Berryman| 139000.0|
|3c035b6e-8483-49a...|     Bob8|   Barry| 139000.0|
|3c035b6e-8483-49a...|     Bob1|   Barry| 139000.0|
|3c035b6e-8483-49a...|     Bob5|   Barry| 139000.0|
+--------------------+---------+--------+---------+



#### 5 - What is the average salary across the whole dataset?

In [4]:
from modules.dataframes_by_profile import get_average_salary_for_all_profiles

get_average_salary_for_all_profiles(df).show()

+---------+
|avgSalary|
+---------+
|109880.95|
+---------+



#### 6 - On average, what are the top 5 paying jobs? Bottom 5 paying jobs?
##### If there is a tie, please order by title, ~~location~~.

In [6]:
from modules.dataframes_by_title import get_average_salaries_by_job_title

result = get_average_salaries_by_job_title(df)

print('Top 5 paying jobs')
result.orderBy(desc('avgSalary'), 'jobTitle').limit(5).show()

print('Bottom 5 paying jobs')
result.orderBy(asc('avgSalary'), 'jobTitle').limit(5).show()

Top 5 paying jobs
+---------------+---------+
|       jobTitle|avgSalary|
+---------------+---------+
|        actuary| 164000.0|
|devops engineer| 154000.0|
|doctor engineer| 144000.0|
|   new engineer| 144000.0|
|   old engineer| 144000.0|
+---------------+---------+

Bottom 5 paying jobs
+--------------------+---------+
|            jobTitle|avgSalary|
+--------------------+---------+
|     support analyst|  42000.0|
|           evaluator|  44000.0|
|  service technician|  51000.0|
|          technician|  54000.0|
|corporate consultant|  60000.0|
+--------------------+---------+



#### 7 - Who is currently making the most money?
##### If there is a tie, please order in lastName descending, fromDate descending.

In [9]:
from modules.common import get_max_rows_for_column
from modules.dataframes_by_profile import get_current_salaries_by_profile

result = get_current_salaries_by_profile(df)
result = get_max_rows_for_column(result, 'currentSalary')
result.show()

+--------------------+---------+--------+-------------+
|                  id|firstName|lastName|currentSalary|
+--------------------+---------+--------+-------------+
|3c035b6e-8483-49a...|   George|Kastanza|       164000|
|3c035b6e-8483-49a...|      Joe| Johnson|       164000|
+--------------------+---------+--------+-------------+



#### 8 - What was the most popular job title that started in 2019?

In [11]:
from modules.dataframes_by_title import get_most_popular_job_titles

get_most_popular_job_titles(df, 2019).show(1)

+---------------+-------------+----------+
|          title|firstSeenDate|occurrence|
+---------------+-------------+----------+
|prompt engineer|   2019-04-23|         2|
+---------------+-------------+----------+
only showing top 1 row



#### 9 - How many people are currently working?

In [12]:
from pyspark.sql.functions import countDistinct
from modules.common import get_all_current_jobs

get_all_current_jobs(df) \
    .select(countDistinct('id').alias('count_of_current_people_working')) \
    .show()

+-------------------------------+
|count_of_current_people_working|
+-------------------------------+
|                             11|
+-------------------------------+



#### 10 - For each person, list only their latest job
##### Display the first 10 results, ordered by lastName descending, firstName ascending order.

In [13]:
from modules.dataframes_by_profile import get_most_recent_jobs_by_profile

get_most_recent_jobs_by_profile(df) \
    .orderBy(desc('lastName'), asc('firstName')) \
    .limit(10) \
    .show()

+--------------------+---------+--------+--------------------+
|                  id|firstName|lastName|           jobDetail|
+--------------------+---------+--------+--------------------+
|da313df5-9613-450...|   Daniel|  Pierce|{2016-09-08, Pert...|
|3c035b6e-8483-49a...|   George|Kastanza|{2014-09-23, Melb...|
|3c035b6e-8483-49a...|      Joe| Johnson|{2014-09-23, Melb...|
|2238d6ef-ff70-4d5...|    Louis|  Hanson|{2015-03-10, Sydn...|
|3c035b6e-8483-49a...|   Andrea|Berryman|{2014-09-23, Melb...|
|11214286-41bb-4d0...|    Olive|    Bays|{2013-06-02, Pert...|
|3c035b6e-8483-49a...|     Bob1|   Barry|{2018-09-23, Melb...|
|3c035b6e-8483-49a...|     Bob2|   Barry|{2019-09-23, Melb...|
|3c035b6e-8483-49a...|     Bob3|   Barry|{2019-09-30, Melb...|
|3c035b6e-8483-49a...|     Bob4|   Barry|{2019-11-30, Melb...|
+--------------------+---------+--------+--------------------+



#### 11 - For each person, list their highest paying job along with their first name, last name, salary and the year they made this salary
##### Store the results in a dataframe, and then print out 10 results

In [18]:
from modules.dataframes_by_profile import get_highest_paying_job_by_profile

df_result = get_highest_paying_job_by_profile(df)
df_result.show(truncate=False)

+------------------------------------+---------+--------+---------------------+----------------------+--------------------+
|id                                  |firstName|lastName|highestPayingJobTitle|highestPayingJobSalary|highestPayingJobYear|
+------------------------------------+---------+--------+---------------------+----------------------+--------------------+
|11214286-41bb-4d09-9821-98a6d5c4d026|Olive    |Bays    |paralegal            |95000                 |2012                |
|2238d6ef-ff70-4d50-9e93-39e95c6fded9|Louis    |Hanson  |principal            |61000                 |2015                |
|3c035b6e-8483-49a5-92a1-131a6f256c17|Bob8     |Barry   |new engineer         |144000                |2020                |
|3c035b6e-8483-49a5-92a1-131a6f256c18|Bob7     |Barry   |doctor engineer      |144000                |2019                |
|3c035b6e-8483-49a5-92a1-131a6f256c19|Bob6     |Barry   |prompt engineer      |144000                |2019                |
|3c035b6

#### 12 - Write out the last result (question 11) in parquet format, compressed, partitioned by the year of their highest paying job

In [20]:
df_result.write.partitionBy('highestPayingJobYear') \
    .parquet('output_data/', compression='gzip', mode='overwrite')