# PysparkParseGroup Task

Initially, a pyspark context and an sql context are created

In [1]:
import pyspark
from pyspark import SparkContext
from pyspark.sql import SQLContext

sc = SparkContext()
sqlContext = SQLContext(sc)



The data is then loaded into a dataframe

In [2]:
jobs = sqlContext.read.csv("jobs.csv", header=True, inferSchema=True)

Inspecting the top 5 rows in the dataframe 

In [3]:
jobs.show(5)

+------------------+--------------+
|              name|           job|
+------------------+--------------+
|    Candice Turner|        Banker|
|      Keith Flores|      Designer|
|     Jason Collins|         Pilot|
|      Jimmy Lawson|       Barista|
|Ms. Crystal Travis|Office manager|
+------------------+--------------+
only showing top 5 rows



Inspecting the schema

In [5]:
jobs.printSchema()

root
 |-- name: string (nullable = true)
 |-- job: string (nullable = true)



The dataframe is then grouped by job, aggregated by job count and then sorted

In [17]:
job_frequency = jobs.groupby("job").agg({"job": "count"}).sort("count(job)", "job")

In [18]:
job_frequency.show()

+-----------------+----------+
|              job|count(job)|
+-----------------+----------+
|            Pilot|        15|
|          Teacher|        24|
|          Barista|        28|
|         Designer|        29|
|           Banker|        31|
|   Office manager|        31|
|Software engineer|        33|
|           Dancer|        34|
|            Nurse|        34|
|      Film editor|        40|
+-----------------+----------+



The resulting dataframe is then converted into a dictionary

In [26]:
dict(job_frequency.collect())

{'Pilot': 15,
 'Teacher': 24,
 'Barista': 28,
 'Designer': 29,
 'Banker': 31,
 'Office manager': 31,
 'Software engineer': 33,
 'Dancer': 34,
 'Nurse': 34,
 'Film editor': 40}

## The final function

In [27]:
def group_sort(input_path):
    jobs = sqlContext.read.csv(input_path, header=True, inferSchema=True)
    job_frequency = jobs.groupby("job").agg({"job": "count"}).sort("count(job)", "job")
    return dict(job_frequency.collect())

### Result

In [28]:
group_sort("jobs.csv")

{'Pilot': 15,
 'Teacher': 24,
 'Barista': 28,
 'Designer': 29,
 'Banker': 31,
 'Office manager': 31,
 'Software engineer': 33,
 'Dancer': 34,
 'Nurse': 34,
 'Film editor': 40}