# Getting and Knowing your Data - Occupation

### Import the dataset

In [13]:
import org.apache.spark.sql.functions._

import org.apache.spark.sql.functions._


In [4]:
val path = "datasets/users.csv"

val rawDF = spark
        .read
        .option("header", "true")
        .option("inferschema", "true")
        .option("delimiter", "|")
        .csv(path)

rawDF.show()
rawDF.printSchema()

+-------+---+------+-------------+--------+
|user_id|age|gender|   occupation|zip_code|
+-------+---+------+-------------+--------+
|      1| 24|     M|   technician|   85711|
|      2| 53|     F|        other|   94043|
|      3| 23|     M|       writer|   32067|
|      4| 24|     M|   technician|   43537|
|      5| 33|     F|        other|   15213|
|      6| 42|     M|    executive|   98101|
|      7| 57|     M|administrator|   91344|
|      8| 36|     M|administrator|   05201|
|      9| 29|     M|      student|   01002|
|     10| 53|     M|       lawyer|   90703|
|     11| 39|     F|        other|   30329|
|     12| 28|     F|        other|   06405|
|     13| 47|     M|     educator|   29206|
|     14| 45|     M|    scientist|   55106|
|     15| 49|     F|     educator|   97301|
|     16| 21|     M|entertainment|   10309|
|     17| 30|     M|   programmer|   06355|
|     18| 35|     F|        other|   37212|
|     19| 40|     M|    librarian|   02138|
|     20| 42|     F|    homemake

path: String = datasets/users.csv
rawDF: org.apache.spark.sql.DataFrame = [user_id: int, age: int ... 3 more fields]


### What is the number of observations in the dataset?

In [5]:
rawDF.count()

res4: Long = 943


### What is the number of columns in the dataset?

In [6]:
rawDF.columns.size

res5: Int = 5


### Print the name of all the columns.

In [7]:
rawDF.columns.foreach(println)

user_id
age
gender
occupation
zip_code


### How many different occupations are in this dataset?

In [12]:
val distinctOccupations = rawDF.select("occupation").distinct()
distinctOccupations.show(21)
distinctOccupations.count()

+-------------+
|   occupation|
+-------------+
|   programmer|
|    marketing|
|     salesman|
|     engineer|
|     educator|
|    librarian|
|    scientist|
|administrator|
|   healthcare|
|entertainment|
|    homemaker|
|       writer|
|    executive|
|      student|
|      retired|
|        other|
|       doctor|
|       lawyer|
|         none|
|       artist|
|   technician|
+-------------+



distinctOccupations: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [occupation: string]
res11: Long = 21


### What is the most frequent occupation?

In [18]:
rawDF
    .groupBy("occupation")
    .agg(count("occupation").as("occupation_count"))
    .orderBy($"occupation_count".desc)
    .show(1)

+----------+----------------+
|occupation|occupation_count|
+----------+----------------+
|   student|             196|
+----------+----------------+
only showing top 1 row



### What is the mean age of users?

In [20]:
rawDF
    .select(avg("age"))
    .show()

+-----------------+
|         avg(age)|
+-----------------+
|34.05196182396607|
+-----------------+



### What is the age with least occurrence?

In [28]:
rawDF
    .groupBy("age")
    .agg(count("age").as("age_occurences"))
    .orderBy("age_occurences")
    .show(5)

+---+--------------+
|age|age_occurences|
+---+--------------+
| 10|             1|
| 73|             1|
| 11|             1|
| 66|             1|
|  7|             1|
+---+--------------+
only showing top 5 rows

