# Getting and Knowing your Data

Read raw dataset:

In [44]:
// input path
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._

val schema = StructType(
    Array(
        StructField("order_id", IntegerType, false),
        StructField("quantity", IntegerType, false),
        StructField("item_name", StringType, false),
        StructField("choice_description", StringType, true),
        StructField("item_price", StringType, false)
    )
)

val path = new java.io.File("datasets/chipotle.tsv").getCanonicalPath()
val rawDF = spark.read
    .option("header", "true")
    .option("delimiter", "\t")
    .schema(schema)
    .csv(path)



import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._
schema: org.apache.spark.sql.types.StructType = StructType(StructField(order_id,IntegerType,false), StructField(quantity,IntegerType,false), StructField(item_name,StringType,false), StructField(choice_description,StringType,true), StructField(item_price,StringType,false))
path: String = /home/kuba/projects/pd-exercises-scala-spark/datasets/chipotle.tsv
rawDF: org.apache.spark.sql.DataFrame = [order_id: int, quantity: int ... 3 more fields]


Inspect data:

In [39]:
rawDF.show(5)

+--------+--------+--------------------+--------------------+----------+
|order_id|quantity|           item_name|  choice_description|item_price|
+--------+--------+--------------------+--------------------+----------+
|       1|       1|Chips and Fresh T...|                NULL|    $2.39 |
|       1|       1|                Izze|        [Clementine]|    $3.39 |
|       1|       1|    Nantucket Nectar|             [Apple]|    $3.39 |
|       1|       1|Chips and Tomatil...|                NULL|    $2.39 |
|       2|       2|        Chicken Bowl|[Tomatillo-Red Ch...|   $16.98 |
+--------+--------+--------------------+--------------------+----------+
only showing top 5 rows



### What is the number of observations in the dataset?

In [13]:
rawDF.count()

res5: Long = 4622


### What is the number of columns in the dataset?

In [14]:
rawDF.columns.size

res6: Int = 5


### Print the name of all the columns.

In [16]:
rawDF.columns.foreach(println)

order_id
quantity
item_name
choice_description
item_price


### Which was the most-ordered item?

In [54]:
rawDF
    .select("quantity", "item_name")
    .groupBy("item_name")
    .agg(sum("quantity").alias("orders_sum"))
    .orderBy($"orders_sum".desc)
    .show(1)

+------------+----------+
|   item_name|orders_sum|
+------------+----------+
|Chicken Bowl|       761|
+------------+----------+
only showing top 1 row

