# Ex2 - Getting and Knowing your Data

This time we are going to pull data directly from the internet.
Special thanks to: https://github.com/justmarkham for sharing the dataset and materials.

### Step 1. Import the necessary libraries

In [None]:
from pyspark.sql import SparkSession

spark_context = SparkSession.builder.appName('Spark Exercises').getOrCreate()

### Step 2. Import the dataset from this [address](https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv). 

In [None]:
import pandas

#1 Using Pandas
URL_DATA = "https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv"
pandas_df_chipotle = pandas.read_csv(URL_DATA, sep="\t")

df_chipotle = spark_context.createDataFrame(pandas_df_chipotle)
df_chipotle.printSchema()
df_chipotle.show(5)

#2 Using PySpark
FILE_PATH = "/Users/jai/Downloads/chipotle.tsv"
df_chipotle \
    = spark_context \
        .read \
        .csv(path=FILE_PATH, sep='\t', header=True, inferSchema=True)
df_chipotle.printSchema()
df_chipotle.show(5)

### Step 3. Assign it to a variable called chipo.

In [None]:
chipo = df_chipotle

### Step 4. See the first 10 entries

In [None]:
# In Pandas
pandas_df_chipotle.head(10)

# In PySpark

##'head' should only be used if the resulting array is expected to be small, as all the data is loaded into the driver's memory.
df_chipotle.head(10)
df_chipotle.show(10)

### Step 5. What is the number of observations in the dataset?

In [None]:
# In Pandas
pandas_df_chipotle.info()

In [None]:
# In PySpark
df_chipotle.summary().show()

### Step 6. What is the number of columns in the dataset?

In [None]:
len(df_chipotle.columns)

### Step 7. Print the name of all the columns.

In [None]:
df_chipotle.columns

### Step 8. How is the dataset indexed?

### Step 9. Which was the most-ordered item? 

In [None]:
from pyspark.sql import functions as F

df_chipotle_grouped \
    = df_chipotle.groupBy(df_chipotle.item_name).agg(
        F.count(df_chipotle.order_id).alias('item_order_count')
    )
df_chipotle_grouped \
    .orderBy(df_chipotle_grouped.item_order_count.desc()) \
    .select(df_chipotle_grouped.item_name) \
    .limit(1) \
    .show()

### Step 10. For the most-ordered item, how many items were ordered?

In [None]:
df_chipotle_grouped \
    .orderBy(df_chipotle_grouped.item_order_count.desc()) \
    .select(df_chipotle_grouped.item_name, df_chipotle_grouped.item_order_count) \
    .limit(1) \
    .show()

### Step 11. What was the most ordered item in the choice_description column?

In [None]:
df_chipotle_grouped \
    = df_chipotle.groupBy(df_chipotle.choice_description).agg(
        F.count(df_chipotle.order_id).alias('item_order_count')
    )
df_chipotle_grouped \
    .orderBy(df_chipotle_grouped.item_order_count.desc()) \
    .select(df_chipotle_grouped.choice_description, df_chipotle_grouped.item_order_count) \
    .limit(5) \
    .show()

### Step 12. How many items were orderd in total?

In [None]:
df_chipotle.select(F.sum(df_chipotle.quantity).alias('order_total')).show()

### Step 13. Turn the item price into a float

In [None]:
df_chipotle = df_chipotle.withColumn('item_price', F.expr("substring(item_price, 2)"))
df_chipotle.show(5)

df_chipotle = df_chipotle.withColumn('item_price', df_chipotle.item_price.astype('float'))
df_chipotle.printSchema()
df_chipotle.show(5)

#### Step 13.a. Check the item price type

In [None]:
df_chipotle.printSchema()

#### Step 13.b. Create a lambda function and change the type of item price

#### Step 13.c. Check the item price type

### Step 14. How much was the revenue for the period in the dataset?

In [None]:
df_chipotle.select(F.sum(df_chipotle.item_price).alias('revenue')).show()

### Step 15. How many orders were made in the period?

In [None]:
df_chipotle.select(F.count(df_chipotle.order_id).alias('order_count')).show()

### Step 16. What is the average revenue amount per order?

In [None]:
# Solution 1
df_chipotle.select(
    (F.sum(df_chipotle.item_price)/ F.count(df_chipotle.order_id)).alias('avg_revenue_per_order')
).show()

In [ ]:
# Solution 2
df_chipotle.select(
    (F.avg(df_chipotle.item_price)).alias('avg_revenue_per_order')
).show()

In [None]:
# Solution 3
df_chipotle.select(
    (F.mean(df_chipotle.item_price)).alias('avg_revenue_per_order')
).show()

### Step 17. How many different items are sold?

In [None]:
df_chipotle.select(
    F.count_distinct(df_chipotle.item_name).alias('distinct_item')
).show()