In [106]:
# Libraries.
import numpy as np
import pandas as pd

In [107]:
# Fetch file from adress and put it in a dataframe.
url = "https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv"
chipo = pd.read_csv(url, sep = '\t')

# Preview first 10 rows.
chipo.head(10)

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98
5,3,1,Chicken Bowl,"[Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou...",$10.98
6,3,1,Side of Chips,,$1.69
7,4,1,Steak Burrito,"[Tomatillo Red Chili Salsa, [Fajita Vegetables...",$11.75
8,4,1,Steak Soft Tacos,"[Tomatillo Green Chili Salsa, [Pinto Beans, Ch...",$9.25
9,5,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Pinto...",$9.25


In [108]:
# What's the number of observations in the dataset?
print(f"Total observations: {chipo.shape[0]}.")

# What's the number of columns in the dataset?
print(f"Total columns: {chipo.shape[1]}.")

# Print the name of all columns.
print(f"\nColumn names: {chipo.columns[0]}", *chipo.columns[1:], sep = ", ", end = ".")

# How's the data indexed?
print(f"\n\nIndex: {chipo.index}")

Total observations: 4622.
Total columns: 5.

Column names: order_id, quantity, item_name, choice_description, item_price.

Index: RangeIndex(start=0, stop=4622, step=1)


In [109]:
item_name_grouped = chipo[["item_name", "order_id", "quantity"]] \
  .groupby("item_name") \
  .agg({"order_id" : "count", "quantity": "sum"}) \
  .sort_values("order_id", ascending = False)

# Which was the most ordered item?
print(f"Most ordered item: {item_name_grouped.index[0]}.")
print(f" - Times ordered: {item_name_grouped.iloc[0, 0]}.")

# For the most ordered item, how many items were ordered?
print(f" - Total prepared: {item_name_grouped.iloc[0, 1]}.")

Most ordered item: Chicken Bowl.
 - Times ordered: 726.
 - Total prepared: 761.


In [110]:
# What was the most ordered item in the choice_description column?
choice_description_grouped = chipo[["choice_description", "order_id"]] \
  .groupby("choice_description") \
  .count() \
  .sort_values("order_id", ascending = False)

print(f"Most ordered item in choice_description column: {choice_description_grouped.index[0][1:-1]}") # The 1:-1 is to delete the [] from the item's name.
print(f" - Times ordered: {choice_description_grouped.iloc[0, 0]}.")

Most ordered item in choice_description column: Diet Coke
 - Times ordered: 134.


In [111]:
# How many items were ordered in total?
print(f"Total items ordered: {chipo['quantity'].sum()}.")

Total items ordered: 4972.


In [112]:
# Turn the item price into a float.
# - Check the item price type.
print(f"Current type of item_price column: {chipo.item_price.dtype}.")

# - Change data type with a lambda function.
print("Changing data type...")
chipo["item_price"] = chipo["item_price"].apply(lambda x: float(x[1:]))

# - Check the item price type again.
print(f"New type of item_price column: {chipo.item_price.dtype}.")

Current type of item_price column: object.
Changing data type...
New type of item_price column: float64.


In [113]:
# How much was the revenue for the period in the dataset?
print(f"Total revenue: ${round(chipo.item_price.sum(), 2)}.")

# How many orders were made in the period?
print(f"Total orders: {chipo.order_id.nunique()}.")

# What is the average revenue amount per order?
print(f"Average revenue per order: ${round(chipo.item_price.sum() / chipo.order_id.nunique(), 2)}")

# How many different items are sold?
print(f"Total different items sold: {item_name_grouped.shape[0]}.")

Total revenue: $34500.16.
Total orders: 1834.
Average revenue per order: $18.81
Total different items sold: 50.
