# Ex2 - Getting and Knowing your Data

This time we are going to pull data directly from the internet.
Special thanks to: https://github.com/justmarkham for sharing the dataset and materials.

### Step 1. Import the necessary libraries

In [95]:
import pandas as pd
import numpy as np

### Step 2. Import the dataset from this [address](https://raw.githubusercontent.com/thieu1995/csv-files/main/data/pandas/chipotle.tsv).

In [96]:
# url = "https://raw.githubusercontent.com/thieu1995/csv-files/main/data/pandas/chipotle.tsv"
# df = pd.read_csv(url, sep='\t')

# Use local file for faster execution
path = "chipotle.tsv"
df = pd.read_csv(path, sep='\t')

### Step 3. Assign it to a variable called chipo.

In [97]:
chipo = df

### Step 4. See the first 10 entries

In [98]:
chipo.head(10)

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98
5,3,1,Chicken Bowl,"[Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou...",$10.98
6,3,1,Side of Chips,,$1.69
7,4,1,Steak Burrito,"[Tomatillo Red Chili Salsa, [Fajita Vegetables...",$11.75
8,4,1,Steak Soft Tacos,"[Tomatillo Green Chili Salsa, [Pinto Beans, Ch...",$9.25
9,5,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Pinto...",$9.25


### Step 5. What is the number of observations in the dataset?

In [99]:
# Solution 1
print(f"Number of observations: {len(chipo)}")

Number of observations: 4622


In [100]:
# Solution 2
print(f"Number of observations: {chipo.shape[0]}")

Number of observations: 4622


### Step 6. What is the number of columns in the dataset?

In [101]:
print(f"Number of columns: {chipo.shape[1]}")

Number of columns: 5


### Step 7. Print the name of all the columns.

In [102]:
chipo.columns

Index(['order_id', 'quantity', 'item_name', 'choice_description',
       'item_price'],
      dtype='object')

### Step 8. How is the dataset indexed?

In [103]:
chipo.index

RangeIndex(start=0, stop=4622, step=1)

### Step 9. Which was the most-ordered item? 

In [104]:
most_ordered_item = chipo.groupby('item_name')[['quantity']].sum().sort_values(by='quantity', ascending=False).head(1).index[0]
print(f"The most ordered item is {most_ordered_item}")

The most ordered item is Chicken Bowl


### Step 10. For the most-ordered item, how many items were ordered?

In [105]:
quantity_ordered = chipo[chipo['item_name'] == most_ordered_item]['quantity'].sum()
print(f"The quantity of {most_ordered_item} is {quantity_ordered}")

The quantity of Chicken Bowl is 761


### Step 11. What was the most ordered item in the choice_description column?

In [106]:
def extract_first_element(x:str):
    return x.strip('[]').split(',')[0]

# Extract the first element from the choice_description column
ordered_items_choice = chipo['choice_description'].dropna().apply(extract_first_element)
ordered_items_choice

1                            Clementine
2                                 Apple
4       Tomatillo-Red Chili Salsa (Hot)
5             Fresh Tomato Salsa (Mild)
7             Tomatillo Red Chili Salsa
                     ...               
4617                 Fresh Tomato Salsa
4618                 Fresh Tomato Salsa
4619                 Fresh Tomato Salsa
4620                 Fresh Tomato Salsa
4621                 Fresh Tomato Salsa
Name: choice_description, Length: 3376, dtype: object

In [107]:
# Count the occurrences of each item in the ordered_items_choice series
item_counts = ordered_items_choice.value_counts()

# Get the most ordered item
most_ordered_item = item_counts.idxmax()
print(f"The most ordered item in the choice_description column is {most_ordered_item} with {item_counts[most_ordered_item]} orders")

The most ordered item in the choice_description column is Fresh Tomato Salsa with 1046 orders


### Step 12. How many items were orderd in total?

In [108]:
num_items_ordered = chipo['quantity'].sum()
print(f"The total number of items ordered is {num_items_ordered}")

The total number of items ordered is 4972


### Step 13. Turn the item price into a float

#### Step 13.a. Check the item price type

In [109]:
# print the type of item_price
print(type(chipo['item_price']))
# print the first 10 rows of item_price
print(chipo['item_price'].head(10))

<class 'pandas.core.series.Series'>
0     $2.39 
1     $3.39 
2     $3.39 
3     $2.39 
4    $16.98 
5    $10.98 
6     $1.69 
7    $11.75 
8     $9.25 
9     $9.25 
Name: item_price, dtype: object


#### Step 13.b. Create a lambda function and change the type of item price

In [110]:
converter = lambda x: float(x.strip('$'))
chipo['item_price'] = chipo['item_price'].apply(converter)
chipo['item_price'].head(10)

0     2.39
1     3.39
2     3.39
3     2.39
4    16.98
5    10.98
6     1.69
7    11.75
8     9.25
9     9.25
Name: item_price, dtype: float64

#### Step 13.c. Check the item price type

In [111]:
print(type(chipo['item_price']))
print(chipo['item_price'].head(10))

<class 'pandas.core.series.Series'>
0     2.39
1     3.39
2     3.39
3     2.39
4    16.98
5    10.98
6     1.69
7    11.75
8     9.25
9     9.25
Name: item_price, dtype: float64


### Step 14. How much was the revenue for the period in the dataset?

In [112]:
# Add a column to the dataframe called 'total_price'
chipo['total_price'] = chipo['item_price'] * chipo['quantity']
chipo['total_price'].head(10)

0     2.39
1     3.39
2     3.39
3     2.39
4    33.96
5    10.98
6     1.69
7    11.75
8     9.25
9     9.25
Name: total_price, dtype: float64

In [113]:
total_revenue = chipo['total_price'].sum()
print(f"The total revenue for the period is ${total_revenue}")


The total revenue for the period is $39237.02


### Step 15. How many orders were made in the period?

In [114]:
num_orders = chipo['order_id'].nunique()
print(f"The total number of orders is {num_orders}")

The total number of orders is 1834


### Step 16. What is the average revenue amount per order?

In [115]:
# Solution 1
# Group by order_id and sum the total_price
revenue_per_order = chipo.groupby('order_id')['total_price'].sum()
print(revenue_per_order)
# Calculate the average revenue amount per order
average_revenue_per_order = revenue_per_order.mean()
print(f"The average revenue amount per order is ${average_revenue_per_order: .2f}")


order_id
1       11.56
2       33.96
3       12.67
4       21.00
5       13.70
        ...  
1830    23.00
1831    12.90
1832    13.20
1833    23.50
1834    28.75
Name: total_price, Length: 1834, dtype: float64
The average revenue amount per order is $ 21.39


In [116]:
# Solution 2
average_revenue = total_revenue / num_orders
print(f"The average revenue amount per order is ${average_revenue: .2f}")

The average revenue amount per order is $ 21.39


### Step 17. How many different items are sold?

In [117]:
num_items = chipo['item_name'].nunique()
print(f"The total number of different items sold is {num_items}")

The total number of different items sold is 50
