# Filtering and Sorting Data


In [None]:
import pandas as pd
import numpy as np

## Primer


In [None]:
# Set seed for reproducibility
np.random.seed(999)

# Create a DataFrame
n = 5
data = {"Value": np.random.random(n)}
df = pd.DataFrame(data=data, index=np.arange(1, n + 1, 1))
display(df)

In [None]:
# Create a boolean filter
filt = [True, True, True, False, False]
df.loc[filt, :]

In [None]:
# Create a boolean filter based on index values
filt = df.index < 4
display(df.loc[filt, :])

# Shorthanded way
display(df[filt])

In [None]:
# Create a boolean filter based on column values
filt = df["Value"] > 0.5
df.loc[filt, :]

In [None]:
# Sort DataFrame by index in descending order
df.sort_index(ascending=False)

In [None]:
# Sort DataFrame by column "Value" in descending order
df.sort_values(by="Value", ascending=False)

## Real Data


In [None]:
# Load the chipotle dataset
chipo = pd.read_csv("./data/chipotle.csv", index_col=0)

In [None]:
# Display first 5 rows of the DataFrame
chipo.head()

In [None]:
# Display DataFrame information
chipo.info()

### Data pre-processing


In [None]:
# Fill missing values in "choice_description" column with empty strings
chipo["choice_description"] = chipo["choice_description"].fillna("")
chipo.info()

In [None]:
# Convert "item_price" column to float after removing the dollar sign
chipo["item_price"] = chipo["item_price"].str.replace("$", "").astype(float)
chipo.head()

In [None]:
# Create a new column "unit_price" as item_price divided by quantity
chipo["unit_price"] = chipo["item_price"] / chipo["quantity"]
chipo.head()

### Show unit price for each item in the dataset

Note that unit prices of the same item may vary. You need to show all unit prices.


In [None]:
# Remove duplicate rows based on "item_name" and "unit_price", keeping the first occurrence
chipo_dd = chipo.drop_duplicates(subset=["item_name", "unit_price"], keep="first")

# Sort the resulting DataFrame by "item_name" (ascending) and "unit_price" (descending)
chipo_dd = chipo_dd.sort_values(by=["item_name", "unit_price"], ascending=[True, False])

# Display the first 10 rows of the final DataFrame
chipo_dd.head(10)

### Show item name with unit price > 10.

Name should be unique.


In [None]:
# Create a boolean filter for unit_price > 10
filt = chipo_dd["unit_price"] > 10

# Display item_name of items with unit_price > 10, without duplicates
chipo_dd.loc[filt, ["item_name"]].drop_duplicates(subset=["item_name"], keep="first")

### What is the 10 most popular item? (By order count)


In [None]:
# Display the 10 most ordered items
# Note that chipo is used here, not chipo_dd becuase we want total orders, not unique items
chipo["item_name"].value_counts().head(10)

### What is the stats of unit price of the most popular item? (mean, max, min, ....)


In [None]:
# Create a boolean filter for item_name == "Chicken Bowl"
filt = chipo_dd["item_name"] == "Chicken Bowl"

# Display descriptive statistics for unit_price of "Chicken Bowl"
chipo_dd.loc[filt, ["unit_price"]].describe()

### What is the 10 most expensive items ordered? (by unit price)


In [None]:
# Display the 10 most expensive items based on unit_price
chipo_dd.sort_values(by="unit_price", ascending=False).head(10)

### How many times was a Veggie Salad Bowl ordered?


In [None]:
# Create a boolean filter for item_name == "Veggie Salad Bowl"
filt = chipo["item_name"] == "Veggie Salad Bowl"

# Display the number of orders for "Veggie Salad Bowl"
chipo[filt].shape[0]

### How many times did someone order more than one Canned Soda?


In [None]:
# Create a boolean filter for item_name == "Canned Soda" and quantity > 1
filt = (chipo["item_name"] == "Canned Soda") & (chipo["quantity"] > 1)

# Display the number of orders for "Canned Soda" with quantity > 1
chipo[filt].shape[0]

# Choose (unique) items with Fresh Tomato Salsa as ingredient


In [None]:
# Create a boolean filter for choice_description containing "Fresh Tomato Salsa"
filt = chipo_dd["choice_description"].str.contains("Fresh Tomato Salsa")

# Display all columns for items with "Fresh Tomato Salsa" in choice_description
chipo_dd[filt]

In [None]:
# Add a new column "num_ingredients" that counts the number of ingredients in choice_description
chipo_dd["num_ingredients"] = (
    chipo_dd["choice_description"]
    .str.replace("[", "")
    .str.replace("]", "")
    .str.split(",")
    .apply(len)
)

# Display the 10 items with the highest number of ingredients
chipo_dd[["choice_description", "num_ingredients"]].sort_values(
    by="num_ingredients", ascending=False
).head(10)