# Filtering and Sorting Data


In [125]:
import pandas as pd
import numpy as np

## Primer

In [None]:
# Set seed for reproducibility
np.random.seed(999)

# Create a DataFrame
n = 5
data = {"Value": np.random.random(n)}
df = pd.DataFrame(data=data, index=np.arange(1, n + 1, 1))
display(df)

In [None]:
# Filter rows based on a boolean index
filt = [True, True, True, False, False]
df.loc[filt, :]

In [None]:
# Filter rows based on a condition
filt = df.index < 4
df.loc[filt, :]

In [None]:
# Shorthanded way
df[filt]

In [None]:
filt = df["Value"] > 0.5
df.loc[filt, :]

In [None]:
df.sort_index(ascending=False)

In [None]:
df.sort_values(by="Value", ascending=False)

## Real Data

In [133]:
chipo = pd.read_csv("./data/chipotle.csv", index_col=0)

In [None]:
chipo.head()

In [None]:
chipo.info()

### Data pre-processing


In [None]:
# Fill in missing values
chipo["choice_description"] = chipo["choice_description"].fillna("")
chipo.info()

In [None]:
# Convert item_price to float
chipo["item_price"] = chipo["item_price"].str.replace("$", "").astype(float)
chipo.head()

In [None]:
# Add a new column for unit price
chipo["unit_price"] = chipo["item_price"] / chipo["quantity"]
chipo.head()

### Show unit price for each item in the dataset

Note that unit prices of the same item may vary. You need to show all unit prices.


In [None]:
# Filter out duplicates
chipo_dd = chipo.drop_duplicates(subset=["item_name", "unit_price"], keep="first")
chipo_dd = chipo_dd.sort_values(by=["item_name", "unit_price"], ascending=[True, False])
chipo_dd.head(10)

### Show item name with unit price > 10.

Name should be unique.


In [None]:
filt = chipo_dd["unit_price"] > 10
chipo_dd.loc[filt, ["item_name"]].drop_duplicates(subset=["item_name"], keep="first")

### What is the 10 most popular item? (By order count)


In [None]:
# Note that I used chipo instead of chipo_dd because I want to include all the items in the order.
chipo["item_name"].value_counts().head(10)

### What is the stats of unit price of the most popular item? (mean, max, min, ....)


In [None]:
filt = chipo_dd["item_name"] == "Chicken Bowl"
chipo_dd.loc[filt, ["unit_price"]].describe()

### What is the 10 most expensive items ordered? (by unit price)


In [None]:
chipo_dd.sort_values(by="unit_price", ascending=False).head(10)

### How many times was a Veggie Salad Bowl ordered?


In [None]:
filt = chipo["item_name"] == "Veggie Salad Bowl"
chipo[filt].shape[0]

### How many times did someone order more than one Canned Soda?


In [None]:
filt = (chipo["item_name"] == "Canned Soda") & (chipo["quantity"] > 1)
chipo[filt].shape[0]