# From SQL to pandas challenge 10

In [1]:
# import libraries
import pandas as pd

# load data
# This code is made to load our data stored on Google Drive
def gd_path(file_id):
    """Generate a shareable link from Google Drive file id."""
    return f"https://drive.google.com/uc?export=download&id={file_id}"

# Google Drive file ids
files_id = {
    "titles": "1PLdn50N9GRa53ZbuVWo0l47F_IXdvlEm",
    "sales": "1fzFc9rwYmVIPaGOFmhLVxCi3kg19vNU2",
}

# Read data from Google Drive
sales = pd.read_csv(gd_path(files_id["sales"]), sep=";")
titles = pd.read_csv(gd_path(files_id["titles"]), sep=";")

## 1. Select everything from the sales table and create a new column called "sales_category" to categorise qty:
   
		qty >= 50 high sales
		20 <= qty < 50 medium sales
		qty < 20 low sales

In [None]:
sales['sales_category'] = "high sales"
sales.loc[sales['qty'] < 50, 'sales_category'] = "medium sales"
sales.loc[sales['qty'] < 20, 'sales_category'] = "low sales"

sales

Unnamed: 0,stor_id,ord_num,ord_date,qty,payterms,title_id,sales_category
0,6380,6871,1994-09-14 00:00:00,5,Net 60,BU1032,low sales
1,6380,722a,1994-09-13 00:00:00,3,Net 60,PS2091,low sales
2,7066,A2976,1993-05-24 00:00:00,50,Net 30,PC8888,high sales
3,7066,QA7442.3,1994-09-13 00:00:00,75,ON invoice,PS2091,high sales
4,7067,D4482,1994-09-14 00:00:00,10,Net 60,PS2091,low sales
5,7067,P2121,1992-06-15 00:00:00,40,Net 30,TC3218,medium sales
6,7067,P2121,1992-06-15 00:00:00,20,Net 30,TC4203,medium sales
7,7067,P2121,1992-06-15 00:00:00,20,Net 30,TC7777,medium sales
8,7131,N914008,1994-09-14 00:00:00,20,Net 30,PS2091,medium sales
9,7131,N914014,1994-09-14 00:00:00,25,Net 30,MC3021,medium sales


### Hint:

In SQL the syntax is:

```sql
SELECT *,
CASE
    WHEN qty >= 50 THEN "high sales"
    WHEN qty >= 20 THEN "medium sales"
    ELSE "low sales"
END AS sales_category
FROM sales;
```

## 2. Adding to your answer from the previous question. Find out the total amount of books sold (qty) in each sales category
    i.e. How many books had high sales, how many had medium sales, and how many had low sales

In [None]:
(
sales
    .groupby(by = 'sales_category')
    .aggregate({'qty': 'sum'})
)

Unnamed: 0_level_0,qty
sales_category,Unnamed: 1_level_1
high sales,125
low sales,83
medium sales,285


### Hint:

In SQL the syntax is:

```sql
SELECT sum(qty),
CASE
	WHEN qty>=50 THEN 'high sales'
    WHEN (qty>=20 AND qty<50) THEN 'medium sales'
    ELSE 'low sales'
END AS sales_category
FROM sales
GROUP BY sales_category;
```

## 3. Adding to your answer from the previous questions: output only those sales categories that have a SUM(qty) greater than 100, and order them in descending order

In [None]:
books_per_category = (
sales
    .groupby(by = 'sales_category')
    .aggregate({'qty': 'sum'})
)

(
books_per_category
    .loc[books_per_category['qty']>100]
    .sort_values(by = 'qty', ascending = False)
)

Unnamed: 0_level_0,qty
sales_category,Unnamed: 1_level_1
medium sales,285
high sales,125


### Hint:

In SQL the syntax is:

```sql
SELECT sum(qty),
CASE
    WHEN qty>=50 THEN 'high sales'
    WHEN (qty>=20 AND qty<50) THEN 'medium sales'
    ELSE 'low sales'
END AS sales_category
FROM sales
GROUP BY sales_category
HAVING sum(qty)>100
ORDER BY sum(qty) DESC;
```

## 4. Find out the average book price, per publisher, for the following book types and price categories:
		book types: business, traditional cook and psychology
		price categories: <= 5 super low, <= 10 low, <= 15 medium, > 15 high
        
        - When displaying the average prices, use ROUND() to hide decimals.

In [None]:
selected_books = titles.loc[titles["type"].isin(['business', 'trad_cook', 'psychology'])].copy()

selected_books['price_category'] = "high"
selected_books.loc[selected_books['price'] <= 15, 'price_category'] = "medium"
selected_books.loc[selected_books['price'] <= 10, 'price_category'] = "low"
selected_books.loc[selected_books['price'] <= 5, 'price_category'] = "super low"

selected_books.groupby(by = ['pub_id', 'type', 'price_category']).aggregate({'price': 'mean'}).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,price
pub_id,type,price_category,Unnamed: 3_level_1
736,business,super low,2.99
736,psychology,high,19.99
736,psychology,low,7.5
736,psychology,medium,10.95
877,psychology,high,21.59
877,trad_cook,high,20.95
877,trad_cook,medium,13.47
1389,business,high,19.99
1389,business,medium,11.95


### Hint:

In SQL the syntax is:

```sql
SELECT
    ROUND(AVG(price)),
    type,
    pub_name,
CASE
    WHEN price <= 5 THEN 'super low'
    WHEN (price > 5 AND price <= 10) THEN 'low'
    WHEN (price > 10 AND price <= 15) THEN 'medium'
    ELSE 'high'
END AS price_category
FROM titles
LEFT JOIN publishers
ON titles.pub_id=publishers.pub_id
GROUP BY
    pub_name,
    type,
    price_category
HAVING
    type IN ('business', 'trad_cook', 'psychology');
```