# From SQL to pandas challenge 11

In [None]:
# import libraries
import pandas as pd

# load data
# This code is made to load our data stored on Google Drive
def gd_path(file_id):
    """Generate a shareable link from Google Drive file id."""
    return f"https://drive.google.com/uc?export=download&id={file_id}"

# Google Drive file ids
files_id = {
    "titleauthor": "1F1JOiYXStWacOBca6coNVfyVtoST7ZgD",
    "titles": "1PLdn50N9GRa53ZbuVWo0l47F_IXdvlEm",
    "sales": "1fzFc9rwYmVIPaGOFmhLVxCi3kg19vNU2", 
    "roysched": "1zPRZPoFPEMKyrNR5VSENeYFHGCBZmxbs", 
    "publishers": "1s9E8_AVOziTrowb3wyh2jg3PV763VOyq",
    "employee": "1h9mUjsVqpP74b1w0x7KOw37n_n9Ulkt5", 
    "authors": "1fEF89Nhe61EebAljKlwFwfEuokK0o6aJ"
}

# Read data from Google Drive
sales = pd.read_csv(gd_path(files_id["sales"]), sep=";")
titles = pd.read_csv(gd_path(files_id["titles"]), sep=";")
publishers = pd.read_csv(gd_path(files_id["publishers"]), sep=";")
employee = pd.read_csv(gd_path(files_id["employee"]), sep=";")
authors = pd.read_csv(gd_path(files_id["authors"]), sep=";")
titleauthor = pd.read_csv(gd_path(files_id["titleauthor"]), sep=";")
roysched = pd.read_csv(gd_path(files_id["roysched"]), sep=";")

## 1. Using LEFT JOIN: in which cities has "Is Anger the Enemy?" been sold?

In [None]:
title_publishers = publishers.merge(right = titles, how = 'left', on = 'pub_id')

title_publishers.loc[title_publishers['title']=="Is Anger the Enemy?", ['city']]

Unnamed: 0,city
1,Boston


### Hint:

In SQL the syntax is:

```sql
SELECT p.city
FROM publishers AS p
LEFT JOIN titles AS t
ON p.pub_id = t.pub_id
WHERE t.title = 'Is Anger the Enemy?';
```

## 2. Select all the book titles that have a link to the employee Howard Snyder 
    (he works for the publisher that has published those books).

In [None]:
(
employee
    .loc[(employee['fname'] == 'Howard')
        & (employee['lname'] == 'Snyder')]
    .merge(right = titles, how = 'left', on = 'pub_id')
    ['title']
)

0                  You Can Combat Computer Stress!
1                              Is Anger the Enemy?
2                                Life Without Fear
3    Prolonged Data Deprivation: Four Case Studies
4              Emotional Security: A New Algorithm
Name: title, dtype: object

### Hint:

In SQL the syntax is:

```sql
SELECT t.title
FROM employee e
JOIN titles t
ON e.pub_id = t.pub_id
WHERE e.fname = 'Howard'
AND e.lname = 'Snyder';
```

## 3. Using the `merge` of your choice: Select the book title with highest number of sales (qty)

In [None]:
book_sales = (
sales
    .merge(right = titles, how = 'inner', on = 'title_id')
    [['title', 'qty']]
)

(
book_sales
    .groupby(by = 'title')
    ['qty']
    .sum()
    .nlargest(1)
)

title
Is Anger the Enemy?    108
Name: qty, dtype: int64

### Hint:

In SQL the syntax is:

```sql
SELECT t.title, SUM(qty)
FROM sales AS s 
JOIN titles t
ON s.title_id = t.title_id
GROUP BY t.title_id
ORDER BY SUM(qty) desc
LIMIT 1;
```

# 4. Select all book titles and the full name of their author(s).
      
      - If a book has multiple authors, all authors must be displayed (in 
      multiple rows).
      
      - Books with no authors and authors with no books should not be displayed.

In [None]:
(
titles
    .merge(right = titleauthor, how = 'inner', on = 'title_id')
    .merge(right = authors, how = 'inner', on = 'au_id')
    [['title', 'au_lname', 'au_fname']]
)

Unnamed: 0,title,au_lname,au_fname
0,The Busy Executive's Database Guide,Green,Marjorie
1,You Can Combat Computer Stress!,Green,Marjorie
2,The Busy Executive's Database Guide,Bennet,Abraham
3,Cooking with Computers: Surreptitious Balance ...,O'Leary,Michael
4,"Sushi, Anyone?",O'Leary,Michael
5,Cooking with Computers: Surreptitious Balance ...,MacFeather,Stearns
6,Computer Phobic AND Non-Phobic Individuals: Be...,MacFeather,Stearns
7,Straight Talk About Computers,Straight,Dean
8,Silicon Valley Gastronomic Treats,del Castillo,Innes
9,The Gourmet Microwave,DeFrance,Michel


### Hint:

In SQL the syntax is:

```sql
SELECT
    t.title,
    a.au_fname,
    a.au_lname
FROM titles t
INNER JOIN titleauthor ta 
ON t.title_id = ta.title_id
INNER JOIN authors a 
ON ta.au_id = a.au_id;
```

## 5. Select the full name of authors of Psychology books

   Bonus hint: if you want to prevent duplicates but allow authors with shared
   last names to be displayed, you can concatenate the first and last names
   with `str + str`, and use the UNIQUE clause on the concatenated names.

In [None]:
book_authors = (
titles
    .merge(right = titleauthor, how = 'inner', on = 'title_id')
    .merge(right = authors, how = 'inner', on = 'au_id')
    
)

book_authors['full_name'] = book_authors['au_fname'] + ' ' + book_authors['au_lname']

(
book_authors
    .loc[book_authors['type']=='psychology']
    ['full_name']
    .unique()
)

array(['Stearns MacFeather', 'Anne Ringer', 'Charlene Locksley',
       'Livia Karsen', 'Albert Ringer', 'Johnson White'], dtype=object)

### Hint:

In SQL the syntax is:

```sql
SELECT DISTINCT CONCAT(a.au_fname, " ", a.au_lname) AS full_name
FROM authors a
INNER JOIN titleauthor ta ON a.au_id = ta.au_id
INNER JOIN titles t ON ta.title_id = t.title_id
WHERE t.type = "Psychology";
```

## 6. Explore the table roysched and try to grasp the meaning of each column. 
   The notes below will help:
   
   - "Royalty" means the percentage of the sale price paid to the author(s).
   
   - Sometimes, the royalty may be smaller for the first few sales (which have
     to cover the publishing costs to the publisher) but higher for the sales 
     above a certain threshold.
     
   - In the "roysched" table each title_id can appear multiple times, with
     different royalty values for each range of sales.
     
   - Select all rows for particular title_id, for example "BU1111", and explore
	 the data.

In [None]:
roysched.loc[roysched['title_id'] == "BU1111"]

Unnamed: 0,title_id,lorange,hirange,royalty
49,BU1111,0,4000,10
50,BU1111,4001,8000,12
51,BU1111,8001,10000,14
52,BU1111,12001,16000,16
53,BU1111,16001,20000,18
54,BU1111,20001,24000,20
55,BU1111,24001,28000,22
56,BU1111,28001,50000,24


### Hint:

In SQL the syntax is:

```sql
SELECT * FROM roysched WHERE title_id = "BU1111";
```

## 7. Select all the book titles and the maximum royalty they can reach.
    Display only titles that are present in the roysched table.

In [None]:
max_roy = roysched.groupby(by='title_id').aggregate({'royalty': 'max'})

max_roy_titles = titles.merge(right=max_roy, how = 'right', on = 'title_id')

max_roy_titles = max_roy_titles.rename(columns={"royalty_y": "max_royalty"})

max_roy_titles.loc[:, ["title", "max_royalty"]].sort_values(by="max_royalty", ascending=False)

Unnamed: 0,title,max_royalty
1,Cooking with Computers: Surreptitious Balance ...,24
2,You Can Combat Computer Stress!,24
3,Straight Talk About Computers,24
5,The Gourmet Microwave,24
13,"Onions, Leeks, and Garlic: Cooking Secrets of ...",24
14,Fifty Years in Buckingham Palace Kitchens,22
4,Silicon Valley Gastronomic Treats,20
6,But Is It User Friendly?,18
8,Computer Phobic AND Non-Phobic Individuals: Be...,18
7,Secrets of Silicon Valley,16


### Hint:

In SQL the syntax is:

```sql
SELECT t.title, MAX(r.royalty) max_royalty
FROM titles t
INNER JOIN roysched r 
ON t.title_id = r.title_id
GROUP BY t.title
ORDER BY max_royalty DESC;
```