# SQL Join with postgreSQL and sqlalchemy

In [54]:
import psycopg2
import sqlalchemy
from sqlalchemy import create_engine
from sqlalchemy.sql import select
import pandas as pd

### Create engine

In [19]:
con = create_engine('postgresql://postgres@localhost:5432/postgres')

### Check tables in db

In [24]:
engine.table_names()

['accounts', 'orders', 'region', 'sales_reps', 'web_events']

### Get data from csv and create and seed db with pandas to_sql

In [21]:
accounts = pd.read_csv('accounts.csv', sep=';', index_col='id')
accounts.to_sql('accounts', con=con, if_exists='replace')

### Sanity check - select all records from created table and load to DataFrame

In [22]:
sql_result = pd.read_sql_query("SELECT * FROM accounts", con)
sql_result.head()

Unnamed: 0,id,name,website,lat,long,primary_poc,sales_rep_id
0,1001,Walmart,www.walmart.com,40.238496,-75.103297,Tamara Tuma,321500
1,1011,Exxon Mobil,www.exxonmobil.com,41.169156,-73.849374,Sung Shields,321510
2,1021,Apple,www.apple.com,42.290495,-76.084009,Jodee Lupo,321520
3,1031,Berkshire Hathaway,www.berkshirehathaway.com,40.949021,-75.763898,Serafina Banda,321530
4,1041,McKesson,www.mckesson.com,42.217093,-75.284998,Angeles Crusoe,321540


### Load rest of the tables

In [23]:
orders = pd.read_csv('orders.csv', sep=';', index_col='id')
orders.to_sql('orders', con=con, if_exists='replace')
region = pd.read_csv('region.csv', sep=';', index_col='id')
region.to_sql('region', con=con, if_exists='replace')
sales_reps = pd.read_csv('sales_reps.csv', sep=';', index_col='id')
sales_reps.to_sql('sales_reps', con=con, if_exists='replace')
web_events = pd.read_csv('web_events.csv', sep=';', index_col='id')
web_events.to_sql('web_events', con=con, if_exists='replace')

## DB structure
![DB structure](db.png)

Provide a table that provides the region for each sales_rep along with their associated accounts. This time only for the Midwest region. Your final table should include three columns: the region name, the sales rep name, and the account name. Sort the accounts alphabetically (A-Z) according to account name.

In [28]:
sql_result = pd.read_sql_query("\
SELECT r.name region, s.name rep, a.name account \
FROM sales_reps s \
JOIN region r \
ON s.region_id = r.id \
JOIN accounts a \
ON a.sales_rep_id = s.id \
WHERE r.name = 'Midwest' \
ORDER BY a.name; \
", con)
sql_result.head()

Unnamed: 0,region,rep,account
0,Midwest,Chau Rowles,Abbott Laboratories
1,Midwest,Julie Starr,AbbVie
2,Midwest,Cliff Meints,Aflac
3,Midwest,Chau Rowles,Alcoa
4,Midwest,Charles Bidwell,Altria Group


Provide a table that provides the region for each sales_rep along with their associated accounts. This time only for accounts where the sales rep has a first name starting with S and in the Midwest region. Your final table should include three columns: the region name, the sales rep name, and the account name. Sort the accounts alphabetically (A-Z) according to account name. 

In [37]:
# Use %% for % in LIKE 
sql_result = pd.read_sql_query("\
SELECT s.name sales_rep, a.name account, r.name region \
FROM accounts a \
JOIN sales_reps s \
ON a.sales_rep_id = s.id \
JOIN region r \
ON s.region_id = r.id \
WHERE s.name LIKE 'S%%' \
AND r.name = 'Midwest' \
ORDER BY a.name; \
", con)
sql_result.head()

Unnamed: 0,sales_rep,account,region
0,Sherlene Wetherington,Community Health Systems,Midwest
1,Sherlene Wetherington,Progressive,Midwest
2,Sherlene Wetherington,Rite Aid,Midwest
3,Sherlene Wetherington,Time Warner Cable,Midwest
4,Sherlene Wetherington,U.S. Bancorp,Midwest


Provide a table that provides the region for each sales_rep along with their associated accounts. This time only for accounts where the sales rep has a last name starting with K and in the Midwest region. Your final table should include three columns: the region name, the sales rep name, and the account name. Sort the accounts alphabetically (A-Z) according to account name.

In [38]:
sql_result = pd.read_sql_query("\
SELECT s.name sales_rep, a.name account, r.name region \
FROM accounts a \
JOIN sales_reps s \
ON a.sales_rep_id = s.id \
JOIN region r \
ON s.region_id = r.id \
WHERE s.name LIKE '%% K%%' \
AND r.name = 'Midwest' \
ORDER BY a.name; \
", con)
sql_result.head()

Unnamed: 0,sales_rep,account,region
0,Delilah Krum,Amgen,Midwest
1,Delilah Krum,AutoNation,Midwest
2,Delilah Krum,Capital One Financial,Midwest
3,Delilah Krum,Cummins,Midwest
4,Carletta Kosinski,Danaher,Midwest


Provide the name for each region for every order, as well as the account name and the unit price they paid (total_amt_usd/total) for the order. However, you should only provide the results if the standard order quantity exceeds 100. Your final table should have 3 columns: region name, account name, and unit price. In order to avoid a division by zero error, adding .01 to the denominator here is helpful total_amt_usd/(total+0.01). 

In [39]:
sql_result = pd.read_sql_query("\
SELECT a.name account, r.name region, o.total_amt_usd/(o.total + 0.01) unit_price \
FROM accounts a \
JOIN sales_reps s \
ON a.sales_rep_id = s.id \
JOIN region r \
ON s.region_id = r.id \
JOIN orders o \
ON a.id = o.account_id; \
", con)
sql_result.head()

Unnamed: 0,account,region,unit_price
0,Walmart,Northeast,5.7596
1,Walmart,Northeast,5.965175
2,Walmart,Northeast,5.879706
3,Walmart,Northeast,5.444236
4,Walmart,Northeast,5.960184


Provide the name for each region for every order, as well as the account name and the unit price they paid (total_amt_usd/total) for the order. However, you should only provide the results if the standard order quantity exceeds 100 and the poster order quantity exceeds 50. Your final table should have 3 columns: region name, account name, and unit price. Sort for the smallest unit price first. In order to avoid a division by zero error, adding .01 to the denominator here is helpful (total_amt_usd/(total+0.01). 

In [40]:
sql_result = pd.read_sql_query("\
SELECT a.name account, r.name region, o.total_amt_usd/(o.total + 0.01) unit_price \
FROM accounts a \
JOIN sales_reps s \
ON a.sales_rep_id = s.id \
JOIN region r \
ON s.region_id = r.id \
JOIN orders o \
ON a.id = o.account_id \
WHERE o.standard_qty > 100 \
AND o.poster_qty > 50 \
ORDER BY unit_price; \
", con)
sql_result.head()

Unnamed: 0,account,region,unit_price
0,State Farm Insurance Cos.,Northeast,5.119282
1,DISH Network,Southeast,5.231816
2,Travelers Cos.,Northeast,5.235181
3,Best Buy,Northeast,5.260426
4,Stanley Black & Decker,West,5.266396


What are the different channels used by account id 1001? Your final table should have only 2 columns: account name and the different channels. You can try SELECT DISTINCT to narrow down the results to only the unique values.

In [41]:
sql_result = pd.read_sql_query("\
SELECT DISTINCT a.name, w.channel \
FROM accounts a \
JOIN web_events w \
ON a.id = w.account_id \
WHERE a.id = '1001'; \
", con)
sql_result.head()

Unnamed: 0,name,channel
0,Walmart,adwords
1,Walmart,banner
2,Walmart,direct
3,Walmart,facebook
4,Walmart,organic


Find all the orders that occurred in 2015. Your final table should have 4 columns: occurred_at, account name, order total, and order total_amt_usd.

In [65]:
connection = engine.connect()
result = connection.execute("SELECT o.occurred_at, a.name, o.total, o.total_amt_usd \
FROM accounts a \
JOIN orders o \
ON o.account_id = a.id \
WHERE o.occurred_at BETWEEN '2015-01-01' AND '2016-01-01' \
ORDER BY o.occurred_at DESC \
LIMIT 10;")
for row in result:
    print(row)
connection.close()

('2015-12-31T23:21:15.000Z', 'Thermo Fisher Scientific', 61, 446.97)
('2015-12-31T23:15:35.000Z', 'Thermo Fisher Scientific', 635, 3246.9)
('2015-12-31T20:44:28.000Z', 'Coca-Cola', 528, 2693.54)
('2015-12-31T15:12:41.000Z', 'Computer Sciences', 164, 875.25)
('2015-12-31T15:11:15.000Z', 'Cameron International', 513, 2626.82)
('2015-12-31T13:29:55.000Z', 'eBay', 52, 422.24)
('2015-12-31T13:08:25.000Z', 'eBay', 506, 2581.28)
('2015-12-31T09:14:45.000Z', 'Masco', 160, 847.18)
('2015-12-31T09:07:46.000Z', 'Masco', 763, 4785.99)
('2015-12-31T02:51:20.000Z', 'Cigna', 516, 2584.84)
