In [83]:
from env import host, user, password
from datetime import datetime
import pandas as pd
import numpy as np
from pydataset import data

## Exercises I

In [3]:
def get_db_url(user, hostname, password, database_name):
    return f'mysql+pymysql://{user}:{password}@{hostname}/{database_name}'

In [13]:
url = get_db_url(user, host, password, 'employees')

In [8]:
pd.read_sql('SELECT * FROM employees LIMIT 5 OFFSET 50', url)

Unnamed: 0,emp_no,birth_date,first_name,last_name,gender,hire_date
0,10051,1953-07-28,Hidefumi,Caine,M,1992-10-15
1,10052,1961-02-26,Heping,Nitsch,M,1988-05-21
2,10053,1954-09-13,Sanjiv,Zschoche,F,1986-02-04
3,10054,1957-04-04,Mayumi,Schueller,M,1995-03-13
4,10055,1956-06-06,Georgy,Dredge,M,1992-04-27


In [9]:
url = url + '332'

In [11]:
# Intentionally make typo in database url
pd.read_sql('SELECT * FROM employees LIMIT 5 OFFSET 50', url)

OperationalError: (pymysql.err.OperationalError) (1044, "Access denied for user 'innis_1659'@'%' to database 'employees332'")
(Background on this error at: https://sqlalche.me/e/14/e3q8)

In [12]:
# The url typo results in an OperationalError and Access denied

In [14]:
# Intentionally making error in SQL query
pd.read_sql('SELECT * FROM employees LIMIT 5 OFFFSET 50', url)

ProgrammingError: (pymysql.err.ProgrammingError) (1064, "You have an error in your SQL syntax; check the manual that corresponds to your MySQL server version for the right syntax to use near 'OFFFSET 50' at line 1")
[SQL: SELECT * FROM employees LIMIT 5 OFFFSET 50]
(Background on this error at: https://sqlalche.me/e/14/f405)

In [15]:
# That typo in the SQL query resulted in a Programming Error

In [16]:
employees = pd.read_sql('SELECT * FROM employees', url)
titles = pd.read_sql('SELECT * FROM titles', url)

7. How many rows and columns for each DataFrame

In [18]:
# Rows and columns for each dataframe matches SQL database
employees.shape 

(300024, 6)

In [19]:
titles.shape

(443308, 4)

In [20]:
# Row and column numbers match the SQL database (checked using Sequel-Ace)

8. Summary statistics for each dataframe

In [28]:
employees.describe(include = 'all')

Unnamed: 0,emp_no,birth_date,first_name,last_name,gender,hire_date
count,300024.0,300024,300024,300024,300024,300024
unique,,4750,1275,1637,2,5434
top,,1952-03-08,Shahab,Baba,M,1985-06-20
freq,,95,295,226,179973,132
mean,253321.763392,,,,,
std,161828.23554,,,,,
min,10001.0,,,,,
25%,85006.75,,,,,
50%,249987.5,,,,,
75%,424993.25,,,,,


In [29]:
titles.describe(include = 'all')

Unnamed: 0,emp_no,title,from_date,to_date
count,443308.0,443308,443308,443308
unique,,7,6393,5888
top,,Engineer,1998-10-25,9999-01-01
freq,,115003,132,240124
mean,253075.03443,,,
std,161853.292613,,,
min,10001.0,,,
25%,84855.75,,,
50%,249847.5,,,
75%,424891.25,,,


9. How many unique titles in titles DataFrame

In [35]:
# We can see 7 unique titles as show above using describe(). Also can generate list of the titles and number as follows:

print(f"There are {len(titles.title.unique())} unique title(s) in the titles DataFrame")

There are 7 unique title(s) in the titles DataFrame


10. Oldest date in the to_date column

In [38]:
print(f"The oldest date in the to_date column for the titles table is {titles.to_date.min()}")

The oldest date in the to_date column for the titles table is 1985-03-01


11. Most recent date in the to_date column

In [66]:
print(f"The most recent date in the to_date column for the titles table (besides {titles.to_date.max()}) is {titles.to_date[titles.to_date <=datetime.today().date()].max()}")

The most recent date in the to_date column for the titles table (besides 9999-01-01) is 2002-08-01


## Exercises II

1. Copy the users and roles DataFrames from the examples above.

In [69]:
# Create the users DataFrame.

users = pd.DataFrame({
    'id': [1, 2, 3, 4, 5, 6],
    'name': ['bob', 'joe', 'sally', 'adam', 'jane', 'mike'],
    'role_id': [1, 2, 3, 3, np.nan, np.nan]
})
users


Unnamed: 0,id,name,role_id
0,1,bob,1.0
1,2,joe,2.0
2,3,sally,3.0
3,4,adam,3.0
4,5,jane,
5,6,mike,


In [70]:
# Create the roles DataFrame

roles = pd.DataFrame({
    'id': [1, 2, 3, 4],
    'name': ['admin', 'author', 'reviewer', 'commenter']
})
roles


Unnamed: 0,id,name
0,1,admin
1,2,author
2,3,reviewer
3,4,commenter


What is the result of using a right join on the DataFrames?

In [72]:
users.merge(roles, how = "right")

Unnamed: 0,id,name,role_id
0,1,admin,
1,2,author,
2,3,reviewer,
3,4,commenter,


What is the result of using an outer join on the DataFrames?

In [73]:
users.merge(roles, how = "outer")

Unnamed: 0,id,name,role_id
0,1,bob,1.0
1,2,joe,2.0
2,3,sally,3.0
3,4,adam,3.0
4,5,jane,
5,6,mike,
6,1,admin,
7,2,author,
8,3,reviewer,
9,4,commenter,


What happens if you drop the foreign keys from the DataFrames and try to merge them?

In [81]:
users.drop(columns='role_id').merge(roles.drop(columns = 'id'))

Unnamed: 0,id,name


Load the mpg dataset from PyDataset.

In [109]:
mpg = data('mpg')

Output and read the documentation for the mpg dataset.

In [90]:
data('mpg',show_doc=True)

mpg

PyDataset Documentation (adopted from R Documentation. The displayed examples are in R)

## Fuel economy data from 1999 and 2008 for 38 popular models of car

### Description

This dataset contains a subset of the fuel economy data that the EPA makes
available on http://fueleconomy.gov. It contains only models which had a new
release every year between 1999 and 2008 - this was used as a proxy for the
popularity of the car.

### Usage

    data(mpg)

### Format

A data frame with 234 rows and 11 variables

### Details

  * manufacturer. 

  * model. 

  * displ. engine displacement, in litres 

  * year. 

  * cyl. number of cylinders 

  * trans. type of transmission 

  * drv. f = front-wheel drive, r = rear wheel drive, 4 = 4wd 

  * cty. city miles per gallon 

  * hwy. highway miles per gallon 

  * fl. 

  * class. 




How many rows and columns are in the dataset?

In [96]:
print(f'There are {mpg.shape[0]} rows and {mpg.shape[1]} columns in the mpg dataset')

There are 234 rows and 11 columns in the mpg dataset


Check out your column names and perform any cleanup you may want on them.

In [110]:
mpg = mpg.rename(columns = {'cty':'city_mpg','hwy':'highway_mpg','drv':'drive'})

In [114]:
mpg.describe(include='all')

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drive,city_mpg,highway_mpg,fl,class
count,234,234,234.0,234.0,234.0,234,234,234.0,234.0,234,234
unique,15,38,,,,10,3,,,5,7
top,dodge,caravan 2wd,,,,auto(l4),f,,,r,suv
freq,37,11,,,,83,106,,,168,62
mean,,,3.471795,2003.5,5.888889,,,16.858974,23.440171,,
std,,,1.291959,4.509646,1.611534,,,4.255946,5.954643,,
min,,,1.6,1999.0,4.0,,,9.0,12.0,,
25%,,,2.4,1999.0,4.0,,,14.0,18.0,,
50%,,,3.3,2003.5,6.0,,,17.0,24.0,,
75%,,,4.6,2008.0,8.0,,,19.0,27.0,,


How many different manufacturers are there?

In [115]:
print(f'There are {len(mpg.manufacturer.unique())} unique manufacturers in this dataset')

There are 15 unique manufacturers in this dataset


How many different models are there?

In [117]:
print(f'There are {len(mpg.model.unique())} unique models in this dataset')

There are 38 unique models in this dataset


Create a column named mileage_difference like you did in the DataFrames exercises; this column should contain the difference between highway and city mileage for each car.

In [118]:
mpg['mileage_difference'] = mpg.highway_mpg - mpg.city_mpg

Create a column named average_mileage like you did in the DataFrames exercises; this is the mean of the city and highway mileage.

In [123]:
mpg['average_mileage'] = mpg[['highway_mpg','city_mpg']].mean(axis = 1)

Create a new column on the mpg dataset named is_automatic that holds boolean values denoting whether the car has an automatic transmission.

Using the mpg dataset, find out which which manufacturer has the best miles per gallon on average?

Do automatic or manual cars have better miles per gallon?