## Query a cloud-based MySQL database using boto3 and mysql.connector

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import boto3
import mysql.connector

In [2]:
# If needed, install the library to connect to a MySQL database, this takes about 5 minutes.
# %conda install -c anaconda mysql-connector-python

In [3]:
# Make sure this loads
import mysql.connector

In [4]:
# Initialize boto3 and make sure credentials are ready connect to AWS
session = boto3.Session()
sts = session.client('sts')
response = sts.get_caller_identity()
my_username = response['Arn'].split('/')[1]
print(my_username)

kcolvin


#### Query a MySQL database, step-by-step

In [5]:
# Define parameters to query a cloud-based MySQL server.
#
HOST = '34.222.176.125'  # DNS name or IP address of the AWS RDS Database server
#
# Credentials you'll use on the MySQL Server. These are not the same as AWS credentials.
USER = 'student'
PASSWORD = 'to be supplied in class'
#
# Which database to query
DATABASE = '212'
#
# Which TCP port to talk over (sometimes this is port 3306)
PORT = '80'

In [6]:
# To query an AWS database, you must have a connection to that database. Use the parameters above to create a connector.
# Connect to the db by creating a 'connector' called conn
conn = mysql.connector.connect(
    host = HOST,
    port = PORT,
    user = USER,
    password = PASSWORD,
    database = DATABASE)

In [7]:
# Next, we need to create a cursor object to query data. This is just part of the process.
mycursor = conn.cursor()

In [8]:
# Next, let's write an SQL query. This is just a string with the right SQL syntax.
#
# Write the query using proper SQL syntax. You can put any legal MySQL query syntax into the query.
# Below we indicate we want to select all columns, but only the first 3 rows
# of data from the 'co2_emission' table.
query = "SELECT * FROM co2_emission LIMIT 3"

In [9]:
# Execute that query against the database using the cursor we created earlier.
mycursor.execute(query)

In [10]:
# fetchall() will return all of the rows in the query as a list object of tuples.  
# This is useful if we want to use the data in a dataframe
myresult = mycursor.fetchall()

In [11]:
# Investigate the result
# Notice this is a list of tuples
myresult

[('Afghanistan', 'AFG', 1949, 14656.0),
 ('Afghanistan', 'AFG', 1950, 84272.0),
 ('Afghanistan', 'AFG', 1951, 91600.0)]

In [12]:
#Let's supply the column headers for our data
headers = ['Country','Code','Year','CO2']
# Create a DataFrame using the columns and data from the query
df = pd.DataFrame(data = myresult,columns = headers)
df

Unnamed: 0,Country,Code,Year,CO2
0,Afghanistan,AFG,1949,14656.0
1,Afghanistan,AFG,1950,84272.0
2,Afghanistan,AFG,1951,91600.0


#### Query a MySQL database with a 'real' query

In [13]:
# Write the query
query = "SELECT * FROM co2_emission"

In [14]:
# Same code as above, just do it in a single cell
mycursor.execute(query)
myresult = mycursor.fetchall()
headers = ['Entity','Code','Year','CO2']
df = pd.DataFrame(data = myresult,columns = headers)
print('The size of the dataframe is:',df.shape[0],'rows and',df.shape[1],'columns.\n')
print(df.head(5))
print(df.tail(5))

The size of the dataframe is: 20853 rows and 4 columns.

        Entity Code  Year       CO2
0  Afghanistan  AFG  1949   14656.0
1  Afghanistan  AFG  1950   84272.0
2  Afghanistan  AFG  1951   91600.0
3  Afghanistan  AFG  1952   91600.0
4  Afghanistan  AFG  1953  106256.0
         Entity Code  Year          CO2
20848  Zimbabwe  ZWE  2013  11536239.29
20849  Zimbabwe  ZWE  2014  11866348.41
20850  Zimbabwe  ZWE  2015  10907603.94
20851  Zimbabwe  ZWE  2016   9932649.88
20852  Zimbabwe  ZWE  2017  10397718.47


#### Other operations using standard MySQL syntax

In [15]:
# See what other tables are in the '212' database
query = "SHOW TABLES"
# Same pattern
mycursor.execute(query)
myresult = mycursor.fetchall()
myresult

[('athlete_events',),
 ('avocado',),
 ('car_payment',),
 ('co2_emission',),
 ('county_population',),
 ('forecast_weather_data',),
 ('titanic',)]

In [16]:
# See what other databases are on the MySQL db server
query = "SHOW DATABASES"
# Same code as above, just do it in a single cell
mycursor.execute(query)
myresult = mycursor.fetchall()
myresult

[('information_schema',),
 ('212',),
 ('312',),
 ('album',),
 ('dkraker',),
 ('dkumamot',),
 ('flighttest',),
 ('kcolvin',),
 ('kycolvin',),
 ('lknott',),
 ('mysql',),
 ('nmediati',),
 ('northwind',),
 ('nrodri31',),
 ('performance_schema',),
 ('sys',),
 ('test',),
 ('w3schools',)]

#### If you want to change databases, you have to create a new connector

In [17]:
# It is good practice to close your connections once you are done with them.
conn.close()

In [18]:
# Change databases to '312'
DATABASE = '312'
# And create a new connection
conn = mysql.connector.connect(
    host = HOST,
    port = PORT,
    user = USER,
    password = PASSWORD,
    database = DATABASE)

In [19]:
# And a new cursor()
mycursor = conn.cursor()

In [20]:
# Show tables from the '312' database
query = 'SHOW TABLES'
# Same code as above, just do it in a single cell
mycursor.execute(query)
myresult = mycursor.fetchall()
myresult

[('clean',), ('cutting',)]

In [21]:
# What is the description of the 'clean' table in the database?
query = 'DESCRIBE clean'
# Same code as above, just do it in a single cell
mycursor.execute(query)
myresult = mycursor.fetchall()
myresult

[('serial_num', 'int(11)', 'YES', '', None, ''),
 ('process', 'varchar(1024)', 'YES', '', None, '')]

In [22]:
# Let's get everything in the 'clean' table
query = 'SELECT * FROM clean'
# Same code as above, just do it in a single cell
mycursor.execute(query)
myresult = mycursor.fetchall()
headers = ['serial_num', 'process']
# Create the df
df = pd.DataFrame(data = myresult,columns = headers)
print('The size of the dataframe is:',df.shape[0],'rows and',df.shape[1],'columns.\n')
print(df.head(5))
print(df.tail(5))
# I'm done with the connection, so close it
conn.close()

The size of the dataframe is: 9517 rows and 2 columns.

   serial_num process
0       10000   clean
1       10001   clean
2       10002   clean
3       10003   clean
4       10004   clean
      serial_num process
9512       24996   clean
9513       24997   clean
9514       24998   clean
9515       24999   clean
9516       25000   clean


## Assignment
Your assignment is to:
- connect to the 'w3schools' database
- show all tables in that database
- describe the details about the 'customers' table
- query all columns and rows from the 'customers' table and load the rows into a pandas dataframe, with column headings
- save that dataframe as a .csv file at the S3 location: bucket = gse580, key = 'your username'/data/customers.csv

Assessment of assignment: Verify valid .csv file exists at: gse580/'username'/data/customers.csv

In [30]:
# Your code starting here