In [46]:
import boto3
import botocore
import csv
import os
import io
from io import StringIO
import psycopg2
from dotenv import load_dotenv #pip install python-dotenv
from psycopg2 import connect, sql
from os import environ as env
import pandas as pd

load_dotenv()
conn_string = os.getenv('conn_string')

if 'conn_string' in env:
    print(env['conn_string'][:35])

dbname='etl_bites' user='joemiller'


## 04 Challenge

Your challenge is to find the average sepal length and sepal width for each species and store the results in a local PostgreSQL database.

### EXTRACT

Download csv file from Amazon S3 bucket.

This S3 bucket is public so we dont need any AWS credentials to access it.

In [47]:
def download_csv_from_s3(bucket_name, object_key):
    s3 = boto3.client('s3', config=botocore.config.Config(signature_version=botocore.UNSIGNED))
    response = s3.get_object(Bucket=bucket_name, Key=object_key)
    content = response['Body'].read().decode('utf-8')

    df = pd.read_csv(io.StringIO(content))

    # Return cleaned content
    return df

iris_df = download_csv_from_s3('data-eng-makers-public-datasets-404544469985', 'etl_bites_04_iris_dataset.csv')

In [48]:
iris_df

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica


### TRANSFORM

In [49]:
mean_sepal_length = iris_df.groupby("Species")['SepalLengthCm'].mean().round(2)
mean_sepal_length

Species
Iris-setosa        5.01
Iris-versicolor    5.94
Iris-virginica     6.59
Name: SepalLengthCm, dtype: float64

In [50]:
mean_sepal_width = iris_df.groupby("Species")['SepalWidthCm'].mean().round(2)
mean_sepal_width

Species
Iris-setosa        3.42
Iris-versicolor    2.77
Iris-virginica     2.97
Name: SepalWidthCm, dtype: float64

In [51]:
sepal_length_df = mean_sepal_length.to_frame().reset_index()
sepal_length_df

Unnamed: 0,Species,SepalLengthCm
0,Iris-setosa,5.01
1,Iris-versicolor,5.94
2,Iris-virginica,6.59


In [52]:
sepal_length_df.columns

Index(['Species', 'SepalLengthCm'], dtype='object')

In [53]:
sepal_width_df = mean_sepal_width.to_frame().reset_index()
sepal_width_df

Unnamed: 0,Species,SepalWidthCm
0,Iris-setosa,3.42
1,Iris-versicolor,2.77
2,Iris-virginica,2.97


In [54]:
merged_sepal_df = pd.merge(sepal_width_df, sepal_length_df, on=['Species'])
merged_sepal_df

Unnamed: 0,Species,SepalWidthCm,SepalLengthCm
0,Iris-setosa,3.42,5.01
1,Iris-versicolor,2.77,5.94
2,Iris-virginica,2.97,6.59


In [55]:
merged_sepal_df.rename(columns={
    "SepalWidthCm" : "Avg_SepalWidthCm",
    "SepalLengthCm" : "Avg_SepalLengthCm"
}, inplace= True)

merged_sepal_df

Unnamed: 0,Species,Avg_SepalWidthCm,Avg_SepalLengthCm
0,Iris-setosa,3.42,5.01
1,Iris-versicolor,2.77,5.94
2,Iris-virginica,2.97,6.59


### LOAD

In [56]:
def execute_query_postgresql(conn_string, query):
    with connect(conn_string) as conn:
        with conn.cursor() as cur:
            cur.execute(query)
            conn.commit()

create_api_data_table = '''
DROP TABLE IF EXISTS avg_sepal_length_width  CASCADE;
CREATE TABLE avg_sepal_length_width (
Species TEXT NOT NULL,
Avg_SepalWidthCm FLOAT NOT NULL,
Avg_SepalLengthCm FLOAT NOT NULL
);
'''

execute_query_postgresql(conn_string, create_api_data_table)

In [57]:
merged_sepal_df

Unnamed: 0,Species,Avg_SepalWidthCm,Avg_SepalLengthCm
0,Iris-setosa,3.42,5.01
1,Iris-versicolor,2.77,5.94
2,Iris-virginica,2.97,6.59


In [58]:
def insert_data_to_postgresql(conn_string, table_name, data):
    with connect(conn_string) as conn:
        with conn.cursor() as cur:
            for row in data.index:
                query = sql.SQL("INSERT INTO {} (Species, Avg_SepalWidthCm, Avg_SepalLengthCm) VALUES (%s, %s, %s)").format(sql.Identifier(table_name))
                cur.execute(query, (data['Species'][row], data['Avg_SepalWidthCm'][row], data['Avg_SepalLengthCm'][row]))
        conn.commit()

table_name = "avg_sepal_length_width"
insert_data_to_postgresql(conn_string, table_name, merged_sepal_df)

In [59]:
%load_ext sql

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [60]:
%sql postgresql+psycopg2://joemiller:@localhost:5432/etl_bites

In [61]:
%%sql

SELECT *
FROM avg_sepal_length_width;

 * postgresql+psycopg2://joemiller:***@localhost:5432/etl_bites
3 rows affected.


species,avg_sepalwidthcm,avg_sepallengthcm
Iris-setosa,3.42,5.01
Iris-versicolor,2.77,5.94
Iris-virginica,2.97,6.59
