In [1]:
import boto3
import botocore
import csv
from io import StringIO
import psycopg2

In [2]:
def download_csv_from_s3(bucket_name, object_key):
    s3 = boto3.client('s3', config=botocore.config.Config(signature_version=botocore.UNSIGNED))
    response = s3.get_object(Bucket=bucket_name, Key=object_key)
    content = response['Body'].read().decode('utf-8')

    # Process header row to remove leading and trailing spaces
    header, rows = content.split("\n", 1)
    cleaned_header = ",".join(column.strip() for column in header.split(","))

    # Return cleaned content
    return cleaned_header + "\n" + rows

iris_csv_content = download_csv_from_s3('data-eng-makers-public-datasets-404544469985', 'etl_bites_04_iris_dataset.csv')

In [8]:
for row in csv.DictReader(StringIO(iris_csv_content)):
    print(row)

{'Id': '1', 'SepalLengthCm': '5.1', 'SepalWidthCm': '3.5', 'PetalLengthCm': '1.4', 'PetalWidthCm': '0.2', 'Species': 'Iris-setosa'}
{'Id': '2', 'SepalLengthCm': '4.9', 'SepalWidthCm': '3.0', 'PetalLengthCm': '1.4', 'PetalWidthCm': '0.2', 'Species': 'Iris-setosa'}
{'Id': '3', 'SepalLengthCm': '4.7', 'SepalWidthCm': '3.2', 'PetalLengthCm': '1.3', 'PetalWidthCm': '0.2', 'Species': 'Iris-setosa'}
{'Id': '4', 'SepalLengthCm': '4.6', 'SepalWidthCm': '3.1', 'PetalLengthCm': '1.5', 'PetalWidthCm': '0.2', 'Species': 'Iris-setosa'}
{'Id': '5', 'SepalLengthCm': '5.0', 'SepalWidthCm': '3.6', 'PetalLengthCm': '1.4', 'PetalWidthCm': '0.2', 'Species': 'Iris-setosa'}
{'Id': '6', 'SepalLengthCm': '5.4', 'SepalWidthCm': '3.9', 'PetalLengthCm': '1.7', 'PetalWidthCm': '0.4', 'Species': 'Iris-setosa'}
{'Id': '7', 'SepalLengthCm': '4.6', 'SepalWidthCm': '3.4', 'PetalLengthCm': '1.4', 'PetalWidthCm': '0.3', 'Species': 'Iris-setosa'}
{'Id': '8', 'SepalLengthCm': '5.0', 'SepalWidthCm': '3.4', 'PetalLengthCm': 

In [17]:
def calculate_averages(data):
    result = {}
    for row in data:
        if row['Species'] not in result:
            result[row['Species']] = {'SepalLength': [float(row['SepalLengthCm'])], 'SepalWidth': [float(row['SepalWidthCm'])]}
        else:
            result[row['Species']]['SepalLength'].append(float(row['SepalLengthCm']))
            result[row['Species']]['SepalWidth'].append(float(row['SepalWidthCm']))
    for species in result:
        result[species]['SepalLength'] = round(sum(result[species]['SepalLength']) / len(result[species]['SepalLength']), 2)
        result[species]['SepalWidth'] = round(sum(result[species]['SepalWidth']) / len(result[species]['SepalWidth']), 2)
    return result

average_sepal_values_by_species = calculate_averages(csv.DictReader(StringIO(iris_csv_content)))
average_sepal_values_by_species

{'Iris-setosa': {'SepalLength': 5.01, 'SepalWidth': 3.42},
 'Iris-versicolor': {'SepalLength': 5.94, 'SepalWidth': 2.77},
 'Iris-virginica': {'SepalLength': 6.59, 'SepalWidth': 2.97}}

In [None]:
CREATE TABLE iris_sepal_averages(
    id SERIAL PRIMARY KEY,
    species TEXT,
    average_sepal_length_cm FLOAT NOT NULL,
    average_sepal_width_cm FLOAT NOT NULL
);
# Table created through direct SQL command in TablePlus rather than Python script for the sake of variety

In [19]:
def insert_data_to_postgresql(data, connection):
    cursor = connection.cursor()
    query = "INSERT INTO iris_sepal_averages(species, average_sepal_length_cm, average_sepal_width_cm) VALUES (%s, %s, %s)"
    
    for species, values in data.items():
        species_name = species
        avg_length = values.get('SepalLength')
        avg_width = values.get('SepalWidth')

        cursor.execute(query, (species_name, avg_length, avg_width))

    connection.commit()

etl_bites_conn_string = "host='localhost' port='5432' dbname='etl_bites' user='jackdench'"
conn = psycopg2.connect(etl_bites_conn_string)

insert_data_to_postgresql(average_sepal_values_by_species, conn)