In [44]:
# Extract

import boto3
import botocore
import csv
from io import StringIO
import psycopg2
import pandas as pd

In [23]:
def download_csv_from_s3(bucket_name, object_key):
    s3 = boto3.client('s3', config=botocore.config.Config(signature_version=botocore.UNSIGNED))
    response = s3.get_object(Bucket=bucket_name, Key=object_key)
    content = response['Body'].read().decode('utf-8')

    # Process header row to remove leading and trailing spaces
    header, rows = content.split("\n", 1)
    cleaned_header = ",".join(column.strip() for column in header.split(","))

    # Return cleaned content
    return cleaned_header + "\n" + rows

titanic_csv_content = download_csv_from_s3('data-eng-makers-public-datasets-404544469985', 'etl_bites_04_titanic_dataset.csv')

First the necessary libraries are imported, here is what each is for:
- boto3 = This is for interacting with AWS services
- botocore = This is for configuring the client
- csv = This is for CSV processing
- StringIO = This is for handling strings as file-like objects.

Then the download_csv_from_s3() function is defined, whicn takes two parameters. Inside the function, the Boto3 S3 client is created and configured. The configuration 'signature_version=botocore.UNSIGNED' is used when accessing a public S3 bucket that doesn't require authentication.

The get_object() method is called to retrieve the object from the specified bucket and key. The response content is read and decoded as UTF-8.

The header row is processed to remove any leading and trailing spaces using a list comprehension. The cleaned content is then returned as a string.

Lastly the download_csv_from_s3() function is called to extract the content of a specific CSV file named 'etl_bites_04_titanic_dataset.csv' from the S3 bucket 'data-eng-makers-public-datasets-404544469985'.

Overall, what this code is doing is enabling the download of a CSV file from S3 bucket, makin it available for further processing or loading into a database.

In [24]:
def calculate_average_fare(titanic_data, pclass_filter):
    total_fare = 0
    passengers_count = 0

    for row in titanic_data:
        if row['Pclass'] == str(pclass_filter):
            total_fare += float(row['Fare'])
            passengers_count += 1

    return total_fare / passengers_count if passengers_count > 0 else 0

average_fare_class_1 = calculate_average_fare(csv.DictReader(StringIO(titanic_csv_content)), 1)

Here the calculate_average_fare() function is defined and takes two parameters: titanic_data and pclass_filter (This is the passenger class: 1,2,or 3). The total_fare and passengers_count variables are both set to zero.

Within the function we create a loop and iterate over each row in the titanic_data. We ensure that the passengers_count is increased by 1 if the item in the row in the column 'Pclass' matches the pclass_filter passed as an argument. We also ensure that the total_fare is increased by the amount in the 'Fare' column for that row.

We then end the loop and return the total_fare divided by the passengers_count if the passengers_count is bigger than 0, otherwise we return 0.

Finally we call the calculate_average_fare() function and convert the 'titanic_csv_content' dataset into a csv.DictReader object. The 'pclass_filter' is set to 1 to calculate the average fare for the first class.

To summarise, the code calculates the average fare for a specific passenger class (1, 2, or 3) in the Titanic dataset. It iterates over the dataset, sums the fares for the passengers in the specified class, and divides the sum by the count of passengers to compute the average fare.

In [25]:
create_class_average_fares_table = '''CREATE TABLE class_average_fares (
    id SERIAL PRIMARY KEY,
    pclass INTEGER NOT NULL,
    average_fare NUMERIC(10, 2) NOT NULL
);'''

I've created the table in TablePlus instead, but storing the details above for clarity

In [26]:
def insert_data_to_postgresql(data, connection):
    cursor = connection.cursor()
    query = "INSERT INTO class_average_fares (pclass, average_fare) VALUES (%s, %s)"
    cursor.execute(query, data)
    connection.commit()

etl_bites_conn_string = """
    host='localhost' 
    port='5432' 
    dbname='etl_bites' 
    user='ilhaam.ahmed' 
    password='etl_proj'"""
conn = psycopg2.connect(etl_bites_conn_string)

insert_data_to_postgresql((1, average_fare_class_1), conn)

Here the insert_data_to_postgresql() function is defined, this is used to insert and commit the data to the database. A connection is established with the database and the insert_data_to_postgresql() is called in order insert the data to the database. It takes '(1, average_fare_class_1), conn' as values. The 1 is the passenger class and the average_fare_class_1 is th amount we previously calculated.

# Exercise

### Calculate the survival rate for passengers in the 2nd class from the Titanic dataset.

### Create a new table class_survival_rate in the local PostgreSQL database and insert the calculated survival rate into this table.

In [35]:
# This function computes the survival rate for the given passenger class.

def calculate_survival_rate(csv_content, pclass):
    csv_reader = csv.DictReader(csv_content.splitlines())
    count = 0
    survivors = 0

    for row in csv_reader:
        if int(row['Pclass']) == pclass:
            count += 1
            if int(row['Survived']) == 1:
                survivors += 1

    return survivors / count if count > 0 else None

# Then, you can use it to calculate the survival rate for the 2nd class:

titanic_csv = download_csv_from_s3('data-eng-makers-public-datasets-404544469985', 'etl_bites_04_titanic_dataset.csv')
survival_rate_class_2 = calculate_survival_rate(titanic_csv, 2)

In [36]:
create_table = '''
CREATE TABLE class_survival_rate (
    id SERIAL PRIMARY KEY,
    pclass INTEGER NOT NULL,
    survival_rate NUMERIC(10, 5) NOT NULL
);'''

In [37]:
def insert_survival_rate_to_postgresql(survival_rate_data, connection):
    with connection.cursor() as cursor:
        query = "INSERT INTO class_survival_rate (pclass, survival_rate) VALUES (%s, %s)"
        cursor.execute(query, survival_rate_data)
    connection.commit()

insert_survival_rate_to_postgresql((2, survival_rate_class_2), conn)

# Challenge

### For the challenge, we will use a different dataset, the Iris Species one.

### Your challenge is to find the average sepal length and sepal width for each species and store the results in a local PostgreSQL database.

In [41]:
def download_csv_from_s3(bucket_name, object_key):
    s3 = boto3.client('s3', config=botocore.config.Config(signature_version=botocore.UNSIGNED))
    response = s3.get_object(Bucket=bucket_name, Key=object_key)
    content = response['Body'].read().decode('utf-8')

    # Process header row to remove leading and trailing spaces
    header, rows = content.split("\n", 1)
    cleaned_header = ",".join(column.strip() for column in header.split(","))

    # Return cleaned content
    return cleaned_header + "\n" + rows

iris_species_csv_content = download_csv_from_s3('data-eng-makers-public-datasets-404544469985', 'etl_bites_04_iris_dataset.csv')

In [49]:
conn_string = """
    host='localhost' 
    port='5432' 
    dbname='etl_bites' 
    user='ilhaam.ahmed' 
    password='etl_proj'
"""

def insert_iris_data_to_postgresql(conn_string, table_name, data):
    with psycopg2.connect(conn_string) as conn:
        with conn.cursor() as cur:
            for item in data.intertuples(index=False):
                query = "INSERT INTO {table_name} (species, sepal_length, sepal_width) VALUES (%s, %s, %s)"
                cur.execute(query, item)
            conn.commit()

def calculate_average_sepal_measurements(csv_content):
    data = csv_content.split("\n")[1:]
    species_measurements = {}

    for row in data:
        if not row:
            continue

        sepal_length, sepal_width, species = map(str.strip, row.split(","))
        sepal_length = float(sepal_length)
        sepal_width = float(sepal_width)

        measurements = species_measurements.setdefault(species, {'sepal_length_sum': 0, 'sepal_width_sum':0, 'count':0})
        measurements['sepal_length_sum'] += sepal_length
        measurements['sepal_width_sum'] += sepal_width
        measurements['count'] += 1
    
    result = [(species, measurements['sepal_length_sum'] / measurements['count'], measurements['sepal_width_sum'] / measurements['count']) for species, measurements in species_measurements.items()]

    return result

iris_species_csv_content = download_csv_from_s3('data-eng-makers-public-datasets-404544469985', 'etl_bites_04_iris_dataset.csv')

average_measurements = calculate_average_sepal_measurements(iris_species_csv_content)

table_name = 'iris_data'
insert_iris_data_to_postgresql(conn_string, table_name, average_measurements)