In [2]:
import pandas as pd

# CONNECT to PostgreSQL

In [8]:
import dotenv
import os
import psycopg2

from os.path import expanduser
home = expanduser("~")

# dotenv.load_dotenv(fr"{home}\creds\local_postgres.txt")
# schema = 'deliverable_2'
dotenv.load_dotenv(fr"{home}\creds\pwd_redshift.txt")
schema = 'juanmlacasa_coderhouse'

connection_params = dict(
        host=os.getenv('host')
        , dbname=os.getenv('dbname')
        , user=os.getenv('user')
        , password=os.getenv('password')
        , port=os.getenv('port')
)


try:
    conn = psycopg2.connect(
        **connection_params
    )
    print("Connected to Redshift successfully!")
    
except Exception as e:
    print("Unable to connect to Redshift.")
    print(e)

Connected to Redshift successfully!


# EXTRACT

In [9]:
# Setup the parameters for the data to be fetched
ticker_info = dict(ticker_symbol = 'AAPL'
                   , data_source='Yahoo Finance'
                   , source_type = 'Platform'
                   , interval='1d'
                   , start_date = '2023-01-01'
                   , end_date = None)

In [4]:
import yfinance as yf

# Fetch data for Apple Inc. with a daily interval
data = yf.download(ticker_info['ticker_symbol']
                   , interval=ticker_info['interval']
                   , start=ticker_info['start_date']
                   , end=ticker_info['end_date'])

# Display the data
data.head()

[*********************100%%**********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-01-03,130.279999,130.899994,124.169998,125.07,124.538658,112117500
2023-01-04,126.889999,128.660004,125.080002,126.360001,125.823189,89113600
2023-01-05,127.129997,127.769997,124.760002,125.019997,124.488869,80962700
2023-01-06,126.010002,130.289993,124.889999,129.619995,129.069336,87754700
2023-01-09,130.470001,133.410004,129.889999,130.149994,129.597076,70790800


# TRANSFORM

In [10]:
def get_or_create_record(conn, search_value, table_name, entity):
    ''''
    This function ensures that a record with a specific search value exists in the database.
    If the record already exists, it returns its ID.
    If the record does not exist, it creates one and then returns the new ID.
    '''

    # --- Search for Existing Record ---
    with conn.cursor() as cur:
        # Construct and execute a SELECT query to search for an existing record
        cur.execute(f"SELECT {entity}_id FROM {schema}.{table_name} WHERE {entity}_name = '{search_value}';")
        try:
            # If a record is found, retrieve its ID
            record_id = cur.fetchone()[0]
        except TypeError:
            # If no record is found, set record_id to None
            record_id = None

    # --- Insert Record if Not Found ---
    if not record_id:
        with conn.cursor() as cur:
            # Construct and execute an INSERT query to create a new record
            cur.execute(f"INSERT INTO {schema}.{table_name} ({entity}_name) VALUES ('{search_value}');")
            # Retrieve the ID of the newly created record
            cur.execute(f"SELECT {entity}_id FROM {schema}.{table_name} WHERE {entity}_name = '{search_value}';")
            record_id = cur.fetchone()[0]
            # Commit the transaction to save the new record in the database
            conn.commit()

    # --- Return the Record ID ---
    return record_id


In [11]:
asset_id = get_or_create_record(conn, ticker_info['ticker_symbol'], "assets", "asset")
data_source_id = get_or_create_record(conn, ticker_info['data_source'], "data_sources", "source")
source_type_id = get_or_create_record(conn, ticker_info['source_type'], "source_types", "type")
print(asset_id, source_type_id, data_source_id)

4 2 3


# LOAD

In [12]:
data.reset_index(inplace=True)
data.columns = [c.lower().replace(' ', '_') for c in data.columns]
data.rename(columns={'date':'ts'}, inplace=True)
data['asset_id'] = asset_id
data['source_id'] = data_source_id

In [13]:
from psycopg2.extras import execute_values

def load_redshift(conn, table_name, dataframe):
    '''
    This function loads data from a Pandas DataFrame into a specified table in a PostgreSQL database.
    If records with the same primary key exist, the function updates them with the new values.
    '''

    # --- Convert DataFrame to List of Tuples ---
    # Convert each row of the DataFrame into a tuple and create a list of these tuples
    values = [tuple(x) for x in dataframe.to_numpy()]

    # --- Format Column Names ---
    # Construct a string of column names separated by commas
    cols = '"'+'''", "'''.join(dataframe.columns)+'"'

    # --- Prepare SQL Queries ---
    # Construct the base INSERT INTO query and append the ON CONFLICT clause
    insert_sql = f"INSERT INTO {schema}.{table_name} ({cols}) VALUES %s"
    on_conflict_sql = f"""
        ON CONFLICT (asset_id, source_id, ts)
        DO UPDATE SET
            (open, high, low, close, adj_close, volume) = 
            (EXCLUDED.open, EXCLUDED.high, EXCLUDED.low, EXCLUDED.close, EXCLUDED.adj_close, EXCLUDED.volume)
    """
    insert_sql = insert_sql + on_conflict_sql

    # --- Execute Transaction ---
    # Execute the query using execute_values for batch insertion
    with conn.cursor() as curs:
        execute_values(curs, insert_sql, values)
        conn.commit()

    print('Proceso terminado')


In [14]:
load_redshift(conn, 'ohlcv_data', data)

SyntaxError: syntax error at or near "ON"
LINE 2:         ON CONFLICT (asset_id, source_id, ts)
                ^


In [31]:
# close redshift connection
conn.close()

In [7]:
with conn.cursor() as curs:
    curs.execute("ROLLBACK")
    conn.commit()