In [2]:
import pandas as pd
import psycopg2

In [3]:
class DataSource:
    def __init__(self, client):
        self.client = client

    def fetch_data(self, ticker, interval, start_time=None, end_time=None):
        raise NotImplementedError("Subclasses should implement this method")


class YFinanceSource(DataSource):
    def fetch_data(self, ticker, interval="5m", start_time=None, end_time=None):
        # Now, we directly use the client (which is yfinance in this case)
        data = self.client.download(ticker, interval=interval, start=start_time, end=end_time)
        return data

# CONNECT REDSHIFT

In [8]:
import psycopg2

with open("pwd_redshift.txt",'r') as f:
    pwd= f.read()

connection_params = dict(
        host='data-engineer-cluster.cyhh5bfevlmn.us-east-1.redshift.amazonaws.com'
        , dbname='data-engineer-database'
        , user='juanmlacasa_coderhouse'
        , password=pwd
        , port='5439'
)

try:
    conn = psycopg2.connect(
        **connection_params
    )
    print("Connected to Redshift successfully!")
    
except Exception as e:
    print("Unable to connect to Redshift.")
    print(e)

Connected to Redshift successfully!


# EXTRACT

In [50]:
import yfinance as yf

# Create an instance of YFinanceSource, passing the yfinance module as the client
yfinance_source = YFinanceSource(client=yf)

# Fetch data for Apple Inc. with a daily interval
data = yfinance_source.fetch_data(ticker="AAPL", interval="1d")

# Display the data
print(data.head())

[*********************100%%**********************]  1 of 1 completed
                Open      High       Low     Close  Adj Close     Volume
Date                                                                    
1980-12-12  0.128348  0.128906  0.128348  0.128348   0.099449  469033600
1980-12-15  0.122210  0.122210  0.121652  0.121652   0.094261  175884800
1980-12-16  0.113281  0.113281  0.112723  0.112723   0.087343  105728000
1980-12-17  0.115513  0.116071  0.115513  0.115513   0.089504   86441600
1980-12-18  0.118862  0.119420  0.118862  0.118862   0.092099   73449600


# TRANSFORM

In [43]:
def get_or_create_record(conn, search_value, table_name, entity):
    # Check if data source exists
    with conn.cursor() as cur:
        cur.execute(f"""
            SELECT {entity}_id FROM {table_name} WHERE {entity}_name = '{search_value}';
            """)
        try:
            record_id = cur.fetchone()[0]
        except TypeError:
            record_id = None
    if not record_id:
        # Insert new data source
        with conn.cursor() as cur:
            cur.execute(f"""
                INSERT INTO {table_name} ({entity}_name) VALUES ('{search_value}');
            """)
            cur.execute(f"""
                SELECT MAX({entity}_id) FROM {table_name}
            """)
            record_id = cur.fetchone()[0]
    return record_id

In [49]:
asset_id = get_or_create_record(conn, "AAPL", "assets", "asset")
source_type_id = get_or_create_record(conn, "Platform", "source_types", "type")
data_source_id = get_or_create_record(conn, "Yahoo Finance", "data_sources", "source")
print(asset_id, source_type_id, data_source_id)

1 2 1


# LOAD

In [54]:
data.reset_index(inplace=True)
data.columns = [c.lower().replace(' ', '_') for c in data.columns]
data.rename(columns={'date':'timestamp'}, inplace=True)
data['asset_id'] = asset_id
data['source_id'] = data_source_id

In [92]:
from psycopg2.extras import execute_values

def load_redshift(conn, table_name, dataframe):
    with conn.cursor() as curs:
        values = [tuple(x) for x in dataframe.to_numpy()]
        cols = '"'+'''", "'''.join(data.columns)+'"'
        # define INSERT INTO statement
        insert_sql = f"INSERT INTO {table_name} ({cols}) VALUES %s"
        # Execute the transaction to insert the data
        curs.execute("BEGIN")
        execute_values(curs, insert_sql, values)
        curs.execute("COMMIT")
    print('Proceso terminado')

'INSERT INTO ohlcv_data ("timestamp", "open", "high", "low", "close", "adj_close", "volume", "asset_id", "source_id") VALUES %s'

In [94]:
with conn.cursor() as curs:
    curs.execute("ROLLBACK")
    conn.commit()

In [95]:
load_redshift(conn, 'ohlcv_data', data)

Proceso terminado
