In [13]:
import pandas as pd
import psycopg2

In [14]:
class DataSource:
    def __init__(self, client):
        self.client = client

    def fetch_data(self, ticker, interval, start_time=None, end_time=None):
        raise NotImplementedError("Subclasses should implement this method")


class YFinanceSource(DataSource):
    def fetch_data(self, ticker, interval="5m", start_time=None, end_time=None):
        # Now, we directly use the client (which is yfinance in this case)
        data = self.client.download(ticker, interval=interval, start=start_time, end=end_time)
        return data

# methods to check if I have all the data
# if I have the expected number of rows
# if I have the expected number of columns
# if I have the expected columns from the API

# EXTRACT

In [15]:
import yfinance as yf

# Create an instance of YFinanceSource, passing the yfinance module as the client
yfinance_source = YFinanceSource(client=yf)

# Fetch data for Apple Inc. with a daily interval
apple_data = yfinance_source.fetch_data(ticker="VOO", interval="1d")

# Display the data
print(apple_data.head())

[*********************100%%**********************]  1 of 1 completed
                  Open        High         Low       Close  Adj Close  Volume
Date                                                                         
2010-09-09  102.500000  102.500000  101.139999  101.320000  79.068352   26500
2010-09-10  101.680000  101.860001  101.300003  101.779999  79.427299    8600
2010-09-13  102.959999  103.139999  102.500000  103.059998  80.426216   33750
2010-09-14  102.839996  103.480003  102.379997  103.040001  80.410591   59400
2010-09-15  102.620003  103.379997  102.400002  103.300003  80.613510    9250


# TRANSFORM

In [None]:
class BaseLoader:
    def __init__(self, connection_params):
        self.connection_params = connection_params

    def load_data(self, data):
        """Load the provided data into the destination."""
        raise NotImplementedError("Subclasses should implement this method")

import psycopg2
class RedshiftLoader(BaseLoader):
    def load_data(self, data):
        """Load data into Amazon Redshift."""
        with psycopg2.connect(**self.connection_params) as conn:
            with conn.cursor() as cur:
                # Similar to the SQLLoader but might have Redshift-specific optimizations
                pass


# LOAD

In [18]:
import psycopg2

with open("pwd_redshift.txt",'r') as f:
    pwd= f.read()

connection_params = dict(
        host='data-engineer-cluster.cyhh5bfevlmn.us-east-1.redshift.amazonaws.com'
        , dbname='data-engineer-database'
        , user='juanmlacasa_coderhouse'
        , password=pwd
        , port='5439'
)

try:
    conn = psycopg2.connect(
        **connection_params
    )
    print("Connected to Redshift successfully!")
    
except Exception as e:
    print("Unable to connect to Redshift.")
    print(e)

Connected to Redshift successfully!


In [None]:
from psycopg2.extras import execute_values

def load_redshift(conn, table_name, dataframe):
    dtypes= dataframe.dtypes
    cols= list(dtypes.index )
    tipos= list(dtypes.values)
    type_map = {'int64': 'INT','int32': 'INT','float64': 'FLOAT','object': 'VARCHAR(50)','bool':'BOOLEAN'}
    sql_dtypes = [type_map[str(dtype)] for dtype in tipos]
    # Define variable SQL data_types
    column_defs = [f"{name} {data_type}" for name, data_type in zip(cols, sql_dtypes)]
    # Combine column definitions into the CREATE TABLE statement
    table_schema = f"""
        CREATE TABLE IF NOT EXISTS {table_name} (
            {', '.join(column_defs)}
        );
        """
    # Create table_schema
    cur = conn.cursor()
    cur.execute(table_schema)
    # generate schema values
    values = [tuple(x) for x in dataframe.to_numpy()]
    # define INSERT INTO statement
    insert_sql = f"INSERT INTO {table_name} ({', '.join(cols)}) VALUES %s"
    # Execute the transaction to insert the data
    cur.execute("BEGIN")
    execute_values(cur, insert_sql, values)
    cur.execute("COMMIT")
    print('Proceso terminado')

In [None]:
cargar_en_redshift(conn=conn, table_name='interest_over_time', dataframe=iot)