In [13]:
import pandas as pd
import psycopg2

In [33]:
class DataSource:
    def __init__(self, client):
        self.client = client
        self.column_mapping = None
        self.data = None
        self.ticker_info = dict(ticker_symbol = None
                   , interval=None
                   , start_date = None
                   , end_date = None)

    def __str__(self):
        return f"DataSource({self.client}), {self.ticker_info}"
    
    def fetch_data(self, ticker, interval, start_time=None, end_time=None):
        raise NotImplementedError("Subclasses should implement this method")
    

    def transform_data(self):
        raise NotImplementedError("Subclasses should implement the 'transform_data' method")
    

    def validate_data(self):
        raise NotImplementedError("Subclasses should implement the 'validate_data' method")
    

    def pipeline(self, ticker, interval, start_time=None, end_time=None):
        self.ticker_info['ticker_symbol'] = ticker
        self.ticker_info['interval'] = interval
        self.ticker_info['start_date'] = start_time
        self.ticker_info['end_date'] = end_time
        self.fetch_data(ticker, interval, start_time, end_time)
        self.transform_data()
        self.validate_data()
        return self.data


class YFinanceSource(DataSource):
    def __init__(self):
        import yfinance as yf
        super().__init__(yf)
        self.column_mapping = {
            'Date': 'ts',
            'Open': 'open',
            'High': 'high',
            'Low': 'low',
            'Close': 'close',
            'Adj Close': 'adj_close',
            'Volume': 'volume'
        }
    
    def fetch_data(self, ticker, interval="5m", start_time=None, end_time=None):
        # Now, we directly use the client (which is yfinance in this case)
        self.ticker_info['ticker_symbol'] = ticker
        self.ticker_info['interval'] = interval
        self.ticker_info['start_date'] = start_time
        self.ticker_info['end_date'] = end_time

        self.data = self.client.download(ticker, interval=interval, start=start_time, end=end_time)
        return self.data
    
    def transform_data(self):
        self.data = self.data.reset_index()
        self.data = self.data.rename(columns=self.column_mapping)
        return self.data
    
    def validate_data(self):
        pass

    



# methods to check if I have all the data
# if I have the expected number of rows
# if I have the expected number of columns
# if I have the expected columns from the API

In [40]:
yfin = YFinanceSource()
print(yfin)

DataSource(<module 'yfinance' from 'c:\\Users\\juanm\\anaconda3\\envs\\coder-de\\Lib\\site-packages\\yfinance\\__init__.py'>), {'ticker_symbol': None, 'interval': None, 'start_date': None, 'end_date': None}


In [37]:
yfin.pipeline("AAPL", "1D", "2021-01-01", "2021-01-31")

[*********************100%%**********************]  1 of 1 completed


Unnamed: 0,ts,open,high,low,close,adj_close,volume
0,2021-01-04,133.520004,133.610001,126.760002,129.410004,127.331688,143301900
1,2021-01-05,128.889999,131.740005,128.429993,131.009995,128.905975,97664900
2,2021-01-06,127.720001,131.050003,126.379997,126.599998,124.566811,155088000
3,2021-01-07,128.360001,131.630005,127.860001,130.919998,128.817429,109578200
4,2021-01-08,132.429993,132.630005,130.229996,132.050003,129.929291,105158200
5,2021-01-11,129.190002,130.169998,128.5,128.979996,126.908577,100384500
6,2021-01-12,128.5,129.690002,126.860001,128.800003,126.731491,91951100
7,2021-01-13,128.759995,131.449997,128.490005,130.889999,128.787903,88636800
8,2021-01-14,130.800003,131.0,128.759995,128.910004,126.839722,90221800
9,2021-01-15,128.779999,130.220001,127.0,127.139999,125.09816,111598500


In [21]:
yfin.pipeline("AAPL", "1D", "2021-01-01", "2021-01-31")

[*********************100%%**********************]  1 of 1 completed


TypeError: YFinanceSource.validate_data() takes 0 positional arguments but 1 was given

# EXTRACT

In [15]:
import yfinance as yf

# Create an instance of YFinanceSource, passing the yfinance module as the client
yfinance_source = YFinanceSource(client=yf)

# Fetch data for Apple Inc. with a daily interval
apple_data = yfinance_source.fetch_data(ticker="VOO", interval="1d")

# Display the data
print(apple_data.head())

[*********************100%%**********************]  1 of 1 completed
                  Open        High         Low       Close  Adj Close  Volume
Date                                                                         
2010-09-09  102.500000  102.500000  101.139999  101.320000  79.068352   26500
2010-09-10  101.680000  101.860001  101.300003  101.779999  79.427299    8600
2010-09-13  102.959999  103.139999  102.500000  103.059998  80.426216   33750
2010-09-14  102.839996  103.480003  102.379997  103.040001  80.410591   59400
2010-09-15  102.620003  103.379997  102.400002  103.300003  80.613510    9250


# TRANSFORM

In [None]:
class BaseLoader:
    def __init__(self, connection_params):
        self.connection_params = connection_params

    def load_data(self, data):
        """Load the provided data into the destination."""
        raise NotImplementedError("Subclasses should implement this method")

import psycopg2
class RedshiftLoader(BaseLoader):
    def load_data(self, data):
        """Load data into Amazon Redshift."""
        with psycopg2.connect(**self.connection_params) as conn:
            with conn.cursor() as cur:
                # Similar to the SQLLoader but might have Redshift-specific optimizations
                pass


# LOAD

In [18]:
import psycopg2

with open("pwd_redshift.txt",'r') as f:
    pwd= f.read()

connection_params = dict(
        host='data-engineer-cluster.cyhh5bfevlmn.us-east-1.redshift.amazonaws.com'
        , dbname='data-engineer-database'
        , user='juanmlacasa_coderhouse'
        , password=pwd
        , port='5439'
)

try:
    conn = psycopg2.connect(
        **connection_params
    )
    print("Connected to Redshift successfully!")
    
except Exception as e:
    print("Unable to connect to Redshift.")
    print(e)

Connected to Redshift successfully!


In [None]:
from psycopg2.extras import execute_values

def load_redshift(conn, table_name, dataframe):
    dtypes= dataframe.dtypes
    cols= list(dtypes.index )
    tipos= list(dtypes.values)
    type_map = {'int64': 'INT','int32': 'INT','float64': 'FLOAT','object': 'VARCHAR(50)','bool':'BOOLEAN'}
    sql_dtypes = [type_map[str(dtype)] for dtype in tipos]
    # Define variable SQL data_types
    column_defs = [f"{name} {data_type}" for name, data_type in zip(cols, sql_dtypes)]
    # Combine column definitions into the CREATE TABLE statement
    table_schema = f"""
        CREATE TABLE IF NOT EXISTS {table_name} (
            {', '.join(column_defs)}
        );
        """
    # Create table_schema
    cur = conn.cursor()
    cur.execute(table_schema)
    # generate schema values
    values = [tuple(x) for x in dataframe.to_numpy()]
    # define INSERT INTO statement
    insert_sql = f"INSERT INTO {table_name} ({', '.join(cols)}) VALUES %s"
    # Execute the transaction to insert the data
    cur.execute("BEGIN")
    execute_values(cur, insert_sql, values)
    cur.execute("COMMIT")
    print('Proceso terminado')

In [None]:
from psycopg2.extras import execute_values

def load_postgresql(conn, table_name, dataframe):
    '''
    This function loads data from a Pandas DataFrame into a specified table in a PostgreSQL database.
    If records with the same primary key exist, the function updates them with the new values.
    '''

    # --- Convert DataFrame to List of Tuples ---
    # Convert each row of the DataFrame into a tuple and create a list of these tuples
    values = [tuple(x) for x in dataframe.to_numpy()]

    # --- Format Column Names ---
    # Construct a string of column names separated by commas
    cols = '"'+'''", "'''.join(dataframe.columns)+'"'

    # --- Prepare SQL Queries ---
    # Construct the base INSERT INTO query and append the ON CONFLICT clause
    insert_sql = f"INSERT INTO {schema}.{table_name} ({cols}) VALUES %s"
    on_conflict_sql = f"""
        ON CONFLICT (asset_id, source_id, ts)
        DO UPDATE SET
            (open, high, low, close, adj_close, volume) = 
            (EXCLUDED.open, EXCLUDED.high, EXCLUDED.low, EXCLUDED.close, EXCLUDED.adj_close, EXCLUDED.volume)
    """
    insert_sql = insert_sql + on_conflict_sql

    # --- Execute Transaction ---
    # Execute the query using execute_values for batch insertion
    with conn.cursor() as curs:
        execute_values(curs, insert_sql, values)
        conn.commit()

    print('Finished loading data into PostgreSQL.')


In [None]:
cargar_en_redshift(conn=conn, table_name='interest_over_time', dataframe=iot)