We read each csv file, preprocess it, then add them to a SQLite as a table.
- Historical prices
- Options Data
- Portfolio holdings
- News articles

For version1, we have just downloaded historical data downloaded from barchart.com

In [1]:
# Run as is
import numpy as np
import pandas as pd
# pd.set_option('display.max_rows', 25)
pd.set_option('display.max_columns', 15)

In [2]:
import os
import glob

def count_files_by_extension(path):
    extensions = {}
    for file in os.listdir(path):
        extension = os.path.splitext(file)[1]
        if extension not in extensions:
            extensions[extension] = 1
        else:
            extensions[extension] += 1
    return extensions

def allfilesinpath(path):
    """
    Get all files in a directory specified by the path.

    Args:
        path (str): The path of the directory.

    Returns:
        A list of all files in the directory.
    """
    # Create a list of all files in the specified path
    all_files = glob.glob(path + "/*")
  
    return all_files

# This function gets all the files in the path, and separates them by substrings
def separate_files_by_substrings_in_path(path, substr_list):
    # Creates a list of all files in the path
    file_paths = glob.glob(path + "/*")

    # Extract the file names and separate them by substrings
    file_data = {}
    for substr in substr_list:
        file_data[substr] = []
    for file_path in file_paths:
        file_name = os.path.basename(file_path)
        for substr in substr_list:
            if substr in file_name:
                file_data[substr].append(file_name)

    return file_data


In [3]:
path = 'Raw Data/Barchart/'
count_files_by_extension(path)

{'': 1, '.csv': 10}

In [4]:
file_paths = allfilesinpath(path)
file_paths
# file_names

['Raw Data/Barchart/spy_daily_historical-data-04-24-2023.csv',
 'Raw Data/Barchart/hyg_daily_historical-data-04-24-2023.csv',
 'Raw Data/Barchart/vnq_options-overview-history-04-24-2023.csv',
 'Raw Data/Barchart/tlt_daily_historical-data-04-24-2023.csv',
 'Raw Data/Barchart/vnq_daily_historical-data-04-24-2023.csv',
 'Raw Data/Barchart/vnq_options-overview-history-04-24-2023-2.csv',
 'Raw Data/Barchart/vnq_options-overview-history-04-24-2023-3.csv',
 'Raw Data/Barchart/lqd_daily_historical-data-04-24-2023.csv',
 'Raw Data/Barchart/spy_options-overview-history-04-24-2023 copy.csv',
 'Raw Data/Barchart/spy_options-overview-history-04-24-2023-2.csv']

In [5]:
# substrings = ['spy','hyg','vnq','tlt','lqd']
substrings = ['daily']
names = separate_files_by_substrings_in_path(path,substrings)
names

{'daily': ['spy_daily_historical-data-04-24-2023.csv',
  'hyg_daily_historical-data-04-24-2023.csv',
  'tlt_daily_historical-data-04-24-2023.csv',
  'vnq_daily_historical-data-04-24-2023.csv',
  'lqd_daily_historical-data-04-24-2023.csv']}

In [6]:
# Function to read the CSV file
def readcsv(file):
    # parse_dates = ['Time']
    df1 = pd.read_csv(file)
    # df1 = pd.read_csv(file, parse_dates=parse_dates)

    return df1

# Function to perform preliminary preprocessing of raw etf data downloaded from Barchart.com
def preprocess_historical_etf_data(historical_etf_data):
    # Drop last row because it has irrelevant txt
    historical_etf_data = historical_etf_data[:-1]
    
    # rename colomns
    historical_etf_data = historical_etf_data.rename(columns={"Time": "date","Change":"daily_change","%Chg":"perct_chg"})
    
    # extract percentage from %Chg string column
    historical_etf_data['perct_chg'] = historical_etf_data['perct_chg'].str.replace('%', '').astype(float)

    # convert all column headers to lower case
    historical_etf_data.columns = historical_etf_data.columns.str.lower()

    # convert the date column to a datetime object
    historical_etf_data['date'] = pd.to_datetime(historical_etf_data['date'])

    return historical_etf_data

import sqlite3

def write_dataframe_to_sqlite(df, db_file, table_name=None):
    '''
    Write a pandas dataframe to a new table in a SQLite database.
    
    Parameters:
        df (pandas.DataFrame): The dataframe to write to the database.
        db_file (str): The name of the SQLite database file.
        table_name (str): The name of the table in the database (optional).
                          If not specified, the table will be named after the
                          dataframe.
    '''
    # create a connection to the database
    conn = sqlite3.connect(db_file)

    # if table_name is not specified, use the name of the dataframe
    if table_name is None:
        table_name = df.name

    # write the dataframe to a new table in the database
    df.to_sql(name=table_name, con=conn, if_exists='replace', index=False)

    # close the database connection
    conn.close()


In [7]:
# Function to create sqlite_db with out 
def create_sqlite_db(names):
    sqlite_db_name = 'etf_data.db'
    # For all etfs
    for name in names['daily']:
        # Read the file
        df = readcsv(path + str(name))
        # Do some preperocessing and get the df in our required format
        df = preprocess_historical_etf_data(df)
        # Write the formatted df as a table to the sqlite db
        write_dataframe_to_sqlite(df, sqlite_db_name, str(name[:3]))

    return sqlite_db_name

def view_table_contents(db_file, table_name):
    '''
    View the contents of a table in a SQLite database.
    
    Parameters:
        db_file (str): The name of the SQLite database file.
        table_name (str): The name of the table to view.
    '''
    # create a connection to the database
    conn = sqlite3.connect(db_file)

    # Get the first and last dates for the table
    query = f"SELECT MIN(date), MAX(date) FROM {table_name}"
    date_range = conn.execute(query).fetchone()
    print(f"Date range for {table_name}: {date_range[0]} - {date_range[1]}\n")

    # read the contents of the table into a pandas dataframe
    # df = pd.read_sql_query(f"SELECT * from {table_name}", conn)
    # print the contents of the dataframe
    # print(df)

    # close the database connection
    conn.close()



In [8]:
sqlite_db_name = create_sqlite_db(names)

# etf_list is also the tabe name in the sqlite db
etf_list = ['spy', 'tlt', 'hyg', 'lqd', 'vnq']
for table_name in etf_list:
    view_table_contents('etf_data.db', table_name)

# We now have a sqlite database

Date range for spy: 2000-01-03 00:00:00 - 2023-04-24 00:00:00

Date range for tlt: 2002-07-29 00:00:00 - 2023-04-24 00:00:00

Date range for hyg: 2007-04-12 00:00:00 - 2023-04-24 00:00:00

Date range for lqd: 2002-07-29 00:00:00 - 2023-04-24 00:00:00

Date range for vnq: 2004-09-30 00:00:00 - 2023-04-24 00:00:00



In [None]:
2023-04-24
2023-04-24
2023-04-24