# Investments

This will be where we get all the data for our investments.

We will then run that data through our ETL pipeline until it is ready to be loaded into SQL Server for analysis

Brief explanation of pipeline:

1. Start by picking our stocks, etf, and forex, as well as an index that can look at the overall market

2. Extract the data by requesting it from polygon.io

3. Clean data (removing nulls etc)

4. Aggregate and add metrics that will be important to track

5. Format data into star schema

6. Load into SQL Server for analysis


In [6]:
# Picking our stocks, ETF, and forex
stock1 = {"ticker": "NVDA", "name": "Nvidia"}
stock2 = {"ticker": "AMZN", "name": "Amazon"}
stock3 = {"ticker": "JPM", "name": "JPMorgan"}
etf = {"ticker": "SPY", "name": "SPY"}
forex = {"ticker": "C:USDCAD", "name": "USDCAD"}

# Our way to look at the overall market
market = {"ticker": "I:NDX", "name": "NDX"}

# List for easy access
investments = [stock1, stock2, stock3, etf, forex, market]

In [7]:
import requests
import os
import csv
import pandas as pd
import numpy as np
from dotenv import load_dotenv

In [8]:
load_dotenv()
api_key = os.environ["api_key"]
params = {"apiKey": api_key}
base_url = "https://api.polygon.io"

In [9]:
# Creating a request to get information on each investment every day from 7/1/23 to 7/1/24

for investment in investments:
    response = requests.get(
        f"{base_url}/v2/aggs/ticker/{investment["ticker"]}/range/1/day/2023-07-01/2024-07-01?adjusted=true&sort=asc",
        params,
    )
    investment["data"] = response.json()["results"]
    print(investment)

# # Saving our data to a csv file so we don't have to keep requesting it
for investment in investments:
    keys = investment["data"][0].keys()
    with open(f"./Data/Uncleaned/{investment["name"]}.csv", "w", newline="") as output_file:
        dict_writer = csv.DictWriter(output_file, keys)
        dict_writer.writeheader()
        dict_writer.writerows(investment["data"])

{'ticker': 'NVDA', 'name': 'Nvidia', 'data': [{'v': 198209190.0, 'vw': 42.5106, 'o': 42.517, 'c': 42.413, 'h': 42.898, 'l': 42.202, 't': 1688356800000, 'n': 323138}, {'v': 323617560.0, 'vw': 42.6645, 'o': 42.135, 'c': 42.317, 'h': 43.177, 'l': 42.085, 't': 1688529600000, 'n': 468057}, {'v': 304138790.0, 'vw': 41.8158, 'o': 41.844, 'c': 42.103, 'h': 42.179, 'l': 41.346, 't': 1688616000000, 'n': 448028}, {'v': 355851470.0, 'vw': 42.7642, 'o': 42.322, 'c': 42.503, 'h': 43.214, 'l': 42.18, 't': 1688702400000, 'n': 465971}, {'v': 353794110.0, 'vw': 42.1307, 'o': 42.657, 'c': 42.18, 'h': 42.811, 'l': 41.649, 't': 1688961600000, 'n': 501330}, {'v': 296815060.0, 'vw': 42.3753, 'o': 42.481, 'c': 42.405, 'h': 42.758, 'l': 42.067, 't': 1689048000000, 'n': 410728}, {'v': 481277020.0, 'vw': 43.5979, 'o': 43.033, 'c': 43.902, 'h': 43.935, 'l': 42.7773, 't': 1689134400000, 'n': 632871}, {'v': 477883890.0, 'vw': 45.1937, 'o': 44.518, 'c': 45.977, 'h': 46.155, 'l': 44.492, 't': 1689220800000, 'n': 6786

In [10]:
# Creating our Pandas dataframes from our saved csv file
for investment in investments:
    investment["df"] = pd.read_csv(f"./Data/Uncleaned/{investment["name"]}.csv")
    print(f"name: {investment["name"]} length: {len(investment["df"])}")

name: Nvidia length: 251
name: Amazon length: 251
name: JPMorgan length: 251
name: SPY length: 251
name: USDCAD length: 318
name: NDX length: 253


## Now that we have all the data in a pandas dataframe we can start with data cleaning

In [11]:
df_list = [stock1["df"], stock2["df"], stock3["df"], etf["df"], forex["df"], market["df"]]

# Check for null values
for df in df_list:
    print(f"Number of missing values: {df.isnull().values.sum()}")

Number of missing values: 0
Number of missing values: 0
Number of missing values: 0
Number of missing values: 0
Number of missing values: 0
Number of missing values: 0


In [12]:
# Remove duplicate rows if any
for df in df_list:
    df.drop_duplicates(inplace=True)

In [13]:
# Renaming columns to something we can understand

# Dictionary to understand what the columns will mean
keys = {
    "v": "volume",
    "vw": "volume_weighted_average",
    "o": "open_price",
    "c": "close_price",
    "h": "highest_price",
    "l": "lowest_price",
    "t": "timestamp",
    "n": "num_transactions",
}

for df in df_list:
    df.rename(columns=keys, inplace=True)

# Drop columns that we probably won't use and change volume dtype to int
for df in df_list:
    if "volume" in df:
        df["volume"] = df["volume"].astype(int)
    if "num_transactions" in df:
        df.drop(columns=["num_transactions"], inplace=True)


stock1["df"].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 251 entries, 0 to 250
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   volume                   251 non-null    int64  
 1   volume_weighted_average  251 non-null    float64
 2   open_price               251 non-null    float64
 3   close_price              251 non-null    float64
 4   highest_price            251 non-null    float64
 5   lowest_price             251 non-null    float64
 6   timestamp                251 non-null    int64  
dtypes: float64(5), int64(2)
memory usage: 13.9 KB


In [14]:
# Converting the timestamp into a readable date
for df in df_list:
    df["date"] = pd.to_datetime(df["timestamp"], unit="ms").apply(
        lambda dt: dt.replace(hour=0, minute=0, second=0)
    )
    df.drop(columns=["timestamp"], inplace=True)
stock1["df"]

Unnamed: 0,volume,volume_weighted_average,open_price,close_price,highest_price,lowest_price,date
0,198209190,42.5106,42.517,42.413,42.898,42.202,2023-07-03
1,323617560,42.6645,42.135,42.317,43.177,42.085,2023-07-05
2,304138790,41.8158,41.844,42.103,42.179,41.346,2023-07-06
3,355851470,42.7642,42.322,42.503,43.214,42.180,2023-07-07
4,353794110,42.1307,42.657,42.180,42.811,41.649,2023-07-10
...,...,...,...,...,...,...,...
246,411392492,123.5418,121.200,126.090,126.500,119.320,2024-06-25
247,355252504,125.0539,126.130,126.400,128.120,122.600,2024-06-26
248,247638444,124.3026,124.100,123.990,126.410,122.920,2024-06-27
249,309719295,124.8037,124.580,123.540,127.710,122.750,2024-06-28


### There are missing days in the data

In [15]:
# Filling in the missing data.

# Creating a date range for the year of 2023
date_range = pd.date_range(
    start="2023-01-01 05:00:00", end="2023-12-31 05:00:00", freq="D"
)

# Temporary set the date as the index
for df in df_list:
    df.set_index("date", inplace=True)

# Reindex our df based on the date_range we set up and fill in the missing data using the "nearest" method
# stock1["df"] = stock1["df"].reindex(date_range, method="nearest")
# stock2["df"] = stock2["df"].reindex(date_range, method="nearest")
# stock3["df"] = stock3["df"].reindex(date_range, method="nearest")
# etf["df"] = etf["df"].reindex(date_range, method="nearest")
# forex["df"] = forex["df"].reindex(date_range, method="nearest")

#  Reset our index back to normal
# stock1["df"].reset_index(names="date", inplace=True)
# stock2["df"].reset_index(names="date", inplace=True)
# stock3["df"].reset_index(names="date", inplace=True)
# etf["df"].reset_index(names="date", inplace=True)
# forex["df"].reset_index(names="date", inplace=True)

# df_list = [stock1["df"], stock2["df"], stock3["df"], etf["df"], forex["df"], market["df"]]
stock1["df"]

Unnamed: 0_level_0,volume,volume_weighted_average,open_price,close_price,highest_price,lowest_price
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-07-03,198209190,42.5106,42.517,42.413,42.898,42.202
2023-07-05,323617560,42.6645,42.135,42.317,43.177,42.085
2023-07-06,304138790,41.8158,41.844,42.103,42.179,41.346
2023-07-07,355851470,42.7642,42.322,42.503,43.214,42.180
2023-07-10,353794110,42.1307,42.657,42.180,42.811,41.649
...,...,...,...,...,...,...
2024-06-25,411392492,123.5418,121.200,126.090,126.500,119.320
2024-06-26,355252504,125.0539,126.130,126.400,128.120,122.600
2024-06-27,247638444,124.3026,124.100,123.990,126.410,122.920
2024-06-28,309719295,124.8037,124.580,123.540,127.710,122.750


## Now that our data is clean and complete we can start adding in other features we want to track

In [16]:
# Create a column for the 10 and 100 day moving averages
for df in df_list:
    df["10_day_ma"] = df["close_price"].rolling(window="10D",).mean()
    df["100_day_ma"] = df["close_price"].rolling(window="100D").mean()

In [17]:
# Create a column that will track our daily return by finding the difference between the previous day's closing price
for df in df_list:
    df["daily_return"] = df["close_price"].pct_change()
    df.fillna(0, inplace=True)
    df["cumulative_return"] = (1 + df["daily_return"]).cumprod() - 1

In [18]:
# Calculating Volatility over a 30 days window
for df in df_list:
    df["volatility"] = df["daily_return"].rolling(window="30D").std() * (252**0.5)

In [19]:
# Calculating Sharpe Ratio 
for df in df_list:
    avg_return = df["daily_return"].mean()
    excess_return = avg_return - ((1 + 0.03)**(1/252) - 1)
    sharpe_ratio = excess_return / df["volatility"]
    df["sharpe_ratio"] = sharpe_ratio
    df.fillna(0, inplace=True)

In [20]:
# Reset the index so it doesn't use the date anymore
for df in df_list:
    df.reset_index(names="date", inplace=True)

In [21]:
# Calculating Beta
for df in df_list:
    merge_df = pd.merge(
        df[["date", "daily_return"]],
        market["df"][["date", "daily_return"]],
        on="date",
        suffixes=("_stock", "_market"),
    )
    # Calculate covariance matrix
    covariance_matrix = np.cov(
        merge_df["daily_return_stock"], merge_df["daily_return_market"]
    )

    # Extract covariance of the asset with the market
    covariance_stock_market = covariance_matrix[0, 1]

    # Calculate variance of the market
    variance_market = np.var(merge_df["daily_return_market"])

    df["beta"] = covariance_stock_market / variance_market

## Checking our data

In [22]:
print(stock1["name"])
stock1["df"]

Nvidia


Unnamed: 0,date,volume,volume_weighted_average,open_price,close_price,highest_price,lowest_price,10_day_ma,100_day_ma,daily_return,cumulative_return,volatility,sharpe_ratio,beta
0,2023-07-03,198209190,42.5106,42.517,42.413,42.898,42.202,42.413000,42.413000,0.000000,0.000000,0.000000,0.000000,2.016622
1,2023-07-05,323617560,42.6645,42.135,42.317,43.177,42.085,42.365000,42.365000,-0.002263,-0.002263,0.025407,0.179885,2.016622
2,2023-07-06,304138790,41.8158,41.844,42.103,42.179,41.346,42.277667,42.277667,-0.005057,-0.007309,0.040213,0.113655,2.016622
3,2023-07-07,355851470,42.7642,42.322,42.503,43.214,42.180,42.334000,42.334000,0.009501,0.002122,0.100302,0.045566,2.016622
4,2023-07-10,353794110,42.1307,42.657,42.180,42.811,41.649,42.303200,42.303200,-0.007599,-0.005494,0.104348,0.043799,2.016622
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246,2024-06-25,411392492,123.5418,121.200,126.090,126.500,119.320,128.018333,98.807406,0.067564,1.972909,0.582905,0.007841,2.016622
247,2024-06-26,355252504,125.0539,126.130,126.400,128.120,122.600,127.787143,99.357333,0.002459,1.980218,0.568615,0.008038,2.016622
248,2024-06-27,247638444,124.3026,124.100,123.990,126.410,122.920,126.788571,99.858667,-0.019066,1.923396,0.530571,0.008614,2.016622
249,2024-06-28,309719295,124.8037,124.580,123.540,127.710,122.750,125.068571,100.339362,-0.003629,1.912786,0.531145,0.008605,2.016622


In [23]:
print(stock2["name"])
stock2["df"]

Amazon


Unnamed: 0,date,volume,volume_weighted_average,open_price,close_price,highest_price,lowest_price,10_day_ma,100_day_ma,daily_return,cumulative_return,volatility,sharpe_ratio,beta
0,2023-07-03,28264785,130.8295,130.820,130.22,131.8500,130.0650,130.220000,130.220000,0.000000,0.000000,0.000000,0.000000,1.1871
1,2023-07-05,35895409,130.3199,130.240,130.38,131.4000,129.6400,130.300000,130.300000,0.001229,0.001229,0.013792,0.122502,1.1871
2,2023-07-06,40697848,128.2680,128.250,128.36,128.7300,127.3700,129.653333,129.653333,-0.015493,-0.014284,0.147950,0.011420,1.1871
3,2023-07-07,41992251,129.8568,128.590,129.78,130.9700,128.1294,129.685000,129.685000,0.011063,-0.003379,0.174226,0.009697,1.1871
4,2023-07-10,61889289,127.0830,129.070,127.13,129.2800,125.9150,129.174000,129.174000,-0.020419,-0.023729,0.205341,0.008228,1.1871
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246,2024-06-25,44586537,186.9700,186.810,186.34,188.8400,185.4200,185.660000,182.497826,0.004149,0.430963,0.188464,0.008965,1.1871
247,2024-06-26,61298487,192.1127,186.920,193.61,194.8000,186.2600,186.795714,182.775072,0.039015,0.486792,0.224832,0.007515,1.1871
248,2024-06-27,73974152,197.3166,195.005,197.85,199.8400,194.2000,188.765714,183.093188,0.021900,0.519352,0.233480,0.007236,1.1871
249,2024-06-28,71149090,194.7864,197.730,193.25,198.8500,192.5000,190.257143,183.312029,-0.023250,0.484027,0.251617,0.006715,1.1871


In [24]:
print(stock3["name"])
stock3["df"]

JPMorgan


Unnamed: 0,date,volume,volume_weighted_average,open_price,close_price,highest_price,lowest_price,10_day_ma,100_day_ma,daily_return,cumulative_return,volatility,sharpe_ratio,beta
0,2023-07-03,8021622,146.7011,146.19,146.61,147.4800,146.0000,146.610000,146.610000,0.000000,0.000000,0.000000,0.000000,0.260405
1,2023-07-05,8935519,144.8360,144.94,144.64,145.4300,144.3600,145.625000,145.625000,-0.013437,-0.013437,0.150830,0.008526,0.260405
2,2023-07-06,9714100,142.6530,143.02,143.21,143.3900,141.4401,144.820000,144.820000,-0.009887,-0.023191,0.110535,0.011635,0.260405
3,2023-07-07,8981456,144.7871,143.91,144.34,145.8436,143.0000,144.700000,144.700000,0.007891,-0.015483,0.153640,0.008370,0.260405
4,2023-07-10,9100060,145.1726,144.50,145.15,145.9900,144.5000,144.790000,144.790000,0.005612,-0.009958,0.149076,0.008627,0.260405
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246,2024-06-25,6529272,198.5962,198.09,198.07,200.0700,197.7400,197.316667,195.457101,-0.004073,0.350999,0.181795,0.007074,0.260405
247,2024-06-26,7351224,197.2270,197.45,197.43,197.9400,196.2750,197.332857,195.526232,-0.003231,0.346634,0.177426,0.007248,0.260405
248,2024-06-27,7448593,198.9616,197.44,199.17,199.8600,196.9000,197.931429,195.604203,0.008813,0.358502,0.179275,0.007174,0.260405
249,2024-06-28,14155246,201.8693,200.01,202.26,202.6000,199.3018,198.682857,195.690145,0.015514,0.379578,0.185115,0.006947,0.260405


In [25]:
print(etf["name"])
etf["df"]

SPY


Unnamed: 0,date,volume,volume_weighted_average,open_price,close_price,highest_price,lowest_price,10_day_ma,100_day_ma,daily_return,cumulative_return,volatility,sharpe_ratio,beta
0,2023-07-03,32593378,443.5281,442.92,443.79,444.080,442.630,443.790000,443.790000,0.000000,0.000000,0.000000,0.000000,0.653894
1,2023-07-05,58418432,443.3129,441.91,443.13,443.889,441.900,443.460000,443.460000,-0.001487,-0.001487,0.016694,0.043656,0.653894
2,2023-07-06,80658302,439.1438,439.42,439.66,440.100,437.060,442.193333,442.193333,-0.007831,-0.009306,0.066018,0.011039,0.653894
3,2023-07-07,85434178,439.9317,438.63,438.55,442.640,438.300,441.282500,441.282500,-0.002525,-0.011807,0.054100,0.013471,0.653894
4,2023-07-10,62443501,438.8872,438.18,439.66,439.840,437.585,440.958000,440.958000,0.002531,-0.009306,0.060952,0.011957,0.653894
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246,2024-06-25,38253579,543.9439,543.99,544.83,545.200,542.440,545.778333,521.951159,0.003851,0.227675,0.080603,0.009042,0.653894
247,2024-06-26,38408871,544.7822,543.69,545.51,546.240,543.030,545.740000,522.424348,0.001248,0.229208,0.078565,0.009276,0.653894
248,2024-06-27,34948595,545.7994,545.37,546.37,546.960,544.610,545.635714,522.868696,0.001577,0.231145,0.078520,0.009281,0.653894
249,2024-06-28,74580583,545.9113,547.16,544.22,550.280,542.950,545.025714,523.212754,-0.003935,0.226301,0.074985,0.009719,0.653894


In [26]:
print(forex["name"])
forex["df"]

USDCAD


Unnamed: 0,date,volume,volume_weighted_average,open_price,close_price,highest_price,lowest_price,10_day_ma,100_day_ma,daily_return,cumulative_return,volatility,sharpe_ratio,beta
0,2023-07-02,2182,1.3246,1.32360,1.32500,1.32520,1.32239,1.325000,1.325000,0.000000,0.000000,0.000000,0.000000,-0.101947
1,2023-07-03,135235,1.3251,1.32494,1.32476,1.32740,1.32280,1.324880,1.324880,-0.000181,-0.000181,0.002033,0.000782,-0.101947
2,2023-07-04,105110,1.3233,1.32477,1.32232,1.32563,1.32026,1.324027,1.324027,-0.001842,-0.002023,0.016115,0.000099,-0.101947
3,2023-07-05,120465,1.3258,1.32232,1.32849,1.32880,1.32170,1.325142,1.325142,0.004666,0.002634,0.044383,0.000036,-0.101947
4,2023-07-06,140974,1.3324,1.32851,1.33647,1.33830,1.32730,1.327408,1.327408,0.006007,0.008657,0.054017,0.000029,-0.101947
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
313,2024-06-26,96927,1.3682,1.36590,1.37047,1.37076,1.36480,1.369279,1.366511,0.003192,0.034317,0.038135,0.000042,-0.101947
314,2024-06-27,96736,1.3691,1.37046,1.36980,1.37123,1.36740,1.369102,1.366656,-0.000489,0.033811,0.038008,0.000042,-0.101947
315,2024-06-28,111975,1.3699,1.36995,1.36507,1.37340,1.36482,1.368368,1.366862,-0.003453,0.030242,0.036005,0.000044,-0.101947
316,2024-06-30,2451,1.3671,1.36705,1.36730,1.36745,1.36598,1.367955,1.367110,0.001634,0.031925,0.033888,0.000047,-0.101947


In [27]:
print(market["name"])
market["df"]

NDX


Unnamed: 0,date,open_price,close_price,highest_price,lowest_price,10_day_ma,100_day_ma,daily_return,cumulative_return,volatility,sharpe_ratio,beta
0,2023-07-03,15190.541432,15208.693294,15232.961349,15153.597133,15208.693294,15208.693294,0.000000,0.000000,0.000000,0.000000,1.003968
1,2023-07-05,15165.358588,15203.777659,15275.181485,15162.658117,15206.235477,15206.235477,-0.000323,-0.000323,0.003628,0.269232,1.003968
2,2023-07-06,15054.103308,15089.448692,15106.453580,14969.144206,15167.306549,15167.306549,-0.007520,-0.007841,0.067487,0.014474,1.003968
3,2023-07-07,15070.500987,15036.852033,15210.627342,15032.562111,15134.692920,15134.692920,-0.003486,-0.011299,0.055536,0.017588,1.003968
4,2023-07-10,15031.946743,15045.638655,15066.435403,14924.635834,15116.882067,15116.882067,0.000584,-0.010721,0.053865,0.018134,1.003968
...,...,...,...,...,...,...,...,...,...,...,...,...
248,2024-06-25,19555.839136,19701.128649,19712.415452,19523.506722,19740.013992,18383.875421,0.011631,0.295386,0.130260,0.007499,1.003968
249,2024-06-26,19671.068079,19751.047838,19763.545747,19654.183104,19741.590256,18409.470217,0.002534,0.298668,0.126964,0.007693,1.003968
250,2024-06-27,19740.515013,19789.027010,19850.810183,19701.336249,19725.344279,18434.931407,0.001923,0.301165,0.126930,0.007695,1.003968
251,2024-06-28,19817.004516,19682.870924,20017.714607,19665.848633,19693.060469,18455.841031,-0.005364,0.294186,0.125509,0.007783,1.003968


In [28]:
# # Saving our cleaned data to a csv file
for investment in investments:
    investment["df"].to_csv(f"./Data/Cleaned/{investment["name"]}.csv")

## Now that we have all of our stock, etf, forex, and market data ready we can put it into a star format and export to SQL Server

In [29]:
# Creating our dimension tables
investment_type = {
    "investment_type_id": [1,2,3,4],
    "type": ["Stock", "ETF", "Forex", "Market"],
}

investment_name = {
    "investment_name_id": [1,2,3,4,5,6],
    "name": ["Nvidia", "Amazon", "JPMorgan", "SPY", "USDCAD", "NDX"],
}

investment_sector = {
    "investment_sector_id": [1, 2, 3, 4, 5],
    "sector": ["Tech", "Consumer", "Finance", "Broad Market", "Currency"],
}

investment_type_df = pd.DataFrame(investment_type)
investment_name_df = pd.DataFrame(investment_name)
investment_sector_df = pd.DataFrame(investment_sector)

In [30]:
# Adding in the relationships to our dimensions
for i in investments:
    if i["ticker"] == "NVDA":
        i["df"]["investment_type"] = 1
        i["df"]["investment_name"] = 1
        i["df"]["investment_sector"] = 1
    elif i["ticker"] == "AMZN":
        i["df"]["investment_type"] = 1
        i["df"]["investment_name"] = 2
        i["df"]["investment_sector"] = 2
    elif i["ticker"] == "JPM":
        i["df"]["investment_type"] = 1
        i["df"]["investment_name"] = 3
        i["df"]["investment_sector"] = 3
    elif i["ticker"] == "SPY":
        i["df"]["investment_type"] = 2
        i["df"]["investment_name"] = 4
        i["df"]["investment_sector"] = 4
    elif i["ticker"] == "C:USDCAD":
        i["df"]["investment_type"] = 3
        i["df"]["investment_name"] = 5
        i["df"]["investment_sector"] = 5
    elif i["ticker"] == "I:NDX":
        i["df"]["investment_type"] = 4
        i["df"]["investment_name"] = 6
        i["df"]["investment_sector"] = 4

In [43]:
# Combine all of our dataframes into 1 fact table
fact_df = pd.concat([df for df in df_list])
fact_df.reset_index(drop=True, inplace=True)
fact_df.insert(0, "investment_id", range(1, 1 + len(fact_df)))
fact_df

Unnamed: 0,investment_id,date,volume,volume_weighted_average,open_price,close_price,highest_price,lowest_price,10_day_ma,100_day_ma,daily_return,cumulative_return,volatility,sharpe_ratio,beta,investment_type,investment_name,investment_sector
0,1,2023-07-03,198209190.0,42.5106,42.517000,42.413000,42.898000,42.202000,42.413000,42.413000,0.000000,0.000000,0.000000,0.000000,2.016622,1,1,1
1,2,2023-07-05,323617560.0,42.6645,42.135000,42.317000,43.177000,42.085000,42.365000,42.365000,-0.002263,-0.002263,0.025407,0.179885,2.016622,1,1,1
2,3,2023-07-06,304138790.0,41.8158,41.844000,42.103000,42.179000,41.346000,42.277667,42.277667,-0.005057,-0.007309,0.040213,0.113655,2.016622,1,1,1
3,4,2023-07-07,355851470.0,42.7642,42.322000,42.503000,43.214000,42.180000,42.334000,42.334000,0.009501,0.002122,0.100302,0.045566,2.016622,1,1,1
4,5,2023-07-10,353794110.0,42.1307,42.657000,42.180000,42.811000,41.649000,42.303200,42.303200,-0.007599,-0.005494,0.104348,0.043799,2.016622,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1570,1571,2024-06-25,,,19555.839136,19701.128649,19712.415452,19523.506722,19740.013992,18383.875421,0.011631,0.295386,0.130260,0.007499,1.003968,4,6,4
1571,1572,2024-06-26,,,19671.068079,19751.047838,19763.545747,19654.183104,19741.590256,18409.470217,0.002534,0.298668,0.126964,0.007693,1.003968,4,6,4
1572,1573,2024-06-27,,,19740.515013,19789.027010,19850.810183,19701.336249,19725.344279,18434.931407,0.001923,0.301165,0.126930,0.007695,1.003968,4,6,4
1573,1574,2024-06-28,,,19817.004516,19682.870924,20017.714607,19665.848633,19693.060469,18455.841031,-0.005364,0.294186,0.125509,0.007783,1.003968,4,6,4


In [45]:
# Save our star schema files as csv
fact_df.to_csv("./Data/Star/investments.csv", index=False)
investment_type_df.to_csv("./Data/Star/investment_type.csv", index=False)
investment_name_df.to_csv("./Data/Star/investment_name.csv", index=False)
investment_sector_df.to_csv("./Data/Star/investment_sector.csv", index=False)