# Investments

This will be where we get all the data for our investments.

We will then run that data through our ETL pipeline until it is ready to be loaded into SQL Server for analysis

Brief explanation of pipeline:

1. Start by picking our stocks, etf, and forex, as well as an index that can look at the overall market

2. Extract the data by requesting it from polygon.io

3. Clean data (removing nulls etc)

4. Aggregate and add metrics that will be important to track

5. Format data into star schema

6. Load into SQL Server for analysis


In [30]:
# Picking our stocks, ETF, and forex
stock1 = {"ticker": "NVDA", "name": "Nvidia"}
stock2 = {"ticker": "AMZN", "name": "Amazon"}
stock3 = {"ticker": "JPM", "name": "JPMorgan"}
etf = {"ticker": "SPY", "name": "SPY"}
forex = {"ticker": "C:USDCAD", "name": "USDCAD"}

# Our way to look at the overall market
market = {"ticker": "I:NDX", "name": "NDX"}

# List for easy access
investments = [stock1, stock2, stock3, etf, forex, market]

In [31]:
import requests
import os
import csv
import pandas as pd
from dotenv import load_dotenv

In [32]:
load_dotenv()
api_key = os.environ["api_key"]
params = {"apiKey": api_key}
base_url = "https://api.polygon.io"

In [33]:
# Creating a request to get information on each investment every day from 5/1/23 to 7/1/24
# We will only look at the dates from 7/1/23 to 7/1/24 but we are getting a month extra month to fill in the missing data.

for investment in investments:

    response = requests.get(
        f"{base_url}/v2/aggs/ticker/{investment["ticker"]}/range/1/day/2023-05-01/2024-07-01?adjusted=true&sort=asc",
        params,
    )
    
    investment["data"] = response.json()["results"]
    # print(investment)

# # Saving our fresh data to a csv file so we don't have to keep requesting it
for investment in investments:
    keys = investment["data"][0].keys()
    with open(f"./Data/Fresh/{investment["name"]}.csv", "w", newline="") as output_file:
        dict_writer = csv.DictWriter(output_file, keys)
        dict_writer.writeheader()
        dict_writer.writerows(investment["data"])

In [34]:
# Creating our Pandas dataframes from our saved csv file
for investment in investments:
    investment["df"] = pd.read_csv(f"./Data/Fresh/{investment["name"]}.csv")
    print(f"name: {investment["name"]} length: {len(investment["df"])}")

name: Nvidia length: 294
name: Amazon length: 294
name: JPMorgan length: 294
name: SPY length: 294
name: USDCAD length: 375
name: NDX length: 296


## Now that we have all the data in a pandas dataframe we can start with data cleaning

In [35]:
df_list = [stock1["df"], stock2["df"], stock3["df"], etf["df"], forex["df"], market["df"]]

# Check for null values
for df in df_list:
    print(f"Number of missing values: {df.isnull().values.sum()}")

Number of missing values: 0
Number of missing values: 0
Number of missing values: 0
Number of missing values: 0
Number of missing values: 0
Number of missing values: 0


In [36]:
# Remove duplicate rows if any
for df in df_list:
    df.drop_duplicates(inplace=True)

In [37]:
# Renaming columns to something we can understand

# Dictionary we can map the columns to
keys = {
    "v": "volume",
    "vw": "volume_weighted_average",
    "o": "open_price",
    "c": "close_price",
    "h": "highest_price",
    "l": "lowest_price",
    "t": "timestamp",
    "n": "num_transactions",
}

for df in df_list:
    df.rename(columns=keys, inplace=True)

# Drop columns that we probably won't use and change volume dtype to int
for df in df_list:
    if "volume" in df:
        df["volume"] = df["volume"].astype(int)
    if "num_transactions" in df:
        df.drop(columns=["num_transactions"], inplace=True)


stock1["df"].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294 entries, 0 to 293
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   volume                   294 non-null    int64  
 1   volume_weighted_average  294 non-null    float64
 2   open_price               294 non-null    float64
 3   close_price              294 non-null    float64
 4   highest_price            294 non-null    float64
 5   lowest_price             294 non-null    float64
 6   timestamp                294 non-null    int64  
dtypes: float64(5), int64(2)
memory usage: 16.2 KB


In [38]:
# Converting the timestamp into a readable date
for df in df_list:
    df["date"] = pd.to_datetime(df["timestamp"], unit="ms").apply(
        lambda dt: dt.replace(hour=0, minute=0, second=0)
    )
    df.drop(columns=["timestamp"], inplace=True)
stock1["df"]

Unnamed: 0,volume,volume_weighted_average,open_price,close_price,highest_price,lowest_price,date
0,569428530,28.6238,27.8400,28.910,29.0584,27.780,2023-05-01
1,401220570,28.3550,28.6800,28.210,28.8360,28.083,2023-05-02
2,383387400,27.9082,27.8400,27.802,28.3670,27.472,2023-05-03
3,321848060,27.5710,27.6510,27.562,27.8590,27.240,2023-05-04
4,361479080,28.4083,27.8255,28.680,28.7550,27.731,2023-05-05
...,...,...,...,...,...,...,...
289,411392492,123.5418,121.2000,126.090,126.5000,119.320,2024-06-25
290,355252504,125.0539,126.1300,126.400,128.1200,122.600,2024-06-26
291,247638444,124.3026,124.1000,123.990,126.4100,122.920,2024-06-27
292,309719295,124.8037,124.5800,123.540,127.7100,122.750,2024-06-28


## Some tables have more rows than others which will cause issues later so lets fill in the data.

In [39]:
# Creating a date range from 5/1/23 to 7/1/24
date_range = pd.date_range(
    start="2023-5-01 00:00:00", end="2024-07-01 00:00:00", freq="D"
)

# Set the date as the index
# Reindex our df based on the date_range we set up and fill in the missing data using the "ffill" method
for i in investments:
    i["df"].set_index("date", inplace=True)
    i["df"] = i["df"].reindex(date_range, method="ffill")
    i["df"] = i["df"].loc["2023-07-01":] # Drop the extra dates we don't need

df_list = [stock1["df"], stock2["df"], stock3["df"], etf["df"], forex["df"], market["df"]]

In [40]:
# Checking to make sure all of our data has the same length
for investment in investments:
    print(f"name: {investment["name"]} length: {len(investment["df"])}")

name: Nvidia length: 367
name: Amazon length: 367
name: JPMorgan length: 367
name: SPY length: 367
name: USDCAD length: 367
name: NDX length: 367


## Now that our data is clean and complete we can start adding in other metrics and aggregrations we want to track

In [41]:
# Create a column that will track our daily return by finding the difference between the previous day's closing price
for df in df_list:
    df["daily_return"] = df["close_price"].pct_change()
    df["cumulative_return"] = (1 + df["daily_return"]).cumprod() - 1

In [42]:
# Create a column for the 10 and 100 day moving averages of our daily return
for df in df_list:
    df["10_day_ma"] = df["daily_return"].rolling(window="10D",).apply(lambda x: x[x!= 0].mean()) # We ignore the days where the daily return is 0 because it isn't a trading day
    df["100_day_ma"] = df["daily_return"].rolling(window="100D").apply(lambda x: x[x!= 0].mean())

In [43]:
# Calculating Volatility over a 30 days window
for df in df_list:
    df["volatility"] = df["daily_return"].rolling(window="30D").std()

In [44]:
# Calculating Sharpe Ratio over a 30 days window
risk_free_rate_daily = (1 + 0.03) ** (1 / 252) - 1

for df in df_list:
    df["sharpe_ratio"] = ((df["daily_return"] - risk_free_rate_daily) / df["volatility"]).rolling(window="30D").mean()


In [45]:
# Reset the index to not use date
# Fill in our Nan values with 0
for df in df_list:
    df.reset_index(names="date", inplace=True)
    df.fillna(0, inplace=True)

In [46]:
# Calculating Beta
for df in df_list:
    df_merged = pd.merge(
        df[["date", "daily_return"]],
        market["df"][["date", "daily_return"]],
        on="date",
        how="inner",
        suffixes=("_stock", "_market"),
    )

    # Calculate covariance and variance
    covariance = df_merged["daily_return_stock"].cov(df_merged["daily_return_market"])
    market_variance = df_merged["daily_return_market"].var()
    df["beta"] = covariance / market_variance

## Checking our data to ensure everything looks correct

In [47]:
print(stock1["name"])
stock1["df"]

Nvidia


Unnamed: 0,date,volume,volume_weighted_average,open_price,close_price,highest_price,lowest_price,daily_return,cumulative_return,10_day_ma,100_day_ma,volatility,sharpe_ratio,beta
0,2023-07-01,501124950,42.2186,41.680,42.302,42.550,41.501,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2.014113
1,2023-07-02,501124950,42.2186,41.680,42.302,42.550,41.501,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2.014113
2,2023-07-03,198209190,42.5106,42.517,42.413,42.898,42.202,0.002624,0.002624,0.002624,0.002624,0.001855,1.350992,2.014113
3,2023-07-04,198209190,42.5106,42.517,42.413,42.898,42.202,0.000000,0.002624,0.002624,0.002624,0.001515,0.636781,2.014113
4,2023-07-05,323617560,42.6645,42.135,42.317,43.177,42.085,-0.002263,0.000355,0.000180,0.000180,0.001998,0.027331,2.014113
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
362,2024-06-27,247638444,124.3026,124.100,123.990,126.410,122.920,-0.019066,1.931067,-0.006908,0.005285,0.027838,0.114585,2.014113
363,2024-06-28,309719295,124.8037,124.580,123.540,127.710,122.750,-0.003629,1.920429,-0.012444,0.005075,0.027849,0.100073,2.014113
364,2024-06-29,309719295,124.8037,124.580,123.540,127.710,122.750,0.000000,1.920429,-0.012444,0.004976,0.026790,0.145590,2.014113
365,2024-06-30,309719295,124.8037,124.580,123.540,127.710,122.750,0.000000,1.920429,-0.008618,0.004585,0.026708,0.155541,2.014113


In [48]:
print(stock2["name"])
stock2["df"]

Amazon


Unnamed: 0,date,volume,volume_weighted_average,open_price,close_price,highest_price,lowest_price,daily_return,cumulative_return,10_day_ma,100_day_ma,volatility,sharpe_ratio,beta
0,2023-07-01,54327974,130.2735,129.470,130.36,131.2450,128.950,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.182427
1,2023-07-02,54327974,130.2735,129.470,130.36,131.2450,128.950,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.182427
2,2023-07-03,28264785,130.8295,130.820,130.22,131.8500,130.065,-0.001074,-0.001074,-0.001074,-0.001074,0.000759,-1.568683,1.182427
3,2023-07-04,28264785,130.8295,130.820,130.22,131.8500,130.065,0.000000,-0.001074,-0.001074,-0.001074,0.000620,-0.878935,1.182427
4,2023-07-05,35895409,130.3199,130.240,130.38,131.4000,129.640,0.001229,0.000153,0.000077,0.000077,0.000941,-0.192312,1.182427
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
362,2024-06-27,73974152,197.3166,195.005,197.85,199.8400,194.200,0.021900,0.517720,0.010531,0.001828,0.012359,0.291260,1.182427
363,2024-06-28,71149090,194.7864,197.730,193.25,198.8500,192.500,-0.023250,0.482433,0.008180,0.001297,0.013236,0.234912,1.182427
364,2024-06-29,71149090,194.7864,197.730,193.25,198.8500,192.500,0.000000,0.482433,0.008180,0.001297,0.012854,0.286504,1.182427
365,2024-06-30,71149090,194.7864,197.730,193.25,198.8500,192.500,0.000000,0.482433,0.006544,0.001257,0.012376,0.345147,1.182427


In [49]:
print(stock3["name"])
stock3["df"]

JPMorgan


Unnamed: 0,date,volume,volume_weighted_average,open_price,close_price,highest_price,lowest_price,daily_return,cumulative_return,10_day_ma,100_day_ma,volatility,sharpe_ratio,beta
0,2023-07-01,14212816,145.1738,144.60,145.44,146.00,143.6600,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.263023
1,2023-07-02,14212816,145.1738,144.60,145.44,146.00,143.6600,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.263023
2,2023-07-03,8021622,146.7011,146.19,146.61,147.48,146.0000,0.008045,0.008045,0.008045,0.008045,0.005688,1.393592,0.263023
3,2023-07-04,8021622,146.7011,146.19,146.61,147.48,146.0000,0.000000,0.008045,0.008045,0.008045,0.004645,0.684168,0.263023
4,2023-07-05,8935519,144.8360,144.94,144.64,145.43,144.3600,-0.013437,-0.005501,-0.002696,-0.002696,0.008907,-0.051147,0.263023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
362,2024-06-27,7448593,198.9616,197.44,199.17,199.86,196.9000,0.008813,0.369431,0.003080,0.000497,0.009379,0.002459,0.263023
363,2024-06-28,14155246,201.8693,200.01,202.26,202.60,199.3018,0.015514,0.390677,0.003816,0.000532,0.009696,0.075515,0.263023
364,2024-06-29,14155246,201.8693,200.01,202.26,202.60,199.3018,0.000000,0.390677,0.003816,0.000335,0.009643,0.057824,0.263023
365,2024-06-30,14155246,201.8693,200.01,202.26,202.60,199.3018,0.000000,0.390677,0.003040,0.000523,0.009156,0.011635,0.263023


In [50]:
print(etf["name"])
etf["df"]

SPY


Unnamed: 0,date,volume,volume_weighted_average,open_price,close_price,highest_price,lowest_price,daily_return,cumulative_return,10_day_ma,100_day_ma,volatility,sharpe_ratio,beta
0,2023-07-01,104964019,442.7440,441.44,443.28,444.300,441.11,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.650923
1,2023-07-02,104964019,442.7440,441.44,443.28,444.300,441.11,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.650923
2,2023-07-03,32593378,443.5281,442.92,443.79,444.080,442.63,0.001151,0.001151,0.001151,0.001151,0.000814,1.270024,0.650923
3,2023-07-04,32593378,443.5281,442.92,443.79,444.080,442.63,0.000000,0.001151,0.001151,0.001151,0.000664,0.546714,0.650923
4,2023-07-05,58418432,443.3129,441.91,443.13,443.889,441.90,-0.001487,-0.000338,-0.000168,-0.000168,0.001081,-0.130181,0.650923
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
362,2024-06-27,34948595,545.7994,545.37,546.37,546.960,544.61,0.001577,0.232562,-0.000186,0.000858,0.004165,0.210794,0.650923
363,2024-06-28,74580583,545.9113,547.16,544.22,550.280,542.95,-0.003935,0.227712,-0.001111,0.000667,0.003995,0.218332,0.650923
364,2024-06-29,74580583,545.9113,547.16,544.22,550.280,542.95,0.000000,0.227712,-0.001111,0.000628,0.003725,0.262250,0.650923
365,2024-06-30,74580583,545.9113,547.16,544.22,550.280,542.95,0.000000,0.227712,-0.000844,0.000666,0.003431,0.202572,0.650923


In [51]:
print(forex["name"])
forex["df"]

USDCAD


Unnamed: 0,date,volume,volume_weighted_average,open_price,close_price,highest_price,lowest_price,daily_return,cumulative_return,10_day_ma,100_day_ma,volatility,sharpe_ratio,beta
0,2023-07-01,149588,1.3248,1.32510,1.30768,1.32840,1.30475,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.102335
1,2023-07-02,2182,1.3246,1.32360,1.32500,1.32520,1.32239,0.013245,0.013245,0.013245,0.013245,0.000000,0.000000,-0.102335
2,2023-07-03,135235,1.3251,1.32494,1.32476,1.32740,1.32280,-0.000181,0.013061,0.006532,0.006532,0.009494,-0.031436,-0.102335
3,2023-07-04,105110,1.3233,1.32477,1.32232,1.32563,1.32026,-0.001842,0.011195,0.003741,0.003741,0.008273,-0.134129,-0.102335
4,2023-07-05,120465,1.3258,1.32232,1.32849,1.32880,1.32170,0.004666,0.015914,0.003972,0.003972,0.006770,0.134532,-0.102335
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
362,2024-06-27,96736,1.3691,1.37046,1.36980,1.37123,1.36740,-0.000489,0.047504,-0.000128,0.000110,0.002224,-0.007906,-0.102335
363,2024-06-28,111975,1.3699,1.36995,1.36507,1.37340,1.36482,-0.003453,0.043887,-0.000535,0.000157,0.002107,-0.121655,-0.102335
364,2024-06-29,111975,1.3699,1.36995,1.36507,1.37340,1.36482,0.000000,0.043887,-0.000494,0.000111,0.002062,-0.089648,-0.102335
365,2024-06-30,2451,1.3671,1.36705,1.36730,1.36745,1.36598,0.001634,0.045592,-0.000105,0.000063,0.001943,-0.009378,-0.102335


In [52]:
print(market["name"])
market["df"]

NDX


Unnamed: 0,date,open_price,close_price,highest_price,lowest_price,daily_return,cumulative_return,10_day_ma,100_day_ma,volatility,sharpe_ratio,beta
0,2023-07-01,15085.241800,15179.208385,15213.751646,15081.566103,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.0
1,2023-07-02,15085.241800,15179.208385,15213.751646,15081.566103,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.0
2,2023-07-03,15190.541432,15208.693294,15232.961349,15153.597133,0.001942,0.001942,0.001942,0.001942,0.001374,1.328810,1.0
3,2023-07-04,15190.541432,15208.693294,15232.961349,15153.597133,0.000000,0.001942,0.001942,0.001942,0.001121,0.612106,1.0
4,2023-07-05,15165.358588,15203.777659,15275.181485,15162.658117,-0.000323,0.001619,0.000810,0.000810,0.001036,0.266383,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
362,2024-06-27,19740.515013,19789.027010,19850.810183,19701.336249,0.001923,0.303693,-0.000794,0.001390,0.006726,0.265441,1.0
363,2024-06-28,19817.004516,19682.870924,20017.714607,19665.848633,-0.005364,0.296699,-0.001604,0.001145,0.006659,0.270762,1.0
364,2024-06-29,19817.004516,19682.870924,20017.714607,19665.848633,0.000000,0.296699,-0.001604,0.001097,0.006256,0.324562,1.0
365,2024-06-30,19817.004516,19682.870924,20017.714607,19665.848633,0.000000,0.296699,-0.000561,0.001098,0.006255,0.325125,1.0


## Now that we have all of our stock, etf, forex, and market data ready we can put it into a star format and export to SQL Server

In [53]:
# Creating our dimension tables
investment_type = {
    "investment_type_id": [1, 2, 3, 4],
    "investment_type": ["Stock", "ETF", "Forex", "Market"],
}

investment_name = {
    "investment_name_id": [1, 2, 3, 4, 5, 6],
    "investment_name": ["Nvidia", "Amazon", "JPMorgan", "SPY", "USDCAD", "NDX"],
}

investment_sector = {
    "investment_sector_id": [1, 2, 3, 4, 5],
    "investment_sector": ["Tech", "Consumer", "Finance", "Broad Market", "Currency"],
}

investment_type_df = pd.DataFrame(investment_type)
investment_name_df = pd.DataFrame(investment_name)
investment_sector_df = pd.DataFrame(investment_sector)

In [54]:
# Creating the time dimension table

investment_time_df = pd.DataFrame(
    {"investment_date": pd.date_range(start="2023-07-01", end="2024-07-01", freq="D")}
)
investment_time_df["investment_month"] = investment_time_df["investment_date"].dt.month
investment_time_df["investment_quarter"] = investment_time_df["investment_date"].dt.quarter
investment_time_df["investment_year"] = investment_time_df["investment_date"].dt.year

investment_time_df["time_id"] = investment_time_df.index + 1
investment_time_df.set_index("time_id", inplace=True)

investment_time_df

Unnamed: 0_level_0,investment_date,investment_month,investment_quarter,investment_year
time_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2023-07-01,7,3,2023
2,2023-07-02,7,3,2023
3,2023-07-03,7,3,2023
4,2023-07-04,7,3,2023
5,2023-07-05,7,3,2023
...,...,...,...,...
363,2024-06-27,6,2,2024
364,2024-06-28,6,2,2024
365,2024-06-29,6,2,2024
366,2024-06-30,6,2,2024


In [55]:
# Adding in the relationships to our dimensions
for i in investments:
    if i["ticker"] == "NVDA":
        i["df"]["investment_type_id"] = 1
        i["df"]["investment_name_id"] = 1
        i["df"]["investment_sector_id"] = 1
    elif i["ticker"] == "AMZN":
        i["df"]["investment_type_id"] = 1
        i["df"]["investment_name_id"] = 2
        i["df"]["investment_sector_id"] = 2
    elif i["ticker"] == "JPM":
        i["df"]["investment_type_id"] = 1
        i["df"]["investment_name_id"] = 3
        i["df"]["investment_sector_id"] = 3
    elif i["ticker"] == "SPY":
        i["df"]["investment_type_id"] = 2
        i["df"]["investment_name_id"] = 4
        i["df"]["investment_sector_id"] = 4
    elif i["ticker"] == "C:USDCAD":
        i["df"]["investment_type_id"] = 3
        i["df"]["investment_name_id"] = 5
        i["df"]["investment_sector_id"] = 5
    elif i["ticker"] == "I:NDX":
        i["df"]["investment_type_id"] = 4
        i["df"]["investment_name_id"] = 6
        i["df"]["investment_sector_id"] = 4

In [56]:
# Combine all of our dataframes into 1 fact table
fact_df = pd.concat([df for df in df_list])
fact_df.reset_index(drop=True, inplace=True)
fact_df.insert(0, "investment_id", range(1, 1 + len(fact_df)))

fact_df.rename(columns={"date": "investment_date"}, inplace=True)

fact_df = fact_df.merge(
    investment_time_df.reset_index()[["time_id", "investment_date"]],
    on="investment_date",
    how="left",
)

# Drop data that we won't be analyzing
fact_df.drop(
    columns=[
        "investment_date",
        "volume",
        "volume_weighted_average",
        "open_price",
        "highest_price",
        "lowest_price",
    ],
    inplace=True,
)

# Reordering columns for neatness
fact_df = fact_df[
    [
        "investment_id",
        "investment_name_id",
        "investment_type_id",
        "investment_sector_id",
        "time_id",
        "close_price",
        "daily_return",
        "cumulative_return",
        "10_day_ma",
        "100_day_ma",
        "volatility",
        "sharpe_ratio",
        "beta",
    ]
]
fact_df

Unnamed: 0,investment_id,investment_name_id,investment_type_id,investment_sector_id,time_id,close_price,daily_return,cumulative_return,10_day_ma,100_day_ma,volatility,sharpe_ratio,beta
0,1,1,1,1,1,42.302000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2.014113
1,2,1,1,1,2,42.302000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2.014113
2,3,1,1,1,3,42.413000,0.002624,0.002624,0.002624,0.002624,0.001855,1.350992,2.014113
3,4,1,1,1,4,42.413000,0.000000,0.002624,0.002624,0.002624,0.001515,0.636781,2.014113
4,5,1,1,1,5,42.317000,-0.002263,0.000355,0.000180,0.000180,0.001998,0.027331,2.014113
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2197,2198,6,4,4,363,19789.027010,0.001923,0.303693,-0.000794,0.001390,0.006726,0.265441,1.000000
2198,2199,6,4,4,364,19682.870924,-0.005364,0.296699,-0.001604,0.001145,0.006659,0.270762,1.000000
2199,2200,6,4,4,365,19682.870924,0.000000,0.296699,-0.001604,0.001097,0.006256,0.324562,1.000000
2200,2201,6,4,4,366,19682.870924,0.000000,0.296699,-0.000561,0.001098,0.006255,0.325125,1.000000


In [57]:
fact_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2202 entries, 0 to 2201
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   investment_id         2202 non-null   int64  
 1   investment_name_id    2202 non-null   int64  
 2   investment_type_id    2202 non-null   int64  
 3   investment_sector_id  2202 non-null   int64  
 4   time_id               2202 non-null   int64  
 5   close_price           2202 non-null   float64
 6   daily_return          2202 non-null   float64
 7   cumulative_return     2202 non-null   float64
 8   10_day_ma             2202 non-null   float64
 9   100_day_ma            2202 non-null   float64
 10  volatility            2202 non-null   float64
 11  sharpe_ratio          2202 non-null   float64
 12  beta                  2202 non-null   float64
dtypes: float64(8), int64(5)
memory usage: 223.8 KB


In [58]:
# Save our star schema files as csv
fact_df.to_csv("./Data/Star/investments.csv", index=False)
investment_type_df.to_csv("./Data/Star/investment_type.csv", index=False)
investment_name_df.to_csv("./Data/Star/investment_name.csv", index=False)
investment_sector_df.to_csv("./Data/Star/investment_sector.csv", index=False)
investment_time_df.to_csv("./Data/Star/investment_time.csv")