# Data Aggregating/Cleaning

### Load ETF Data (KARS, DRIV, IDRV, LIT) from April 2019 to April 2025

In [14]:
from alpha_vantage.timeseries import TimeSeries
import pandas as pd

# Initialize API and TimeSeries object
api_key = "97OGHWS77V9ZH0G7"
ts = TimeSeries(key=api_key, output_format="pandas")

# List of ETF tickers
tickers = ['KARS', 'DRIV', 'IDRV', 'LIT'] 
etf_data = {}

# Fetch and process each ETF
for ticker in tickers:
    data, _ = ts.get_daily(symbol=ticker, outputsize='full')
    # Extract and sort closing prices
    closes = data['4. close'].sort_index()
    etf_data[ticker] = closes

etf_df = pd.DataFrame(etf_data)
etf_df.index = pd.to_datetime(etf_df.index) # Ensure index is datetime

# Sort by date
etf_df = etf_df.sort_index()
etf_df.head()

Unnamed: 0_level_0,KARS,DRIV,IDRV,LIT
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-07-23,,,,16.09
2010-07-26,,,,16.456
2010-07-27,,,,16.39
2010-07-28,,,,16.33
2010-07-29,,,,16.82


In [15]:
etf_df = etf_df.dropna()
etf_df

Unnamed: 0_level_0,KARS,DRIV,IDRV,LIT
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-04-18,22.5000,14.400,25.870,29.06
2019-04-22,22.5000,14.340,25.857,28.85
2019-04-23,22.3700,14.300,25.935,28.59
2019-04-24,22.3700,14.280,25.790,28.03
2019-04-25,21.9600,14.008,25.435,27.72
...,...,...,...,...
2025-04-02,21.4500,21.240,29.510,38.62
2025-04-03,20.4100,19.940,28.270,37.07
2025-04-04,19.2900,18.720,26.720,34.92
2025-04-07,18.0186,18.390,25.830,32.81


### Load company stock close price data

In [16]:
from alpha_vantage.timeseries import TimeSeries
import pandas as pd

# Initialize API and TimeSeries object
api_key = "97OGHWS77V9ZH0G7"
ts = TimeSeries(key=api_key, output_format="pandas")

# List of company tickers
stock_tickers = ['TSLA', 'RIVN', 'BYDDY', 'NIO Inc.', 'LCID', 'F', 'HYMTF']
stock_data = {}

# Fetch and process each company
for ticker in stock_tickers:
    data, _ = ts.get_daily(symbol=ticker, outputsize='full')
    # Extract and sort closing prices
    closes = data['4. close'].sort_index()
    stock_data[ticker] = closes

stock_df = pd.DataFrame(stock_data)
stock_df.index = pd.to_datetime(stock_df.index)
stock_df = stock_df.sort_index()

# Drop rows with any NaN values
stock_df_clean = stock_df.dropna()
stock_df_clean

ValueError: Invalid API call. Please retry or visit the documentation (https://www.alphavantage.co/documentation/) for TIME_SERIES_DAILY.

### Load Green Commodities Metals Data

In [17]:
metals = pd.read_csv("data/Commodities IMF Data/commodities_IMF.csv")
metals = metals.drop(columns=["Unnamed: 428", "Country Name", "Country Code", "Unit Code", "Attribute"],
                     errors="ignore")
metals = metals.drop_duplicates().reset_index(drop=True)

# Reshape Data to Long Format
metals = metals.melt(id_vars=["Commodity Name", "Commodity Code", "Unit Name"], 
                      var_name="Date", value_name="Value")

# Convert Date Column to datetime format
metals["Date"] = pd.to_datetime(metals["Date"].str.replace("M", "-", regex=False), format="%Y-%m")

metals = metals.dropna(subset=["Value"])
metals["Value"] = pd.to_numeric(metals["Value"], errors="coerce")
metals = metals.reset_index(drop=True)
metals

Unnamed: 0,Commodity Name,Commodity Code,Unit Name,Date,Value
0,Aluminum,PALUM,Index,1990-01-01,95.251042
1,Nickel,PNICK,Percent Change over Corresponding Period Previ...,1990-01-01,-60.192144
2,Nickel,PNICK,US Dollars,1990-01-01,7056.000000
3,Aluminum,PALUM,Percent Change over Previous Period,1990-01-01,-6.429884
4,Aluminum,PALUM,Percent Change over Corresponding Period Previ...,1990-01-01,-36.296876
...,...,...,...,...,...
3958,Aluminum,PALUM,Percent Change over Corresponding Period Previ...,2025-01-01,16.797308
3959,Aluminum,PALUM,US Dollars,2025-01-01,2571.370435
3960,Lithium,PLITH,US Dollars,2025-01-01,91267.271203
3961,Nickel,PNICK,Index,2025-01-01,160.226930
