## Imports

In [22]:
import numpy as np
import pandas as pd
import yfinance as yf
import os
import matplotlib.pyplot as plt
import seaborn as sns
import time

from scipy.stats import spearmanr
from talib import RSI, BBANDS, MACD, ATR

In [23]:
tickers = ['AAPL', 'MSFT', 'NVDA', 'AMZN', 'GOOGL', 'META']
tickers = [ticker.replace('.', '-') for ticker in tickers]  # Replace dots with hyphens for yfinance compatibility
print(tickers)

['AAPL', 'MSFT', 'NVDA', 'AMZN', 'GOOGL', 'META']


In [24]:
start_date = '2015-01-01'
end_date = '2025-01-01'

The following method is supposed to download data for each ticket.

In [25]:
def get_data(ticker, start_date, end_date):
    data = yf.download(ticker, start_date, end_date)
    data["Ticker"] = ticker
    return data

In [26]:
os.makedirs("data", exist_ok=True)

print(f"Processing data for {len(tickers)} stocks...")
for ticker in tickers:
    try:
        # Get enhanced data for this stock
        stock_data = get_data(ticker, start_date, end_date)
        
        # Reset index to make Date a column
        stock_data = stock_data.reset_index()

        print(f"Processed {ticker} data with {len(stock_data)} rows.")
        
        # Save individual stock data
        stock_data.to_csv(f"data/{ticker}.csv", index=False)
        
        # Add a small delay to avoid hitting API limits
        time.sleep(2)

    except Exception as e:
        print(f"Error processing {ticker}: {str(e)}")

  data = yf.download(ticker, start_date, end_date)
[*********************100%***********************]  1 of 1 completed

Processing data for 6 stocks...
Processed AAPL data with 2516 rows.



  data = yf.download(ticker, start_date, end_date)
[*********************100%***********************]  1 of 1 completed


Processed MSFT data with 2516 rows.


  data = yf.download(ticker, start_date, end_date)
[*********************100%***********************]  1 of 1 completed


Processed NVDA data with 2516 rows.


  data = yf.download(ticker, start_date, end_date)
[*********************100%***********************]  1 of 1 completed


Processed AMZN data with 2516 rows.


  data = yf.download(ticker, start_date, end_date)
[*********************100%***********************]  1 of 1 completed


Processed GOOGL data with 2516 rows.


  data = yf.download(ticker, start_date, end_date)
[*********************100%***********************]  1 of 1 completed


Processed META data with 2516 rows.


Let us analyze how does the dataframe for one stock look like.

In [27]:
files = os.listdir('data') #list of files in the data directory
files

['MSFT.csv', 'AMZN.csv', 'GOOGL.csv', 'NVDA.csv', 'META.csv', 'AAPL.csv']

In [28]:
example_file = 'MSFT.csv'
example_file_path = os.path.join('data', example_file)

In [29]:
msft_data = pd.read_csv(example_file_path)
msft_data.head()

Unnamed: 0,Date,Close,High,Low,Open,Volume,Ticker
0,,MSFT,MSFT,MSFT,MSFT,MSFT,
1,2015-01-02,39.998714447021484,40.563281307143846,39.810527669053585,39.913175299136924,27913900,MSFT
2,2015-01-05,39.63088607788086,39.9730458684429,39.562452161903224,39.66509977276096,39673900,MSFT
3,2015-01-06,39.0492057800293,39.9901491609254,38.95511078931803,39.67365049834883,36447900,MSFT
4,2015-01-07,39.54534149169922,39.74208409457034,38.91234411952407,39.331490410781676,29114100,MSFT


As we see, there is a redundant second row(row with index 0), and moreover there is redundant indexes in the dataframe. We want to get rid of them and concatenate dataframes.  

In [32]:
dataframes = []
for file in files:
    file_path = os.path.join('data', file)     
    stock_data = pd.read_csv(file_path)
    stock_data.drop([0], inplace=True) # drop row with index 0
    dataframes.append(stock_data)
if dataframes:
    data = pd.concat(dataframes, ignore_index=True)

In [35]:
data

Unnamed: 0,Date,Close,High,Low,Open,Volume,Ticker
0,2015-01-02,39.998714447021484,40.563281307143846,39.810527669053585,39.913175299136924,27913900,MSFT
1,2015-01-05,39.63088607788086,39.9730458684429,39.562452161903224,39.66509977276096,39673900,MSFT
2,2015-01-06,39.0492057800293,39.9901491609254,38.95511078931803,39.67365049834883,36447900,MSFT
3,2015-01-07,39.54534149169922,39.74208409457034,38.91234411952407,39.331490410781676,29114100,MSFT
4,2015-01-08,40.708683013916016,40.845547545323676,39.964482328331094,39.990143408248834,29645200,MSFT
...,...,...,...,...,...,...,...
15091,2024-12-24,257.57867431640625,257.58862955018265,254.67565812568793,254.87518901917565,23234700,AAPL
15092,2024-12-26,258.39666748046875,259.47408555117545,257.0100282923795,257.56867823862046,27237100,AAPL
15093,2024-12-27,254.9749298095703,258.0774615569102,252.4510193654514,257.20952960207876,42355300,AAPL
15094,2024-12-30,251.5930938720703,252.8899685500744,250.14658624824915,251.62302046123375,35557500,AAPL


In [36]:
data.dtypes

Date      object
Close     object
High      object
Low       object
Open      object
Volume    object
Ticker    object
dtype: object

The columns should be converted into appropriate datatype. 

In [38]:
data['Date'] = pd.to_datetime(data['Date'])
numeric_columns = ['Close', 'High', 'Low', 'Open', 'Volume']
for col in numeric_columns:
    data[col] = pd.to_numeric(data[col], errors='coerce')

In [45]:
data.dtypes

date          datetime64[ns]
close                float64
high                 float64
low                  float64
open                 float64
volume                 int64
ticker                object
dollar_vol           float64
dtype: object

In [46]:
data.columns = data.columns.map(lambda x: x.lower())

#### Compute Rolling Average Dollar Volume

In [47]:
data['dollar_vol'] = data[['close', 'volume']].prod(axis=1)

In [48]:
data

Unnamed: 0,date,close,high,low,open,volume,ticker,dollar_vol
0,2015-01-02,39.998714,40.563281,39.810528,39.913175,27913900,MSFT,1.116520e+09
1,2015-01-05,39.630886,39.973046,39.562452,39.665100,39673900,MSFT,1.572312e+09
2,2015-01-06,39.049206,39.990149,38.955111,39.673650,36447900,MSFT,1.423262e+09
3,2015-01-07,39.545341,39.742084,38.912344,39.331490,29114100,MSFT,1.151327e+09
4,2015-01-08,40.708683,40.845548,39.964482,39.990143,29645200,MSFT,1.206817e+09
...,...,...,...,...,...,...,...,...
15091,2024-12-24,257.578674,257.588630,254.675658,254.875189,23234700,AAPL,5.984763e+09
15092,2024-12-26,258.396667,259.474086,257.010028,257.568678,27237100,AAPL,7.037976e+09
15093,2024-12-27,254.974930,258.077462,252.451019,257.209530,42355300,AAPL,1.079954e+10
15094,2024-12-30,251.593094,252.889969,250.146586,251.623020,35557500,AAPL,8.946021e+09


KeyError: 'ticker'