In [1]:
import pandas_datareader as web
import pandas as pd
import numpy as np
import datetime as dt
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from sklearn.cluster import KMeans
from yahoo_fin import stock_info as si
import yfinance as yf
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

import warnings
from pprint import pprint
warnings.filterwarnings('ignore')

             requires requests_html, which is not installed.
             
             Install using: 
             pip install requests_html
             
             After installation, you may have to restart your Python session.


In [2]:
start_date = dt.datetime.now() - dt.timedelta(days=365 * 10)
end_date = dt.datetime.now()
data = {}

companies_dow30 = si.tickers_dow()
stocks = list(companies_dow30)

# Fetch the historical data and ticker specific data
for stock in stocks:
    ticker = yf.Ticker(stock)
    hist = ticker.history(start=start_date, end=end_date)
    
    # Include additional info from the .info attribute for each day
    for date in hist.index:
        info = ticker.info
        hist.at[date, 'industry'] = info.get('industry')
        hist.at[date, 'country'] = info.get('country')
        hist.at[date, 'state'] = info.get('state')
        hist.at[date, 'exchange'] = info.get('exchange')
        hist.at[date, 'sector'] = info.get('sector')
        
    
    data[stock] = hist

# Combine and create df
all_data = pd.concat(data)

# basic df maintenance
fin_df = all_data.reset_index().copy()
fin_df.rename(columns={'level_0': 'ticker'}, inplace=True)
fin_df['date'] = fin_df['Date'].dt.strftime('%Y-%m-%d')

In [3]:
def rolling_avgs(df):
    df['rolling_close_60'] = fin_df.sort_values(['ticker', 'Date']).groupby('ticker')['Close'].transform(lambda x: x.rolling(60, 1).mean())
    df['rolling_close_30'] = fin_df.sort_values(['ticker', 'Date']).groupby('ticker')['Close'].transform(lambda x: x.rolling(30, 1).mean())
    df['rolling_close_10'] = fin_df.sort_values(['ticker', 'Date']).groupby('ticker')['Close'].transform(lambda x: x.rolling(10, 1).mean())

    return df


In [4]:
fin_df = rolling_avgs(fin_df)
fin_df.head()


Unnamed: 0,ticker,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,industry,country,state,exchange,sector,date,rolling_close_60,rolling_close_30,rolling_close_10
0,AAPL,2014-05-29 00:00:00-04:00,19.83412,20.119067,19.831593,20.071997,376474000,0.0,0.0,Consumer Electronics,United States,CA,NMS,Technology,2014-05-29,20.071997,20.071997,20.071997
1,AAPL,2014-05-30 00:00:00-04:00,20.154133,20.349678,19.867291,19.996813,564020800,0.0,0.0,Consumer Electronics,United States,CA,NMS,Technology,2014-05-30,20.034405,20.034405,20.034405
2,AAPL,2014-06-02 00:00:00-04:00,20.027145,20.054628,19.665116,19.859398,369350800,0.0,0.0,Consumer Electronics,United States,CA,NMS,Technology,2014-06-02,19.976069,19.976069,19.976069
3,AAPL,2014-06-03 00:00:00-04:00,19.853394,20.178145,19.84676,20.140236,292709200,0.0,0.0,Consumer Electronics,United States,CA,NMS,Technology,2014-06-03,20.017111,20.017111,20.017111
4,AAPL,2014-06-04 00:00:00-04:00,20.137082,20.467204,20.095066,20.37022,335482000,0.0,0.0,Consumer Electronics,United States,CA,NMS,Technology,2014-06-04,20.087733,20.087733,20.087733


In [5]:
def calculate_rsi(data, window_size=14):
    data = data.sort_values(by=['ticker', 'Date'])
    data['Price_Change'] = data.groupby('ticker')['Close'].diff()
    data['Gain'] = data['Price_Change'].apply(lambda x: x if x > 0 else 0)
    data['Loss'] = -data['Price_Change'].apply(lambda x: x if x < 0 else 0)
    data['Avg_Gain'] = data.groupby('ticker')['Gain'].rolling(window=window_size).mean().reset_index(level=0, drop=True)
    data['Avg_Loss'] = data.groupby('ticker')['Loss'].rolling(window=window_size).mean().reset_index(level=0, drop=True)
    data['RS'] = data['Avg_Gain'] / data['Avg_Loss']
    data['RSI'] = 100 - (100 / (1 + data['RS']))
    return data['RSI']

def calculate_macd(data, short_window=7, long_window=30, signal_window=9):
    data = data.sort_values(by=['ticker', 'Date'])
    data['Short_EMA'] = data.groupby('ticker')['Close'].transform(lambda x: x.ewm(span=short_window, min_periods=1, adjust=False).mean())
    data['Long_EMA'] = data.groupby('ticker')['Close'].transform(lambda x: x.ewm(span=long_window, min_periods=1, adjust=False).mean())
    data['MACD'] = data['Short_EMA'] - data['Long_EMA']
    data['Signal_Line'] = data.groupby('ticker')['MACD'].transform(lambda x: x.ewm(span=signal_window, min_periods=1, adjust=False).mean())
    return data['MACD'], data['Signal_Line']

def calculate_bollinger_bands(data, window_size=7, num_std=2):
    data = data.sort_values(by=['ticker', 'Date'])
    data['Rolling_Mean'] = data.groupby('ticker')['Close'].transform(lambda x: x.rolling(window=window_size).mean())
    data['Rolling_STD'] = data.groupby('ticker')['Close'].transform(lambda x: x.rolling(window=window_size).std())
    data['Upper_Band'] = data['Rolling_Mean'] + (data['Rolling_STD'] * num_std)
    data['Lower_Band'] = data['Rolling_Mean'] - (data['Rolling_STD'] * num_std)
    return data['Upper_Band'], data['Lower_Band']

In [6]:
# momentum oscillator that measures the speed and change of price movements. 
# The formula for RSI involves calculating average gains and losses over a specified period
fin_df['rsi'] = calculate_rsi(fin_df, window_size=14)

# MACD - momentum indicator that shows the relationship between two moving averages of a security's price.
macd_line, signal_line = calculate_macd(fin_df)
fin_df['MACD'] = macd_line
fin_df['Signal_Line'] = signal_line

# Bollinger Bands - two outer bands that are standard deviations away from the middle band. They are used to measure volatility
upper_band, lower_band = calculate_bollinger_bands(fin_df)
fin_df['Upper_Band'] = upper_band
fin_df['Lower_Band'] = lower_band

In [7]:
fin_df.head()

Unnamed: 0,ticker,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,industry,...,sector,date,rolling_close_60,rolling_close_30,rolling_close_10,rsi,MACD,Signal_Line,Upper_Band,Lower_Band
0,AAPL,2014-05-29 00:00:00-04:00,19.83412,20.119067,19.831593,20.071997,376474000,0.0,0.0,Consumer Electronics,...,Technology,2014-05-29,20.071997,20.071997,20.071997,,0.0,0.0,,
1,AAPL,2014-05-30 00:00:00-04:00,20.154133,20.349678,19.867291,19.996813,564020800,0.0,0.0,Consumer Electronics,...,Technology,2014-05-30,20.034405,20.034405,20.034405,,-0.013945,-0.002789,,
2,AAPL,2014-06-02 00:00:00-04:00,20.027145,20.054628,19.665116,19.859398,369350800,0.0,0.0,Consumer Electronics,...,Technology,2014-06-02,19.976069,19.976069,19.976069,,-0.048993,-0.01203,,
3,AAPL,2014-06-03 00:00:00-04:00,19.853394,20.178145,19.84676,20.140236,292709200,0.0,0.0,Consumer Electronics,...,Technology,2014-06-03,20.017111,20.017111,20.017111,,-0.020702,-0.013764,,
4,AAPL,2014-06-04 00:00:00-04:00,20.137082,20.467204,20.095066,20.37022,335482000,0.0,0.0,Consumer Electronics,...,Technology,2014-06-04,20.087733,20.087733,20.087733,,0.04214,-0.002583,,


In [8]:
fin_df['tomorrow_close'] = fin_df.sort_values(['ticker', 'Date']).groupby('ticker')['Close'].shift(-1)
fin_df = fin_df.dropna(subset=['tomorrow_close'])


In [9]:

features = ['Open', 'High', 'Low', 'Volume', 'Dividends', 'Stock Splits',
            'rolling_close_60', 'rolling_close_30', 'rolling_close_10',
            'rsi', 'MACD', 'Signal_Line', 'Upper_Band', 'Lower_Band']

X = fin_df[features]
y = fin_df['tomorrow_close']


In [10]:
fin_df.head(20)

Unnamed: 0,ticker,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,industry,...,date,rolling_close_60,rolling_close_30,rolling_close_10,rsi,MACD,Signal_Line,Upper_Band,Lower_Band,tomorrow_close
0,AAPL,2014-05-29 00:00:00-04:00,19.83412,20.119067,19.831593,20.071997,376474000,0.0,0.0,Consumer Electronics,...,2014-05-29,20.071997,20.071997,20.071997,,0.0,0.0,,,19.996813
1,AAPL,2014-05-30 00:00:00-04:00,20.154133,20.349678,19.867291,19.996813,564020800,0.0,0.0,Consumer Electronics,...,2014-05-30,20.034405,20.034405,20.034405,,-0.013945,-0.002789,,,19.859398
2,AAPL,2014-06-02 00:00:00-04:00,20.027145,20.054628,19.665116,19.859398,369350800,0.0,0.0,Consumer Electronics,...,2014-06-02,19.976069,19.976069,19.976069,,-0.048993,-0.01203,,,20.140236
3,AAPL,2014-06-03 00:00:00-04:00,19.853394,20.178145,19.84676,20.140236,292709200,0.0,0.0,Consumer Electronics,...,2014-06-03,20.017111,20.017111,20.017111,,-0.020702,-0.013764,,,20.37022
4,AAPL,2014-06-04 00:00:00-04:00,20.137082,20.467204,20.095066,20.37022,335482000,0.0,0.0,Consumer Electronics,...,2014-06-04,20.087733,20.087733,20.087733,,0.04214,-0.002583,,,20.450136
5,AAPL,2014-06-05 00:00:00-04:00,20.413807,20.51395,20.300398,20.450136,303805600,0.0,0.0,Consumer Electronics,...,2014-06-05,20.148133,20.148133,20.148133,,0.100374,0.018008,,,20.393902
6,AAPL,2014-06-06 00:00:00-04:00,20.53069,20.573654,20.359154,20.393902,349938400,0.0,0.0,Consumer Electronics,...,2014-06-06,20.183243,20.183243,20.183243,,0.129182,0.040243,20.633744,19.732742,20.720236
7,AAPL,2014-06-09 00:00:00-04:00,20.499102,20.76004,20.289026,20.720236,301660000,0.0,7.0,Consumer Electronics,...,2014-06-09,20.250367,20.250367,20.250367,,0.207841,0.073763,20.864849,19.686849,20.841866
8,AAPL,2014-06-10 00:00:00-04:00,20.948011,21.018773,20.691494,20.841866,251108000,0.0,0.0,Consumer Electronics,...,2014-06-10,20.316089,20.316089,20.316089,,0.282237,0.115457,21.060337,19.732804,20.755619
9,AAPL,2014-06-11 00:00:00-04:00,20.815324,20.95464,20.669377,20.755619,182724000,0.0,0.0,Consumer Electronics,...,2014-06-11,20.360042,20.360042,20.360042,,0.313884,0.155143,21.032208,20.016996,20.408434
