In [25]:
import warnings
warnings.filterwarnings('ignore')
import requests
import pandas as pd
from ta.momentum import RSIIndicator, StochasticOscillator
from ta.trend import EMAIndicator, MACD, ADXIndicator
from ta.volatility import BollingerBands
from sklearn.model_selection import train_test_split
from ta.volume import OnBalanceVolumeIndicator
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import yfinance as yf

In [17]:
import sys
import os

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import FinanceLib as fl

In [18]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [21]:
def fetch_and_process_data(stock_symbol, source_type, start_date, end_date):
    # Fetch historical stock price data
    if source_type == 'yf_api':
        stock_data = yf.download(stock_symbol, start=start_date, end=end_date)
        stock_data["symbol"] = stock_symbol
    elif source_type == 'db' :
        stock_data = fl.get_stock_quote_from_db(stock_symbol, "america", start_date, end_date, is_dt_index=1, is_stock_index=0)
    
    # Check for missing values
    stock_data.fillna(method='ffill', inplace=True)    
    # Create the Target column
    stock_data['Target'] = (stock_data['Close'].shift(-1) > stock_data['Close']).astype(int)
    stock_data.dropna(inplace=True)
    
    # Feature Engineering
    stock_data['MA_50'] = stock_data['Close'].rolling(window=50).mean()
    stock_data['MA_200'] = stock_data['Close'].rolling(window=200).mean()
    stock_data['EMA_50'] = EMAIndicator(stock_data['Close'], window=50).ema_indicator()
    stock_data['EMA_200'] = EMAIndicator(stock_data['Close'], window=200).ema_indicator()
    stock_data['Volatility'] = stock_data['Close'].rolling(window=50).std()
    stock_data['Daily_Return'] = stock_data['Close'].pct_change()
    stock_data['RSI'] = RSIIndicator(stock_data['Close']).rsi()
    stock_data['Stochastic'] = StochasticOscillator(stock_data['High'], stock_data['Low'], stock_data['Close']).stoch()
    stock_data['ADX'] = ADXIndicator(stock_data['High'], stock_data['Low'], stock_data['Close']).adx()
    stock_data['OBV'] = OnBalanceVolumeIndicator(stock_data['Close'], stock_data['Volume']).on_balance_volume()
    macd = MACD(stock_data['Close'])
    stock_data['MACD'] = macd.macd()
    stock_data['MACD_Signal'] = macd.macd_signal()
    stock_data['MACD_Diff'] = macd.macd_diff()
    bb = BollingerBands(stock_data['Close'])
    stock_data['BB_High'] = bb.bollinger_hband()
    stock_data['BB_Low'] = bb.bollinger_lband()
    
    # Drop rows with NaN values generated by these calculations
    stock_data.dropna(inplace=True)
    
    # Fetch and merge fundamental data
#     fundamental_data = fetch_time_series_fundamental_data(stock_symbol)
#     fundamental_data = fetch_alpha_vantage_data(stock_symbol, alpha_vantage_api_key)
#     stock_data = stock_data.merge(fundamental_data, left_index=True, right_index=True, how='left')
    
    # Fill any remaining NaN values in fundamental data
    stock_data.fillna(method='ffill', inplace=True)
    
    return stock_data

In [22]:
def get_stock_data(stock_symbols, start_date, end_date):
    stocks = {}
    stocks_list = []

    for symbol in stock_symbols:
        stock_data = fetch_and_process_data(symbol, start_date, end_date)
        stocks[symbol] = stock_data
        stocks_list.append(stock_data.iloc[:-1])

    final_stock_df = pd.concat(stocks_list, ignore_index=False)
    return stocks, final_stock_df    

In [23]:
def fit_stock_predict_model(stock_df, features, target_name='Target', scoring='precision'):    
    stock_models = {}
    n_splits = 5

    final_stock_df = stock_df
    X = final_stock_df[features]
    y = final_stock_df[target_name]

    tscv = TimeSeriesSplit(n_splits=5)

    # XGBoost Classifier
    model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

    # Hyperparameter tuning
    param_distributions = {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 4, 5, 6],
        'colsample_bytree': [0.3, 0.7],
        'subsample': [0.7, 0.9],
        'reg_alpha': [0, 0.1, 1],
        'reg_lambda': [1, 1.5, 2]
    }

    random_search = RandomizedSearchCV(model, param_distributions, n_iter=20, scoring=scoring, cv=tscv, verbose=1, n_jobs=-1)

    # Fit model using TimeSeriesSplit
    random_search.fit(X, y)

    # Evaluate model
    y_pred = random_search.predict(X)
#     print(classification_report(y, y_pred))
    return random_search, classification_report(y, y_pred)

In [5]:
def get_probability_for_stock(stock_symbols, stocks, model):
    predictions = {}
    for symbol in stock_symbols:
        stock_data = stocks[symbol]
        latest_data = stock_data.iloc[-1][features].values.reshape(1, -1)
        probability = model.predict_proba(latest_data)[0][1]
        predictions[symbol] = probability
        print(f'Predicted probability of stock price for date {stock_data.iloc[-1].name} increase for {symbol}: {probability}')
    return predictions

In [None]:
stock_symbols = ['AAPL', 'GOOGL', 'MSFT', 'AMZN', 'TSLA']  
features = ['MA_50', 'MA_200', 'EMA_50', 'EMA_200', 'Volatility', 'Daily_Return', 'RSI', 'Stochastic', 'ADX', 'OBV', 'MACD', 'MACD_Signal', 'MACD_Diff', 'BB_High', 'BB_Low']
start_date = '2010-01-01'
end_date = '2024-07-23'

#get dataframe
stocks, stock_df = get_stock_data(stock_symbols, start_date, end_date)

#fit model
random_search_xgboost_model, classification_report = fit_stock_predict_model(stock_df, features)
print(classification_report)

#get predictions
predictions = get_probability_for_stock(stock_symbols, stocks, random_search_xgboost_model)

In [26]:
stock_symbol = "AAPL"
stock_data1 = yf.download(stock_symbol, start="2024-07-01", end="2024-07-10")  
stock_data1["symbol"] = stock_symbol

[*********************100%%**********************]  1 of 1 completed


In [27]:
stock_data1.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2024-07-01,212.089996,217.509995,211.919998,216.75,216.75,60402900
2024-07-02,216.149994,220.380005,215.100006,220.270004,220.270004,58046200
2024-07-03,220.0,221.550003,219.029999,221.550003,221.550003,37369800
2024-07-05,221.649994,226.449997,221.649994,226.339996,226.339996,60412400
2024-07-08,227.089996,227.850006,223.25,227.820007,227.820007,59085900


In [28]:
stock_symbol = "AAPL"
stock_data2 = fl.get_stock_quote_from_db(stock_symbol, "america", "2024-07-01", "2024-07-10", is_dt_index=1, is_stock_index=0)

select * from finance.f_get_quote ('2024-07-01', '2024-07-10', 'AAPL', 'america')


In [29]:
stock_data2.head()

Unnamed: 0_level_0,stock,openvalue,highvalue,lowvalue,closevalue,adjclose,volume
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2024-07-01,AAPL,212.089996,217.509995,211.919998,216.75,216.75,
2024-07-02,AAPL,216.149994,220.380005,215.100006,220.270004,220.270004,
2024-07-03,AAPL,220.0,221.550003,219.029999,221.550003,221.550003,
2024-07-05,AAPL,221.649994,226.449997,221.649994,226.339996,226.339996,
2024-07-08,AAPL,227.089996,227.850006,223.25,227.820007,227.820007,
