In [51]:
!pip install TA_Lib-0.4.24-cp38-cp38-win_amd64.whl
# https://blog.quantinsti.com/install-ta-lib-python/
# see installation guide for details

Processing c:\users\kenny\onedrive - hkust connect\documents\projects\algo trade\algorithmic-trading-python\starter_files\ta_lib-0.4.24-cp38-cp38-win_amd64.whl
Installing collected packages: TA-Lib
Successfully installed TA-Lib-0.4.24



[notice] A new release of pip available: 22.3.1 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [16]:
# Features functions
import numpy as np

def calculate_sma(values, period):
    """
    Calculates the Simple Moving Average (SMA) for a given list of values and period.

    Args:
        values (list, np.ndarray): List or numpy array of values.
        period (int): Period for which to calculate the SMA.

    Returns:
        np.ndarray: Numpy array of SMA values.
    """
    if len(values) < period:
        raise ValueError("Number of values is less than the specified period.")

    sma_values = np.convolve(values, np.ones((period,))/period, mode='valid')
    sma_values = np.concatenate(([None]*(period-1), sma_values))
    return sma_values

import numpy as np

def calculate_ema(values, period):
    """
    Calculates the Exponential Moving Average (EMA) for a given list of values and period.

    Args:
        values (list, np.ndarray): List or numpy array of values.
        period (int): Period for which to calculate the EMA.

    Returns:
        np.ndarray: Numpy array of EMA values.
    """
    if len(values) < period:
        raise ValueError("Number of values is less than the specified period.")

    alpha = 2 / (period + 1)
    ema_values = [None]
    for i in range(1, len(values)):
        if ema_values[-1] is None:
            ema = values[i]
        else:
            ema = alpha * values[i] + (1 - alpha) * ema_values[-1]
        ema_values.append(ema)
    return np.array(ema_values)

import numpy as np
import pandas as pd
import math

def calculate_macd(values, short_period=12, long_period=26, signal_period=9):
    """
    Calculates the Moving Average Convergence Divergence (MACD) for a given list of values.

    Args:
        values (list, np.ndarray): List or numpy array of values.
        short_period (int): Short period for calculating the MACD. Default is 12.
        long_period (int): Long period for calculating the MACD. Default is 26.
        signal_period (int): Signal period for calculating the MACD signal line. Default is 9.

    Returns:
        pd.DataFrame: DataFrame with MACD, signal line, and histogram values.
    """
    if len(values) < long_period:
        raise ValueError("Number of values is less than the specified long period.")

    ema_short = calculate_ema(values, short_period)
    ema_long = calculate_ema(values, long_period)

    macd_line = [0]*len(ema_long) #initialize
    for i in range(len(ema_long)):
        if ema_short[i] == None or ema_long[i] == None:
            macd_line[i] = None 
        else:
            macd_line[i] = ema_short[i] - ema_long[i]

    signal_line = calculate_ema(macd_line, signal_period)

    histogram  = [0]*len(ema_long) #initialize
    for j in range(len(ema_long)): #
        if macd_line[j] == None or signal_line[j] == None:
            histogram[j] = None 
        else: #      
            histogram[j] = macd_line[j] - signal_line[j]

    macd_data = pd.DataFrame({'macd': macd_line, 'signal': signal_line, 'histogram': histogram})
    return macd_data


def calculate_atr(data, period=14):
    """
    Calculates the Average True Range (ATR) for a given DataFrame of OHLCV data.

    Args:
        data (pd.DataFrame): DataFrame with OHLCV data.
        period (int): Period for calculating the ATR. Default is 14.

    Returns:
        pd.Series: Series with ATR values.
    """
    tr = np.maximum(
        np.maximum(data['high'] - data['low'], np.abs(data['high'] - data['close'].shift(1))),
        np.abs(data['low'] - data['close'].shift(1))
    )
    atr = tr.rolling(window=period).mean()
    return atr


def calculate_rsi(values, period=14):
    """
    Calculates the Relative Strength Index (RSI) for a given list of values.

    Args:
        values (list, np.ndarray): List or numpy array of values.
        period (int): Period for calculating the RSI. Default is 14.

    Returns:
        np.ndarray: Numpy array of RSI values.
    """
    if len(values) < period:
        raise ValueError("Number of values is less than the specified period.")

    deltas = np.diff(values)
    gains = np.where(deltas > 0, deltas, 0)
    losses = -np.where(deltas < 0, deltas, 0)
    avg_gain = np.convolve(gains, np.ones(period) / period, mode='valid')
    avg_loss = np.convolve(losses, np.ones(period) / period, mode='valid')
    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))
    # Pad the resulting array to match the original length
    rsi = np.pad(rsi, (values.size - rsi.size, 0), mode='constant', constant_values=np.nan)

    return pd.Series(rsi)


def calculate_adi(data):
    """
    Calculates the Accumulation/Distribution Index (ADI) for a given DataFrame of OHLCV data.

    Args:
        data (pd.DataFrame): DataFrame with OHLCV data.

    Returns:
        pd.Series: Series with ADI values.
    """
    clv = ((data['close'] - data['low']) - (data['high'] - data['close'])) / (data['high'] - data['low'])
    adi = (clv * data['volume']).cumsum()
    return adi

def calculate_rc(values, period=1):
    """
    Calculates the Rate of Change (RC) for a given list of values.
    Args:
        values (list, np.ndarray): List or numpy array of values.
        period (int): Period for calculating the RC. Default is 1.

    Returns:
        np.ndarray: Numpy array of RC values.
    """
    if len(values) < period:
        raise ValueError("Number of values is less than the specified period.")

    rc = (values - values.shift(period)) / values.shift(period)
    return rc.values

def calculate_bollinger_bands(values, window=20, k=2):
    """
    Calculates the Bollinger Bands for a given list of values.
    Args:
        values (list, np.ndarray): List or numpy array of values.
        window (int): Window size for calculating the moving average. Default is 20.
        k (float): Number of standard deviations for calculating the upper and lower bands. Default is 2.

    Returns:
        pd.DataFrame: DataFrame with Bollinger Bands values (upper, middle, lower).
    """
    if len(values) < window:
        raise ValueError("Number of values is less than the specified window size.")

    rolling_mean = values.rolling(window=window).mean()
    rolling_std = values.rolling(window=window).std()

    upper_band = rolling_mean + (k * rolling_std)
    lower_band = rolling_mean - (k * rolling_std)

    bollinger_bands_data = pd.DataFrame({'upper': upper_band, 'middle': rolling_mean, 'lower': lower_band})
    return bollinger_bands_data




In [21]:
from alpha_vantage.timeseries import TimeSeries
from matplotlib import pyplot as plt


ts = TimeSeries(key='1FYOEOJ5OMMGM9U9',output_format='pandas')
data,meta_data = ts.get_intraday(symbol='MSFT',interval='15min',outputsize='full')
# data,meta_data = ts.get_intraday_extended(symbol='MSFT',interval='15min', slice='year1month1')
# plt.figure(1,figsize=(12,9))
# data['4. close'].plot()
# data['2. high'] 
data.columns = ['open', 'high', 'low', 'close', 'volume']

In [22]:
# Change the index name
data = data.rename_axis('datetime')

In [23]:
data.head()

Unnamed: 0_level_0,open,high,low,close,volume
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-04-28 20:00:00,307.2,307.5,307.2,307.5,7816.0
2023-04-28 19:45:00,307.15,307.35,307.15,307.26,1445.0
2023-04-28 19:30:00,307.2,307.2,307.1,307.12,824.0
2023-04-28 19:15:00,307.1,307.1899,307.1,307.1,814.0
2023-04-28 19:00:00,307.35,307.35,307.2,307.2,3007.0


In [24]:
import pandas as pd


# df2 = pd.read_json(data)
# print(df2)

def estimate_market_metrics(data):
    
    # metric is a dataframe containing all the metrics at each time step
    # bt needs access to the whole data
    bt.feeds.PandasData(dataname=data)
    data['sma_10'] =  calculate_sma(data['close'], period=10)
    data['sma_30'] = calculate_sma(data['close'], period=30)
    data['ema_10'] =  calculate_ema(data['close'], period=10)
    data['ema_30'] = calculate_ema(data['close'], period=30)
    data['ema_95'] = calculate_ema(data['close'], period=95)
    macd = calculate_macd(data['close'])
    data['macd'],data['macd_signal'],data['macd_histogram'] = (macd['macd']).tolist(),(macd['signal']).tolist(),(macd['histogram']).tolist() 
    data['atr'] = calculate_atr(data)
    data['adi'] = calculate_adi(data)
    data['rc'] = calculate_rc(data['close'])
    rsi = calculate_rsi(data['close'])
    data['rsi'] = rsi.tolist()

    bb = calculate_bollinger_bands(data['close'])
    data['bollinger_upper'], data['bollinger_middle'], data['bollinger_lower'] = bb['upper'], bb['middle'], bb['lower']
    
    return data

In [25]:
metrics = estimate_market_metrics(data)
metrics.tail(10)

Unnamed: 0_level_0,open,high,low,close,volume,sma_10,sma_30,ema_10,ema_30,ema_95,macd,macd_signal,macd_histogram,atr,adi,rc,rsi,bollinger_upper,bollinger_middle,bollinger_lower
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2023-03-20 06:30:00,279.99,280.58,279.47,279.65,9172.0,279.121,274.69091,278.773924,275.940544,273.832385,2.06834,1.844621,0.223718,1.822771,60611810.0,-0.00125,86.090058,283.572967,276.36068,269.148393
2023-03-20 06:15:00,280.52,280.7,279.91,280.25,26393.0,279.263,274.984243,279.042301,276.218574,273.966085,2.062066,1.88811,0.173956,1.474557,60608140.0,0.002146,83.421892,283.860122,276.78275,269.705378
2023-03-20 06:00:00,280.48,280.9,280.12,280.5,7502.0,279.428,275.298577,279.307338,276.494795,274.102208,2.053595,1.921207,0.132387,1.247414,60607950.0,0.000892,77.226277,283.898365,277.27975,270.661135
2023-03-20 05:45:00,280.38,280.98,280.38,280.59,10013.0,279.669,275.618243,279.540549,276.759002,274.237371,2.030734,1.943113,0.087622,1.1417,60604940.0,0.000321,72.67951,283.675352,277.79725,271.919148
2023-03-20 05:30:00,280.5,280.5,279.97,279.97,6947.0,279.796,275.92391,279.618631,276.966163,274.3568,1.940223,1.942535,-0.002312,1.085986,60598000.0,-0.00221,59.85267,283.380784,278.21675,273.052716
2023-03-20 05:15:00,279.97,281.19,279.97,280.42,32536.0,280.008,276.237243,279.764334,277.188991,274.483117,1.883096,1.930647,-0.047551,1.146429,60589460.0,0.001607,63.683305,283.131681,278.60625,274.080819
2023-03-20 05:00:00,279.27,279.8,279.27,279.8,9010.0,280.088,276.525577,279.770819,277.357443,274.593885,1.767419,1.898001,-0.130582,1.164286,60598470.0,-0.002211,57.410296,282.116331,279.0185,275.920669
2023-03-20 04:45:00,277.52,279.43,277.51,279.11,7723.0,280.036,276.812243,279.65067,277.470511,274.687971,1.601606,1.838722,-0.237116,1.09,60603620.0,-0.002466,57.231726,281.258953,279.276,277.293047
2023-03-20 04:30:00,278.6,278.6,276.99,277.94,7922.0,279.823,277.042077,279.339639,277.500801,274.755722,1.36011,1.743,-0.38289,1.2,60605050.0,-0.004192,44.632768,281.058259,279.3345,277.610741
2023-03-20 04:15:00,275.83,279.99,275.83,278.8,34412.0,279.703,277.269787,279.241523,277.58462,274.839977,1.224007,1.639201,-0.415194,1.427143,60619770.0,0.003094,53.31565,281.00233,279.3745,277.74667


In [139]:
# Binance API information
API_KEY = ""
SECRET_KEY = ""


In [42]:
import requests
import json
import pandas as pd
import datetime as dt

In [190]:
import requests
import time

def fetch_binance_historical_data(symbol, interval, start_time, end_time):
    """
    Fetches historical k-line data from Binance API for a given symbol, interval, start time, and end time.
    
    :param symbol: str, trading pair symbol (e.g., 'BTCUSDT' for Bitcoin/USDT)
    :param interval: str, timeframe for the historical data (e.g., '1d' for daily)
    :param start_time: int, start timestamp in milliseconds
    :param end_time: int, end timestamp in milliseconds
    :return: list of dictionaries, historical k-line data
    """
    # Define API endpoint
    endpoint = 'https://api.binance.com/api/v3/klines'
    
    # Initialize empty list to store historical data
    historical_data = []
    
    # Loop until end time is reached
    while start_time < end_time:
        # Calculate remaining time until end time or maximum allowed timeframe (500 data points)

        if interval[-1] == 'h':
            interval_size = (int(interval[0]) * 60) - 1
        else:
            interval_size = int(interval[:-1])
        
        remaining_time = min(end_time - start_time, 500 * interval_size * 60 * 1000)
        
        # Define API parameters for current request
        params = {
            'symbol': symbol,
            'interval': interval,
            'startTime': start_time,
            'endTime': start_time + remaining_time,
            'limit': 500,
        }
        
        # Send API request and parse response
        response = requests.get(endpoint, params=params)
        data = response.json()

        # display(len(data)) 
        
        # Append fetched data to historical_data list
        historical_data.extend(data)
        
        # Update start time for next request
        start_time += remaining_time + (int(interval[:-1]) * 60 * 1000)
        
        # Add delay to avoid hitting API rate limits (optional)
        time.sleep(1)
    
    return historical_data


In [218]:
# hist_data = fetch_binance_historical_data(symbol = 'AXSUSDT',
#                                           interval = '1h',
#                                           start_time = int(dt.datetime(2023,3,1).timestamp()*1000),
#                                           end_time = int(dt.datetime(2023,4,13).timestamp()*1000))

hist_data = fetch_binance_historical_data(symbol = 'AXSUSDT',
                                          interval = '1h',
                                          start_time = int(dt.datetime(2023,3,1).timestamp()*1000),
                                          end_time = int(dt.datetime(2023,5,6).timestamp()*1000))

In [219]:
len(hist_data)

1584

In [220]:
data = pd.DataFrame(hist_data)
#format columns name
data.columns = ['datetime', 'open', 'high', 'low', 'close', 'volume','close_time', 'qav', 'num_trades','taker_base_vol', 'taker_quote_vol', 'ignore']
data.index = [dt.datetime.fromtimestamp(x/1000.0) for x in data.datetime]
data=data.astype(float)
data.tail()

Unnamed: 0,datetime,open,high,low,close,volume,close_time,qav,num_trades,taker_base_vol,taker_quote_vol,ignore
2023-05-05 20:00:00,1683288000000.0,7.62,7.64,7.47,7.55,44559.89,1683292000000.0,336678.8271,1578.0,14863.18,112301.7937,0.0
2023-05-05 21:00:00,1683292000000.0,7.55,7.62,7.53,7.61,16389.72,1683295000000.0,124179.4459,484.0,9290.71,70441.7859,0.0
2023-05-05 22:00:00,1683295000000.0,7.62,7.69,7.61,7.68,36715.2,1683299000000.0,280721.0257,975.0,20826.63,159304.9533,0.0
2023-05-05 23:00:00,1683299000000.0,7.68,7.73,7.67,7.71,30138.36,1683302000000.0,232078.7388,757.0,20587.24,158573.0048,0.0
2023-05-06 00:00:00,1683302000000.0,7.72,7.81,7.71,7.77,70995.83,1683306000000.0,551780.6399,1755.0,44215.81,343750.4254,0.0


In [222]:
import pandas as pd


# df2 = pd.read_json(data)
# print(df2)

def extract_features(data):
    
    # metric is a dataframe containing all the metrics at each time step
    # bt needs access to the whole data
    bt.feeds.PandasData(dataname=data)
    data['sma_10'] =  calculate_sma(data['close'], period=10)
    data['sma_30'] = calculate_sma(data['close'], period=30)
    data['ema_10'] =  calculate_ema(data['close'], period=10)
    data['ema_30'] = calculate_ema(data['close'], period=30)
    data['ema_95'] = calculate_ema(data['close'], period=95)
    macd = calculate_macd(data['close'])
    data['macd'],data['macd_signal'],data['macd_histogram'] = (macd['macd']).tolist(),(macd['signal']).tolist(),(macd['histogram']).tolist() 
    data['atr'] = calculate_atr(data)
    data['adi'] = calculate_adi(data)
    data['rc'] = calculate_rc(data['close'])
    rsi = calculate_rsi(data['close'])
    data['rsi'] = rsi.tolist()

    bb = calculate_bollinger_bands(data['close'])
    data['bollinger_upper'], data['bollinger_middle'], data['bollinger_lower'] = bb['upper'], bb['middle'], bb['lower']
    
    
    return data

In [223]:
new_data = extract_features(data)
new_data.tail()

Unnamed: 0,datetime,open,high,low,close,volume,close_time,qav,num_trades,taker_base_vol,...,macd,macd_signal,macd_histogram,atr,adi,rc,rsi,bollinger_upper,bollinger_middle,bollinger_lower
2023-05-05 20:00:00,1683288000000.0,7.62,7.64,7.47,7.55,44559.89,1683292000000.0,336678.8271,1578.0,14863.18,...,0.000187,0.003292,-0.003105,0.054286,1277341.0,-0.009186,50.0,7.660561,7.5915,7.522439
2023-05-05 21:00:00,1683292000000.0,7.55,7.62,7.53,7.61,16389.72,1683295000000.0,124179.4459,484.0,9290.71,...,0.001141,0.002862,-0.001721,0.058571,1290088.0,0.007947,56.666667,7.66187,7.594,7.52613
2023-05-05 22:00:00,1683295000000.0,7.62,7.69,7.61,7.68,36715.2,1683299000000.0,280721.0257,975.0,20826.63,...,0.007459,0.003781,0.003678,0.062143,1317625.0,0.009198,64.864865,7.675185,7.6005,7.525815
2023-05-05 23:00:00,1683299000000.0,7.68,7.73,7.67,7.71,30138.36,1683302000000.0,232078.7388,757.0,20587.24,...,0.014717,0.005969,0.008749,0.059286,1327671.0,0.003906,60.606061,7.693915,7.6085,7.523085
2023-05-06 00:00:00,1683302000000.0,7.72,7.81,7.71,7.77,70995.83,1683306000000.0,551780.6399,1755.0,44215.81,...,0.025023,0.009779,0.015243,0.060714,1341870.0,0.007782,65.789474,7.727754,7.619,7.510246


In [None]:
# Feature Engineering
# https://www.relataly.com/feature-engineering-for-multivariate-time-series-models-with-python/1813/
# Indexing Batches
train_df = df.sort_values(by=['Date']).copy()

# Adding Month and Year in separate columns
d = pd.to_datetime(train_df.index)
train_df['Day'] = d.strftime("%d") 
train_df['Month'] = d.strftime("%m") 
train_df['Year'] = d.strftime("%Y") 
train_df


def createFeatures(df):
    df = pd.DataFrame(df)

    
    df['Close_Diff'] = df['Adj Close'].diff()
        
    # Moving averages - different periods
    df['MA200'] = df['Close'].rolling(window=200).mean() 
    df['MA100'] = df['Close'].rolling(window=100).mean() 
    df['MA50'] = df['Close'].rolling(window=50).mean() 
    df['MA26'] = df['Close'].rolling(window=26).mean() 
    df['MA20'] = df['Close'].rolling(window=20).mean() 
    df['MA12'] = df['Close'].rolling(window=12).mean() 
    
    # SMA Differences - different periods
    df['DIFF-MA200-MA50'] = df['MA200'] - df['MA50']
    df['DIFF-MA200-MA100'] = df['MA200'] - df['MA100']
    df['DIFF-MA200-CLOSE'] = df['MA200'] - df['Close']
    df['DIFF-MA100-CLOSE'] = df['MA100'] - df['Close']
    df['DIFF-MA50-CLOSE'] = df['MA50'] - df['Close']
    
    # Moving Averages on high, lows, and std - different periods
    df['MA200_low'] = df['Low'].rolling(window=200).min()
    df['MA14_low'] = df['Low'].rolling(window=14).min()
    df['MA200_high'] = df['High'].rolling(window=200).max()
    df['MA14_high'] = df['High'].rolling(window=14).max()
    df['MA20dSTD'] = df['Close'].rolling(window=20).std() 
    
    # Exponential Moving Averages (EMAS) - different periods
    df['EMA12'] = df['Close'].ewm(span=12, adjust=False).mean()
    df['EMA20'] = df['Close'].ewm(span=20, adjust=False).mean()
    df['EMA26'] = df['Close'].ewm(span=26, adjust=False).mean()
    df['EMA100'] = df['Close'].ewm(span=100, adjust=False).mean()
    df['EMA200'] = df['Close'].ewm(span=200, adjust=False).mean()

    # Shifts (one day before and two days before)
    df['close_shift-1'] = df.shift(-1)['Close']
    df['close_shift-2'] = df.shift(-2)['Close']

    # Bollinger Bands
    df['Bollinger_Upper'] = df['MA20'] + (df['MA20dSTD'] * 2)
    df['Bollinger_Lower'] = df['MA20'] - (df['MA20dSTD'] * 2)
    
    # Relative Strength Index (RSI)
    df['K-ratio'] = 100*((df['Close'] - df['MA14_low']) / (df['MA14_high'] - df['MA14_low']) )
    df['RSI'] = df['K-ratio'].rolling(window=3).mean() 

    # Moving Average Convergence/Divergence (MACD)
    df['MACD'] = df['EMA12'] - df['EMA26']
    
    # Replace nas 
    nareplace = df.at[df.index.max(), 'Close']    
    df.fillna((nareplace), inplace=True)
    
    return df


# https://kernc.github.io/backtesting.py/doc/examples/Trading%20with%20Machine%20Learning.html

# code extract_features from market data and also obtain the 48 price...to train a ML model

def BBANDS(data, n_lookback, n_std):
    """Bollinger bands indicator"""
    hlc3 = (data.High + data.Low + data.Close) / 3
    mean, std = hlc3.rolling(n_lookback).mean(), hlc3.rolling(n_lookback).std()
    upper = mean + n_std*std
    lower = mean - n_std*std
    return upper, lower


close = data.Close.values
sma10 = SMA(data.Close, 10)
sma20 = SMA(data.Close, 20)
sma50 = SMA(data.Close, 50)
sma100 = SMA(data.Close, 100)
upper, lower = BBANDS(data, 20, 2)

# Design matrix / independent features:

# Price-derived features
data['X_SMA10'] = (close - sma10) / close
data['X_SMA20'] = (close - sma20) / close
data['X_SMA50'] = (close - sma50) / close
data['X_SMA100'] = (close - sma100) / close

data['X_DELTA_SMA10'] = (sma10 - sma20) / close
data['X_DELTA_SMA20'] = (sma20 - sma50) / close
data['X_DELTA_SMA50'] = (sma50 - sma100) / close

# Indicator features
data['X_MOM'] = data.Close.pct_change(periods=2)
data['X_BB_upper'] = (upper - close) / close
data['X_BB_lower'] = (lower - close) / close
data['X_BB_width'] = (upper - lower) / close
data['X_Sentiment'] = ~data.index.to_series().between('2017-09-27', '2017-12-14')

# Some datetime features for good measure
data['X_day'] = data.index.dayofweek
data['X_hour'] = data.index.hour

data = data.dropna().astype(float)

import numpy as np


def get_X(data):
    """Return model design matrix X"""
    return data.filter(like='X').values


def get_y(data):
    """Return dependent variable y"""
    y = data.Close.pct_change(48).shift(-48)  # Returns after roughly two days
    y[y.between(-.004, .004)] = 0             # Devalue returns smaller than 0.4%
    y[y > 0] = 1
    y[y < 0] = -1
    return y


def get_clean_Xy(df):
    """Return (X, y) cleaned of NaN values"""
    X = get_X(df)
    y = get_y(df).values
    isnan = np.isnan(y)
    X = X[~isnan]
    y = y[~isnan]
    return X, y

import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

X, y = get_clean_Xy(data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=0)

clf = KNeighborsClassifier(7)  # Model the output based on 7 "nearest" examples
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

_ = pd.DataFrame({'y_true': y_test, 'y_pred': y_pred}).plot(figsize=(15, 2), alpha=.7)
print('Classification accuracy: ', np.mean(y_test == y_pred))

In [None]:
# Plot line charts
df_plot = df.copy()

ncols = 2
nrows = int(round(df_plot.shape[1] / ncols, 0))

fig, ax = plt.subplots(nrows=nrows, ncols=ncols, sharex=True, figsize=(14, 7))
for i, ax in enumerate(fig.axes):
        sns.lineplot(data = df_plot.iloc[:, i], ax=ax)
        ax.tick_params(axis="x", rotation=30, labelsize=10, length=0)
        ax.xaxis.set_major_locator(mdates.AutoDateLocator())
fig.tight_layout()
plt.show()

# check the close data and see if it is necessary to filter the noise - Kalman filter or Gaussian filter

In [None]:
import pandas as pd
import numpy as np
import random

# Load historical stock data with technical indicators as features
df = pd.read_csv('stock_data.csv')

# Define the state space
window_size = 10  # Number of days to consider for each state
n_features = 10  # Number of technical indicators as features
state_size = window_size * n_features  # Total number of values in each state
n_actions = 3  # Number of possible actions (buy, sell, or hold)

# Define the Q-learning algorithm
n_episodes = 1000
alpha = 0.1
gamma = 0.99
epsilon = 0.1
Q = np.zeros((state_size, n_actions))

for episode in range(n_episodes):
    state = np.zeros(state_size)  # Start with an empty state
    done = False
    while not done:
        # Choose an action based on the current state and exploration strategy
        if np.random.uniform(0, 1) < epsilon:
            action = np.random.randint(0, n_actions)
        else:
            action = np.argmax(Q[state, :])

        # Calculate the reward for the action taken and the new state
        next_state = np.concatenate([state[n_features:], df.iloc[window_size][1:].values])
        reward = 0
        if action == 0:  # Buy
            reward = df.iloc[window_size]['Close'] - df.iloc[window_size-1]['Close']
        elif action == 1:  # Sell
            reward = df.iloc[window_size-1]['Close'] - df.iloc[window_size]['Close']
        else:  # Hold
            reward = 0

        # Update the Q-value for the current state-action pair
        Q[state, action] = Q[state, action] + alpha * (reward + gamma * np.max(Q[next_state, :]) - Q[state, action])

        # Update the state
        state = next_state

        # Check if done
        if len(df) - window_size <= np.argmax(state) or np.random.random() < 0.05:
            done = True

# Use the learned Q-values to determine the best trading actions
state = np.zeros(state_size)  # Start with an empty state
done = False
while not done:
    action = np.argmax(Q[state, :])
    if action == 0:  # Buy
        print(f"Buy at {df.iloc[window_size]['Close']}")
    elif action == 1:  # Sell
        print(f"Sell at {df.iloc[window_size]['Close']}")
    else:  # Hold
        print("Hold")
    next_state = np.concatenate([state[n_features:], df.iloc[window_size][1:].values])
    state = next_state
    if len(df) - window_size <= np.argmax(state) or np.random.random() < 0.05:
        done = True
