In [None]:
# Final Revision of Capstone Key Performance Indicator Calculation

In [None]:
# pip ta installation line:

# library for financial calculations

# (don't need to do this every time)


In [None]:
pip install ta

Collecting ta
  Downloading ta-0.11.0.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: ta
  Building wheel for ta (setup.py) ... [?25l[?25hdone
  Created wheel for ta: filename=ta-0.11.0-py3-none-any.whl size=29412 sha256=ba55bbd9a909e2eda960980fc57c21ad1f257162a07e3c35c8d68f2cbfa5e2d3
  Stored in directory: /root/.cache/pip/wheels/a1/d7/29/7781cc5eb9a3659d032d7d15bdd0f49d07d2b24fec29f44bc4
Successfully built ta
Installing collected packages: ta
Successfully installed ta-0.11.0


In [None]:
# KPI Calculation Preprocessing File Notes
# File Sequence #4

# The purpose of this file is to generate KPI calculations based on stock data

# input:
# 1. merged stock and econ datasets
# output:
# 1. merged dataset with KPI calculations attached


In [None]:
# mount google drive
from google.colab import drive
# this resets all file variables
drive.flush_and_unmount()
# mount/remount
drive.mount('/content/drive')

Drive not mounted, so nothing to flush and unmount.
Mounted at /content/drive


In [None]:
# imports

# data handling and calculations:
import pandas as pd
# calculations
import numpy as np
# financial calculation functions
import ta
# file handling
import os
# group file handling
import glob

In [None]:
# define primary notebook objects:
shared_folder = '/content/drive/MyDrive/Capstone_Docs_Shared'

# viable input for this notebook begins with the following format
string_format = 'source_data_merged_*.csv'

# number of raw input files user would like to combine before KPI calcs:
num_combined_files = 2
# (2 is usually enough, yields approx 6.5 yrs)

# define name of column from which KPI calcs will be derived:
# (feature which will form basis of signals and predictions)
calc_col_name = 'Close'



In [None]:
# find viable filepaths based on naming convention

def find_project_paths(shared_direct, string_format):
  # filepath join to get pattern of filenames matching specs
  pattern = os.path.join(shared_direct, string_format)
  # multiple matching files are 'globbed' together -- aka added to a list
  viable_paths = glob.glob(pattern)
  return viable_paths


In [None]:
# function to load data based on defined number of source files to extract

def load_and_combine_files(num_files, file_list):

  # filter file list based on number of selected files:
  selected_files = file_list[:num_files]

  # initialize counter for dfs imported:
  df_counter = 0

  # initialize df list to store loaded dfs
  df_list = []

  # iterate through list of filepaths:
  for file in selected_files:
    # read csv file
    df = pd.read_csv(file)

    # ensure that 'Date' field is in datetime format:
    df['Date'] = pd.to_datetime(df['Date'])

    # output date range for each df in list:
    df_counter += 1
    print('df #: ', df_counter)
    df['Year'] = df['Date'].dt.year
    unique_years = df['Year'].unique()
    print(unique_years)
    print("")

    # append df to df list:
    df_list.append(df)

  # concatenate all selected dfs
  combined_df = pd.concat(df_list, ignore_index=True)

  # return combined df
  return combined_df


In [None]:
# generate files viable for use in this notebook:
intended_files = find_project_paths(shared_folder, string_format)

# verify actual output:
#print(intended_files)

# confirm existence of valid output:
num_files_found = len(intended_files)
print("Number of Valid Files Found:")
print(num_files_found)

Number of Valid Files Found:
10


In [None]:
# import selected number of viable files into notebook:
combined_years_df = load_and_combine_files(num_combined_files, intended_files)

df #:  1
[1992 1993 1994 1995]

df #:  2
[1995 1996 1997 1998]



In [None]:
# Gather summary datetime information for filename

first_date = combined_years_df['Date'].min()
last_date = combined_years_df['Date'].max()

print('First Date:', first_date)
print('Last Date:', last_date)

# use datetime to separate out d/m/y from dates
first_str = first_date.strftime('%Y_%m%d')
last_str = last_date.strftime('%Y_%m%d')

print(first_str)
print(last_str)

# define output filename based on date strings
output_filename = f'kpi_calc_{first_str}_thru_{last_str}.csv'

print(output_filename)

# output folder = shared drive folder
output_folder = '/content/drive/MyDrive/Capstone_Docs_Shared'

# string together full path using os
full_output_path = os.path.join(output_folder, output_filename)

print(full_output_path)

First Date: 1992-06-01 00:00:00
Last Date: 1998-11-30 00:00:00
1992_0601
1998_1130
kpi_calc_1992_0601_thru_1998_1130.csv
/content/drive/MyDrive/Capstone_Docs_Shared/kpi_calc_1992_0601_thru_1998_1130.csv


In [None]:
# function to add SMA (Simple Moving Averages) features:

def sma_indicators(df,col_name,windows=[5, 10, 20, 50, 75, 100]):

  # take a copy of df
  df = df.copy()

  #sort data by ticker and date
  df = df.sort_values(['Ticker', 'Date'])

  #calculate simple moving averages
  for window in windows:
    #create column name
    sma_column = f'SMA_{window}'
    #create simple moving average for each ticker using specific window size
    # apply the SMA indicator to each ticker
    df[sma_column] = df.groupby('Ticker')[col_name].transform(lambda x: ta.trend.sma_indicator(x, window=window))

  return df


In [None]:
# function to add EMA (Exponential Moving Averages) features:

def ema_indicators(df,col_name,windows=[5, 10, 20, 50, 75, 100]):

  # take a copy of df
  df = df.copy()

  #sort data by ticker and date
  df = df.sort_values(['Ticker', 'Date'])

  #calculate simple moving averages
  for window in windows:
    #create column name
    ema_column = f'EMA_{window}'
    #create simple moving average for each ticker using specific window size
    # apply the SMA indicator to each ticker
    df[ema_column] = df.groupby('Ticker')[col_name].transform(lambda x: ta.trend.sma_indicator(x, window=window))

  return df


In [None]:
# function to add Relative Strength Index (RSI) features:

def rsi_indicators(df,col_name,windows=[5, 7, 9, 14, 21, 30, 50]):

  #sort data by ticker and date
  df = df.sort_values(['Ticker', 'Date']).copy()

  #calculate simple moving averages
  for window in windows:
    #create column name
    rsi_column = f'RSI_{window}'
    #create rsi fore each ticker using specific window size
    #apply the RSI indicator to each ticker
    df[rsi_column] = df.groupby('Ticker')[col_name].transform(lambda x: ta.momentum.rsi(x, window=window))

  return df


In [None]:
# function to add Bollinger Band features:

def bollinger_band_indicators(df,col_name,windows=[10, 14, 30, 50, 100], window_dev=2):

  #sort data by ticker and date
  df = df.sort_values(['Ticker', 'Date']).copy()

  #calculate bollinger bands for middle, upper, and lower bands
  for window in windows:

    #calculate middle band (SMA)
    df[f'Bollinger_Mid_{window}'] = df.groupby('Ticker')[col_name].transform(
      lambda x: ta.volatility.bollinger_mavg(x, window=window))

    # calculate upper band (SMA + window * std)
    df[f'Bollinger_Upper_{window}'] = df.groupby('Ticker')[col_name].transform(
      lambda x: ta.volatility.bollinger_hband(x, window=window, window_dev=window_dev))

    #calculate lower band (SMA + window * std)
    df[f'Bollinger_Lower_{window}'] = df.groupby('Ticker')[col_name].transform(
      lambda x: ta.volatility.bollinger_lband(x, window=window, window_dev=window_dev))

  return df


In [None]:
# function to add Stochastic Oscillator features:

def stochastic_oscillator(df, col_name, windows=[5, 7, 14, 30]):

  #sort data by ticker and date
  df = df.sort_values(['Ticker', 'Date']).copy()

  #loop through each window
  for window in windows:
    #create column
    stoch_col = f'Stoch_%K_{window}'

    #calculate lowest low and highest high in rolling window for each ticker
    low_min = df.groupby('Ticker')['Low'].transform(lambda x: x.rolling(window).min())
    high_max = df.groupby('Ticker')['High'].transform(lambda x: x.rolling(window).max())

    #had to calculate more manually because I kept getting errors when using ta.momentum
    #calculate percent k = (close - low)/ (high - low) * 100
    df[stoch_col] = (df[col_name] - low_min) / (high_max - low_min) * 100


  return df


In [None]:
# function to add Average Directional Index (ADX) features:

def adx_indicators(df, col_name, windows=[7, 10, 14, 20, 30, 50]):

  #sort data by ticker and date
  df = df.sort_values(['Ticker', 'Date']).copy()

  #loop through windows
  for window in windows:

    #name column
    adx_col = f'ADX_{window}'

    # create empty column
    df[adx_col] = np.nan

    #get unique tickers
    tickers = df['Ticker'].unique()

    #loop through tickers
    for ticker in tickers:

      #filter for current ticker
      group = df[df['Ticker'] == ticker]

      #need specific amount of data to calculate adx
      #had to make sure i have enough rows for window
      if len(group) >= window:
        #constantly errored out, had to add a try and except method
        #consulted chatgpt on how to fix this
        try:
          #calculate adx
          adx = ta.trend.adx(group['High'], group['Low'], group[col_name], window=window)

          #add to df
          df.loc[group.index, adx_col] = adx.values

        except Exception as e:
          print(f"skipping {ticker} for window {window} due to error: {e}")
          continue

  return df

In [None]:
# function to add Rate of Change (RoC) features:

def roc_indicators(df, col_name, windows=[7, 14, 30]):

  #sort data by ticker and
  df = df.sort_values(['Ticker', 'Date']).copy()

  #loop through windows
  for window in windows:
    #create column name
    roc_col = f'ROC_{window}'
    #create price rate fo chang for each ticker using specific window size
    # apply the roc indicator to each ticker
    df[roc_col] = df.groupby('Ticker')[col_name].transform(
        lambda x: ta.momentum.roc(x, window=window)
    )

  return df


In [None]:
# function to add William Percent Range (williams_r) feature:

def williams_r_indicator(df, col_name, lookback_periods=[14]):

  #sort by ticker and date
  df = df.sort_values(['Ticker', 'Date']).copy()

  #loop through each period
  for period in lookback_periods:

    # create column name
    wr_col = f'Williams_R_{lookback_periods}'

    # create empty columm
    df[wr_col] = np.nan

    #loop through each ticker
    for ticker in df['Ticker'].unique():
      #filter for current ticker
      stock_data = df[df['Ticker'] == ticker]

      #calcaulate wri if there is enough data in window
      if len(stock_data) >= period:

        #calculate wri using high low and close prices
        wr = ta.momentum.williams_r(
          high=stock_data['High'],
          low=stock_data['Low'],
          close=stock_data[col_name],
          lbp=period
          )
        # add to df
        df.loc[stock_data.index, wr_col] = wr.values

    return df


In [None]:
# function to add Ichimoku Cloud features:

def ichimoku_indicators(df):

  #sort by date and ticker
  df = df.sort_values(['Ticker', 'Date']).copy()

  # create empty columns
  df['Ichimoku_A'] = np.nan
  df['Ichimoku_B'] = np.nan

  #loop through each ticker
  for ticker in df['Ticker'].unique():

    #grab rows associated with ticker
    group = df[df['Ticker'] == ticker]

    # calculate ichimoku values
    ichimoku_a = ta.trend.ichimoku_a(high=group['High'], low=group['Low'])
    ichimoku_b = ta.trend.ichimoku_b(high=group['High'], low=group['Low'])

    # assign calculated values back to df
    df.loc[group.index, 'Ichimoku_A'] = ichimoku_a.values
    df.loc[group.index, 'Ichimoku_B'] = ichimoku_b.values

  return df


In [None]:
# Begin all indicator calculations based on selected y feature

indicator_df = combined_years_df.copy()

# ensuing cells are broken up so users can "batch in" desired indicators

In [None]:
# Add SMA Indicators:
indicator_df = sma_indicators(combined_years_df,col_name=calc_col_name)
indicator_df = indicator_df.copy()


In [None]:
# Add EMA Indicators:
indicator_df = ema_indicators(indicator_df,col_name=calc_col_name)
indicator_df = indicator_df.copy()


In [None]:
# Add RSI Indicators:
indicator_df = rsi_indicators(indicator_df,col_name=calc_col_name)
indicator_df = indicator_df.copy()


In [None]:
# Add Bollinger Bands Indicators:
indicator_df = bollinger_band_indicators(indicator_df,col_name=calc_col_name)
indicator_df = indicator_df.copy()

In [None]:
# Add Stochastic Oscillator Indicators:
indicator_df = stochastic_oscillator(indicator_df,col_name=calc_col_name)
indicator_df = indicator_df.copy()

In [None]:
# Add ADX Indicators:
indicator_df = adx_indicators(indicator_df,col_name=calc_col_name)
indicator_df = indicator_df.copy()

skipping EVF for window 14 due to error: index 14 is out of bounds for axis 0 with size 10
skipping PB for window 14 due to error: index 14 is out of bounds for axis 0 with size 11
skipping TIMB for window 14 due to error: index 14 is out of bounds for axis 0 with size 7
skipping VIV for window 14 due to error: index 14 is out of bounds for axis 0 with size 7
skipping DDT for window 20 due to error: index 20 is out of bounds for axis 0 with size 14
skipping EVF for window 20 due to error: index 20 is out of bounds for axis 0 with size 4
skipping GIB for window 20 due to error: index 20 is out of bounds for axis 0 with size 19
skipping PB for window 20 due to error: index 20 is out of bounds for axis 0 with size 5
skipping TIMB for window 20 due to error: index 20 is out of bounds for axis 0 with size 1
skipping TVC for window 20 due to error: index 20 is out of bounds for axis 0 with size 14
skipping VIV for window 20 due to error: index 20 is out of bounds for axis 0 with size 1
skipp

In [None]:
# Add RoC Indicators:
indicator_df = roc_indicators(indicator_df,col_name=calc_col_name)
indicator_df = indicator_df.copy()

In [None]:
# Add williams_r Indicator:
indicator_df = williams_r_indicator(indicator_df,col_name=calc_col_name)
indicator_df = indicator_df.copy()

In [None]:
# Add Ichimoku Cloud Indicators:
indicator_df = ichimoku_indicators(indicator_df)
indicator_df = indicator_df.copy()

In [None]:
# output check post-calculation:
print(indicator_df)

              Date Ticker       Open       High        Low      Close  \
552     1992-06-01     AA  14.600807  14.835925  14.483249  14.812413   
1427    1992-06-02     AA  14.812414  14.929973  14.788901  14.835925   
2369    1992-06-03     AA  14.835926  15.024020  14.741880  15.024020   
2511    1992-06-04     AA  15.024010  15.165080  14.976986  14.976986   
3629    1992-06-05     AA  14.976993  14.976993  14.788899  14.859435   
...            ...    ...        ...        ...        ...        ...   
1706606 1998-11-25    ZTR   2.275102   2.306921   2.259192   2.306921   
1707069 1998-11-27    ZTR   2.270409   2.301943   2.270409   2.301943   
1707190 1998-11-27    ZTR   2.291011   2.322831   2.291011   2.322831   
1708408 1998-11-30    ZTR   2.301942   2.301942   2.254642   2.254642   
1708756 1998-11-30    ZTR   2.322831   2.322831   2.275102   2.275102   

            Volume  UMCSENT   DGORDER       GDP  ...     ADX_14     ADX_20  \
552      1933916.0     80.4  122834.0  6470.7

In [None]:
# Handle nans -- fill with 0 & Reindex

indicator_df.fillna(0, inplace=True)

indicator_df.reset_index(drop=True, inplace=True)

print(indicator_df)

              Date Ticker       Open       High        Low      Close  \
0       1992-06-01     AA  14.600807  14.835925  14.483249  14.812413   
1       1992-06-02     AA  14.812414  14.929973  14.788901  14.835925   
2       1992-06-03     AA  14.835926  15.024020  14.741880  15.024020   
3       1992-06-04     AA  15.024010  15.165080  14.976986  14.976986   
4       1992-06-05     AA  14.976993  14.976993  14.788899  14.859435   
...            ...    ...        ...        ...        ...        ...   
1709176 1998-11-25    ZTR   2.275102   2.306921   2.259192   2.306921   
1709177 1998-11-27    ZTR   2.270409   2.301943   2.270409   2.301943   
1709178 1998-11-27    ZTR   2.291011   2.322831   2.291011   2.322831   
1709179 1998-11-30    ZTR   2.301942   2.301942   2.254642   2.254642   
1709180 1998-11-30    ZTR   2.322831   2.322831   2.275102   2.275102   

            Volume  UMCSENT   DGORDER       GDP  ...     ADX_14     ADX_20  \
0        1933916.0     80.4  122834.0  6470.7

In [None]:
# Handle infinity and negative infinity values
# (result of div/0 errors...)
# (common strategy - replace w/ nan, then fills nans with column mean)

# take a copy
ind_stats_df = indicator_df.copy()

# identify cols where inf counts occur:
inf_counts = ind_stats_df.isin([np.inf, -np.inf]).sum()
# print non-zero cols:
inf_present = inf_counts[inf_counts>0]
print("Infinite Values Present After Calculations: ")
print(inf_present)

# replace infinity and negative infinity with nan
ind_stats_df.replace([np.inf, -np.inf], np.nan, inplace=True)

# intermediate results check:
#print(indicator_df.isna().sum())

# replace nan values with column mean
for col in ind_stats_df.columns:
    if col in inf_present.index:
      mean_val = ind_stats_df[col].mean()
      ind_stats_df[col] = ind_stats_df[col].fillna(mean_val)


# print nan values:
print("Remaining Inf. Values After Replacement Strategy: ")
print(ind_stats_df.isna().sum().sum())


Infinite Values Present After Calculations: 
Stoch_%K_5         597
Stoch_%K_7         436
Stoch_%K_14        245
Stoch_%K_30        105
Williams_R_[14]    245
dtype: int64
Remaining Inf. Values After Replacement Strategy: 
0


In [None]:
# output KPI calculated df to csv:

ind_stats_df.to_csv(full_output_path, index=False)