In [None]:
!pip install stockstats

In [None]:
import pandas as pd
import numpy as np
from stockstats import StockDataFrame as Sdf
from sklearn.preprocessing import MinMaxScaler

## 1. Definition of the functions

In [None]:
def load_dataset(*, file_name: str) -> pd.DataFrame:
    _data = pd.read_csv(file_name)
    _data= _data.drop_duplicates()
    return _data

In [None]:
def add_technical_indicator(df):
    """
    Calculate technical indicators using stockstats package and add them to the dataframe
    - ichimoku cloud
    - macd
    - rsi
    - stochastic oscillator
    - roc
    - vr
    - bollinger bands
    - atr
    :param df: (df) pandas dataframe
    :return: (df) pandas dataframe
    """
    stock = Sdf.retype(df.copy())

    # Adjust the 'close' column for stockstats
    unique_ticker = stock.tic.unique()

    # Create empty lists to store indicators
    macd_list = []
    rsi_list = []
    band_width_list = []
    ichimoku_list = []
    stoch_k_list = []
    roc_list = []
    vr_list = []
    atr_14_list = []

    for i in range(len(unique_ticker)):
        ticker_stock = stock[stock.tic == unique_ticker[i]]

        ## MACD (26 periods)
        temp_macd = ticker_stock['macdh']
        macd_list.append(temp_macd)

        ## RSI (14 period)
        temp_rsi = ticker_stock['rsi']
        rsi_list.append(temp_rsi)

        ## Bollinger BandWidth (20 period)
        temp_boll_ub = ticker_stock['boll_ub']
        temp_boll_lb = ticker_stock['boll_lb']
        temp_bandwidth = (temp_boll_ub.values - temp_boll_lb.values) / ticker_stock['close_20_sma'].values
        band_width_list.append(temp_bandwidth)

        ## Ichimoku Cloud with default windows
        temp_ichimoku = ticker_stock['ichimoku']
        ichimoku_list.append(temp_ichimoku)

        ## Stochastic Oscillator (%K) 14 periods
        temp_stoch_k = ticker_stock['kdjk_14']
        stoch_k_list.append(temp_stoch_k)

        ## ROC (12 period)
        temp_roc = ticker_stock['close_12_roc']
        roc_list.append(temp_roc)

        ## Volume Ratio (VR)
        temp_vr = ticker_stock['vr']
        vr_list.append(temp_vr)

        ## ATR (14 period)
        temp_atr = ticker_stock['atr_14']
        atr_14_list.append(temp_atr)

    # Concatenate all the lists and add them to the dataframe
    df['macd'] = pd.concat(macd_list, ignore_index=True).values
    df['rsi'] = pd.concat(rsi_list, ignore_index=True).values
    df['bandwidth'] = pd.concat([pd.DataFrame(b) for b in band_width_list], ignore_index=True).values
    df['ichimoku'] = pd.concat(ichimoku_list, ignore_index=True).values
    df['stoch_k'] = pd.concat(stoch_k_list, ignore_index=True).values
    df['roc'] = pd.concat(roc_list, ignore_index=True).values
    df['vr'] = pd.concat(vr_list, ignore_index=True).values
    df['atr_14'] = pd.concat(atr_14_list, ignore_index=True).values

    return df

## 2. Preprocessing data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import sys
sys.path.append('/content/drive/My Drive/Codis_TFM/')

df = pd.read_csv('/content/drive/My Drive/Codis_TFM/datos_1d.csv')
df.rename(columns={'Open': 'open', 'High': 'high', 'Low': 'low', 'Adj Close': 'close', 'Volume': 'volume', 'Dividends': 'dividends'}, inplace=True)
df = df.drop(columns=['Close'], axis=1).sort_values(['datadate','tic']).reset_index(drop=True)

In [None]:
df.head()

In [None]:
df_preprocess=add_technical_indicator(df)
df_preprocess.loc[:, df_preprocess.columns != 'dividends'] = df_preprocess.loc[:, df_preprocess.columns != 'dividends'].fillna(method='bfill')
df_preprocess['dividends'].fillna(0, inplace=True)

In [None]:
df_preprocess.head()

In [None]:
df_preprocess.describe()

In [None]:
df_final = df_preprocess[(df_preprocess['datadate'] >= '2014-01-01') & (df_preprocess['datadate'] <= '2023-12-31')]
df_final = df_final.sort_values(['datadate','tic']).reset_index(drop=True)
df_final.head()

In [None]:
df_final.describe()

In [None]:
# Normalization of macroeconomic variables
macro_vars = ['GDP_growth_developed', 'GDP_growth_emerging', 'GDP_growth_us',
              'inflation_developed', 'inflation_emerging', 'inflation_us']

data_normalized = df_final.copy()
macro_data = data_normalized[macro_vars]

# MinMaxScaler
scaler = MinMaxScaler()
macro_data_scaled = scaler.fit_transform(macro_data)
data_normalized[macro_vars] = macro_data_scaled

In [None]:
data_normalized.describe()

In [None]:
data_normalized.to_csv("/content/drive/My Drive/Codis_TFM/preprocessed_data_1d.csv", index=False)

In [None]:
data_normalized.columns