# FTSE100 Dataset Preparation for ML Prediction

This notebook prepares the dataset by calculating the following features:
- `sma_crossover`
- `price_sma_ratio`
- `rsi`
- `macd`
- `macd_hist`
- `adx`
- `obv`

The final dataset will be saved for use in prediction tasks.

In [None]:
# 1. Import Required Libraries
import pandas as pd
import numpy as np

In [None]:
# 2. Load Raw Dataset
df = pd.read_csv('backend/data/dataset.csv', index_col=0, parse_dates=True)
df.head()

In [None]:
# 3. Calculate Technical Indicators

def calculate_sma(series, window):
    return series.rolling(window=window, min_periods=1).mean()

def calculate_rsi(series, window=14):
    delta = series.diff()
    gain = delta.where(delta > 0, 0).rolling(window=window, min_periods=1).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window, min_periods=1).mean()
    loss = loss.replace(0, 1e-6)
    rs = gain / loss
    rsi = 100 - (100 / (1 + rs))
    return rsi.fillna(50).replace([np.inf, -np.inf], 50)

def calculate_macd(series, fast=12, slow=26, signal=9):
    exp1 = series.ewm(span=fast, adjust=False).mean()
    exp2 = series.ewm(span=slow, adjust=False).mean()
    macd_line = exp1 - exp2
    signal_line = macd_line.ewm(span=signal, adjust=False).mean()
    macd_hist = macd_line - signal_line
    return macd_line.fillna(0), signal_line.fillna(0), macd_hist.fillna(0)

def calculate_adx(high, low, close, window=14):
    tr1 = high - low
    tr2 = abs(high - close.shift(1))
    tr3 = abs(low - close.shift(1))
    tr = pd.concat([tr1, tr2, tr3], axis=1).max(axis=1)
    high_diff = high.diff()
    low_diff = low.shift(1) - low
    dm_plus = np.where((high_diff > low_diff) & (high_diff > 0), high_diff, 0)
    dm_minus = np.where((low_diff > high_diff) & (low_diff > 0), low_diff, 0)
    atr = tr.rolling(window=window, min_periods=1).mean().replace(0, 1e-6)
    di_plus = 100 * (pd.Series(dm_plus).rolling(window=window, min_periods=1).mean() / atr)
    di_minus = 100 * (pd.Series(dm_minus).rolling(window=window, min_periods=1).mean() / atr)
    di_sum = di_plus + di_minus
    di_sum = di_sum.replace(0, 1e-6)
    dx = 100 * abs(di_plus - di_minus) / di_sum
    adx = dx.rolling(window=window, min_periods=1).mean()
    return adx.fillna(25).replace([np.inf, -np.inf], 25)

def calculate_obv(close, volume):
    obv = [0]
    for i in range(1, len(close)):
        if close.iloc[i] > close.iloc[i-1]:
            obv.append(obv[-1] + volume.iloc[i])
        elif close.iloc[i] < close.iloc[i-1]:
            obv.append(obv[-1] - volume.iloc[i])
        else:
            obv.append(obv[-1])
    return pd.Series(obv, index=close.index)

# Calculate indicators
sma_50 = calculate_sma(df['Adj Close'], 50)
sma_200 = calculate_sma(df['Adj Close'], 200)
df['rsi'] = calculate_rsi(df['Adj Close'])
df['macd'], df['macd_signal'], df['macd_hist'] = calculate_macd(df['Adj Close'])
df['adx'] = calculate_adx(df['High'], df['Low'], df['Adj Close'])
df['obv'] = calculate_obv(df['Adj Close'], df['Volume'])
df['sma_50'] = sma_50
df['sma_200'] = sma_200

In [None]:
# 4. Create Derived Features
df['sma_crossover'] = df['sma_50'] - df['sma_200']
df['price_sma_ratio'] = df['Adj Close'] / df['sma_200'].replace(0, np.nan)
df['price_sma_ratio'] = df['price_sma_ratio'].fillna(1.0)

In [None]:
# 5. Handle Missing Values and Clean Data
features = ['sma_crossover', 'price_sma_ratio', 'rsi', 'macd', 'macd_hist', 'adx', 'obv']
for col in features:
    if col in df.columns:
        nan_count = df[col].isnull().sum()
        if nan_count > 0:
            fill_value = df[col].median() if col != 'target' else 0
            if pd.isna(fill_value):
                fill_value = 0
            df[col] = df[col].fillna(fill_value)
        inf_count = np.isinf(df[col]).sum()
        if inf_count > 0:
            df[col] = df[col].replace([np.inf, -np.inf], [df[col].quantile(0.99), df[col].quantile(0.01)])