# 02 — Feature Engineering
## Nifty 50 Return Direction Forecasting
This notebook loads the raw Nifty 50 data and creates ML-ready features
including returns, moving averages, RSI, volatility, and lag features.

**Input:** data/nifty50_raw.csv  
**Output:** data/nifty50_features.csv  
**Features Created:** 9 input features + 1 target variable

In [1]:
import pandas as pd
import numpy as np

# Load the saved data
df = pd.read_csv('../data/nifty50_raw.csv', header=[0,1], index_col=0)

# Flatten multi-level columns
df.columns = ['Open', 'High', 'Low', 'Close', 'Volume']
df.index = pd.to_datetime(df.index)

print(df.shape)
print(df.head())

(2747, 5)
                   Open         High          Low        Close  Volume
Date                                                                  
2015-01-02  8288.700195  8410.599609  8288.700195  8395.450195  101900
2015-01-05  8407.950195  8445.599609  8363.900391  8378.400391  118200
2015-01-06  8325.299805  8327.849609  8111.350098  8127.350098  172800
2015-01-07  8118.649902  8151.200195  8065.450195  8102.100098  164100
2015-01-08  8191.399902  8243.500000  8167.299805  8234.599609  143800


In [2]:
# 1. Daily Returns
df['Return'] = df['Close'].pct_change()

# 2. Moving Averages
df['MA_5'] = df['Close'].rolling(window=5).mean()
df['MA_20'] = df['Close'].rolling(window=20).mean()
df['MA_50'] = df['Close'].rolling(window=50).mean()

# 3. Volatility (rolling std of returns)
df['Volatility_10'] = df['Return'].rolling(window=10).std()

# 4. RSI (Relative Strength Index)
delta = df['Close'].diff()
gain = delta.where(delta > 0, 0).rolling(window=14).mean()
loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
rs = gain / loss
df['RSI'] = 100 - (100 / (1 + rs))

# 5. MA Crossover signal
df['MA_Cross'] = (df['MA_5'] > df['MA_20']).astype(int)

# 6. Target Variable
df['Target'] = (df['Return'].shift(-1) > 0).astype(int)

# 7. Additional features  ← NEW
df['MA5_20_ratio'] = df['MA_5'] / df['MA_20']
df['MA20_50_ratio'] = df['MA_20'] / df['MA_50']
df['Return_lag1'] = df['Return'].shift(1)
df['Return_lag2'] = df['Return'].shift(2)
df['Return_lag3'] = df['Return'].shift(3)

# Drop NaN rows
df.dropna(inplace=True)

print(df.shape)
print(df.head())

(2698, 18)
                   Open         High          Low        Close  Volume  \
Date                                                                     
2015-03-17  8689.099609  8742.549805  8630.799805  8723.299805  177000   
2015-03-18  8742.900391  8747.250000  8664.000000  8685.900391  156400   
2015-03-19  8749.450195  8788.200195  8614.650391  8634.650391  163500   
2015-03-20  8627.900391  8627.900391  8553.000000  8570.900391  174600   
2015-03-23  8591.549805  8608.349609  8540.549805  8550.900391  150700   

              Return         MA_5        MA_20        MA_50  Volatility_10  \
Date                                                                         
2015-03-17  0.010442  8696.030078  8799.122412  8669.166973       0.009786   
2015-03-18 -0.004287  8693.220117  8792.949951  8674.975977       0.009476   
2015-03-19 -0.005900  8664.950195  8781.227490  8680.100977       0.009377   
2015-03-20 -0.007383  8649.580273  8765.007520  8688.971982       0.009286   
20

In [4]:
# Save featured data
df.to_csv('../data/nifty50_features.csv')
print("Saved! Shape:", df.shape)
# Quick sanity check
print("Target distribution:")
print(df['Target'].value_counts())
print(f"\nUp days: {df['Target'].mean()*100:.1f}%")
print(f"Down days: {(1-df['Target'].mean())*100:.1f}%")

Saved! Shape: (2698, 18)
Target distribution:
Target
1    1446
0    1252
Name: count, dtype: int64

Up days: 53.6%
Down days: 46.4%
