In [10]:
import pandas as pd
import yfinance as yf
import numpy as np
import matplotlib.pyplot as plt

# Load STT stock data
df = yf.download("STT", start="2015-01-01", end="2024-01-01")
df = df.dropna()
df

[*********************100%***********************]  1 of 1 completed


Price,Close,High,Low,Open,Volume
Ticker,STT,STT,STT,STT,STT
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2015-01-02,58.966736,59.689238,58.485068,59.621506,1421300
2015-01-05,57.988350,58.883948,57.785143,58.567855,3265100
2015-01-06,56.332615,57.920616,56.084253,57.800202,3254800
2015-01-07,56.859428,57.145417,56.460542,57.024997,2158000
2015-01-08,58.063606,58.176498,57.363682,57.484096,1590100
...,...,...,...,...,...
2023-12-22,73.267914,73.932343,72.907217,73.438768,1536500
2023-12-26,73.770981,73.951329,73.154006,73.362829,1608600
2023-12-27,74.008278,74.236081,73.353335,73.695044,1154800
2023-12-28,74.435410,74.577789,74.169638,74.558809,1394300


# 🧠 Feature Engineering for STT Stock Data

This section outlines the creation of new predictive features for the State Street Corporation (STT) stock dataset, with a focus on technical indicators and time-based transformations.

---

## 📈 1. Simple Moving Averages (SMA)
- **SMA_10**: 10-day average of the closing price.
- **SMA_30**: 30-day average of the closing price.
- Purpose: Smooths short-term fluctuations to highlight longer-term trends.

---


In [11]:
# 1. Simple Moving Averages (SMA)
# Calculates the 10-day and 30-day simple moving averages of the closing price
df['SMA_10'] = df['Close'].rolling(window=10).mean()
df['SMA_30'] = df['Close'].rolling(window=30).mean()

df


Price,Close,High,Low,Open,Volume,SMA_10,SMA_30
Ticker,STT,STT,STT,STT,STT,Unnamed: 6_level_1,Unnamed: 7_level_1
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2015-01-02,58.966736,59.689238,58.485068,59.621506,1421300,,
2015-01-05,57.988350,58.883948,57.785143,58.567855,3265100,,
2015-01-06,56.332615,57.920616,56.084253,57.800202,3254800,,
2015-01-07,56.859428,57.145417,56.460542,57.024997,2158000,,
2015-01-08,58.063606,58.176498,57.363682,57.484096,1590100,,
...,...,...,...,...,...,...,...
2023-12-22,73.267914,73.932343,72.907217,73.438768,1536500,72.862604,69.098750
2023-12-26,73.770981,73.951329,73.154006,73.362829,1608600,73.158752,69.418944
2023-12-27,74.008278,74.236081,73.353335,73.695044,1154800,73.429272,69.764449
2023-12-28,74.435410,74.577789,74.169638,74.558809,1394300,73.601075,70.064392



## 🚀 2. Momentum
- **Momentum_10**: Difference between the current closing price and the price from 10 days ago.
- Purpose: Measures the strength and direction of recent price movement.

---


In [12]:

# 2. Momentum (Close - Close_n_days_ago)
# Measures the price change over the past 10 days
df['Momentum_10'] = df['Close'] - df['Close'].shift(10)
df


Price,Close,High,Low,Open,Volume,SMA_10,SMA_30,Momentum_10
Ticker,STT,STT,STT,STT,STT,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
2015-01-02,58.966736,59.689238,58.485068,59.621506,1421300,,,
2015-01-05,57.988350,58.883948,57.785143,58.567855,3265100,,,
2015-01-06,56.332615,57.920616,56.084253,57.800202,3254800,,,
2015-01-07,56.859428,57.145417,56.460542,57.024997,2158000,,,
2015-01-08,58.063606,58.176498,57.363682,57.484096,1590100,,,
...,...,...,...,...,...,...,...,...
2023-12-22,73.267914,73.932343,72.907217,73.438768,1536500,72.862604,69.098750,3.056396
2023-12-26,73.770981,73.951329,73.154006,73.362829,1608600,73.158752,69.418944,2.961487
2023-12-27,74.008278,74.236081,73.353335,73.695044,1154800,73.429272,69.764449,2.705193
2023-12-28,74.435410,74.577789,74.169638,74.558809,1394300,73.601075,70.064392,1.718033



## 📊 3. Relative Strength Index (RSI)
- **RSI_14**: Momentum oscillator that measures the speed and change of price movements over a 14-day period.
- Formula:  
  \[
  RSI = 100 - \frac{100}{1 + \frac{Average\ Gain}{Average\ Loss}}
  \]
- Purpose: Identifies overbought (>70) and oversold (<30) conditions.

---


In [13]:

# 3. RSI (Relative Strength Index) - simplified 14-day
# Calculate the difference in closing prices
delta = df['Close'].diff()

# Calculate gains (positive changes) and losses (negative changes)
gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()

# Compute the relative strength (RS)
rs = gain / loss

# Compute the RSI using the RS values
df['RSI_14'] = 100 - (100 / (1 + rs))
df


Price,Close,High,Low,Open,Volume,SMA_10,SMA_30,Momentum_10,RSI_14
Ticker,STT,STT,STT,STT,STT,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
2015-01-02,58.966736,59.689238,58.485068,59.621506,1421300,,,,
2015-01-05,57.988350,58.883948,57.785143,58.567855,3265100,,,,
2015-01-06,56.332615,57.920616,56.084253,57.800202,3254800,,,,
2015-01-07,56.859428,57.145417,56.460542,57.024997,2158000,,,,
2015-01-08,58.063606,58.176498,57.363682,57.484096,1590100,,,,
...,...,...,...,...,...,...,...,...,...
2023-12-22,73.267914,73.932343,72.907217,73.438768,1536500,72.862604,69.098750,3.056396,60.736674
2023-12-26,73.770981,73.951329,73.154006,73.362829,1608600,73.158752,69.418944,2.961487,66.025703
2023-12-27,74.008278,74.236081,73.353335,73.695044,1154800,73.429272,69.764449,2.705193,76.099127
2023-12-28,74.435410,74.577789,74.169638,74.558809,1394300,73.601075,70.064392,1.718033,73.351157



## 🌪️ 4. Rolling Volatility
- **Rolling_STD_20**: 20-day rolling standard deviation of the closing price.
- Purpose: Captures the stock's recent price volatility and risk.

---


In [14]:

# 4. Rolling Standard Deviation (Volatility)
# Measures the 20-day rolling volatility of the closing price
df['Rolling_STD_20'] = df['Close'].rolling(window=20).std()
df

Price,Close,High,Low,Open,Volume,SMA_10,SMA_30,Momentum_10,RSI_14,Rolling_STD_20
Ticker,STT,STT,STT,STT,STT,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
2015-01-02,58.966736,59.689238,58.485068,59.621506,1421300,,,,,
2015-01-05,57.988350,58.883948,57.785143,58.567855,3265100,,,,,
2015-01-06,56.332615,57.920616,56.084253,57.800202,3254800,,,,,
2015-01-07,56.859428,57.145417,56.460542,57.024997,2158000,,,,,
2015-01-08,58.063606,58.176498,57.363682,57.484096,1590100,,,,,
...,...,...,...,...,...,...,...,...,...,...
2023-12-22,73.267914,73.932343,72.907217,73.438768,1536500,72.862604,69.098750,3.056396,60.736674,2.483310
2023-12-26,73.770981,73.951329,73.154006,73.362829,1608600,73.158752,69.418944,2.961487,66.025703,2.315673
2023-12-27,74.008278,74.236081,73.353335,73.695044,1154800,73.429272,69.764449,2.705193,76.099127,2.122698
2023-12-28,74.435410,74.577789,74.169638,74.558809,1394300,73.601075,70.064392,1.718033,73.351157,2.050978



## ⏮️ 5. Lag Features
- **Lag_1**, **Lag_3**, **Lag_7**: Closing prices from 1, 3, and 7 days ago.
- Purpose: Provides past price points as predictors for future price behavior.

---


In [15]:
# 5. Lag Features
# Adds lagged versions of the closing price as features for prediction
df['Lag_1'] = df['Close'].shift(1)
df['Lag_3'] = df['Close'].shift(3)
df['Lag_7'] = df['Close'].shift(7)
df


Price,Close,High,Low,Open,Volume,SMA_10,SMA_30,Momentum_10,RSI_14,Rolling_STD_20,Lag_1,Lag_3,Lag_7
Ticker,STT,STT,STT,STT,STT,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
2015-01-02,58.966736,59.689238,58.485068,59.621506,1421300,,,,,,,,
2015-01-05,57.988350,58.883948,57.785143,58.567855,3265100,,,,,,58.966736,,
2015-01-06,56.332615,57.920616,56.084253,57.800202,3254800,,,,,,57.988350,,
2015-01-07,56.859428,57.145417,56.460542,57.024997,2158000,,,,,,56.332615,58.966736,
2015-01-08,58.063606,58.176498,57.363682,57.484096,1590100,,,,,,56.859428,57.988350,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-22,73.267914,73.932343,72.907217,73.438768,1536500,72.862604,69.098750,3.056396,60.736674,2.483310,73.097061,73.733009,72.717377
2023-12-26,73.770981,73.951329,73.154006,73.362829,1608600,73.158752,69.418944,2.961487,66.025703,2.315673,73.267914,72.185829,74.558807
2023-12-27,74.008278,74.236081,73.353335,73.695044,1154800,73.429272,69.764449,2.705193,76.099127,2.122698,73.770981,73.097061,73.685547
2023-12-28,74.435410,74.577789,74.169638,74.558809,1394300,73.601075,70.064392,1.718033,73.351157,2.050978,74.008278,73.267914,73.267914



## 🧼 Data Cleanup
- All features that involve rolling or shifting introduce `NaN` values.
- These rows are dropped to ensure the dataset is ready for modeling.

---


In [16]:
# Drop NaN rows caused by rolling calculations and lag features
df = df.dropna()
df

Price,Close,High,Low,Open,Volume,SMA_10,SMA_30,Momentum_10,RSI_14,Rolling_STD_20,Lag_1,Lag_3,Lag_7
Ticker,STT,STT,STT,STT,STT,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
2015-02-13,57.898041,58.515178,57.694839,58.138875,2536700,56.966312,56.297242,4.079132,65.994928,1.561393,58.244247,58.153923,55.519810
2015-02-17,58.153923,58.266815,56.490663,56.686342,3007900,57.289180,56.270148,3.228676,71.480686,1.625802,57.898041,57.943195,56.212200
2015-02-18,57.386269,58.236712,57.250799,58.063616,2122900,57.411856,56.250079,1.226765,73.073520,1.628442,58.153923,58.244247,57.288429
2015-02-19,57.604523,57.890511,57.070173,57.122855,2082100,57.620328,56.292476,2.084713,72.486014,1.655923,57.386269,57.898041,57.318527
2015-02-20,58.424858,58.470018,57.145429,57.393791,2418000,57.841594,56.344657,2.212658,76.984247,1.680074,57.604523,58.153923,58.153923
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-22,73.267914,73.932343,72.907217,73.438768,1536500,72.862604,69.098750,3.056396,60.736674,2.483310,73.097061,73.733009,72.717377
2023-12-26,73.770981,73.951329,73.154006,73.362829,1608600,73.158752,69.418944,2.961487,66.025703,2.315673,73.267914,72.185829,74.558807
2023-12-27,74.008278,74.236081,73.353335,73.695044,1154800,73.429272,69.764449,2.705193,76.099127,2.122698,73.770981,73.097061,73.685547
2023-12-28,74.435410,74.577789,74.169638,74.558809,1394300,73.601075,70.064392,1.718033,73.351157,2.050978,74.008278,73.267914,73.267914



## ✅ Final Feature Set Preview
The following features are now available for model training:
- `Close`
- `SMA_10`, `SMA_30`
- `Momentum_10`
- `RSI_14`
- `Rolling_STD_20`
- `Lag_1`, `Lag_3`, `Lag_7`


In [17]:

# Display head of the DataFrame to inspect relevant columns
df[['Close', 'SMA_10', 'SMA_30', 'Momentum_10', 'RSI_14', 'Rolling_STD_20', 'Lag_1', 'Lag_3', 'Lag_7']].head()


Price,Close,SMA_10,SMA_30,Momentum_10,RSI_14,Rolling_STD_20,Lag_1,Lag_3,Lag_7
Ticker,STT,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
2015-02-13,57.898041,56.966312,56.297242,4.079132,65.994928,1.561393,58.244247,58.153923,55.51981
2015-02-17,58.153923,57.28918,56.270148,3.228676,71.480686,1.625802,57.898041,57.943195,56.2122
2015-02-18,57.386269,57.411856,56.250079,1.226765,73.07352,1.628442,58.153923,58.244247,57.288429
2015-02-19,57.604523,57.620328,56.292476,2.084713,72.486014,1.655923,57.386269,57.898041,57.318527
2015-02-20,58.424858,57.841594,56.344657,2.212658,76.984247,1.680074,57.604523,58.153923,58.153923


In [18]:
# Export the final feature-engineered dataset to CSV
df.to_csv("../data/STT_features.csv", index=True)

print("✅ Feature-engineered dataset saved to: ../data/STT_features.csv'")


✅ Feature-engineered dataset saved to: ../data/STT_features.csv'
