# Installing the necessary libraries

In [None]:
%pip install yfinance pandas matplotlib sklearn


# I am looking at taking the quarterly data from yahoo finance. Doing it from the perspective of a investor. This can be edited into daily data depending on the user's preferences

This part will call for the download of data from the Yahoo Finance API. For this, I have fetched both daily and quarterly to allow me to have a clearer picture of the data, and what I can choose to use later on. Currently set ticker to show AAPL only, but user can edit it to fit other companies.

Additional thing to note is that we can just use the daily data and convert it to quarterly using the below

Resample daily data to quarter-end frequency, taking the last available price of each quarter
data_quarterly_alt = data_daily.resample('Q').last()


In [1]:
import yfinance as yf

# Fetch daily historical data for the past 10 years
ticker = "AAPL"  # Apple Inc. as an example
data_daily = yf.download(ticker, start="2015-01-01", end="2025-01-01", interval="1d")
print("Daily data shape:", data_daily.shape)
print(data_daily.head())

# Fetch quarterly historical data for the past 10 years
data_quarterly = yf.download(ticker, start="2015-01-01", end="2025-01-01", interval="3mo")
print("Quarterly data shape:", data_quarterly.shape)
print(data_quarterly.head())


YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

Daily data shape: (2516, 5)
Price           Close       High        Low       Open     Volume
Ticker           AAPL       AAPL       AAPL       AAPL       AAPL
Date                                                             
2015-01-02  24.320433  24.789802  23.879981  24.778679  212818400
2015-01-05  23.635281  24.169160  23.448424  24.089078  257142000
2015-01-06  23.637512  23.897778  23.274918  23.699798  263188400
2015-01-07  23.968964  24.069065  23.735391  23.846616  160423600
2015-01-08  24.889900  24.947738  24.180285  24.298185  237458000
Quarterly data shape: (40, 5)
Price           Close       High        Low       Open       Volume
Ticker           AAPL       AAPL       AAPL       AAPL         AAPL
Date                                                               
2015-01-01  27.679419  29.719284  23.274913  24.778674  14321762800
2015-04-01  28.011990  30.046503  27.491636  27.875760  11315577200
2015-07-01  24.735939  29.819925  20.631970  28.458664  15486588000
2015-1




# Data Cleaning and preprocessing 

I will drop any rows with missing values and maintain data integrity. Additionally, I only want to make use of adjusted close and volume as it is the simplest feature to understand as a start. This can be edited in the future

In [3]:
# Drop any rows with missing data (if any)
data = data_daily.dropna().copy()

# Use only the adjusted close and volume for simplicity in feature creation
# safer: check if 'Adj Close' exists, otherwise fallback to 'Close'
if 'Adj Close' in data.columns:
    data = data[['Adj Close', 'Volume']].rename(columns={'Adj Close': 'AdjClose'})
else:
    data = data[['Close', 'Volume']].rename(columns={'Close': 'AdjClose'})




I am labelling the feature to get what I want to see at the end. 1 is for price going up and 0 is price is going down or maintain the same. Practically binary classification.

In [4]:
# Create target label: 1 if next period's price is higher than current, else 0
data['Target'] = (data['AdjClose'].shift(-1) > data['AdjClose']).astype(int)

print(data[['AdjClose', 'Target']].tail(5))


Price         AdjClose Target
Ticker            AAPL       
Date                         
2024-12-24  257.916443      1
2024-12-26  258.735504      0
2024-12-27  255.309296      0
2024-12-30  251.923019      0
2024-12-31  250.144974      0


Now will be the crucial part, which is the train-test split. This allows for the model to train on a certain amount of data from the whole dataset, and then test what it has trained on the unseen data from the dataset. In this situation, I put 80% of the data for training.

In [5]:
# Assume data is sorted by date ascending
train_size = int(len(data) * 0.8)
train_data = data.iloc[:train_size]
test_data = data.iloc[train_size:]

print("Training period:", train_data.index[0], "to", train_data.index[-1])
print("Testing period:", test_data.index[0], "to", test_data.index[-1])


Training period: 2015-01-02 00:00:00 to 2022-12-28 00:00:00
Testing period: 2022-12-29 00:00:00 to 2024-12-31 00:00:00


# This is the feature engineering portion. 

I will be crafting a few features from the current available data from the AAPL ticker data that was downloaded earlier. 

Feature 1: Recent Price Change (Momentum) –today’s return (percentage change from previous close)

Return is the momentum of the current day, and Return_1, Return_2, etc. are the momentum from previous days. We shift them so that on any given day, we only use information from that day or earlier as features (avoiding peeking into the future).

In [6]:
# Daily percentage return as a feature
train_data['Return'] = train_data['AdjClose'].pct_change() * 100  # percent change
# Lagged returns for last 3 days as features (to capture short-term momentum)
train_data['Return_1'] = train_data['Return'].shift(1)
train_data['Return_2'] = train_data['Return'].shift(2)
train_data['Return_3'] = train_data['Return'].shift(3)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['Return'] = train_data['AdjClose'].pct_change() * 100  # percent change
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['Return_1'] = train_data['Return'].shift(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['Return_2'] = train_data['Return'].shift(2)
A value is t

Feature 2: Moving Averages (Trend) –  5-day and 20-day moving averages of closing price, and their difference

If MA_gap is positive, the short-term average is above the long-term average (an indication of upward momentum, often a “buy” crossover signal in technical analysis). If negative, it might indicate a downtrend.

In [7]:
train_data['MA_5'] = train_data['AdjClose'].rolling(window=5).mean()
train_data['MA_20'] = train_data['AdjClose'].rolling(window=20).mean()
train_data['MA_gap'] = train_data['MA_5'] - train_data['MA_20']  # difference between short and long MA


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['MA_5'] = train_data['AdjClose'].rolling(window=5).mean()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['MA_20'] = train_data['AdjClose'].rolling(window=20).mean()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['MA_gap'] = train_data['MA_5'] - train_data['MA_20']  

Feature 3: Volume Change – e.g., volume of previous day or volume percent change

I included Volume_change (how much volume changed from yesterday, as a percentage) or simply the previous day’s volume Vol_prev as a feature. The idea is to capture unusual volume spikes.

In [8]:
train_data['Volume_change'] = train_data['Volume'].pct_change() * 100
train_data['Vol_prev'] = train_data['Volume'].shift(1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['Volume_change'] = train_data['Volume'].pct_change() * 100
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['Vol_prev'] = train_data['Volume'].shift(1)


After creating these features, remove any rows with NaN (the first few days where moving averages or lagged returns can’t be computed). 

In [9]:
train_data = train_data.dropna()


# In this portion, we will be using logistic regression, decision tree and random forest as out 3 algorithm to train the model

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Prepare training data matrices
X_train = train_data[['Return_1', 'Return_2', 'Return_3', 'MA_gap', 'Vol_prev']]  # using a subset of features for example
y_train = train_data['Target']

X_test = test_data[['Return_1', 'Return_2', 'Return_3', 'MA_gap', 'Vol_prev']]
y_test = test_data['Target']

# Initialize models
log_reg = LogisticRegression(max_iter=1000)
tree = DecisionTreeClassifier(max_depth=5, random_state=0)
rf = RandomForestClassifier(n_estimators=100, random_state=0)

# Train the models
log_reg.fit(X_train, y_train)
tree.fit(X_train, y_train)
rf.fit(X_train, y_train)


KeyError: "['Return_1' 'Return_2' 'Return_3' 'MA_gap' 'Vol_prev'] not in index"

# Training and validating the model

In [None]:
from sklearn.metrics import accuracy_score

# Make predictions on the test set
pred_log = log_reg.predict(X_test)
pred_tree = tree.predict(X_test)
pred_rf = rf.predict(X_test)

# Calculate accuracy
acc_log = accuracy_score(y_test, pred_log)
acc_tree = accuracy_score(y_test, pred_tree)
acc_rf = accuracy_score(y_test, pred_rf)

print(f"Logistic Regression Accuracy: {acc_log:.2%}")
print(f"Decision Tree Accuracy: {acc_tree:.2%}")
print(f"Random Forest Accuracy: {acc_rf:.2%}")


# Generating Buy/Hold/Sell Signals from Predictions

In [None]:
# Predict probabilities for class 1 (up) on the test set
probs = log_reg.predict_proba(X_test)[:, 1]  # probability of class 1

signals = []
for p in probs:
    if p > 0.6:
        signals.append("Buy")   # model very confident the stock will go up
    elif p < 0.4:
        signals.append("Sell")  # model very confident the stock will go down
    else:
        signals.append("Hold")  # model isn't sure, so we take no action
