#  Data Acquisition and Feature Engineering

In [15]:
# Import libraries for data handling and stock data retrieval
import yfinance as yf
import pandas as pd
import numpy as np

# Download historical stock data for Apple Inc.
ticker = "AAPL"
data = yf.download(ticker, start="2020-01-01", end="2025-04-01")

# Print available columns to help diagnose the structure
print("Available columns:", data.columns)

# Use 'Adj Close' if available; otherwise, fall back to 'Close'
price_col = 'Adj Close' if 'Adj Close' in data.columns else 'Close'
if price_col == 'Close':
    print("Warning: 'Adj Close' not found. Using 'Close' instead.")

# Feature Engineering:
# Calculate daily returns
data['Return'] = data[price_col].pct_change()

# Compute a 20-day moving average and 20-day volatility
data['MA20'] = data[price_col].rolling(window=20).mean()
data['Volatility'] = data['Return'].rolling(window=20).std()

# Drop any rows with missing values due to rolling calculations
data.dropna(inplace=True)

# Define the target: bullish if daily return > 0, bearish otherwise
data['Trend'] = np.where(data['Return'] > 0, 'bullish', 'bearish')

# Display the first few rows of the dataset
print(data.tail())


[*********************100%***********************]  1 of 1 completed

Available columns: MultiIndex([( 'Close', 'AAPL'),
            (  'High', 'AAPL'),
            (   'Low', 'AAPL'),
            (  'Open', 'AAPL'),
            ('Volume', 'AAPL')],
           names=['Price', 'Ticker'])
Price            Close        High         Low        Open    Volume  \
Ticker            AAPL        AAPL        AAPL        AAPL      AAPL   
Date                                                                   
2025-03-25  223.750000  224.100006  220.080002  220.770004  34493600   
2025-03-26  221.529999  225.020004  220.470001  223.509995  34466100   
2025-03-27  223.850006  224.990005  220.559998  221.389999  37094800   
2025-03-28  217.899994  223.809998  217.679993  221.669998  39818600   
2025-03-31  222.130005  225.619995  216.229996  217.009995  65299300   

Price         Return        MA20 Volatility    Trend  
Ticker                                                
Date                                                  
2025-03-25  0.013682  225.542500   0.019




# Baseline Model Creation

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Define features and target variable
features = ['MA20', 'Volatility']
X = data[features]
y = data['Trend']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Initialize and train the baseline Random Forest model
baseline_model = RandomForestClassifier(n_estimators=100, random_state=42)
baseline_model.fit(X_train, y_train)

# Make predictions and evaluate the baseline model
baseline_predictions = baseline_model.predict(X_test)
print("Baseline Model Accuracy:", accuracy_score(y_test, baseline_predictions))
print("Baseline Classification Report:\n", classification_report(y_test, baseline_predictions))


Baseline Model Accuracy: 0.48
Baseline Classification Report:
               precision    recall  f1-score   support

     bearish       0.44      0.48      0.46       148
     bullish       0.52      0.48      0.50       177

    accuracy                           0.48       325
   macro avg       0.48      0.48      0.48       325
weighted avg       0.48      0.48      0.48       325



# Pipeline with Feature Scaling

In [21]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Create a pipeline that scales the features and then applies the Random Forest classifier
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Train the pipeline on the training data
pipeline.fit(X_train, y_train)

# Make predictions using the pipeline and evaluate
pipeline_predictions = pipeline.predict(X_test)
print("Pipeline Model Accuracy:", accuracy_score(y_test, pipeline_predictions))
print("Pipeline Classification Report:\n", classification_report(y_test, pipeline_predictions))


Pipeline Model Accuracy: 0.48
Pipeline Classification Report:
               precision    recall  f1-score   support

     bearish       0.44      0.48      0.46       148
     bullish       0.52      0.48      0.50       177

    accuracy                           0.48       325
   macro avg       0.48      0.48      0.48       325
weighted avg       0.48      0.48      0.48       325



# Hyperparameter Tuning

In [23]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

# Define a parameter grid for hyperparameter tuning
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

# Setup GridSearchCV using the pipeline with cross-validation
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=5,
    scoring=make_scorer(accuracy_score),
    n_jobs=-1
)
grid_search.fit(X_train, y_train)

# Retrieve and display the best parameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Evaluate the optimized model on the test set
optimized_model = grid_search.best_estimator_
optimized_predictions = optimized_model.predict(X_test)
print("Optimized Model Accuracy:", accuracy_score(y_test, optimized_predictions))
print("Optimized Classification Report:\n", classification_report(y_test, optimized_predictions))


Best Hyperparameters: {'classifier__max_depth': None, 'classifier__min_samples_leaf': 4, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 200}
Optimized Model Accuracy: 0.5076923076923077
Optimized Classification Report:
               precision    recall  f1-score   support

     bearish       0.46      0.48      0.47       148
     bullish       0.55      0.53      0.54       177

    accuracy                           0.51       325
   macro avg       0.51      0.51      0.51       325
weighted avg       0.51      0.51      0.51       325

