In [None]:
# Install necessary libraries
!pip install pandas_ta xgboost plotly scikit-learn optuna imbalanced-learn

# Import necessary libraries
import pandas as pd
import numpy as np
import yfinance as yf
import pandas_ta as ta
from sqlalchemy import create_engine
import plotly.graph_objs as go
import plotly.express as px
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, precision_recall_curve, roc_curve, auc
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import optuna



In [None]:
# Download stock data for a specific ticker (e.g., Apple)
ticker = 'AAPL'
df = yf.download(ticker, start='2022-01-01', end='2023-01-01')

[*********************100%%**********************]  1 of 1 completed


In [None]:
# Calculate additional trading metrics using pandas_ta
df['SMA_50'] = ta.sma(df['Close'], length=50)
df['SMA_200'] = ta.sma(df['Close'], length=200)
df['RSI'] = ta.rsi(df['Close'], length=14)
bbands = ta.bbands(df['Close'], length=20)
df['upper_band'] = bbands['BBU_20_2.0']
df['middle_band'] = bbands['BBM_20_2.0']
df['lower_band'] = bbands['BBL_20_2.0']
macd = ta.macd(df['Close'])
df['MACD'] = macd['MACD_12_26_9']
df['MACD_signal'] = macd['MACDs_12_26_9']
df['MACD_hist'] = macd['MACDh_12_26_9']

# Calculate daily returns and cumulative returns
df['Daily_Return'] = df['Close'].pct_change()
df['Cumulative_Return'] = (1 + df['Daily_Return']).cumprod()

In [None]:
# Connect to SQLite database (or create it)
engine = create_engine('sqlite:///trade_data.db')
df.reset_index(inplace=True)
df.to_sql('trades', engine, if_exists='replace', index=False)

# Query the data using SQLAlchemy
query = """
SELECT Date, Open, High, Low, Close, Volume, SMA_50, SMA_200, RSI, upper_band, middle_band, lower_band, MACD, MACD_signal, MACD_hist, Daily_Return, Cumulative_Return
FROM trades
"""
df_summary = pd.read_sql(query, engine)

In [None]:
# Feature Engineering for Machine Learning
df_ml = df[['Open', 'High', 'Low', 'Close', 'Volume', 'SMA_50', 'SMA_200', 'RSI', 'upper_band', 'lower_band', 'MACD', 'MACD_signal', 'Daily_Return']].copy()
df_ml['Target'] = (df['Close'].shift(-1) > df['Close']).astype(int)  # 1 if next day close is higher, else 0

# Drop NaN values
df_ml.dropna(inplace=True)

# Standardize the data
scaler = StandardScaler()
X = scaler.fit_transform(df_ml.drop('Target', axis=1))
y = df_ml['Target']

# Handle class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

# Function to train model and get classification report
def get_classification_report(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
    return report

In [None]:
# Initialize base models
base_models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(random_state=42),
    "Support Vector Machine": SVC(random_state=42),
    "XGBoost": XGBClassifier(random_state=42)
}

# Get classification reports for each base model
base_reports = {}
for model_name, model in base_models.items():
    base_reports[model_name] = get_classification_report(model, X_train, X_test, y_train, y_test)

In [None]:
# Hyperparameter tuning with Optuna for XGBoost
def objective(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'max_depth': trial.suggest_int('max_depth', 3, 9),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0)
    }

    model = XGBClassifier(**param, random_state=42)
    score = cross_val_score(model, X_train, y_train, cv=3, scoring='f1_weighted')
    return score.mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)
best_params = study.best_params

xgb_best_model = XGBClassifier(**best_params, random_state=42)
xgb_best_model.fit(X_train, y_train)

[I 2024-07-12 02:15:12,407] A new study created in memory with name: no-name-e7379d3d-0fee-4caf-af08-132937324202
[I 2024-07-12 02:15:12,690] Trial 0 finished with value: 0.5147832559597266 and parameters: {'n_estimators': 154, 'max_depth': 6, 'learning_rate': 0.20094522270510476, 'subsample': 0.989555268500383, 'colsample_bytree': 0.6155542880521065}. Best is trial 0 with value: 0.5147832559597266.
[I 2024-07-12 02:15:16,285] Trial 1 finished with value: 0.5435897435897435 and parameters: {'n_estimators': 161, 'max_depth': 7, 'learning_rate': 0.02946386822305972, 'subsample': 0.7828814147156365, 'colsample_bytree': 0.8701945286410235}. Best is trial 1 with value: 0.5435897435897435.
[I 2024-07-12 02:15:18,660] Trial 2 finished with value: 0.5263736263736264 and parameters: {'n_estimators': 189, 'max_depth': 3, 'learning_rate': 0.02020436530430398, 'subsample': 0.603505561063834, 'colsample_bytree': 0.5215824102599774}. Best is trial 1 with value: 0.5435897435897435.
[I 2024-07-12 02:1

In [None]:
# Get classification reports for tuned models
tuned_models = {
    "XGBoost (Tuned)": xgb_best_model
}

tuned_reports = {}
for model_name, model in tuned_models.items():
    tuned_reports[model_name] = get_classification_report(model, X_train, X_test, y_train, y_test)

# Combine base and tuned reports for comparison
all_reports = {**base_reports, **tuned_reports}

# Create comparison table
comparison_table = pd.DataFrame()

for model_name, report in all_reports.items():
    comparison_table[model_name] = [
        report['0']['precision'], report['0']['recall'], report['0']['f1-score'],
        report['1']['precision'], report['1']['recall'], report['1']['f1-score'],
        report['accuracy'], report['macro avg']['f1-score'], report['weighted avg']['f1-score']
    ]

comparison_table.index = [
    'Class 0 Precision', 'Class 0 Recall', 'Class 0 F1-Score',
    'Class 1 Precision', 'Class 1 Recall', 'Class 1 F1-Score',
    'Accuracy', 'Macro Avg F1-Score', 'Weighted Avg F1-Score'
]

print(comparison_table)

                       Random Forest  Logistic Regression  \
Class 0 Precision           0.700000             0.857143   
Class 0 Recall              0.700000             0.600000   
Class 0 F1-Score            0.700000             0.705882   
Class 1 Precision           0.625000             0.636364   
Class 1 Recall              0.625000             0.875000   
Class 1 F1-Score            0.625000             0.736842   
Accuracy                    0.666667             0.722222   
Macro Avg F1-Score          0.662500             0.721362   
Weighted Avg F1-Score       0.666667             0.719642   

                       Support Vector Machine   XGBoost  XGBoost (Tuned)  
Class 0 Precision                    0.500000  0.833333         1.000000  
Class 0 Recall                       0.200000  0.500000         0.700000  
Class 0 F1-Score                     0.285714  0.625000         0.823529  
Class 1 Precision                    0.428571  0.583333         0.727273  
Class 1 Recall

In [None]:
# Data visualization using Plotly
# Line chart for Closing Price with SMA and Bollinger Bands
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_summary['Date'], y=df_summary['Close'], mode='lines', name='Close Price'))
fig.add_trace(go.Scatter(x=df_summary['Date'], y=df_summary['SMA_50'], mode='lines', name='SMA 50'))
fig.add_trace(go.Scatter(x=df_summary['Date'], y=df_summary['SMA_200'], mode='lines', name='SMA 200'))
fig.add_trace(go.Scatter(x=df_summary['Date'], y=df_summary['upper_band'], mode='lines', name='Upper Band', line=dict(dash='dash')))
fig.add_trace(go.Scatter(x=df_summary['Date'], y=df_summary['middle_band'], mode='lines', name='Middle Band', line=dict(dash='dash')))
fig.add_trace(go.Scatter(x=df_summary['Date'], y=df_summary['lower_band'], mode='lines', name='Lower Band', line=dict(dash='dash')))
fig.update_layout(title='Daily Closing Price with SMA and Bollinger Bands', xaxis_title='Date', yaxis_title='Price (USD)')
fig.show()

In [None]:
# Bar chart for Trading Volume
fig = px.bar(df_summary, x='Date', y='Volume', title='Daily Trading Volume')
fig.update_layout(xaxis_title='Date', yaxis_title='Volume')
fig.show()

In [None]:
# Line chart for RSI
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_summary['Date'], y=df_summary['RSI'], mode='lines', name='RSI'))
fig.update_layout(title='Relative Strength Index (RSI)', xaxis_title='Date', yaxis_title='RSI')
fig.show()

In [None]:
# Line chart for MACD
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_summary['Date'], y=df_summary['MACD'], mode='lines', name='MACD'))
fig.add_trace(go.Scatter(x=df_summary['Date'], y=df_summary['MACD_signal'], mode='lines', name='MACD Signal'))
fig.add_trace(go.Bar(x=df_summary['Date'], y=df_summary['MACD_hist'], name='MACD Histogram'))
fig.update_layout(title='MACD (Moving Average Convergence Divergence)', xaxis_title='Date', yaxis_title='Value')
fig.show()

In [None]:
# Line chart for Cumulative Returns
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_summary['Date'], y=df_summary['Cumulative_Return'], mode='lines', name='Cumulative Return'))
fig.update_layout(title='Cumulative Return', xaxis_title='Date', yaxis_title='Cumulative Return')
fig.show()

In [None]:
# Precision-Recall Curve
for model_name, model in tuned_models.items():
    y_pred_prob = model.predict_proba(X_test)[:, 1]
    precision, recall, _ = precision_recall_curve(y_test, y_pred_prob)
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=recall, y=precision, mode='lines', name=model_name))
    fig.update_layout(title=f'Precision-Recall Curve for {model_name}', xaxis_title='Recall', yaxis_title='Precision')
    fig.show()

In [None]:
# ROC Curve
for model_name, model in tuned_models.items():
    y_pred_prob = model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
    auc_score = auc(fpr, tpr)
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name=f'{model_name} (AUC = {auc_score:.2f})'))
    fig.update_layout(title=f'ROC Curve for {model_name}', xaxis_title='False Positive Rate', yaxis_title='True Positive Rate')
    fig.show()

In [None]:
# Close the database connection
engine.dispose()