In [None]:
from sqlalchemy import create_engine
import pandas as pd
import pyodbc
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
from keras.models import Sequential
from keras.layers import LSTM, Dense
from tensorflow.keras.layers import LSTM, Input, Dense
from datetime import datetime
from tqdm import tqdm
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestRegressor

In [None]:
########### CONNECT TO ThalesStockPredictor SQLServer DB

# Define connection string
connStr = 'mssql+pyodbc://@MSI/ThalesStockPredictor?trusted_connection=yes&driver=ODBC+Driver+17+for+SQL+Server'

# Create SQLAlchemy engine
engine = create_engine(connStr)

# Define the query
query = """
SELECT * FROM vw_COMBINED_MODEL
ORDER BY FK_DT_Date desc
"""

# Execute the query and assign the result to a pandas DataFrame
Model_Data = pd.read_sql(query, engine)

# Close the SQL Server Connection
engine.dispose()

In [None]:
# View the data
Model_Data

In [None]:
# Copy original data frame for normalization
Model_ML = Model_Data.copy()

In [None]:
Model_ML

In [None]:
# Convert 'FK_DT_Date' to UNIX Epoch (numeric date)
Model_ML['FK_DT_Date'] = pd.to_datetime(Model_ML['FK_DT_Date']).astype('int64') // 10**9

# Remove rows with NA values (equivalent to na.omit in R)
Model_ML.dropna(inplace=True)

# Reset row names (equivalent to removing row names in R)
Model_ML.reset_index(drop=True)

Model_ML

In [None]:
# Set random seed for reproducibility
seed = 123

# Proportion for training set
train_prop = 0.8

# Split data into training and initial test sets
train_data, initial_test_data = train_test_split(Model_ML, test_size=1-train_prop, random_state=seed)

# Proportion for validation set (from the initial test set)
val_prop = 0.5

# Split initial test data into validation and testing sets
validate_data, test_data = train_test_split(initial_test_data, test_size=val_prop, random_state=seed)

# Reset row indices
train_data.reset_index(drop=True, inplace=True)
validate_data.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)

In [None]:
# Feature Importance for Un-Normalized train_data dataframe
# Separate features and target variable
features = train_data.drop("THA_NextDay_Close", axis=1)  # Replace "target_column" with your target
target = train_data["THA_NextDay_Close"]

# Create the Random Forest model for regression
RFmodel = RandomForestRegressor(n_estimators=500)  # using 500 as there is a chance of overfitting n_estimators as needed

# Train the model
RFmodel.fit(features, target)

In [None]:
# Get feature importances
importance = RFmodel.feature_importances_
feature_names = features.columns
sorted_idx = importance.argsort()[::-1]  # Sort features by importance (descending)
sorted_features = feature_names[sorted_idx]
sorted_importance = importance[sorted_idx]

# print sorted importance
print("Feature Importances (without scientific notation):\n")
for feature, importance in zip(sorted_features, sorted_importance):
    print(f"{feature}: {importance:.8f}") 

# Set a threshold for minimum importance (adjust as needed)
importance_threshold = 0.00001500

# Filter features and importance based on threshold
filtered_features = sorted_features[sorted_importance >= importance_threshold]
filtered_importance = sorted_importance[sorted_importance >= importance_threshold]

# Print or visualize importance
print("Sorted features by importance:\n", filtered_features)


In [None]:
# Create a bar chart to visualize un-normalized feature importance

# Create the bar chart with filtered data
plt.figure(figsize=(10, 12))
plt.barh(filtered_features, filtered_importance, color='skyblue')
plt.xlabel("Feature Importance")
plt.ylabel("Feature")
plt.title("Feature Importance Visualization: Un-Normalized")
plt.gca().invert_yaxis()  # Invert y-axis to display most important feature at the top
plt.grid(axis='x', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()

In [None]:
# Define your own `normalize()` function
def normalize(x):
    num = x - x.min()
    denom = x.max() - x.min()
    return num / denom

# Create a list of column names to exclude
exclude_columns = ["FK_DT_Date", "THA_NextDay_Close"]

# Normalize columns in the training dataset (excluding specified columns)
train_data.loc[:, ~train_data.columns.isin(exclude_columns)] = train_data.loc[:, ~train_data.columns.isin(exclude_columns)].apply(normalize)

# Normalize columns in the validation dataset (excluding specified columns)
validate_data.loc[:, ~validate_data.columns.isin(exclude_columns)] = validate_data.loc[:, ~validate_data.columns.isin(exclude_columns)].apply(normalize)

# Normalize columns in the testing dataset (excluding specified columns)
test_data.loc[:, ~test_data.columns.isin(exclude_columns)] = test_data.loc[:, ~test_data.columns.isin(exclude_columns)].apply(normalize)

In [None]:
# Feature Importance for Normalized train_data dataframe
# Separate features and target variable
features_Norm = train_data.drop("THA_NextDay_Close", axis=1)  
target_Norm = train_data["THA_NextDay_Close"]

# Create the Random Forest model
RFmodel_Norm = RandomForestRegressor(n_estimators=500)  # using 5400 as there is a chance of overfitting 

# Train the model
RFmodel_Norm.fit(features_Norm, target_Norm)

In [None]:

# Get feature importances
importance_Norm = RFmodel_Norm.feature_importances_

# Get feature importances
importance_Norm = RFmodel_Norm.feature_importances_
feature_names_Norm = features_Norm.columns
sorted_idx_Norm = importance_Norm.argsort()[::-1]  # Sort features by importance (descending)
sorted_features_Norm = feature_names_Norm[sorted_idx_Norm]
sorted_importance_Norm = importance_Norm[sorted_idx_Norm]

# print sorted importance
print("Feature Importances (without scientific notation):\n")
for feature_Norm, importance_Norm in zip(sorted_features_Norm, sorted_importance_Norm):
    print(f"{feature_Norm}: {importance_Norm:.8f}") 

# Set a threshold for minimum importance (adjust as needed)
importance_threshold = 0.00001500

# Filter features and importance based on threshold
filtered_features_Norm = sorted_features_Norm[sorted_importance_Norm >= importance_threshold]
filtered_importance_Norm = sorted_importance_Norm[sorted_importance_Norm >= importance_threshold]

# Print or visualize importance
print("Sorted features by importance:\n", filtered_features_Norm)

In [None]:
# Create a bar chart to visualize normalized feature importance

# Create the bar chart with filtered data
plt.figure(figsize=(10, 12))
plt.barh(filtered_features_Norm, filtered_importance_Norm, color='skyblue')
plt.xlabel("Feature Importance")
plt.ylabel("Feature")
plt.title("Feature Importance Visualization: Normalized")
plt.gca().invert_yaxis()  # Invert y-axis to display most important feature at the top
plt.grid(axis='x', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()

In [None]:
# Define features (X) and target (y)
X = train_data.drop(columns=['THA_NextDay_Close', 'FK_DT_Date'])
y = train_data['THA_NextDay_Close']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize models (excluding ARIMA)
models = {
    'Random Forest': RandomForestRegressor(),
    'k-Nearest Neighbors': KNeighborsRegressor(),
    'Linear Regression': LinearRegression(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'Decision Tree': DecisionTreeRegressor(),
    'Multi-layer Perceptron': MLPRegressor(max_iter=1000),
    'XGBoost': XGBRegressor(),
    'Support Vector Machine (Poly)': SVR(kernel='poly'),
    'Support Vector Machine (Radial)': SVR(kernel='rbf'),
    'Support Vector Machine (Linear)': SVR(kernel='linear')
}

# Function to calculate and print evaluation metrics
def evaluate_model(model_name, model, X_test, y_test):
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)

    print(f"{model_name}: MSE = {mse:.4f}, RMSE = {rmse:.4f}, MAE = {mae:.4f}")


# Loop through models, train, and evaluate
for model_name, model in models.items():
    model.fit(X_train, y_train)
    evaluate_model(model_name, model, X_test, y_test)


# Example LSTM implementation (adjust hyperparameters as needed)
# Define features (X) and target (y)
X_train = train_data.drop('THA_NextDay_Close', axis=1)  # Drop target variable from features
y_train = train_data['THA_NextDay_Close']

# Get the number of features dynamically
n_features = X_train.shape[1]

# Define the LSTM model with dynamic input shape
model = Sequential()
model.add(Input(shape=(n_features, 1)))  # Define input shape here
model.add(LSTM(units=50, return_sequences=True))
model.add(LSTM(units=50))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test))
evaluate_model('LSTM', model, X_test, y_test)


In [None]:
# Define features (X) and target (y)
# X = train_data.drop(columns=['THA_NextDay_Close', 'FK_DT_Date'])
# y = train_data['THA_NextDay_Close']
# Training Models:   0%|          | 0/10 [00:00<?, ?it/s]
# Training Models:  10%|█         | 1/10 [00:23<03:33, 23.75s/it]
# Random Forest: MSE = 1.7345, RMSE = 1.3170, MAE = 0.7861
# Training Models:  20%|██        | 2/10 [00:23<01:19,  9.90s/it]
# k-Nearest Neighbors: MSE = 1.9726, RMSE = 1.4045, MAE = 0.9244
# Linear Regression: MSE = 1.6161, RMSE = 1.2713, MAE = 0.7594
# Training Models:  40%|████      | 4/10 [00:34<00:41,  6.97s/it]
# Gradient Boosting: MSE = 1.7452, RMSE = 1.3211, MAE = 0.8043
# Training Models:  50%|█████     | 5/10 [00:34<00:24,  4.93s/it]
# Decision Tree: MSE = 2.5008, RMSE = 1.5814, MAE = 1.0122
# Training Models:  60%|██████    | 6/10 [00:42<00:23,  5.81s/it]
# Multi-layer Perceptron: MSE = 2.1818, RMSE = 1.4771, MAE = 0.9099
# Training Models:  70%|███████   | 7/10 [00:44<00:13,  4.67s/it]
# XGBoost: MSE = 2.0275, RMSE = 1.4239, MAE = 0.8628
# Training Models:  80%|████████  | 8/10 [00:45<00:07,  3.59s/it]
# Support Vector Machine (Poly): MSE = 3.0388, RMSE = 1.7432, MAE = 1.1928
# Training Models:  90%|█████████ | 9/10 [00:46<00:02,  2.81s/it]
# Support Vector Machine (Radial): MSE = 5.2402, RMSE = 2.2892, MAE = 1.3272
# Training Models: 100%|██████████| 10/10 [00:47<00:00,  4.75s/it]
# Support Vector Machine (Linear): MSE = 2.1323, RMSE = 1.4602, MAE = 0.8534
# LSTM: MSE = 86.3594, RMSE = 9.2930, MAE = 7.6911

In [None]:
# Define features (excluding target)
features = [col for col in train_data.columns if col != "THA_NextDay_Close"]

# Define target variable
target = "THA_NextDay_Close"

# Feature selection using F-value with classification approach
selector = SelectKBest(f_classif, k=30)  # Choose top 10 features (adjust k as needed)
selector.fit(train_data[features], train_data[target])

# Get feature importances
scores = selector.scores_

# Get feature names with scores
feature_scores = pd.DataFrame({'feature': features, 'score': scores})
feature_scores = feature_scores.sort_values(by='score', ascending=False)

# Print top features
print("Top Features based on F-value:")
print(feature_scores.head(30)) 

In [None]:
# Extract features and scores
features = feature_scores['feature'].to_numpy()
scores = feature_scores['score'].to_numpy()

# Sort together by scores (descending)
sorted_data = pd.DataFrame({'feature': features, 'score': scores})
sorted_data = sorted_data.sort_values(by='score', ascending=False)

# Extract sorted features and scores
sorted_features = sorted_data['feature'].to_numpy()
sorted_scores = sorted_data['score'].to_numpy()

# Plot the elbow graph
plt.plot(sorted_features, sorted_scores)
plt.xlabel("Features")
plt.ylabel("Feature Importance Score")
plt.title("Elbow Method for Feature Selection")
plt.xticks(rotation=45)  # Rotate feature names for readability

# Optional: Add a grid for better visualization
plt.grid(True)

plt.show()

In [None]:
# Filter features starting with 'THA_'
tha_features = [col for col in train_data.columns if col.startswith('THA_')]
df_filtered = train_data[tha_features]

# Define features (X) and target (y)
X = df_filtered.drop(columns=['THA_NextDay_Close'])
y = df_filtered['THA_NextDay_Close']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize models (excluding ARIMA)
models = {
    'Random Forest': RandomForestRegressor(),
    'k-Nearest Neighbors': KNeighborsRegressor(),
    'Linear Regression': LinearRegression(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'Decision Tree': DecisionTreeRegressor(),
    'Multi-layer Perceptron': MLPRegressor(max_iter=1000),
    'XGBoost': XGBRegressor(),
    'Support Vector Machine (Poly)': SVR(kernel='poly'),
    'Support Vector Machine (Radial)': SVR(kernel='rbf'),
    'Support Vector Machine (Linear)': SVR(kernel='linear')
}

# Function to calculate and print evaluation metrics
def evaluate_model(model_name, model, X_test, y_test):
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)

    print(f"{model_name}: MSE = {mse:.4f}, RMSE = {rmse:.4f}, MAE = {mae:.4f}")


# Loop through models, train, and evaluate
for model_name, model in models.items():
    model.fit(X_train, y_train)
    evaluate_model(model_name, model, X_test, y_test)


# Example LSTM implementation (adjust hyperparameters as needed)
# Define features (X) and target (y)
X_train = df_filtered.drop('THA_NextDay_Close', axis=1)  # Drop target variable from features
y_train = df_filtered['THA_NextDay_Close']

# Get the number of features dynamically
n_features = X_train.shape[1]

# Define the LSTM model with dynamic input shape
model = Sequential()
model.add(Input(shape=(n_features, 1)))  # Define input shape here
model.add(LSTM(units=50, return_sequences=True))
model.add(LSTM(units=50))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test))
evaluate_model('LSTM', model, X_test, y_test)

In [None]:
# All Features except FK_DT_Date
# # Random Forest: MSE = 1.7035, RMSE = 1.3052, MAE = 0.7885
# k-Nearest Neighbors: MSE = 2.2569, RMSE = 1.5023, MAE = 0.9749
# Linear Regression: MSE = 1.6422, RMSE = 1.2815, MAE = 0.7633
# Gradient Boosting: MSE = 1.7412, RMSE = 1.3195, MAE = 0.8053
# Decision Tree: MSE = 2.5059, RMSE = 1.5830, MAE = 1.0459
# Multi-layer Perceptron: MSE = 2.0624, RMSE = 1.4361, MAE = 0.8723
# XGBoost: MSE = 1.9694, RMSE = 1.4034, MAE = 0.8629
# Support Vector Machine (Poly): MSE = 14.9405, RMSE = 3.8653, MAE = 2.4632
# Support Vector Machine (Radial): MSE = 3.7796, RMSE = 1.9441, MAE = 1.1282
# Support Vector Machine (Linear): MSE = 2.2261, RMSE = 1.4920, MAE = 0.8778
# LSTM: MSE = 2.5670, RMSE = 1.6022, MAE = 1.1268

In [None]:
# Get top features based on F-value
top_features = list(feature_scores.head(30)['feature'])  # Adjust the number of features (k) as needed

# Define features (X) for model training using selected features
X = train_data[top_features]

# Define target variable (y) for model training and evaluation
y = train_data['THA_NextDay_Close']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize models (excluding ARIMA)
models = {
    'Random Forest': RandomForestRegressor(),
    'k-Nearest Neighbors': KNeighborsRegressor(),
    'Linear Regression': LinearRegression(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'Decision Tree': DecisionTreeRegressor(),
    'Multi-layer Perceptron': MLPRegressor(max_iter=1000),
    'XGBoost': XGBRegressor(),
    'Support Vector Machine (Poly)': SVR(kernel='poly'),
    'Support Vector Machine (Radial)': SVR(kernel='rbf'),
    'Support Vector Machine (Linear)': SVR(kernel='linear')
}

# Function to calculate and print evaluation metrics
def evaluate_model(model_name, model, X_test, y_test):
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)

    print(f"{model_name}: MSE = {mse:.4f}, RMSE = {rmse:.4f}, MAE = {mae:.4f}")


# Loop through models, train, and evaluate
for model_name, model in models.items():
    model.fit(X_train, y_train)
    evaluate_model(model_name, model, X_test, y_test)


# Example LSTM implementation (adjust hyperparameters as needed)
# Define features (X) and target (y)
X_train = train_data_filtered.drop('THA_NextDay_Close', axis=1)  # Drop target variable from features
y_train = train_data_filtered['THA_NextDay_Close']

# Get the number of features dynamically
n_features = X_train.shape[1]

# Define the LSTM model with dynamic input shape
model = Sequential()
model.add(Input(shape=(n_features, 1)))  # Define input shape here
model.add(LSTM(units=50, return_sequences=True))
model.add(LSTM(units=50))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test))
evaluate_model('LSTM', model, X_test, y_test)

In [None]:
# Using top 30 Features based on Fscore
# Random Forest: MSE = 1.7210, RMSE = 1.3119, MAE = 0.7858
# k-Nearest Neighbors: MSE = 1.9707, RMSE = 1.4038, MAE = 0.8105
# Linear Regression: MSE = 1.6077, RMSE = 1.2680, MAE = 0.7531
# Gradient Boosting: MSE = 1.7464, RMSE = 1.3215, MAE = 0.8050
# Decision Tree: MSE = 2.6304, RMSE = 1.6219, MAE = 1.0556
# Multi-layer Perceptron: MSE = 2.1446, RMSE = 1.4644, MAE = 0.8781
# XGBoost: MSE = 1.9582, RMSE = 1.3994, MAE = 0.8317
# Support Vector Machine (Poly): MSE = 49.0350, RMSE = 7.0025, MAE = 5.6358
# Support Vector Machine (Radial): MSE = 3.8672, RMSE = 1.9665, MAE = 1.1684
# Support Vector Machine (Linear): MSE = 2.1745, RMSE = 1.4746, MAE = 0.8863
# LSTM: MSE = 1069.6376, RMSE = 32.7053, MAE = 28.0060

In [None]:
# Define features (X) for model training using selected features
X = train_data[filtered_features_Norm]

# Define target variable (y) for model training and evaluation
y = train_data['THA_NextDay_Close']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize models (excluding ARIMA)
models = {
    'Random Forest': RandomForestRegressor(),
    'k-Nearest Neighbors': KNeighborsRegressor(),
    'Linear Regression': LinearRegression(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'Decision Tree': DecisionTreeRegressor(),
    'Multi-layer Perceptron': MLPRegressor(max_iter=1000),
    'XGBoost': XGBRegressor(),
    'Support Vector Machine (Poly)': SVR(kernel='poly'),
    'Support Vector Machine (Radial)': SVR(kernel='rbf'),
    'Support Vector Machine (Linear)': SVR(kernel='linear')
}

# Function to calculate and print evaluation metrics
def evaluate_model(model_name, model, X_test, y_test):
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)

    print(f"{model_name}: MSE = {mse:.4f}, RMSE = {rmse:.4f}, MAE = {mae:.4f}")


# Loop through models, train, and evaluate
for model_name, model in models.items():
    model.fit(X_train, y_train)
    evaluate_model(model_name, model, X_test, y_test)

# Example LSTM implementation (adjust hyperparameters as needed)
# Reshape data for LSTM input
X_train_LSTM = X_train.values.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test_LSTM = X_test.values.reshape((X_test.shape[0], 1, X_test.shape[1]))  # Reshape test data as well

# Define the LSTM model with dynamic input shape
model = Sequential()
model.add(Input(shape=(X_train_LSTM.shape[1], X_train_LSTM.shape[2])))  # Define input shape based on reshaped data
model.add(LSTM(units=50, return_sequences=True))
model.add(LSTM(units=50))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(X_train_LSTM, y_train, epochs=100, batch_size=32, validation_data=(X_test_LSTM, y_test))
evaluate_model('LSTM', model, X_test_LSTM, y_test)