In [9]:
import os
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter("ignore", UserWarning)

In [10]:
from data.data_fetcher import get_stock_df
from data.indicator import add_rsi
import numpy as np
from feature.feature import create_batch_feature, feature_names
from data.label import label_feature

In [11]:
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import resample

In [23]:
csv_file = "data/stock_training_2023-01-01_2024-12-31.csv"
if not os.path.exists(csv_file):
    raise FileNotFoundError(
        f"Please run data_fetcher.py to download the data first.")
else:
    df_all = pd.read_csv(csv_file)

In [24]:
df_train = df_all[feature_names[9:] + label_feature]
df_train = df_train.dropna(subset=label_feature)

In [25]:
class_0 = df_train[df_train[label_feature[0]] == 0]
class_1 = df_train[df_train[label_feature[0]] == 1]
class_0_downsampled = resample(class_0, replace=False, n_samples=len(class_1), random_state=42)
df_train = pd.concat([class_0_downsampled, class_1])

In [26]:
# Split data into train and test sets
X = df_train.drop(columns=label_feature)
y = df_train[label_feature[0]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
# Initialize XGBoost classifier
xgb_model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False,
                             verbosity=0)  # Suppresses unnecessary warnings

# Define hyperparameters for tuning
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [50, 100, 200]
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(xgb_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Train the best model
best_model = grid_search.best_estimator_

# Make predictions
y_pred = best_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Best Parameters: {grid_search.best_params_}')
print(f'Accuracy: {accuracy:.4f}')
print('Classification Report:\n', report)

Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}
Accuracy: 0.9144
Classification Report:
               precision    recall  f1-score   support

         0.0       1.00      0.82      0.90       190
         1.0       0.86      1.00      0.92       207

    accuracy                           0.91       397
   macro avg       0.93      0.91      0.91       397
weighted avg       0.93      0.91      0.91       397



In [19]:
type(y_pred)

numpy.ndarray

In [20]:
result = np.column_stack((y_pred, y_pred))


In [22]:
y_pred

array([0, 0, 0, ..., 1, 0, 1])