In [None]:
# !pip install protobuf==3.20.1

In [None]:
# from sagemaker.xgboost.estimator import XGBoost
import xgboost as xgb
import pandas as pd
import numpy as np
import openpyxl
import matplotlib.pyplot as plt
from openpyxl import load_workbook
import os

from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, recall_score, f1_score, classification_report

### data + dates

In [None]:
df = pd.read_parquet('s3://sisyphus-general-bucket/AthenaInsights/latest_data/model/data/stock_bars_1min.parquet' )

In [None]:
category_map = {'A': 0, 'B': 1, 'C':2}
reverse_category_map = {v: k for k, v in category_map.items()}

In [None]:
df = df.fillna(0)
df['mapped_category'] = df['category'].map({'A': 0, 'B': 1, 'C':2})
df['mapped_category'].value_counts()

### train test split

In [None]:
from datetime import datetime, timedelta

In [None]:
start_date = '2024-10-01'
end_date = datetime.today().strftime('%Y-%m-%d')
date_series = pd.date_range(start=start_date, end=end_date, freq='D')
print(date_series)

In [None]:
# Create a DataFrame with initial content
file_name = "results_0.xlsx"

# if os.path.exists(file_name):
#     file_name, result_num = file_name.split(".")[0].split("_")
#     result_num += 1
#     file_name = file_name + result_num + ".xlsx"
#     print(f'base name: {file_name}')

data = {'features used': list(df.columns)}
data = pd.DataFrame(data)

sheet_name = 'InitialSheet'

# Write the DataFrame to an Excel file with a custom sheet name
with pd.ExcelWriter(file_name, engine='openpyxl') as writer:
    data.to_excel(writer, sheet_name=sheet_name, index=False)

In [None]:
def get_dates(dt):
    test_date = dt
    next_day = (datetime.strptime(test_date, '%Y-%m-%d') + timedelta(days=1)).strftime('%Y-%m-%d')
    next_10_day = (datetime.strptime(test_date, '%Y-%m-%d') + timedelta(days=9)).strftime('%Y-%m-%d')
    prev_day = (datetime.strptime(test_date, '%Y-%m-%d') + timedelta(days=-1)).strftime('%Y-%m-%d')
    return test_date, next_day, next_10_day, prev_day

In [None]:
def get_train_test_split(dt):

    X_train = df.loc[:prev_day, ].drop(columns=['category', 'mapped_category'])
    y_train = df.loc[:prev_day, 'mapped_category']

    X_test = df.loc[test_date: next_10_day, ].drop(columns=['category', 'mapped_category'])
    y_test = df.loc[test_date: next_10_day, 'mapped_category']

    X_test_full = df.loc[test_date:, ].drop(columns=['category', 'mapped_category'])
    y_test_full = df.loc[test_date:, 'mapped_category']

    X_test_only_next_day = df.loc[test_date, ].drop(columns=['category', 'mapped_category'])
    y_test_only_next_day = df.loc[test_date, 'mapped_category']
    
    return X_train, y_train, X_test, y_test, X_test_full, y_test_full, X_test_only_next_day, y_test_only_next_day

In [None]:
def log(x, dt, image=None, add=None):
    if type(x)==type('str'):
        data = {'text': [x]}
        data = pd.DataFrame(data)

        book = openpyxl.load_workbook(file_name)
        if dt in book.sheetnames:
            sheet = book[dt]
            start_row = sheet.max_row + 1  # Find the first empty row
        else:
            sheet = book.create_sheet(dt)  # Create a new sheet
            start_row = 1

         # Convert DataFrame to rows and append to the sheet
        for r_idx, row in enumerate(dataframe_to_rows(data, index=False, header=False), start=start_row):
            for c_idx, value in enumerate(row, start=1):
                sheet.cell(row=r_idx, column=c_idx, value=value)

        book.save(file_name)
    
    elif image==1:
        workbook = xlsxwriter.Workbook(file_name)
        worksheet = workbook.get_worksheet_by_name(dt)
        start_row = openpyxl.load_workbook(file_name)[dt].max_row + 1
        worksheet.insert_image(f"B{start_row}", add)
    
    else:
        raise ValueError

In [None]:
def initialte_and_train(X_train, y_train, X_test, y_test, X_test_full, y_test_full, X_test_only_next_day, y_test_only_next_day):
    
    clf = xgb.XGBClassifier(n_estimators=100,
                            objective='multi:softmax',
                            n_jobs =-1,
                            random_state=420,
                            num_class=3,
                            eval_metric=['merror','mlogloss'])
    clf.fit(X_train,
            y_train,
            verbose=1,
            eval_set=[(X_train, y_train), (X_test, y_test), (X_test_full, y_test_full), (X_test_only_next_day, y_test_only_next_day)])
    
    return clf

In [None]:
def get_results(clf, dt):
    results = clf.evals_result()
    epochs = len(results['validation_0']['mlogloss'])
    x_axis = range(0, epochs)

    log(f"results: {results}", dt)
    log(f"epochs: {epochs}", dt)
    log("\n\n", dt)

    # xgboost 'mlogloss' plot
    fig, ax = plt.subplots(figsize=(9,5))
    ax.plot(x_axis, results['validation_0']['mlogloss'], label='Train')
    ax.plot(x_axis, results['validation_1']['mlogloss'], label='Test')
    ax.plot(x_axis, results['validation_2']['mlogloss'], label='Test_full')
    ax.plot(x_axis, results['validation_3']['mlogloss'], label='Test_only_next_day')
    ax.legend()
    plt.ylabel('mlogloss')
    plt.title(f'GridSearchCV XGBoost mlogloss - {dt}')
    plt.show()
    fig.savefig(f'GridSearchCV XGBoost mlogloss - {dt}.png')
    log(None, dt, image=1, add=f'GridSearchCV XGBoost mlogloss - {dt}.png')
    log("\n\n", dt)

    # xgboost 'merror' plot
    fig, ax = plt.subplots(figsize=(9,5))
    ax.plot(x_axis, results['validation_0']['merror'], label='Train')
    ax.plot(x_axis, results['validation_1']['merror'], label='Test')
    ax.plot(x_axis, results['validation_2']['merror'], label='Test_full')
    ax.plot(x_axis, results['validation_3']['merror'], label='Test_only_next_day')
    ax.legend()
    plt.ylabel('merror')
    plt.title(f'GridSearchCV XGBoost merror - {dt}')
    plt.show()
    fig.savefig(f'GridSearchCV XGBoost merror - {dt}.png')
    log(None, dt, image=1, add=f'GridSearchCV XGBoost merror - {dt}.png')
    log("\n\n", dt)

In [None]:
def generate_reports(X_test, y_test, clf):
    log('## ---------- Model Classification Report ----------', dt)
    log('## get predictions and create model quality report', dt)

    y_pred = clf.predict(X_test)

    log('\n------------------ Confusion Matrix -----------------\n', dt)
    log(confusion_matrix(y_test, y_pred), dt)
    preds_probs = clf.predict_proba(X_test)

    for i in range(4, 10, 1):
        log(f'threshold - {i/10}', dt)
        preds_probs1 = (preds_probs>=i/10).argmax(axis=1,)
        log(confusion_matrix(y_test_only_next_day, preds_probs1), dt)
        log('\n\n', dt)

    log('\n-------------------- Key Metrics --------------------', dt)
    log('\nAccuracy: {:.2f}'.format(accuracy_score(y_test, y_pred)), dt)
    log('Balanced Accuracy: {:.2f}\n'.format(balanced_accuracy_score(y_test, y_pred)), dt)

    log('Micro Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='micro')), dt)
    log('Micro Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='micro')), dt)
    log('Micro F1-score: {:.2f}\n'.format(f1_score(y_test, y_pred, average='micro')), dt)

    log('Macro Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='macro')), dt)
    log('Macro Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='macro')), dt)
    log('Macro F1-score: {:.2f}\n'.format(f1_score(y_test, y_pred, average='macro')), dt)

    log('Weighted Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='weighted')), dt)
    log('Weighted Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='weighted')), dt)
    log('Weighted F1-score: {:.2f}'.format(f1_score(y_test, y_pred, average='weighted')), dt)

    log('\n--------------- Classification Report ---------------\n', dt)
    log(classification_report(y_test, y_pred), dt)
    log('---------------------- XGBoost ----------------------', dt) # unnecessary fancy styling

In [None]:
def plot_categorization(df, date, pred, field='close', ):
    """ Plot categorization for a given day with dynamic field selection """
    df_day = df.loc[date]
    df_day['preds'] = list(pred)
    # categories, future_highs, future_lows = categorize_points(df_day, field=field, **kwargs)
    # df_day['category'] = categories
    # df_day['future_highs'] = future_highs
    # df_day['future_lows'] = future_lows

    
    plt.figure(figsize=(14, 7))
    plt.plot(df_day.index, df_day[field], label=f'{field.capitalize()} Price', color='gray', linewidth=2)
    # plt.plot(df_day.index, df_day['close'], label=f'{field.capitalize()} Price', color='blue', linewidth=1)
    for cat, color in zip(['A', 'B', 'C'], ['green', 'red', 'gray']):
        plt.scatter(df_day[df_day['category'] == cat].index, df_day[df_day['category'] == cat][field], color=color, label=f'Category {cat}', s=30 if cat!='C' else 0)
    for cat, color in zip(['A', 'B', 'C'], ['blue', 'black', 'pink']):
        print(cat)
        print(df_day[df_day['preds'] == cat].index)
        plt.scatter(df_day[df_day['preds'] == cat].index, df_day[df_day['preds'] == cat][field], color=color, label=f'Preds {cat}', s=20 if cat!='C' else 10, 
                    # marker = '1' if cat=='B' else '2' if cat=='A' else '+')
                    marker='s')
    plt.legend()
    plt.title(f'Price Categorization on {date}')
    plt.xlabel('Timestamp')
    plt.ylabel(f'{field.capitalize()} Price')
    plt.show()
    plt.savefig(f'plot for day - {date}.png')
    log(None, dt, image=1, add=f'plot for day - {date}.png')
    log("\n\n", date)

In [None]:
for dt in date_series:
    log(f"running for dt = {dt}", dt)
    log ("\n\n", dt)
    test_date, next_day, next_10_day, prev_day = get_dates(dt)
    
    log(f"test_date: {test_date}, next_day: {next_day}, next_10_day: {next_10_day}, prev_day: {prev_day}", dt)
    log ("\n", dt)
    
    X_train, y_train, X_test, y_test, X_test_full, y_test_full, X_test_only_next_day, y_test_only_next_day = train_test_split(dt)
    
    log(f"y_train.value_counts():\n{log(y_train.value_counts())}", dt)
    log(f"y_test.value_counts():\n{y_test.value_counts()}", dt)
    log(f"y_test_full.value_counts():\n{y_test_full.value_counts()}", dt)
    log(f"y_test_only_next_day.value_counts():\n{y_test_only_next_day.value_counts()}", dt)
    log ("\n\n", dt)
    
    log("training the model", dt)
    clf = initialte_and_train(X_train, y_train, X_test, y_test, X_test_full, y_test_full, X_test_only_next_day, y_test_only_next_day)
    log ("\n\n", dt)
    
    log("results", dt)
    get_results(clf, dt)
    log ("\n\n", dt)
    
    log("reports", dt)
    
    log("10 day test", dt)
    generate_reports(X_test, y_test, clf)
    log ("\n\n", dt)
    
    log("full day test", dt)
    generate_reports(X_test_full, y_test_full, clf)
    log ("\n\n", dt)
    
    log("1 day test", dt)
    generate_reports(X_test_only_next_day, y_test_only_next_day, y_pred, clf)
    log ("\n\n", dt)
    
    feature_important = clf.feature_importances_ 
    keys = list(X_train.columns)
    values = list(feature_important)

    log("feature importances", dt)
    fea_imp = pd.DataFrame(data=values, index=keys, columns=["score"]).sort_values(by = "score", ascending=True)
    log(fea_imp, dt)
    
    preds_probs = clf.predict_proba(X_test_only_next_day)
    preds_probs1 = (preds_probs>=0.5).argmax(axis=1,)
    plot_categorization(df, dt, pd.Series(preds_probs1).map(reverse_category_map))

In [None]:
# y_test.value_counts() # category # 2    6809 # 0     523 # 1     348
# 134 + 0 + 389 # 0
# 0 + 24 + 324 # 1
# 68 + 34 + 6707 # 2