In [2]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from collections import Counter


In [3]:

# Load data
df = pd.read_csv('data/features_filtered.csv')  


In [4]:

# Features and target
features = [
    'mid_iv', 
    'moneyness',
    'DTE',
    'mid_delta',
    'mid_gamma',
    'mid_theta',
    'mid_vega',
    'atm_iv',
    'skew',
    'curvature',
    'iv_slope',
    'iv_curvature',
    'term_spread'
]

mapped_labels = {-1: 0, 0: 1, 1: 2}

# Training label:
df['label_mapped'] = df['label'].map(mapped_labels)



In [5]:

X = df[features]
y = df['label_mapped']  # 0: LONG, 1: IGNORE, 2: SHORT

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Handle class imbalance
class_counts = Counter(y_train)
max_class = max(class_counts.values())
scale_weights = {cls: max_class / count for cls, count in class_counts.items()}


In [6]:


model = XGBClassifier(
    objective='multi:softprob',
    num_class=3,  # 3 classes: LONG, IGNORE, SHORT
    eval_metric='mlogloss',
    use_label_encoder=False,
    random_state=42,
    scale_pos_weight=[scale_weights.get(i, 1) for i in range(3)]  # Adjust class weights
)


In [7]:

model.fit(X_train, y_train)


Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [8]:

# Predict
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)


In [9]:

# Evaluate
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))



Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.98      5985
           1       1.00      1.00      1.00     55265
           2       1.00      1.00      1.00     54499

    accuracy                           1.00    115749
   macro avg       0.99      0.99      0.99    115749
weighted avg       1.00      1.00      1.00    115749

Confusion Matrix:
 [[ 5868   116     1]
 [   81 55072   112]
 [    0   131 54368]]


In [10]:
long_prob  = model.predict_proba(X)[:, 0]
short_prob = model.predict_proba(X)[:, 2]


In [11]:
# Long/Short confidence from predict_proba
long_prob  = model.predict_proba(X)[:, 0]  # LONG = class 0
short_prob = model.predict_proba(X)[:, 2]  # SHORT = class 2


In [12]:


preds = model.predict(X)


In [13]:
rows = []

for i in range(len(preds)):
    if preds[i] == 0 and long_prob[i] > 0.97:
        rows.append({
            'QUOTE_DATE': df.loc[i, 'QUOTE_DATE'],
            'EXPIRE_DATE': df.loc[i, 'EXPIRE_DATE'],
            'STRIKE': df.loc[i, 'STRIKE'],
            'DTE': df.loc[i, 'DTE'],
            'moneyness': df.loc[i, 'moneyness'],
            'mid_iv': df.loc[i, 'mid_iv'],
            'atm_iv': df.loc[i, 'atm_iv'],
            'skew': df.loc[i, 'skew'],
            'term_spread': df.loc[i, 'term_spread'],
            'action': 'BUY',
            'position_type': 'LONG_PUT',
            'confidence': long_prob[i]
        })

    elif preds[i] == 2 and short_prob[i] > 0.97:
        rows.append({
            'QUOTE_DATE': df.loc[i, 'QUOTE_DATE'],
            'EXPIRE_DATE': df.loc[i, 'EXPIRE_DATE'],
            'STRIKE': df.loc[i, 'STRIKE'],
            'DTE': df.loc[i, 'DTE'],
            'moneyness': df.loc[i, 'moneyness'],
            'mid_iv': df.loc[i, 'mid_iv'],
            'atm_iv': df.loc[i, 'atm_iv'],
            'skew': df.loc[i, 'skew'],
            'term_spread': df.loc[i, 'term_spread'],
            'action': 'SELL',
            'position_type': 'SHORT_PUT',
            'confidence': short_prob[i]
        })


In [14]:


signal_df = pd.DataFrame(rows)
signal_df.to_csv('data/signals_mispricing.csv', index=False)
