In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os, gc 
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

In [2]:
SEED = 42
FOLDS = 5

In [3]:
df = pd.read_csv('../data/clinical_and_other_features_filtered.csv')
df.columns=df.columns.str.strip()
df.columns = [col.replace('[', '(').replace(']', ')') for col in df.columns]
df.replace('NP', np.nan, inplace=True)
df.replace('NC', np.nan, inplace=True)
df.replace('NA', np.nan, inplace=True)
df.drop('Oncotype score', axis=1, inplace=True)

In [4]:
near_complete_response = ['Overall Near-complete Response:  Looser Definition','Near-complete Response (Graded Measure)']
df.drop(near_complete_response, axis=1, inplace=True)
pathologic_response_to_neoadjuvant_therapy = ['Pathologic response to Neoadjuvant therapy: Pathologic stage (T) following neoadjuvant therapy',
       'Pathologic response to Neoadjuvant therapy:  Pathologic stage (N) following neoadjuvant therapy',
       'Pathologic response to Neoadjuvant therapy:  Pathologic stage (M) following neoadjuvant therapy']
df.drop(pathologic_response_to_neoadjuvant_therapy, axis=1, inplace=True)

In [5]:
X = df.drop("Overall Near-complete Response:  Stricter Definition", axis=1)
y = df["Overall Near-complete Response:  Stricter Definition"]

In [6]:
y = y.astype('int')

In [7]:
cont_columns = ['Date of Birth (Days)', 'Days to Surgery (from the date of diagnosis)', 'Age at last contact in EMR f/u(days)(from the date of diagnosis) ,last time patient known to be alive, unless age of death is reported(in such case the age of death',
'Age at mammo (days)', 'Days to distant recurrence(from the date of diagnosis)', 'Days to local recurrence (from the date of diagnosis)',
'Days to death (from the date of diagnosis)', 'Days to last local recurrence free assessment (from the date of diagnosis)', 
]
categorical_columns = list(set(X.columns) - set(cont_columns))


In [8]:
for col in cont_columns:
    X[col] = X[col].astype("float64", errors='ignore')
for categorical_column in categorical_columns:
    X[categorical_column] = X[categorical_column].astype("category", errors='ignore')

In [9]:
y.nunique()

4

In [10]:
X.dtypes

Date of Birth (Days)                                                      float64
Menopause (at diagnosis)                                                 category
Race and Ethnicity                                                       category
Metastatic at Presentation (Outside of Lymph Nodes)                      category
ER                                                                       category
                                                                           ...   
Number of Ovaries In Situ                                                category
Therapeutic or Prophylactic Oophorectomy as part of Endocrine Therapy    category
Neoadjuvant Anti-Her2 Neu Therapy                                        category
Adjuvant Anti-Her2 Neu Therapy                                           category
Received Neoadjuvant Therapy or Not                                      category
Length: 77, dtype: object

In [11]:
#  multiclass classification with XGBoost
results = xgb.cv(
    params={
        "objective": "multi:softmax",
        "num_class": 4,
        "eval_metric": "merror",
        "nthread": 4,
        "eta": 0.1,
        "max_depth": 6,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "silent": 1,
        "seed": SEED,
    },
    # enable_categorical=True to use categorical features
    dtrain=xgb.DMatrix(X, y, enable_categorical=True),
    num_boost_round=1000,
    nfold=FOLDS,
    stratified=True,
    early_stopping_rounds=20,
    verbose_eval=10,
    seed=SEED,
)

[0]	train-merror:0.06651+0.01674	test-merror:0.16959+0.04695
[10]	train-merror:0.03765+0.00540	test-merror:0.05771+0.00794
[20]	train-merror:0.02163+0.00823	test-merror:0.05771+0.00794
[22]	train-merror:0.01763+0.00651	test-merror:0.05771+0.00794


In [12]:
results.keys()


Index(['train-merror-mean', 'train-merror-std', 'test-merror-mean',
       'test-merror-std'],
      dtype='object')

In [13]:
results['test-merror-mean'].max()

0.1695852534562212

In [14]:
1-results['test-merror-mean'].max()

0.8304147465437788