# Thyroid Cancer Recurrence XGBoost

In [88]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
import warnings

warnings.filterwarnings("ignore")


In [89]:
cancer = pd.read_csv('Thyroid_Diff.csv')
cancer.head()

Unnamed: 0,Age,Gender,Smoking,Hx Smoking,Hx Radiothreapy,Thyroid Function,Physical Examination,Adenopathy,Pathology,Focality,Risk,T,N,M,Stage,Response,Recurred
0,27,F,No,No,No,Euthyroid,Single nodular goiter-left,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Indeterminate,No
1,34,F,No,Yes,No,Euthyroid,Multinodular goiter,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
2,30,F,No,No,No,Euthyroid,Single nodular goiter-right,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
3,62,F,No,No,No,Euthyroid,Single nodular goiter-right,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
4,62,F,No,No,No,Euthyroid,Multinodular goiter,No,Micropapillary,Multi-Focal,Low,T1a,N0,M0,I,Excellent,No


## Preprocessing

In [90]:
cancer = cancer.rename(columns={'Hx Radiothreapy':'Hx Radiotherapy'})

In [91]:
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder(categories=[['No','Yes']])
cancer['Smoking'] = encoder.fit_transform(cancer[['Smoking']]).astype(int)
cancer['Hx Smoking'] = encoder.fit_transform(cancer[['Hx Smoking']]).astype(int)
cancer['Hx Radiotherapy'] = encoder.fit_transform(cancer[['Hx Radiotherapy']]).astype(int)

In [92]:
cancer['Adenopathy'].value_counts()

No           277
Right         48
Bilateral     32
Left          17
Extensive      7
Posterior      2
Name: Adenopathy, dtype: int64

Thoughts: We immediately see two options here. We can transform this feature into a binary feature whereby we classify adenopathy as either 0 (No) or 1 (any other value). This does not necessarily preserve all data and if the type of adenopathy affects the recurrence of thyroid cancer then we would be better off using one hot encoding to preserve this information. We can revisit this using one hot encoding later to see if we get a better classifier.

In [93]:
cancer['Adenopathy'] = cancer['Adenopathy'].apply(lambda x: 0 if x == 'No' else 1)

In [94]:
cancer = pd.get_dummies(cancer, columns=['Thyroid Function','Physical Examination', 'Pathology'])

From here we would like to ordinally encode values with inherent ordering and turn other categorical variables into binary variables.

In [95]:
from sklearn.preprocessing import OrdinalEncoder

In [96]:
encoder = OrdinalEncoder(categories =[['Low','Intermediate','High']])
example = cancer['Risk'].copy()
cancer['Risk'] = encoder.fit_transform(cancer[['Risk']]).astype(int)

In [97]:
cancer['Tumor Size'] = cancer['T'].str.replace('^T1.*', '1', case=False, regex=True)
cancer['Tumor Size'] = cancer['Tumor Size'].str.replace('^T2.*', '2', case=False, regex=True)
cancer['Tumor Size'] = cancer['Tumor Size'].str.replace('^T3.*', '3', case=False, regex=True)
cancer['Tumor Size'] = cancer['Tumor Size'].str.replace('^T4.*', '4', case=False, regex=True)
cancer.drop('T', axis=1, inplace=True)
cancer['Tumor Size'] = cancer['Tumor Size'].astype(int)

In [98]:
cancer['Multi-Focal'] = cancer['Focality'].apply(lambda x: 1 if x=='Multi-Focal' else 0)
cancer.drop('Focality', axis=1, inplace=True)

In [99]:
cancer['Lymph Spread'] = cancer['N'].apply(lambda x: 0 if x=='N0' else 1)
cancer.drop('N',axis=1, inplace=True)

In [100]:
cancer['Metastatic'] = cancer['M'].apply(lambda x: 1 if x=='M1' else 0)
cancer.drop('M', axis=1, inplace=True)

In [101]:
cancer['Male'] = cancer['Gender'].apply(lambda x: 1 if x=='M' else 0)
cancer.drop('Gender', axis=1, inplace=True)

In [102]:
encoder2 = OrdinalEncoder(categories = [['Structural Incomplete','Indeterminate', 'Biochemical Incomplete', 'Excellent']])
cancer['Response'] = encoder2.fit_transform(cancer[['Response']]).astype(int)

In [103]:
cancer['Stage'] = cancer['Stage'].replace({'IVA':'IV','IVB':'IV'})

In [104]:
encoder3 = OrdinalEncoder(categories=[['I', 'II', 'III', 'IV']])
cancer['Stage'] = encoder3.fit_transform(cancer[['Stage']]).astype(int)

In [105]:
cancer['Recurred'] = cancer['Recurred'].apply(lambda x: 1 if x=='Yes' else 0)

## Model

In [106]:
X = cancer.drop('Recurred', axis=1)
y = cancer['Recurred']

In [107]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, mean_squared_error

### Feature Selection

In [108]:
model = RandomForestClassifier(n_jobs=-1, n_estimators=100, max_depth=10)
sfs = SequentialFeatureSelector(estimator = model, n_features_to_select = 5, direction='forward')
sfs.fit(X,y)
features = sfs.get_feature_names_out()

## Scikit-learn RFC

In [109]:
model = RandomForestClassifier(n_jobs=-1, n_estimators=20, max_depth=10)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25, random_state=42)
model.fit(X_train[features], y_train)

print(f'Accuracy: {model.score(X_test[features], y_test)}\n')

y_pred = model.predict(X_test[features])
conf = confusion_matrix(y_pred,y_test)
print(f'Confusion Matrix:\n {conf}')


Accuracy: 0.96875

Confusion Matrix:
 [[68  3]
 [ 0 25]]


### Grid Search 

In [None]:
n_estimators = [int(x) for x in np.linspace(1,200,21)]
max_depth = [int(x) for x in np.linspace(1,200,21)]
max_depth.append(None)
params = {'n_estimators': n_estimators, 'max_depth':max_depth}

grid = GridSearchCV(estimator=model, param_grid=params)
grid.fit(X_train[features],y_train)

In [None]:
best_params = grid.best_params_
best_params

### RFC Revisited with Hyperparameter Tuning

In [None]:
model = RandomForestClassifier(n_jobs=-1, n_estimators=110, max_depth=10)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25, random_state=42)
model.fit(X_train[features], y_train)

print(f'Accuracy: {model.score(X_test[features], y_test)}\n')

y_pred = model.predict(X_test[features])
conf = confusion_matrix(y_pred,y_test)
print(f'Confusion Matrix:\n {conf}')


## Using XGBoost

In [None]:
dtrain_clf = xgb.DMatrix(X_train, y_train)
dtest_clf = xgb.DMatrix(X_test, y_test)

In [None]:
params = {"objective": "binary:logitraw", "tree_method": "hist"}

n = 1000
result = xgb.cv(
    params=params,
    dtrain=dtrain_clf,
    num_boost_round=n,
    nfold=5,
    early_stopping_rounds=20,
    metrics=["auc"]
)

result.head()

We see that the boosted tree ends on the 4th iteration, where we have a mean ROC AUC of 0.995, a significant improvement over the non boosted models.