In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
from joblib import dump

In [None]:
df_data = pd.read_csv('EGFR_Feature_Extraction.csv')
df_data.head()

In [None]:
df = df_data.drop(df_data.columns[-2], axis=1, inplace=True)
df = df_data.iloc[:, 2:]
print("Null values: " + str(df.isnull().values.any()))
print(df['class'].value_counts())
df.head()

In [None]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [None]:
le = LabelEncoder()
y = le.fit_transform(y)

In [None]:
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.125, random_state=42)

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 6, 10],
    'min_child_weight': [1, 5, 10],  
    'gamma': [0.5, 1, 1.5],  
    'subsample': [0.6, 0.8, 1.0],  
    'colsample_bytree': [0.6, 0.8, 1.0],  
    'reg_alpha': [1e-5, 1e-2, 0.1, 1],  
    'reg_lambda': [1e-5, 1e-2, 0.1, 1]  
}

grid_model = XGBClassifier()

grid = GridSearchCV(estimator=grid_model, param_grid=param_grid, scoring='accuracy', n_jobs=-1, cv=3)

grid.fit(X_train, y_train)

print("Best parameters found: ", grid.best_params_)
print("Best score found: ", grid.best_score_)

In [None]:
model = XGBClassifier(
    n_estimators=grid.best_params_['n_estimators'],
    max_depth=grid.best_params_['max_depth'],
    min_child_weight=grid.best_params_['min_child_weight'],
    gamma=grid.best_params_['gamma'],
    subsample=grid.best_params_['subsample'],
    colsample_bytree=grid.best_params_['colsample_bytree'],
    reg_alpha=grid.best_params_['reg_alpha'],
    reg_lambda=grid.best_params_['reg_lambda'],
)

eval_set = [(X_train, y_train), (X_val, y_val)]

model.fit(
    X_train,
    y_train,
    eval_metric=["error", "logloss"],
    eval_set=eval_set,
    early_stopping_rounds=10,
    verbose=True
)

In [None]:
y_pred = model.predict(X_test)
cr = classification_report(y_test, y_pred)
print(cr)

In [None]:
results = model.evals_result()

epochs = len(results['validation_0']['error'])
x_axis = range(0, epochs)

fig, ax = plt.subplots()
ax.plot(x_axis, results['validation_0']['logloss'], label='Train')
ax.plot(x_axis, results['validation_1']['logloss'], label='Val')
ax.legend()
plt.ylabel('Log Loss')
plt.title('XGBoost Log Loss')
plt.show()

In [None]:
y_val_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_val_pred)
print("Validation Accuracy:", accuracy)

In [None]:
saved_model = "XGBClassifierEGFR.joblib"
dump(model, saved_model)