In [1]:
import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.mixture import GaussianMixture
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, ConcatDataset

In [2]:
file_path = './data./Data Set for Chapter - Sheet1.csv'
df = pd.read_csv(file_path)
df = df.drop('S.no',axis = 1)
df = df.dropna()
df

Unnamed: 0,Age,Intimate Partners,"Protection Usage (0: Never, 1: Sometimes, 2: Always)",Symptoms,Location,Education,"STD Testing history (0: No, 1: Yes)",STD Status
0,45.0,4,0,0,2,1,1,1
1,43.0,4,0,0,2,1,1,1
2,39.0,5,0,0,3,1,1,1
3,35.0,5,2,0,3,1,0,1
4,32.0,1,1,1,2,1,0,0
...,...,...,...,...,...,...,...,...
540,36.0,2,0,1,2,0,1,1
541,45.0,4,0,0,2,0,1,1
542,43.0,4,0,0,3,0,0,0
543,39.0,5,0,0,3,0,0,0


In [3]:
train_df,test_df = train_test_split(df,test_size=0.2,random_state=42)

In [28]:
## Training Autoencoder Model

X = df.drop('STD Status',axis = 1)
y = df['STD Status']

X_scaled = MinMaxScaler().fit_transform(X)

X_infected = X_scaled[y==1]
X_uninfected = X_scaled[y==0]

class Autoencoder(nn.Module):
  def __init__(self):
    super(Autoencoder,self).__init__()
    self.encoder = nn.Sequential(
        nn.Linear(7,5),
        nn.Tanh(),
        nn.Linear(5,3),
    )
    self.decoder = nn.Sequential(
        nn.Linear(3,5),
        nn.Tanh(),
        nn.Linear(5,7),
        nn.ReLU()
    )
  def forward(self,x):
    x = self.encoder(x)
    x = self.decoder(x)
    return x

autoencoder = Autoencoder()

criterion = nn.MSELoss()
optimizer = optim.Adadelta(autoencoder.parameters())

batch_size = 16
epochs = 30

X_infected_tensor = torch.FloatTensor(X_infected)
X_uninfected_tensor = torch.FloatTensor(X_uninfected)

infected_dataset = TensorDataset(X_infected_tensor,X_infected_tensor)
uninfected_dataset = TensorDataset(X_uninfected_tensor,X_uninfected_tensor)

dataset = ConcatDataset([infected_dataset,uninfected_dataset])
loader = DataLoader(dataset,batch_size=batch_size,shuffle=True)
infected_loader = DataLoader(infected_dataset,batch_size=batch_size,shuffle=True)
uninfected_loader = DataLoader(uninfected_dataset,batch_size=batch_size,shuffle=True)
losses = []

for epoch in range(epochs):
  for data in loader:
    inputs,targets = data
    optimizer.zero_grad()
    outputs = autoencoder(inputs)
    loss = criterion(outputs,targets)
    loss.backward()
    optimizer.step()
    losses.append(loss.detach().numpy())


losses[-100:]

[array(0.05448578, dtype=float32),
 array(0.07968129, dtype=float32),
 array(0.06443986, dtype=float32),
 array(0.06701566, dtype=float32),
 array(0.05061757, dtype=float32),
 array(0.07657838, dtype=float32),
 array(0.07714188, dtype=float32),
 array(0.07496406, dtype=float32),
 array(0.05469698, dtype=float32),
 array(0.06153443, dtype=float32),
 array(0.08868213, dtype=float32),
 array(0.07938512, dtype=float32),
 array(0.09604077, dtype=float32),
 array(0.05290859, dtype=float32),
 array(0.09426009, dtype=float32),
 array(0.05382857, dtype=float32),
 array(0.04247431, dtype=float32),
 array(0.07982804, dtype=float32),
 array(0.05812027, dtype=float32),
 array(0.05419628, dtype=float32),
 array(0.06677362, dtype=float32),
 array(0.04367984, dtype=float32),
 array(0.05307752, dtype=float32),
 array(0.07378739, dtype=float32),
 array(0.07863349, dtype=float32),
 array(0.08956944, dtype=float32),
 array(0.10256601, dtype=float32),
 array(0.06550241, dtype=float32),
 array(0.08339465, d

In [32]:
def encoded(model,data_loader):
  hidden_rep = []
  with torch.no_grad():
    for data in data_loader:
      inputs,_=data
      hidden = model.encoder(inputs)
      hidden_rep.append(hidden)
  return torch.cat(hidden_rep,dim=0)

infected_encoded = encoded(autoencoder,infected_loader)
uninfected_encoded = encoded(autoencoder,uninfected_loader)

encoded_X = torch.cat([infected_encoded,uninfected_encoded],dim=0).numpy()
y_infected = np.ones(infected_encoded.shape[0])
y_uninfected = np.zeros(uninfected_encoded.shape[0])
encoded_y = np.append(y_infected,y_uninfected)

X_train,X_test,y_train,y_test = train_test_split(encoded_X,encoded_y,test_size=0.2)

In [33]:
gmm = GaussianMixture(n_components=2,random_state=42)
gmm.fit(X_train)
y_pred=gmm.predict(X_test)

conf_matrix = confusion_matrix(y_test,y_pred)
report = classification_report(y_test,y_pred)

In [34]:
print(report)

              precision    recall  f1-score   support

         0.0       0.91      0.84      0.88        51
         1.0       0.87      0.93      0.90        58

    accuracy                           0.89       109
   macro avg       0.89      0.89      0.89       109
weighted avg       0.89      0.89      0.89       109



In [35]:
rows_with_nan = df[df.isna().any(axis=1)]
rows_with_nan

Unnamed: 0,Age,Intimate Partners,"Protection Usage (0: Never, 1: Sometimes, 2: Always)",Symptoms,Location,Education,"STD Testing history (0: No, 1: Yes)",STD Status


In [36]:
#implementing xgboost
X_train, X_test, y_train, y_test = train_test_split(encoded_X,encoded_y,test_size=0.3,random_state=41) # 70% training and 30% test
X_cv,y_cv = X_test[:int(len(X_test)/2)],y_test[:int(len(y_test)/2)]
X_test,y_test = X_test[int(len(X_test)/2):],y_test[int(len(y_test)/2):]

In [42]:
xgb_clf = xgb.XGBClassifier(
    objective="binary:logistic",
    eta=0.1,
    max_depth=5,
    min_child_weight=5,
    subsample=0.8,
    colsample_bytree=0.9
)

# Create classification matrices
dtrain_clf = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_clf = xgb.DMatrix(X_test, y_test, enable_categorical=True)
param_grid = { 'eta': [0.01, 0.1, 0.2,0.5],
               'max_depth': [3, 5, 7, 9, 11, 15, 20],
               'min_child_weight': [1,3,5,7,10],
               'subsample': [0.7,0.8, 0.9],
               'colsample_bytree': [0.7,0.8, 0.9]
             }

In [43]:
random_search_xgboost_model = RandomizedSearchCV(
    xgb_clf, param_grid, cv=5, scoring='neg_log_loss', n_iter=100, verbose=1
)
# Fit the model on the training data
random_search_xgboost_model.fit(X_train, y_train)

# Display the best hyperparameters
print("Best Hyperparameters:", random_search_xgboost_model.best_params_)

# Get the best model
best_random_search_xgb_model = random_search_xgboost_model.best_estimator_

# Evaluate the model with additional metrics
random_predictions = best_random_search_xgb_model.predict(X_cv)
print("Accuracy:", accuracy_score(y_cv, random_predictions))
print("Precision:", precision_score(y_cv, random_predictions))
print("Recall:", recall_score(y_cv, random_predictions))
report = classification_report(y_cv,random_predictions)
print(report)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Hyperparameters: {'subsample': 0.9, 'min_child_weight': 5, 'max_depth': 20, 'eta': 0.1, 'colsample_bytree': 0.9}
Accuracy: 0.8780487804878049
Precision: 0.9142857142857143
Recall: 0.8205128205128205
              precision    recall  f1-score   support

         0.0       0.85      0.93      0.89        43
         1.0       0.91      0.82      0.86        39

    accuracy                           0.88        82
   macro avg       0.88      0.88      0.88        82
weighted avg       0.88      0.88      0.88        82



In [44]:
best_params = random_search_xgboost_model.best_params_

# Re-run XGBoost CV with early stopping and additional metrics
xgb_random_search_tuned_results = xgb.cv(
    best_params, dtrain_clf,
    num_boost_round=1000,
    nfold=3,
    early_stopping_rounds=10,
    metrics=["auc", "error", "logloss"],
    verbose_eval=True
)

# Display tuned results
print("Tuned Results:")
print(xgb_random_search_tuned_results.iloc[-1])

[0]	train-auc:0.93934+0.00719	train-error:0.22868+0.14456	train-logloss:0.62739+0.00570	test-auc:0.91724+0.03391	test-error:0.26382+0.18056	test-logloss:0.63770+0.01114
[1]	train-auc:0.95160+0.01100	train-error:0.12499+0.00648	train-logloss:0.57530+0.00826	test-auc:0.92349+0.02964	test-error:0.15806+0.04930	test-logloss:0.58984+0.01266
[2]	train-auc:0.95612+0.01052	train-error:0.10791+0.01003	train-logloss:0.52982+0.00994	test-auc:0.92418+0.02572	test-error:0.14486+0.03604	test-logloss:0.54859+0.01397
[3]	train-auc:0.96245+0.00845	train-error:0.10398+0.01849	train-logloss:0.49120+0.01185	test-auc:0.91868+0.02531	test-error:0.13959+0.03598	test-logloss:0.51574+0.01640
[4]	train-auc:0.97168+0.00560	train-error:0.09345+0.01318	train-logloss:0.45528+0.01305	test-auc:0.91911+0.02137	test-error:0.13163+0.02102	test-logloss:0.48814+0.01765
[5]	train-auc:0.97435+0.00481	train-error:0.09345+0.01669	train-logloss:0.42586+0.01435	test-auc:0.92006+0.01823	test-error:0.13165+0.02718	test-logloss:0.

In [45]:
# Create the grid search object
grid_search_xgboost_model = GridSearchCV(
    xgb_clf, param_grid, cv=5, scoring='neg_log_loss', verbose=5
)

# Fit the model on the training data
grid_search_xgboost_model.fit(X_train, y_train)

# Display the best hyperparameters
print("Best Hyperparameters:", grid_search_xgboost_model.best_params_)

# Get the best model
best_grid_search_xgb_model = grid_search_xgboost_model.best_estimator_

# Evaluate the model with additional metrics
grid_predictions = best_grid_search_xgb_model.predict(X_cv)
print("Accuracy:", accuracy_score(y_cv, grid_predictions))
print("Precision:", precision_score(y_cv, grid_predictions))
print("Recall:", recall_score(y_cv, grid_predictions))
report = classification_report(y_cv,grid_predictions)
print(report)

Fitting 5 folds for each of 1260 candidates, totalling 6300 fits
[CV 1/5] END colsample_bytree=0.7, eta=0.01, max_depth=3, min_child_weight=1, subsample=0.7;, score=-0.438 total time=   0.0s
[CV 2/5] END colsample_bytree=0.7, eta=0.01, max_depth=3, min_child_weight=1, subsample=0.7;, score=-0.451 total time=   0.0s
[CV 3/5] END colsample_bytree=0.7, eta=0.01, max_depth=3, min_child_weight=1, subsample=0.7;, score=-0.434 total time=   0.0s
[CV 4/5] END colsample_bytree=0.7, eta=0.01, max_depth=3, min_child_weight=1, subsample=0.7;, score=-0.433 total time=   0.0s
[CV 5/5] END colsample_bytree=0.7, eta=0.01, max_depth=3, min_child_weight=1, subsample=0.7;, score=-0.468 total time=   0.0s
[CV 1/5] END colsample_bytree=0.7, eta=0.01, max_depth=3, min_child_weight=1, subsample=0.8;, score=-0.435 total time=   0.0s
[CV 2/5] END colsample_bytree=0.7, eta=0.01, max_depth=3, min_child_weight=1, subsample=0.8;, score=-0.450 total time=   0.0s
[CV 3/5] END colsample_bytree=0.7, eta=0.01, max_dept

In [41]:
# Get the best parameters from  grid search
best_params = grid_search_xgboost_model.best_params_

# Re-run XGBoost CV with early stopping and additional metrics
xgb_grid_search_tuned_results = xgb.cv(
    best_params, dtrain_clf,
    num_boost_round=1000,
    nfold=5,
    early_stopping_rounds=10,
    metrics=["logloss", "auc", "error"],
    verbose_eval=True
)

# Display tuned results
print("Tuned Results:")
print(xgb_grid_search_tuned_results.iloc[-1])


[0]	train-logloss:0.63440+0.00236	train-auc:0.91648+0.00945	train-error:0.30263+0.12759	test-logloss:0.64000+0.00511	test-auc:0.90666+0.02056	test-error:0.32632+0.14732
[1]	train-logloss:0.58632+0.00295	train-auc:0.92770+0.00829	train-error:0.13026+0.00967	test-logloss:0.59717+0.00784	test-auc:0.91233+0.02173	test-error:0.15263+0.02440
[2]	train-logloss:0.54668+0.00461	train-auc:0.93293+0.00607	train-error:0.12763+0.01184	test-logloss:0.56017+0.00914	test-auc:0.91801+0.02154	test-error:0.14474+0.01177
[3]	train-logloss:0.51201+0.00675	train-auc:0.93494+0.00592	train-error:0.12566+0.01069	test-logloss:0.52774+0.00905	test-auc:0.91839+0.02135	test-error:0.14211+0.00985
[4]	train-logloss:0.48290+0.00769	train-auc:0.94002+0.00483	train-error:0.12171+0.01213	test-logloss:0.50162+0.00905	test-auc:0.91897+0.01606	test-error:0.14474+0.01177
[5]	train-logloss:0.45887+0.00842	train-auc:0.94039+0.00487	train-error:0.12039+0.00738	test-logloss:0.48045+0.01194	test-auc:0.92006+0.01552	test-error:0.