In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.mixture import GaussianMixture
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, ConcatDataset
import xgboost as xgb

In [4]:
df = pd.read_csv('./data/Data Set for Chapter - Sheet1.csv')
df = df.drop('S.no',axis = 1)
df = df.dropna()
df

Unnamed: 0,Age,Intimate Partners,"Protection Usage (0: Never, 1: Sometimes, 2: Always)",Symptoms,Location,Education,"STD Testing history (0: No, 1: Yes)",STD Status
0,45.0,4,0,0,2,1,1,1
1,43.0,4,0,0,2,1,1,1
2,39.0,5,0,0,3,1,1,1
3,35.0,5,2,0,3,1,0,1
4,32.0,1,1,1,2,1,0,0
...,...,...,...,...,...,...,...,...
540,36.0,2,0,1,2,0,1,1
541,45.0,4,0,0,2,0,1,1
542,43.0,4,0,0,3,0,0,0
543,39.0,5,0,0,3,0,0,0


In [5]:
train_df,test_df = train_test_split(df,test_size=0.2,random_state=42)

In [19]:
## Training Autoencoder Model

X = df.drop('STD Status',axis = 1)
y = df['STD Status']

X_scaled = MinMaxScaler().fit_transform(X)

X_infected = X_scaled[y==1]
X_uninfected = X_scaled[y==0]

class Autoencoder(nn.Module):
  def __init__(self):
    super(Autoencoder,self).__init__()
    self.encoder = nn.Sequential(
        nn.Linear(7,5),
        nn.Tanh(),
        nn.Linear(5,3),
    )
    self.decoder = nn.Sequential(
        nn.Linear(3,5),
        nn.Tanh(),
        nn.Linear(5,7),
        nn.ReLU()
    )
  def forward(self,x):
    x = self.encoder(x)
    x = self.decoder(x)
    return x

autoencoder = Autoencoder()

criterion = nn.MSELoss()
optimizer = optim.Adadelta(autoencoder.parameters())

batch_size = 16
epochs = 30

X_infected_tensor = torch.FloatTensor(X_infected)
X_uninfected_tensor = torch.FloatTensor(X_uninfected)

infected_dataset = TensorDataset(X_infected_tensor,X_infected_tensor)
uninfected_dataset = TensorDataset(X_uninfected_tensor,X_uninfected_tensor)

dataset = ConcatDataset([infected_dataset,uninfected_dataset])
loader = DataLoader(dataset,batch_size=batch_size,shuffle=True)
infected_loader = DataLoader(infected_dataset,batch_size=batch_size,shuffle=True)
uninfected_loader = DataLoader(uninfected_dataset,batch_size=batch_size,shuffle=True)
losses = []

for epoch in range(epochs):
  for data in loader:
    inputs,targets = data
    optimizer.zero_grad()
    outputs = autoencoder(inputs)
    loss = criterion(outputs,targets)
    loss.backward()
    optimizer.step()
    losses.append(loss.detach().numpy())


#losses[-100:]

In [21]:
def encoded(model,data_loader):
  hidden_rep = []
  with torch.no_grad():
    for data in data_loader:
      inputs,_=data
      hidden = model.encoder(inputs)
      hidden_rep.append(hidden)
  return torch.cat(hidden_rep,dim=0)

infected_encoded = encoded(autoencoder,infected_loader)
uninfected_encoded = encoded(autoencoder,uninfected_loader)

encoded_X = torch.cat([infected_encoded,uninfected_encoded],dim=0).numpy()
y_infected = np.ones(infected_encoded.shape[0])
y_uninfected = np.zeros(uninfected_encoded.shape[0])
encoded_y = np.append(y_infected,y_uninfected)

X_train,X_test,y_train,y_test = train_test_split(encoded_X,encoded_y,test_size=0.2)

In [22]:
gmm = GaussianMixture(n_components=2,random_state=42)
gmm.fit(X_train)
y_pred=gmm.predict(X_test)

conf_matrix = confusion_matrix(y_test,y_pred)
report = classification_report(y_test,y_pred)

In [23]:
print(report)

              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00        54
         1.0       0.14      0.16      0.15        55

    accuracy                           0.08       109
   macro avg       0.07      0.08      0.08       109
weighted avg       0.07      0.08      0.08       109



In [10]:
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification


pca = PCA(n_components=2, random_state=42)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)


knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_pca, y_train)
y_pred_pca = knn.predict(X_test_pca)


conf_matrix_pca = confusion_matrix(y_test, y_pred_pca)
report_pca = classification_report(y_test, y_pred_pca)

print("Confusion Matrix (PCA):")
print(conf_matrix_pca)

print("\nClassification Report (PCA):")
print(report_pca)


Confusion Matrix (PCA):
[[56  1]
 [ 7 45]]

Classification Report (PCA):
              precision    recall  f1-score   support

         0.0       0.89      0.98      0.93        57
         1.0       0.98      0.87      0.92        52

    accuracy                           0.93       109
   macro avg       0.93      0.92      0.93       109
weighted avg       0.93      0.93      0.93       109



In [11]:
torch.save(autoencoder.state_dict(), 'Autoencoder.pth')

In [7]:
## LOADING WEIGHTS ##
model = Autoencoder()
model.load_state_dict(torch.load('Autoencoder.pth'))

<All keys matched successfully>

Implementing XGBoost

In [None]:
#implementing xgboost
X_train, X_test, y_train, y_test = train_test_split(encoded_X,encoded_y,test_size=0.3,random_state=41) # 70% training and 30% test
X_cv,y_cv = X_test[:int(len(X_test)/2)],y_test[:int(len(y_test)/2)]
X_test,y_test = X_test[int(len(X_test)/2):],y_test[int(len(y_test)/2):]

In [None]:
xgb_clf = xgb.XGBClassifier(
    objective="binary:logistic",
    eta=0.1,
    max_depth=5,
    min_child_weight=5,
    subsample=0.8,
    colsample_bytree=0.9
)

# Create classification matrices
dtrain_clf = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_clf = xgb.DMatrix(X_test, y_test, enable_categorical=True)
param_grid = { 'eta': [0.01, 0.1, 0.2,0.5],
               'max_depth': [3, 5, 7, 9, 11, 15, 20],
               'min_child_weight': [1,3,5,7,10],
               'subsample': [0.7,0.8, 0.9],
               'colsample_bytree': [0.7,0.8, 0.9]
             }

In [None]:
random_search_xgboost_model = RandomizedSearchCV(
    xgb_clf, param_grid, cv=5, scoring='neg_log_loss', n_iter=100, verbose=1
)
# Fit the model on the training data
random_search_xgboost_model.fit(X_train, y_train)

# Display the best hyperparameters
print("Best Hyperparameters:", random_search_xgboost_model.best_params_)

# Get the best model
best_random_search_xgb_model = random_search_xgboost_model.best_estimator_

# Evaluate the model with additional metrics
random_predictions = best_random_search_xgb_model.predict(X_cv)
print("Accuracy:", accuracy_score(y_cv, random_predictions))
print("Precision:", precision_score(y_cv, random_predictions))
print("Recall:", recall_score(y_cv, random_predictions))
report = classification_report(y_cv,random_predictions)
print(report)

In [None]:
best_params = random_search_xgboost_model.best_params_

# Re-run XGBoost CV with early stopping and additional metrics
xgb_random_search_tuned_results = xgb.cv(
    best_params, dtrain_clf,
    num_boost_round=1000,
    nfold=3,
    early_stopping_rounds=10,
    metrics=["auc", "error", "logloss"],
    verbose_eval=True
)

# Display tuned results
print("Tuned Results:")
print(xgb_random_search_tuned_results.iloc[-1])

In [None]:
# Create the grid search object
grid_search_xgboost_model = GridSearchCV(
    xgb_clf, param_grid, cv=5, scoring='neg_log_loss', verbose=5
)

# Fit the model on the training data
grid_search_xgboost_model.fit(X_train, y_train)

# Display the best hyperparameters
print("Best Hyperparameters:", grid_search_xgboost_model.best_params_)

# Get the best model
best_grid_search_xgb_model = grid_search_xgboost_model.best_estimator_

# Evaluate the model with additional metrics
grid_predictions = best_grid_search_xgb_model.predict(X_cv)
print("Accuracy:", accuracy_score(y_cv, grid_predictions))
print("Precision:", precision_score(y_cv, grid_predictions))
print("Recall:", recall_score(y_cv, grid_predictions))
report = classification_report(y_cv,grid_predictions)
print(report)

In [None]:
# Get the best parameters from  grid search
best_params = grid_search_xgboost_model.best_params_

# Re-run XGBoost CV with early stopping and additional metrics
xgb_grid_search_tuned_results = xgb.cv(
    best_params, dtrain_clf,
    num_boost_round=1000,
    nfold=5,
    early_stopping_rounds=10,
    metrics=["logloss", "auc", "error"],
    verbose_eval=True
)

# Display tuned results
print("Tuned Results:")
print(xgb_grid_search_tuned_results.iloc[-1])
