In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.mixture import GaussianMixture
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, ConcatDataset

In [None]:
df = pd.read_csv('/content/Data Set for Chapter - Sheet1.csv')
df = df.drop('S.no',axis = 1)
df = df.dropna()
df

Unnamed: 0,Age,Intimate Partners,"Protection Usage (0: Never, 1: Sometimes, 2: Always)",Symptoms,Location,Education,"STD Testing history (0: No, 1: Yes)",STD Status
0,45.0,4,0,0,2,1,1,1
1,43.0,4,0,0,2,1,1,1
2,39.0,5,0,0,3,1,1,1
3,35.0,5,2,0,3,1,0,1
4,32.0,1,1,1,2,1,0,0
...,...,...,...,...,...,...,...,...
540,36.0,2,0,1,2,0,1,1
541,45.0,4,0,0,2,0,1,1
542,43.0,4,0,0,3,0,0,0
543,39.0,5,0,0,3,0,0,0


In [None]:
train_df,test_df = train_test_split(df,test_size=0.2,random_state=42)

In [None]:
train_df = train_df.drop('S.no',axis = 1)
test_df = test_df.drop('S.no',axis = 1)
df = df.drop('S.no',axis = 1)

In [None]:
## Training Autoencoder Model

X = df.drop('STD Status',axis = 1)
y = df['STD Status']

X_scaled = MinMaxScaler().fit_transform(X)

X_infected = X_scaled[y==1]
X_uninfected = X_scaled[y==0]

class Autoencoder(nn.Module):
  def __init__(self):
    super(Autoencoder,self).__init__()
    self.encoder = nn.Sequential(
        nn.Linear(7,5),
        nn.Tanh(),
        nn.Linear(5,3),
    )
    self.decoder = nn.Sequential(
        nn.Linear(3,5),
        nn.Tanh(),
        nn.Linear(5,7),
        nn.ReLU()
    )
  def forward(self,x):
    x = self.encoder(x)
    x = self.decoder(x)
    return x

autoencoder = Autoencoder()

criterion = nn.MSELoss()
optimizer = optim.Adadelta(autoencoder.parameters())

batch_size = 16
epochs = 30

X_infected_tensor = torch.FloatTensor(X_infected)
X_uninfected_tensor = torch.FloatTensor(X_uninfected)

infected_dataset = TensorDataset(X_infected_tensor,X_infected_tensor)
uninfected_dataset = TensorDataset(X_uninfected_tensor,X_uninfected_tensor)

dataset = ConcatDataset([infected_dataset,uninfected_dataset])
loader = DataLoader(dataset,batch_size=batch_size,shuffle=True)
infected_loader = DataLoader(infected_dataset,batch_size=batch_size,shuffle=True)
uninfected_loader = DataLoader(uninfected_dataset,batch_size=batch_size,shuffle=True)
losses = []

for epoch in range(epochs):
  for data in loader:
    inputs,targets = data
    optimizer.zero_grad()
    outputs = autoencoder(inputs)
    loss = criterion(outputs,targets)
    loss.backward()
    optimizer.step()
    losses.append(loss.detach().numpy())


losses[-100:]

[array(0.06459816, dtype=float32),
 array(0.03669699, dtype=float32),
 array(0.03156625, dtype=float32),
 array(0.03121795, dtype=float32),
 array(0.03171251, dtype=float32),
 array(0.05076092, dtype=float32),
 array(0.05026991, dtype=float32),
 array(0.06674357, dtype=float32),
 array(0.04321745, dtype=float32),
 array(0.02933252, dtype=float32),
 array(0.05459446, dtype=float32),
 array(0.05669976, dtype=float32),
 array(0.05290964, dtype=float32),
 array(0.04407999, dtype=float32),
 array(0.05050448, dtype=float32),
 array(0.06251157, dtype=float32),
 array(0.05362901, dtype=float32),
 array(0.05543786, dtype=float32),
 array(0.03116007, dtype=float32),
 array(0.0329885, dtype=float32),
 array(0.05342618, dtype=float32),
 array(0.0418699, dtype=float32),
 array(0.06269699, dtype=float32),
 array(0.04563798, dtype=float32),
 array(0.05076758, dtype=float32),
 array(0.05433216, dtype=float32),
 array(0.05523357, dtype=float32),
 array(0.03305859, dtype=float32),
 array(0.05317362, dty

In [None]:
def encoded(model,data_loader):
  hidden_rep = []
  with torch.no_grad():
    for data in data_loader:
      inputs,_=data
      hidden = model.encoder(inputs)
      hidden_rep.append(hidden)
  return torch.cat(hidden_rep,dim=0)

infected_encoded = encoded(autoencoder,infected_loader)
uninfected_encoded = encoded(autoencoder,uninfected_loader)

encoded_X = torch.cat([infected_encoded,uninfected_encoded],dim=0).numpy()
y_infected = np.ones(infected_encoded.shape[0])
y_uninfected = np.zeros(uninfected_encoded.shape[0])
encoded_y = np.append(y_infected,y_uninfected)

X_train,X_test,y_train,y_test = train_test_split(encoded_X,encoded_y,test_size=0.2)

In [None]:
gmm = GaussianMixture(n_components=2,random_state=42)
gmm.fit(X_train)
y_pred=gmm.predict(X_test)

conf_matrix = confusion_matrix(y_test,y_pred)
report = classification_report(y_test,y_pred)

In [None]:
print(report)

              precision    recall  f1-score   support

         0.0       0.92      0.85      0.88        53
         1.0       0.87      0.93      0.90        56

    accuracy                           0.89       109
   macro avg       0.89      0.89      0.89       109
weighted avg       0.89      0.89      0.89       109



Unnamed: 0,Age,Intimate Partners,"Protection Usage (0: Never, 1: Sometimes, 2: Always)",Symptoms,Location,Education,"STD Testing history (0: No, 1: Yes)"
0,45.0,4,0,0,2,1,1
1,43.0,4,0,0,2,1,1
2,39.0,5,0,0,3,1,1
3,35.0,5,2,0,3,1,0
4,32.0,1,1,1,2,1,0
...,...,...,...,...,...,...,...
540,36.0,2,0,1,2,0,1
541,45.0,4,0,0,2,0,1
542,43.0,4,0,0,3,0,0
543,39.0,5,0,0,3,0,0


In [None]:
rows_with_nan = df[df.isna().any(axis=1)]
rows_with_nan

Unnamed: 0,Age,Intimate Partners,"Protection Usage (0: Never, 1: Sometimes, 2: Always)",Symptoms,Location,Education,"STD Testing history (0: No, 1: Yes)",STD Status
64,,4,2,1,2,0,0,0
