In [None]:
#!/usr/bin/env python
# coding: utf-8

# Importing necessary packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
import seaborn as sns
from matplotlib import gridspec
from sklearn import metrics
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler 

plt.style.use('fivethirtyeight')

In [None]:
from sklearn.metrics import f1_score

In [None]:
# Loading the Dataset
train = pd.read_csv('C:/Users/MBBLABS/Desktop/Python/1. Models/3. Project/Data/less_feature/train.csv',index_col='Unnamed: 0')
test = pd.read_csv('C:/Users/MBBLABS/Desktop/Python/1. Models/3. Project/Data/less_feature/test.csv',index_col='Unnamed: 0')

In [None]:
for i in train.columns:
    display(train[i].head(2))

In [None]:
train['y'].value_counts(dropna=False)

In [None]:
# train[train['y'].isnull()]['y']
train['y'] = train['y'].fillna(-1)
train['y'].value_counts()

In [None]:
#data type chaged to int32
train['y'] = train['y'].astype('int32')
train.info()

In [None]:
#separating X,y with label:
x_train_labeled = train[train['y']!=-1].iloc[:,:-1]
y_train_labeled = train[train['y']!=-1].iloc[:,-1]

x_train_labeled.shape,y_train_labeled.shape

In [None]:
#separating X,y with unlabel:
x_train_unlabeled = train[train['y'] == -1].iloc[:,:-1]

x_train_unlabeled.shape

In [None]:
#test data
x_test = test.iloc[:,:-1] 
y_test = test.iloc[:,-1] 
x_test.shape,y_test.shape

In [None]:
#plotting distribution variables
sns.set_style('darkgrid')
features = x_train_labeled.columns

plt.figure(figsize=(20,20*4))
gs = gridspec.GridSpec(20, 4)
for i, c in enumerate(x_train_labeled[features]):
    ax = plt.subplot(gs[i])
    sns.kdeplot(x=x_train_labeled[c][y_train_labeled==0],color='b',fill=True)
    sns.kdeplot(x=x_train_labeled[c][y_train_labeled==1],color='r',fill=True)
    plt.tight_layout

plt.show()

In [None]:
# Importing Torch to work on Neural Network
import torch
from torch import nn
import torch.nn.functional as F
torch.manual_seed(42)
np.random.seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
# Normalizing all three datasets
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train_labeled = scaler.fit_transform(x_train_labeled)
x_train_unlabeled = scaler.transform(x_train_unlabeled)
x_test = scaler.transform(x_test.values)

In [None]:
# Converting train dataset into tensor
x_train_labeled = torch.from_numpy(x_train_labeled).type(torch.FloatTensor)
y_train_labeled =y_train_labeled.to_numpy()
y_train_labeled = torch.from_numpy(y_train_labeled).type(torch.LongTensor) 

In [None]:
# Converting test dataset into tensor
x_test = torch.from_numpy(x_test).type(torch.FloatTensor)
y_test=y_test.to_numpy()
y_test = torch.from_numpy(y_test).type(torch.LongTensor) 

In [None]:
# Creating the dataloaders for train & test sets
train_labeled = torch.utils.data.TensorDataset(x_train_labeled, y_train_labeled)
test = torch.utils.data.TensorDataset(x_test, y_test)

In [None]:
# Converting unlabeled dataset into tensor
train_unlabeled = torch.from_numpy(x_train_unlabeled).type(torch.FloatTensor)
# Creating the dataloader for unlabeled set
unlabeled = torch.utils.data.TensorDataset(train_unlabeled)

In [None]:
# The dataset is highly unbalaced,so Weighted Random Sampler is used for sampling to make each batch more balanced
# Creating the Weighted Random Sampler for train set
class_sample_count = np.array([len(np.where(y_train_labeled==t)[0]) for t in np.unique(y_train_labeled)]) # Counting number of points in each class
print(class_sample_count)
weight = 1. / class_sample_count # Weight is reciprocal of the sample count of each class
print(weight)
samples_weight = np.array([weight[t] for t in y_train_labeled]) # Assigning the weights to every point in the set
print(samples_weight)
samples_weight = torch.from_numpy(samples_weight) # Converting assigned weights to tensor
print(samples_weight)
sampler_labeled = torch.utils.data.sampler.WeightedRandomSampler(samples_weight.type('torch.DoubleTensor'), len(samples_weight)) #Creating the sampler

In [None]:
#Creating the dataloaders for train, unlabeled and test sets.
train_loader = torch.utils.data.DataLoader(train_labeled, batch_size = 64,sampler= sampler_labeled,shuffle = False, num_workers = 4)
unlabeled_loader = torch.utils.data.DataLoader(unlabeled, batch_size = 128,shuffle = False, num_workers = 4)
test_loader = torch.utils.data.DataLoader(test, batch_size = 64, shuffle = False, num_workers = 4)

In [None]:
#Architecture of the Neural Network
class Net(nn.Module):
        def __init__(self):
            super(Net, self).__init__()
            self.fc1 = nn.Linear(20, 16)
            self.fc2 = nn.Linear(16, 18)
            self.fc3 = nn.Linear(18, 20)
            self.fc4 = nn.Linear(20, 24)
            self.fc5 = nn.Linear(24, 2)
            
       
        def forward(self, x):
            
            x = F.relu(self.fc1(x))
            x = F.relu(self.fc2(x))
            x = F.dropout(x, p=0.25)
            x = F.relu(self.fc3(x))
            x = F.relu(self.fc4(x))
            x = torch.sigmoid(self.fc5(x))
            return x
        
net = Net()

In [None]:
# Initializing weights and bias
def weights_init(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.kaiming_normal_(m.weight)
        torch.nn.init.zeros_(m.bias)

In [None]:
net.apply(weights_init)

In [None]:
# Creating the loss functtion
loss_function = nn.CrossEntropyLoss()

In [None]:
# Importing Classification Report to evaluate the result
from sklearn.metrics import classification_report

In [None]:
# Defining a function to evaluate the network and get loss value and classification report on test set
def evaluate(model, test_loader):
    model.eval()     
    loss = 0
    pred1 = np.array([])
    lbl1 = np.array([])
    
    with torch.no_grad():
        for data, labels in test_loader:
            data = data
            output = model(data) # Generating the output                       
            predicted = torch.max(output,1)[1] # Prdicting the class of the output                                                        
            loss += loss_function(output,labels).item() # Updating loss with every epoch
            
            predicted1= predicted.cpu() 
            labels1= labels.cpu()
            
            pred = predicted1.detach().numpy()
            lbl = labels1.detach().numpy()
            
            pred1 = np.append(pred1,pred)
            lbl1 = np.append(lbl1,lbl)
            
    return (classification_report(lbl1,pred1)) , (loss/len(test_loader)),lbl1,pred1 # Getting results

In [None]:
# First, training the model on the labeled set for 100 epochs
from tqdm import tqdm_notebook
def train_supervised(model, train_loader, test_loader):  # Defining a function to train the model on the labeled set
    optimizer = torch.optim.Adam(model.parameters(), lr = 0.001) # Using Adam optimizer with learning rate 0.001
    EPOCHS = 50 # Number of epochs
    model.train() # Training the model
    for epoch in tqdm_notebook(range(EPOCHS)):
        correct = 0
        running_loss = 0
        for batch_idx, (X_batch, y_batch) in enumerate(train_loader): 
            X_batch, y_batch = X_batch, y_batch
            
            output = model(X_batch) # Generating the output
            labeled_loss = loss_function(output, y_batch) # Calculating loss on labeled set
            # Now, doing Backpropagation           
            optimizer.zero_grad() # Resetting gradients
            labeled_loss.backward() # Backward pass
            optimizer.step() # Updating weights
            running_loss += labeled_loss.item() # Updating loss with every epoch
        
        # Evaluating the model 
        report, test_loss =evaluate(model, test_loader)
        print('\n Epoch: {} | Train Loss : {:.7f} \n Classification Report :\n  {} \n Test Loss : {:.7f} \n'.format(epoch, running_loss/( len(train)), report, test_loss))
        model.train()
    

In [None]:
# Now, running the funnction
train_supervised(net, train_loader, test_loader)

In [None]:
# Printing the best result
report, test_loss,lbl,pred = evaluate(net, test_loader)
print('Classification Report : \n  {} \n Test Loss : {:.7f} '.format(report, test_loss))

In [None]:
#Ploting the confusion matrix
sns.heatmap(metrics.confusion_matrix(lbl, pred),annot=True,fmt='d',cmap = 'Blues')
plt.title('Confusion Matrix',size = 15)
plt.xlabel('Predictions',size =15)
plt.ylabel('True Values',size = 15)

In [None]:
# Defining a function of alpha_weight to control the contribution of unlabeled data to the overall loss. 
T1 = 15
T2 = 30
af = 3
def alpha_weight(epoch):
    if epoch < T1:
        return 0.0
    elif epoch > T2:
        return af
    else:
         return ((epoch-T1) / (T2-T1))*af

In [None]:
# Now, training the model on the labeled set and unalbeled data for 150 epochs
import tqdm
# Creating logs for alpha_weight, test_loss & classification report
alpha_log = []
test_loss_log = []
report_log=[]

def semisup_train(model, train_loader, unlabeled_loader, test_loader): # Defining a function to train the model on the labeled set & unlabeled set
    optimizer = torch.optim.Adam(model.parameters(), lr = 0.001) # Using Adam optimizer with learning rate 0.001
    EPOCHS = 50 # Number of epochs    
    # Instead of using current epoch we use a "step" variable to calculate alpha_weight to help the model to converge faster    
    step = 10 # As for first 100 epochs , alpha_weight will be 0     
    model.train()
    for epoch in tqdm.notebook.tqdm(range(EPOCHS)):
        for batch_idx, x_unlabeled in enumerate(unlabeled_loader):                       
            # Forward Pass to get the pseudo labels
            x_unlabeled = x_unlabeled[0]
            model.eval()
            output_unlabeled = model(x_unlabeled) # Generating the output for unlabeled set
            _, pseudo_labeled = torch.max(output_unlabeled, 1) # Creating pseudo labels for unlabeled set
            model.train() # Training the model
            
            # Now calculating the unlabeled loss using the pseudo label
            output = model(x_unlabeled)
            unlabeled_loss = alpha_weight(step) * loss_function(output, pseudo_labeled)   
            
            # Now, doing Backpropogation                        
            optimizer.zero_grad() # Resetting gradients                        
            unlabeled_loss.backward() # Backward pass                        
            optimizer.step() # Updating weights
                        
            # Now, for every 50 batches training one epoch on labeled data 
            if batch_idx % 2 == 0:
                
                # Labeled data training 
                for batch_idx, (X_batch, y_batch) in enumerate(train_loader):
                    X_batch = X_batch
                    y_batch = y_batch
                    output = model(X_batch) # Generating output of the labeled data
                    labeled_loss = loss_function(output, y_batch) # Calculating loss of the labeled data
                    # Again, doing Backpropagation
                    optimizer.zero_grad()
                    labeled_loss.backward()
                    optimizer.step()
                
                # Now step is increased by 1
                step += 1
                
        # Evaluating the model
        report, test_loss = evaluate(model, test_loader)
        print('\n Epoch: {} | Alpha Weight : {:.5f} | \n Classification Report :\n  {} \n Test Loss : {:.7f} \n'.format(epoch, alpha_weight(step), report, test_loss))
        
        # LOGGING VALUES 
        alpha_log.append(alpha_weight(step))
        report_log.append(report)                
        test_loss_log.append(test_loss)       
        
        model.train() # Training the model

In [None]:
# Now, running the funnction
semisup_train(net, train_loader, unlabeled_loader, test_loader)

In [None]:
# Printing the best result
report, test_loss,_,_ = evaluate(net, test_loader)
print('Classification Report : \n {} | Test Loss : {:.7f} '.format(report, test_loss))

# Saving the weights
torch.save(net.state_dict(), 'Saved_models/Semi_supervised_weight_with_ST.pt')

In [None]:
# Checking the saved weights
net.load_state_dict(torch.load('Saved_models/Semi_supervised_weight_with_ST.pt'))

In [None]:
#classification report
report, test_loss,lbl1,pred1 = evaluate(net, test_loader)
print('Classification Report : \n {} | Test Loss : {:.7f} '.format(report, test_loss))

In [None]:
#Ploting the confusion matrix
sns.heatmap(metrics.confusion_matrix(lbl1, pred1),annot=True,fmt='d',cmap = 'Blues')
plt.title('Confusion Matrix',size = 15)
plt.xlabel('Predictions',size =15)
plt.ylabel('True Values',size = 15)