In [None]:
# define Class Imputeddatasets
class ImputedDatasets(Dataset):
    
    def __init__(self,HD_file='HD_file.rds',incplt_data_filename='df.rds',k=2):# from R to python and getting donors
        self.HD=readRDS(HD_file)
        self.incplt_data_filename=incplt_data_filename
        self.k=k
        self.incplt_data=self.incplt_data_rtopy()
        self.donors=self.donors_gen()
        self.whichna=self.whichna_mis()

    
    def incplt_data_rtopy(self):
        incplt_data=readRDS(self.incplt_data_filename)
        with localconverter(robjects.default_converter + pandas2ri.converter):# convert it to python DF
                incplt_data = robjects.conversion.rpy2py(incplt_data)
        return incplt_data

    def donors_gen(self):
        donors=list(range(len(self.HD[2])))
        for i in range(len(self.HD[2])):
            #donors of mis_value number i
            donors[i]=self.HD[2][i]
        return donors

    def whichna_mis(self):
        d=[[],[],[]]
        for j in range(len(self.incplt_data.columns)):
            for i in range(len(self.incplt_data)):
                if np.isnan(self.incplt_data.iloc[i,j]) or self.incplt_data.iloc[i,j]==-2147483648:
                    d[0].append(i)
                    d[1].append(j)
        for k in range(len(self.donors)): # 
            d[2].append(list(self.donors[k]))
        d=pd.DataFrame(d)
        whichna=pd.DataFrame.transpose(d)# whichna row 1 represents the first miss value in our incplt dataset, located in row=(col 0 of whichna)
                                        # column=(col1 of whichna) and with plausible donors=(col2 of whichna)
        return whichna

    def sampling_row(self,i):
        row=self.incplt_data.iloc[i,]#get the specified row from incplt data
        r=[]
        for imputation in range(self.k):# to get K samples for this specified row
            r_imp=[]
            for j in range(len(self.incplt_data.columns)):
                if np.isnan(row[j]) or row[j]==-2147483648:
                    donors=self.whichna[(self.whichna.iloc[:,0] ==i) & (self.whichna.iloc[:,1] ==j)].iloc[0,2]# fetch the donors in whichna using ij then sample (
                    imp=random.choices(donors)[0]
                else:
                    imp=row[j]
                r_imp.append(imp)
            a=t.tensor(r_imp,dtype=t.float32)#turn the list to tensor..its shape is 13
            b=a.expand(1,a.shape[0])#to get shape [1,12]
            r.append(b)
        row_imp=t.stack(r)#3D (k,1,12)
        return row_imp


    def __len__(self):
         return len(self.incplt_data)

    def __getitem__(self,idx): 
        return self.sampling_row(idx)

# defining functions
## Training functions

###RO-training
def max_train(epochs,train_loader_imp,model,criterion,optimizer,device):
    """  
    
    """
    #train_loss=[]
    for epoch in range (epochs):
        epoch_loss=0
        for data in train_loader_imp:
            truth=data[...,[-1]].to(device=device)
            input_model=data[...,:-1].to(device=device)
            output=model(input_model)
            loss=criterion(output,truth)
            loss=t.mean(t.max(loss,dim=0)[0]) #(t.max : to get the max mse of each row, [0]: values, [1]: indices) 
            #print(loss)
            epoch_loss+=loss.item()
            #train_loss.append(loss)
            loss.backward() # calcul grad
            optimizer.step() # update weights & bias
            optimizer.zero_grad()
            # print(optimizer.param_groups)
        print(" epoch = %4d  loss = %0.4f  epoch_loss = %0.4f  training_loss=%0.4f " % \
                (epoch, loss, epoch_loss, epoch_loss/len(train_loader_imp)))# the total number of batches
    print("Done ")

### Classic training
def train_model(model,criterion,optimizer,train_loader,val_loader,epochs,device):
    """ 
    """
    model.train()
    train_loss=[]
    val_loss=[]
    for epoch in range(epochs):
        epoch_loss=0
        for data in (train_loader):
            (X, Y) =  (data[...,:-1].to(device=device),data[...,[-1]].to(device=device)) # input ,targets
            oupt = model(X) #predictions
            loss = t.mean(criterion(oupt, Y))  # avg loss in BATCH
            epoch_loss+=loss.item() #
            loss.backward()# compute gradients
            optimizer.step() # update weights
            optimizer.zero_grad()
        train_loss.append(epoch_loss/len(train_loader)) 
        # Validation phase
        valid_loss=0
        #model.eval() 
        for data in val_loader:
            (X,Y)=(data[...,:-1],data[...,[-1]])
            with t.no_grad():
                oupt=model(X)
            loss=t.mean(criterion(oupt,Y))
            valid_loss+=loss.item()
        val_loss.append(valid_loss/len(val_loader))
        if epoch % 100 == 0:
            print(" epoch = %4d  loss = %0.4f  epoch_loss = %0.4f  training_loss=%0.4f Val_loss=%0.4f"  % \
                (epoch, loss, epoch_loss, epoch_loss/len(train_loader),valid_loss / len(val_loader)))# the total number of batches
    print("Done ")
    plt.plot(train_loss,'-o')
    plt.plot(val_loss,'-o')
    plt.xlabel('epoch')
    plt.ylabel('loss')
    plt.legend(['Train','Valid'])
    plt.title('Train vs Valid Loss')
    plt.show

## define testing functions 

def testing_loss(model,test_loader,device,imp=False):
    """ 
    """
    #percentage=1-
    test_loss=0
    for data in test_loader:
        (X,Y)=(data[...,:-1].to(device=device),data[...,[-1]].to(device=device))
        with t.no_grad():
            pred = model(X)
        loss=criterion(pred,Y)
        if imp:
            loss=t.mean(loss,dim=0)
        summed_loss= t.sum(loss)
        test_loss+=summed_loss.item()
    test_loss=test_loss/len(test_loader.dataset)
    if imp:
        test_loss=test_loss/0.4 #percentage
    return test_loss


def testing_max_loss(model,test_loader,device):# remove imp=false
    """
    """
    #percentage= 1-
    test_loss=0
    for data in test_loader:
    # ... is a numpy/torch notation for any dimension that is not selected explicitely
    #it allows me to work if the dataset is the imputed dataset
        (X,Y)=(data[...,:-1].to(device=device),data[...,[-1]].to(device=device))
        with t.no_grad():
            pred=model(X)
        loss=criterion(pred,Y)
        loss=t.max(loss,dim=0)[0]
        #changed for sum rather than mean
        summed_loss=t.sum(loss)
        test_loss+=summed_loss.item()
    test_loss=test_loss/len(test_loader.dataset)*0.4 # the percentage of the test_imputed data 
    return test_loss

In [None]:
def instanciate_train_test_batch_sampler(length_dataset,percentage,batch_size,shuffle_train=True,shuffle_test=False,drop_last=False):
    """
    Function to instanciate a generator for training and a generator for testing
    Parameters:
        length_dataset: int length of the dataset to sample
        percentage: percentage of the datset for the training set
        batch_size: batch size for the training
        shffle_train: bool whether or not the training set should be shuffled
        shuffle_test: bool whether or not the testing set should be shuffled
        drop_last: bool whether or not to drop uncomplete batch
    """
    # indices = np.random.choice(range(1,length_dataset+1),size=length_dataset,replace=False)
    indices = np.random.choice(length_dataset,size=length_dataset,replace=False)
    indices_train = indices[:int(percentage*length_dataset)]
    indices_test = indices[int(percentage*length_dataset):]
    if shuffle_train:
        train_sampler = t.utils.data.SubsetRandomSampler(indices_train)
    else:
        train_sampler = indices_train
    if shuffle_test:
        test_sampler = t.utils.data.SubsetRandomSampler(indices_test)
    else:
        test_sampler = indices_test
    return t.utils.data.BatchSampler(train_sampler, batch_size, drop_last), t.utils.data.BatchSampler(test_sampler, batch_size, drop_last)

In [None]:
%%capture 
# DATA AFTER IMPUTATION
## RO-Imputed data
k_train=1000
results = ImputedDatasets(HD_file='HD_data1_MAR.rds',incplt_data_filename='scaled_data1_MAR.rds',k=k_train) 
# random.seed(4)
# k_test=100
# results_test = ImputedDatasets(HD_file='HD_data1.rds',incplt_data_filename='scaled_data1.rds',k=k_test)
##Mean-imputation Data
mean_imp=incplt_data.fillna(incplt_data.mean())     
mean_imp=t.tensor(mean_imp.values).to(t.float32)
##KNN-Imputed Data
imputer = KNNImputer(n_neighbors=3)
data1_KNN= imputer.fit_transform(incplt_data)
knn_data1=pd.DataFrame(data1_KNN)
KNN_imp=t.tensor(knn_data1.values).to(t.float32)
##MF-Imputed Data
imputer = MissForest()
data1_MF= imputer.fit_transform(incplt_data)
MF_data=pd.DataFrame(data1_MF)
MF_imp=t.tensor(MF_data.values).to(t.float32)


# TRAIN/TEST LOADERS
trsize=np.around(len(data1T)*0.6)
valsize=np.around(len(data1T)*0.2)
testsize=len(data1T)-trsize-valsize 
##train/val/test loaders for complete data
train,val,test_cplt=random_split(data1T,[int(trsize),int(valsize),int(testsize)])
batch_size=64
train_loader_cplt=DataLoader(train,batch_size,shuffle=False)
val_loader_cplt=DataLoader(val,batch_size,shuffle=False)
test_loader_cplt=DataLoader(test_cplt,batch_size,shuffle=False)

##train/ test loaders for RO imputed data
collate_fn = lambda x: t.cat(x,dim=1)
train_batch_sampler,test_batch_sampler=instanciate_train_test_batch_sampler(len(results),0.6,64,shuffle_train=True,shuffle_test=False,drop_last=False)
train_loader_imp = DataLoader(results, batch_sampler=train_batch_sampler,shuffle=False, collate_fn=collate_fn)
random.seed(4)
test_loader_imp = DataLoader(results, batch_sampler=test_batch_sampler, shuffle=False, collate_fn=collate_fn)
##train/val/test loaders for mean imputation data
train,val,test_mean=random_split(mean_imp,[int(trsize),int(valsize),int(testsize)])
batch_size=64
mean_train_loader=DataLoader(train,batch_size,shuffle=True)
mean_val_loader=DataLoader(val,batch_size,shuffle=False)
mean_test_loader=DataLoader(test_mean,batch_size,shuffle=False)

##train/val/test loaders for KNN imputed data
train,val,test_knn=random_split(KNN_imp,[int(trsize),int(valsize),int(testsize)])
batch_size=64
KNN_train_loader=DataLoader(train,batch_size,shuffle=True)
KNN_val_loader=DataLoader(val,batch_size,shuffle=False)
KNN_test_loader=DataLoader(test_knn,batch_size,shuffle=False)

#train/val/test loaders for MF imputed data 
train,val,test_mf=random_split(MF_imp,[int(trsize),int(valsize),int(testsize)])
batch_size=64
MF_train_loader=DataLoader(train,batch_size,shuffle=False)
MF_val_loader=DataLoader(val,batch_size,shuffle=False)
MF_test_loader=DataLoader(test_mf,batch_size,shuffle=False)

Wanna learn more ?
Contact me :)