In [378]:
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import random
import pickle

In [379]:
# Load the data
with open('all_data.pkl', 'rb') as f:
    all_data = pickle.load(f)

# shuffle the data
random.shuffle(all_data)

# split the data into train and test
train_data, test_data = train_test_split(all_data, test_size=0.2)




In [380]:
# some encodings

with open('./kdd21-MLVis-main/data/1k/tmp/meta_variable_mapping.pkl', 'rb') as f:
    meta_variable_mapping = pickle.load(f)

with open('./kdd21-MLVis-main/data/1k/tmp/wide-and-deep-config2id.pkl','rb') as f:
    config2id = pickle.load(f)

with open('./kdd21-MLVis-main/data/1k/tmp/wide-and-deep-dataset2id.pkl','rb') as f:
    dataset2id = pickle.load(f)

# one_hot_encoding of 60 numerical features
one_hot_c = np.eye(60)
# # print(config2id.values())

In [381]:

# feature extraction
def get_sparse_features(feature,n):
    # normalize the feature to [0,1], by substracting minimum and dividing by range
    # print(type(feature[0]))
    # convert the feature into a a numpy array of numbers
    feature = np.array(feature)
    # print(type(feature[0]))

    feature = (feature - feature.min())/(feature.max() - feature.min())

    # # print(feature)
    # divide the range[0,1] into n bins
    bins = np.linspace(0,1,n+1)
    # print(bins)
    # get the index of the bin that each feature belongs to
    feature = np.digitize(feature,bins)
    # print(feature)
    # convert the index to a sparse feature
    feature = np.eye(n+2)[feature]
    # make it to a 1D array
    # print(feature[0][0])
    feature = feature.reshape(-1)
    return feature
    




In [382]:
def generate_s1(n,b=4):
    # return an array of 4 vectors of length n
    # each vector is a sparse feature of a numerical feature
    # and has 0 and 1 as values with 0.8 probability and 0.2 probability respectively
    s1 = np.random.choice([0,1],size=(b,n),p=[0.8,0.2])
    return torch.tensor(s1).double()


In [383]:
# defining the model

s1_len = 1006
dc_len = 10
dx_len = 2*1006
sc_len = 60
sx_len = 2*1006*5

wide_len = sx_len + sc_len + s1_len
deep_len = dc_len + dx_len

class Model(torch.nn.Module):
    def __init__(self) -> None:
        super().__init__()

        # self.wide_len = wide_len
        # self.deep_len = deep_len
        
        # matrix to extract dense features
        self.dense_c = nn.Linear(sc_len, dc_len).double()

        # wide model parameters
        self.wide_w = nn.Linear(wide_len,100).double()
        self.wide_b = nn.Parameter(torch.zeros(100)).double()

        # deep model parameters
        # deep model is a 3 layer MLP, with 2*1006+dc_len neurons in the first layer, 1006 neurons in the second layer, and 1 neuron in the third layer, 
        # with relu activation function
        self.deep1_w = nn.Linear(deep_len,500).double()
        self.deep1_b = nn.Parameter(torch.zeros(500)).double()
        self.deep2_w = nn.Linear(500,100).double()
        self.deep2_b = nn.Parameter(torch.zeros(100)).double()
        # ReLU activation function
        self.relu = nn.ReLU()

        # final score parameters
        self.final_w = nn.Linear(100,1).double()
        self.final_d = nn.Linear(100,1).double()
        self.final_b = nn.Parameter(torch.zeros(1)).double()

        self.sigmoid = nn.Sigmoid()
                


    def forward(self,dense,sparse,config):

        dx = dense
        sx = sparse
        
        sc = np.eye(60)[config]
        dx = torch.tensor(dx)
        # sc = np.eye(60)[sc]
        sc = torch.tensor(sc)
        sx = torch.tensor(sx)

        dc = self.dense_c(sc)
        
        s1 = generate_s1(s1_len)
        # concatanete dc and dx to get d
        try:
            d = torch.cat((dc,dx),1)
            s = torch.cat((sx,sc),1)
            

        except:
            # print(dc.shape)
            # print(dx.shape)
            d = torch.cat((dc,dx))
            s = torch.cat((sx,sc))
            s1 = generate_s1(s1_len,1)
            s1 = s1.reshape(-1)
            # print(dc)
            # print(dx)
        # try:
        # s1 = torch.tens(s1)
        try:    
            wide_s = torch.cat((s,s1),1)
        except:
            # print(s.shape)
            # print(s1.shape)
            wide_s = torch.cat((s,s1))
        wide_s = torch.tensor(wide_s)
        
        wide = self.wide_w(wide_s) + self.wide_b
        
        d = torch.tensor(d)
        
    
        # print("error at deep encoding")
    
    
        deep = self.deep1_w(d) + self.deep1_b
        deep = self.relu(deep)
        deep = self.deep2_w(deep) + self.deep2_b
        deep = self.relu(deep)
        
        wide = torch.tensor(wide)
        deep = torch.tensor(deep)
        final = self.final_w(wide) + self.final_d(deep) + self.final_b
        
        final = torch.tensor(final)
        final = self.sigmoid(final)
        
        return final

In [384]:
# training the model

train_data = np.array(train_data)

def get_dense_x(data):

    if(len(data) == 5):
        var1 = data[1]
        var2 = data[2]
    if(len(data) == 4):
        var1 = data[1]
    
    d1 = meta_variable_mapping[var1]
    if(len(data) == 5):
        d2 = meta_variable_mapping[var2]
        
        # concatenate d1 and d2
        dx = torch.cat((d1,d2))
    else:
        dx = d1
    
    # concatenate d1 and d2 to get dx
    # dx = torch.cat((d1,d2),1)

    return dx

def get_sparse_c(data):
    config_id = config2id[data[-2]]
    # get the one hot encoding of the config id
    sc = one_hot_c[config_id]

    return sc



  train_data = np.array(train_data)


In [385]:
modified_data = []
sparse_vactors = []
print(len(all_data))
for data in all_data:

    dataset = data[0]
    try:
        dataset = dataset2id[dataset]
        # print(dataset)
    except:
        continue

    var1 = data[1]
    try:    
        var1 = meta_variable_mapping[var1]
        dense = var1
    except:
        continue
    if(len(data) == 5):
        var2 = data[2]
        try:
            var2 = meta_variable_mapping[var2]
            dense = np.concatenate((var1,var2))
            
        except:
            continue
    if(len(data) == 4):
        var2 = torch.zeros(1006)
        try:
            dense = np.concatenate((var1,var2))
        except:
            continue
        
    config = data[-2]
    try:
        config = config2id[config]
    except:
        continue
    try:
        label = int(data[-1])
    except:
        continue
    
    # dense = np.concatenate((var1,var2))
    var1 = get_sparse_features(var1,3)
    if(len(data) == 5):
        var2 = get_sparse_features(var2,3)
        var1 = np.concatenate((var1,var2))
    if(len(data) == 4):
        var2 = np.zeros(1006*5)
        var1 = np.concatenate((var1,var2))

    # make a list of the modified data
    sparse = var1
    modified_data.append([dataset,dense,sparse,config,label])

import torch.nn as nn
# import Dataloder utility from pytorch
from torch.utils.data import DataLoader

batch_size = 4
epochs = 3

train_data , test_data = train_test_split(modified_data,test_size=0.2,random_state=42)

# divide train into batches of 4 using dataloader
train_batches = DataLoader(train_data,batch_size=batch_size,shuffle=True)

# divide test into batches of 4 using dataloader
# test_test = DataLoader(test_data,batch_size=batch_size,shuffle=True)


8531


In [386]:
def train():
    model = Model()
    # define loss function
    loss_fn = nn.BCELoss()
    # define optimizer
    optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
    # train the model
    for epoch in range(epochs):
        for batch in train_batches:
            
            if(len(batch) != 4):
                continue
            # make gradients zero
            optimizer.zero_grad()

            dense = torch.tensor(batch[1])
            sparse = torch.tensor(batch[2])
            config = (batch[3])
            label = batch[4]

            
            
            pred = model(dense,sparse,config)
            try:
                loss = loss_fn(pred,label)
                loss.backward()
                optimizer.step()
            except:
                continue
        try:    
            print("epoch: ",epoch," loss: ",loss.item())
        except:
            print()
    return model
        
            
    

In [387]:
# training the model
import time
print(time.ctime())
model = train()

# separate data based on label 0 and 1
def separate_data(data):
    data_0 = []
    data_1 = []
    for d in data:
        if(d[-1] == 0):
            data_0.append(d)
        else:
            data_1.append(d)
    return data_0,data_1

# get the data with label 0 and 1
data_0,data_1 = separate_data(test_data)

# predict the labels

score = 0

for data in data_0:
    dense = torch.tensor(data[1])
    sparse = torch.tensor(data[2])
    config = data[3]
    label = data[4]
    pred = model(dense,sparse,config)
    pred = pred.item()
    if(pred < 0.5):
        score += 1

print("score_0: ",score/len(data_0))

score = 0
for data in data_1:
    dense = torch.tensor(data[1])
    sparse = torch.tensor(data[2])
    config = data[3]
    label = data[4]
    pred = model(dense,sparse,config)
    pred = pred.item()
    if(pred < 0.5):
        score += 1

print("score_1: ",score/len(data_1))

        



Fri Dec  2 01:25:02 2022





  dx = torch.tensor(dx)
  sx = torch.tensor(sx)
  wide_s = torch.tensor(wide_s)
  d = torch.tensor(d)
  wide = torch.tensor(wide)
  deep = torch.tensor(deep)
  final = torch.tensor(final)


score_0:  0.8871951219512195
score_1:  0.9691629955947136


In [400]:
import pickle

# save the model
with open('model.pkl','wb') as f:
    pickle.dump(model,f)

with open('model.pkl','rb') as f:
    model = pickle.load(f)
    print(model)

# save model parameters
torch.save(model.state_dict(),'model_params.pkl')

with open('model_params.pkl','rb') as f:
    model.load_state_dict(torch.load(f))
    print(model)
weights = []
j = 0
for i in model.parameters():
    weights.append(i)
    print(i)
    j += 1
print(j)

with open('weights.pkl','wb') as f:
    pickle.dump(weights,f)

# print(len(model.parameters()))

Model(
  (dense_c): Linear(in_features=60, out_features=10, bias=True)
  (wide_w): Linear(in_features=11126, out_features=100, bias=True)
  (deep1_w): Linear(in_features=2022, out_features=500, bias=True)
  (deep2_w): Linear(in_features=500, out_features=100, bias=True)
  (relu): ReLU()
  (final_w): Linear(in_features=100, out_features=1, bias=True)
  (final_d): Linear(in_features=100, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)
Model(
  (dense_c): Linear(in_features=60, out_features=10, bias=True)
  (wide_w): Linear(in_features=11126, out_features=100, bias=True)
  (deep1_w): Linear(in_features=2022, out_features=500, bias=True)
  (deep2_w): Linear(in_features=500, out_features=100, bias=True)
  (relu): ReLU()
  (final_w): Linear(in_features=100, out_features=1, bias=True)
  (final_d): Linear(in_features=100, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)
Parameter containing:
tensor([[ 0.0526,  0.0421,  0.0063, -0.0086, -0.0531, -0.0808, -0.0919, -0.1165,
          0.053

In [388]:
model = Model()
data = None
for i in train_batches:
    data = i
print(data)

dense = torch.tensor(data[1])
sparse = torch.tensor(data[2])
config = data[3]
label = data[4]


dx = dense
sx = sparse

sc = np.eye(60)[config]
dx = torch.tensor(dx)


# sc = np.eye(60)[sc]
sc = torch.tensor(sc)
sx = torch.tensor(sx)
dc = torch.randn((4,20))
print("dx:",dx.shape)
print("sc",sc.shape)
print("sx",sx.shape)
print("dc",dc.shape)
# dc = Model.dense_c(sc)

# s1 = generate_s1(s1_len)
# concatanete dc and dx to get d
d = torch.cat((dc,dx),1)

s = torch.cat((sx,sc),1)
print("d",d.shape)
print("s",s.shape)
# s1 = torch.tens(s1)
# wide_s = torch.cat((s,s1),1)
# wide_s = torch.tensor(wide_s)

[tensor([344, 642]), tensor([[0., 1., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.]], dtype=torch.float64), tensor([[0., 1., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.]], dtype=torch.float64), tensor([22, 32]), tensor([1, 1])]
dx: torch.Size([2, 2012])
sc torch.Size([2, 60])
sx torch.Size([2, 10060])
dc torch.Size([4, 20])


  dense = torch.tensor(data[1])
  sparse = torch.tensor(data[2])
  dx = torch.tensor(dx)
  sx = torch.tensor(sx)


RuntimeError: Sizes of tensors must match except in dimension 1. Expected size 4 but got size 2 for tensor number 1 in the list.