In [1]:
%cd D:\Joe\Acads\Sem8\DDP\Code files\FSR

D:\Joe\Acads\Sem8\DDP\Code files\FSR


In [27]:
import torch
from torch import nn 
from nn_config import NN_config

use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")

class NN_Large_Predictor(nn.Sequential):

	def make_encoder(self, enc_dims, dec_dims):
		num_enc = len(enc_dims)
		num_dec = len(dec_dims)
		encoder = []
		decoder = []
		for i in range(num_enc-1):
			if i != 0:
				encoder += [nn.ReLU(True), nn.Linear(enc_dims[i], enc_dims[i+1])]
			else:
				encoder.append(nn.Linear(enc_dims[i], enc_dims[i+1]))
		for i in range(num_dec-1):
			if i != 0:
				decoder += [nn.ReLU(True), nn.Linear(dec_dims[i], dec_dims[i+1])] 
			else:
				decoder.append(nn.Linear(dec_dims[i], dec_dims[i+1]))
		return torch.nn.Sequential(*(encoder)), torch.nn.Sequential(*(decoder))

	def __init__(self, **config):
		super(NN_Large_Predictor, self).__init__()
		self.input_dim = config['input_dim']
		self.encode_fc1_dim = config['encode_fc1_dim']
		self.encode_fc2_dim = config['encode_fc2_dim']
		self.decode_fc1_dim = config['decode_fc1_dim']
		self.decode_fc2_dim = config['decode_fc2_dim']
		self.cat_flag = False
		self.num_flag = False

		#self.ffn_input = config['features_size'] + config['encode_fc2_dim']
		self.ffn_input = config['encode_fc2_dim']

		if config['cat_features_size']>0:
			self.cat_flag = True
			self.cat_input_dim = config['cat_features_size']
			self.cat_encode_fc1_dim = config['cat_encode_fc1_dim']
			self.cat_encode_fc2_dim = config['cat_encode_fc2_dim']
			self.cat_decode_fc1_dim = config['cat_decode_fc1_dim']
			self.cat_decode_fc2_dim = config['cat_decode_fc2_dim']
			cat_enc_dims = [self.cat_input_dim, self.cat_encode_fc1_dim, self.cat_encode_fc2_dim]
			cat_dec_dims = [self.cat_encode_fc2_dim, self.cat_decode_fc1_dim, self.cat_decode_fc2_dim]
			self.cat_encoder, self.cat_decoder = self.make_encoder(cat_enc_dims, cat_dec_dims)

			self.ffn_input += cat_enc_dims[-1]

		if config['num_features_size']>0 and config['extra_features'] == True:
			self.num_flag = True
			self.num_input_dim = config['num_features_size']
			self.num_encode_fc1_dim = config['num_encode_fc1_dim']
			self.num_encode_fc2_dim = config['num_encode_fc2_dim']
			self.num_decode_fc1_dim = config['num_decode_fc1_dim']
			self.num_decode_fc2_dim = config['num_decode_fc2_dim']
			num_enc_dims = [self.num_input_dim, self.num_encode_fc1_dim, self.num_encode_fc2_dim]
			num_dec_dims = [self.num_encode_fc2_dim, self.num_decode_fc1_dim, self.num_decode_fc2_dim]
			self.num_encoder, self.num_decoder = self.make_encoder(num_enc_dims, num_dec_dims)

			self.ffn_input += num_enc_dims[-1]

		self.predict_dim = config['predict_dim']
		self.predict_out_dim = config['predict_out_dim']
		
		self.dropout_ratio = config['dropout']
		self.ffn_num_layers = config['ffn_num_layers']
		print("input dim:", self.input_dim, "encode dim:", self.encode_fc1_dim)

		enc_dims = [self.input_dim, self.encode_fc1_dim, self.encode_fc2_dim]
		dec_dims = [self.encode_fc2_dim, self.decode_fc1_dim, self.decode_fc2_dim]
		self.encoder, self.decoder = self.make_encoder(enc_dims, dec_dims)

		'''if self.feature_input_dim:
			self.features_ffn = nn.Sequential(
				nn.Linear(self.feature_input_dim, self.feature_fc1_dim),
				nn.Sigmoid()
			)'''

		self.create_predictor()

	def create_predictor(self):
		
		dropout = nn.Dropout(self.dropout_ratio)
		activation = nn.ReLU()

		if self.ffn_num_layers == 1:
			predictor = [
                dropout,
                nn.Linear(self.ffn_input, self.predict_out_dim)
            ]
		else:
			predictor = [
                dropout,
                nn.Linear(self.ffn_input, self.predict_dim)
            ]
			predictor.extend([
                activation,
                dropout,
                nn.Linear(self.predict_dim, self.predict_dim//2),
            ])
			for _ in range(self.ffn_num_layers - 3):
				predictor.extend([
                    activation,
                    dropout,
                    nn.Linear(self.predict_dim//2, self.predict_dim//2),
                ])
			predictor.extend([
                activation,
                dropout,
                nn.Linear(self.predict_dim//2, self.predict_out_dim),
            ])

		# Create predictor model
		self.predictor = nn.Sequential(*predictor)

	def forward(self, v_D, cat_features = None, num_features=None):
		'''
		:param v_D: batch_size x eta, multi-hot vector
		:return: recon, score, code
		'''
		Z_D = self.encoder(v_D.to(device))
		# # decode
		v_D_hat = self.decoder(Z_D)
		recon  = torch.sigmoid(v_D_hat)

		cat_recon = None
		num_recon = None
		#if features_batch is not None:
		if self.cat_flag:
			print('Hello1')
			assert cat_features != None 
			#Z_D_extended = torch.cat([Z_D, features_batch], dim=1)
			cat_e = self.cat_encoder(cat_features.to(device))
			cat_hat = self.cat_decoder(cat_e)
			cat_recon = torch.sigmoid(cat_hat)
			Z_D = torch.cat([Z_D, cat_e], dim=1)  
		self.num_flag = False ##### REMOVE THIS
		Z_D = torch.cat([Z_D, num_features.to(device)], dim=1) ###### REMOVE THIS
		if self.num_flag:
			print('Hello2')
			assert num_features != None
			num_e = self.num_encoder(num_features.to(device))
			num_hat = self.num_decoder(num_e)
			num_recon = torch.sigmoid(num_hat)
			Z_D = torch.cat([Z_D, num_e], dim=1) 

		score = self.predictor(Z_D)
		return  recon, cat_recon, num_recon, score, Z_D

In [28]:
config = NN_config()

In [29]:
import os 
import numpy as np
from chemprop.features import load_features
import pickle

main_dir = '.\\'
if 'main_dir' in config.keys():
    main_dir = config['main_dir']
data_dir = os.path.join(main_dir, 'Data')
data_path =  os.path.join(data_dir, config['file_name'])

if config['extra_features']:
    features_data = []
    features_path = os.path.join(data_dir, config['features_filename'])
    features_data.append(load_features(features_path))
    features_data = np.concatenate(features_data, axis=1)
    features_data[:,32] = np.log(features_data[:,32]+1) ##### Done because of the huge range of the feature
    if not config['feature_categorical_columns'] == None:
        categorical_idx = config['feature_categorical_columns']
        numeric = features_data[:,[not (i in categorical_idx) for i in range(features_data.shape[1])]]
        categorical = features_data[:,[(i in categorical_idx) for i in range(features_data.shape[1])]]
        # TODO: save the OHE from a large corpus
        #categorical = OneHotEncoder().fit_transform(categorical)
        with open(r"D:\Joe\Acads\Sem8\DDP\Code files\FSR\Data\features\ohe_list", "rb") as f:
            ohe_list = pickle.load(f)
        inp = categorical
        cat_encoded = None
        for idx in range(108):
            temp = ohe_list[idx].transform(inp[:,idx].reshape(-1,1))
            if not isinstance(cat_encoded, np.ndarray):
                cat_encoded = temp
            else:
                cat_encoded = np.concatenate([cat_encoded, temp], axis = 1)
        #features_data = np.concatenate([numeric, categorical.toarray()], axis = 1)
        features_data = [cat_encoded, numeric]
    __, config['cat_features_size'] = features_data[0].shape
    config['cat_decode_fc2_dim'] = config['cat_features_size']

In [30]:
model = NN_Large_Predictor(**config)

input dim: 2586 encode dim: 1000


In [31]:
idx = 0
for name, item in model.named_modules():
    print(idx, name, item)
    idx += 1

0  NN_Large_Predictor(
  (cat_encoder): Sequential(
    (0): Linear(in_features=2153, out_features=300, bias=True)
    (1): ReLU(inplace=True)
    (2): Linear(in_features=300, out_features=20, bias=True)
  )
  (cat_decoder): Sequential(
    (0): Linear(in_features=20, out_features=300, bias=True)
    (1): ReLU(inplace=True)
    (2): Linear(in_features=300, out_features=2153, bias=True)
  )
  (num_encoder): Sequential(
    (0): Linear(in_features=92, out_features=70, bias=True)
    (1): ReLU(inplace=True)
    (2): Linear(in_features=70, out_features=30, bias=True)
  )
  (num_decoder): Sequential(
    (0): Linear(in_features=30, out_features=70, bias=True)
    (1): ReLU(inplace=True)
    (2): Linear(in_features=70, out_features=92, bias=True)
  )
  (encoder): Sequential(
    (0): Linear(in_features=2586, out_features=1000, bias=True)
    (1): ReLU(inplace=True)
    (2): Linear(in_features=1000, out_features=200, bias=True)
  )
  (decoder): Sequential(
    (0): Linear(in_features=200, out