## Multiomics GRN inference evaluation
## VAE-SEM model
### by Jalil Nourisa

In [1]:
!pip install anndata category_encoders

Collecting anndata
  Using cached anndata-0.10.7-py3-none-any.whl.metadata (6.6 kB)
Collecting category_encoders
  Using cached category_encoders-2.6.3-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting array-api-compat!=1.5,>1.4 (from anndata)
  Using cached array_api_compat-1.7-py3-none-any.whl.metadata (1.5 kB)
Collecting h5py>=3.1 (from anndata)
  Using cached h5py-3.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.5 kB)
Collecting natsort (from anndata)
  Using cached natsort-8.4.0-py3-none-any.whl.metadata (21 kB)
Collecting statsmodels>=0.9.0 (from category_encoders)
  Using cached statsmodels-0.14.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.2 kB)
Collecting patsy>=0.5.1 (from category_encoders)
  Using cached patsy-0.5.6-py2.py3-none-any.whl.metadata (3.5 kB)
Using cached anndata-0.10.7-py3-none-any.whl (122 kB)
Using cached category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
Using cached array_api_compat-1.7-py3-none-any.

In [1]:
import torch 
import anndata as ad
import pandas as pd
import numpy as np
import tqdm
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error 
import os
import random 
import category_encoders 


os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'

SEED = 0xCAFE
USE_GPU = True
if USE_GPU and torch.cuda.is_available():
    print('using device: cuda')
else:
    print('using device: cpu')
    USE_GPU = False

def seed_everything():
    seed = 42
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['TOKENIZERS_PARALLELISM'] = 'true'
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
    print('-----Seed Set!-----')
seed_everything()

def sign_grn(grn):
    grn_sign = grn.copy()
    weights = grn_sign.weight
    weights = [1 if weight>0 else -1 for weight in weights]
    grn_sign.weight = weights
    return grn_sign
def shuffle_grn(grn):
    grn_s = grn.copy()
    grn_s['source'] = grn_s['source'].sample(frac=1).reset_index(drop=True)
    grn_s['target'] = grn_s['target'].sample(frac=1).reset_index(drop=True)
    dup_flags = grn_s[['source','target']].duplicated()
    grn_s = grn_s[~dup_flags].reset_index(drop=True)
    if grn_s.duplicated().sum()>0:
        raise ValueError('')
    return grn_s

def mrrmse(x: np.ndarray, y: np.ndarray) -> float:
    return float(np.mean(np.sqrt(np.mean(np.square(x - y), axis=1))))

using device: cuda
-----Seed Set!-----


## Process train test data

In [2]:
if False:
    adata = ad.read_h5ad('../output/preprocess/bulk_adata_f.h5ad')
    hvgs = np.loadtxt('../output/hvgs.txt', dtype=str)

    adata = adata[:, adata.var_names.isin(hvgs)]
    # adata.X = adata.layers['X_norm_pearson']
    controls3 = ['Dabrafenib', 'Belinostat', 'Dimethyl Sulfoxide']
    # compound-cell type based split
    sm_names = adata.obs.sm_name.unique()
    non_controls = np.setdiff1d(sm_names, controls3)
    test_sm_names_1 = np.random.choice(non_controls, 30)
    test_cell_types = ['B cells', 'Myeloid cells']
    test_mask_1 =  adata.obs.sm_name.isin(test_sm_names_1) & adata.obs.cell_type.isin(test_cell_types) # cell type compound 
    print(test_mask_1.sum())

    # donor based split 
    test_donor = np.random.choice(adata.obs.donor_id.unique(), )
    test_sm_names_2 = list(np.random.choice(non_controls, int(len(non_controls)/2)))  # half of non control compounds + controls
    test_mask_2 =  adata.obs.donor_id.eq(test_donor) & adata.obs.sm_name.isin(test_sm_names_2) # donor
    print(test_mask_2.sum())

    # actual split 
    features_X = ['sm_name', 'cell_type', 'donor_id']
    adata.obs['split']  = 'train'
    adata.obs.loc[test_mask_1 | test_mask_2, 'split'] = 'test'

    gene_names = adata.var_names
    n_genes = len(gene_names)
    adata_train = adata[adata.obs.split=='train',:]
    df_train = pd.DataFrame(adata_train.X, columns=adata.var_names, index=pd.MultiIndex.from_frame(adata_train.obs[features_X])) 
    adata_test = adata[adata.obs.split=='test',:]
    df_test = pd.DataFrame(adata_test.X, columns=adata.var_names, index=pd.MultiIndex.from_frame(adata_test.obs[features_X])) 
else:
    features_X = ['sm_name', 'cell_type']
    de_train = ad.read_h5ad('../resources/neurips-2023-data/de_train.h5ad')
    de_test = ad.read_h5ad('../resources/neurips-2023-data/de_test.h5ad')

    de_train.X = de_train.layers['sign_log10_adj_pval']
    de_test.X = de_test.layers['sign_log10_adj_pval']
    
    gene_names = de_train.var_names
    n_genes = len(gene_names)
    
    df_train = pd.DataFrame(de_train.X, columns=de_train.var_names, index=pd.MultiIndex.from_frame(de_train.obs[features_X])) 
    df_test = pd.DataFrame(de_test.X, columns=de_test.var_names, index=pd.MultiIndex.from_frame(de_test.obs[features_X])) 

    

## Process grn net

In [3]:
# grn_net_df = pd.read_csv("https://github.com/pablormier/omnipath-static/raw/main/op/collectri-26.09.2023.zip")

grn_net_df = pd.read_csv("../output/benchmark/grn_models/scenicplus.csv")
# grn_net_df =  shuffle_grn(grn_net_df)

grn_net_df = grn_net_df[grn_net_df.target.isin(gene_names)].reset_index(drop=True)
grn_net = grn_net_df.pivot(index='target', columns='source', values='weight').fillna(0)

print('sparsity:', (grn_net.values==0).sum()/grn_net.size)
net_genes = grn_net.index.unique()
shared_genes = np.intersect1d(net_genes, gene_names)
missing_genes = np.setdiff1d(gene_names, shared_genes)
tfs_n = len(grn_net.columns.unique())

sparsity = (grn_net.values==0).sum()/grn_net.size
ratios = [sparsity, (1-sparsity)/2, (1-sparsity)/2]
shape = (missing_genes.shape[0], tfs_n)
X_random = np.random.choice([0, -1, 1], size=shape, p=ratios)
# concat actual net with random net
grn_net = pd.concat([grn_net, pd.DataFrame(X_random, columns=grn_net.columns, index=missing_genes)])

# make the order of genes compatible with adata
grn_net = grn_net.reindex(gene_names)

grn_net = grn_net.values

sparsity: 0.9468663664012258


In [52]:
np.random.shuffle(grn_net)

## NN

### Encode data

In [6]:
class MultiOutputTargetEncoder:
    def __init__(self):
        self.encoders: List[category_encoders.leave_one_out.LeaveOneOutEncoder] = []
        
    @staticmethod
    def new_encoder() -> category_encoders.leave_one_out.LeaveOneOutEncoder:
        return category_encoders.leave_one_out.LeaveOneOutEncoder(return_df=False)
    
    def fit(self, X: np.ndarray, y: np.ndarray) -> None:
        self.encoders = []
        for j in tqdm.tqdm(range(y.shape[1]), desc='fit LOO encoders'):
            self.encoders.append(MultiOutputTargetEncoder.new_encoder())
            self.encoders[-1].fit(X, y[:, j])
    
    def transform(self, X: np.ndarray) -> np.ndarray:
        Z = []
        for encoder in tqdm.tqdm(self.encoders, desc='transform LOO encoders'):
            y_hat = encoder.transform(X)
            Z.append(y_hat)
        Z = np.asarray(Z)
        return np.transpose(Z, (1, 0, 2))
encoder = MultiOutputTargetEncoder()

encoder.fit(np.asarray([df_train.index.get_level_values(var) for var in features_X]).T, df_train.values)

fit LOO encoders: 100%|██████████| 5317/5317 [00:47<00:00, 111.42it/s]


In [7]:
X = encoder.transform(np.asarray([df_train.index.get_level_values(var) for var in features_X]).T)
X_submit = encoder.transform(np.asarray([df_test.index.get_level_values(var) for var in features_X]).T)

transform LOO encoders: 100%|██████████| 5317/5317 [00:20<00:00, 253.32it/s]
transform LOO encoders: 100%|██████████| 5317/5317 [00:20<00:00, 261.40it/s]


In [8]:
X.shape

(402, 5317, 2)

### NN design

In [9]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, features, labels):
        self.features = features 
        self.labels = labels 
    def __len__(self):
        return len(self.features)
    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]
dataset = Dataset(features=torch.tensor(X, dtype=torch.float32), labels=torch.tensor(df_train.values, dtype=torch.float32))

In [10]:
def background_noise(
    *size: int,
    cutoff: float = 0.05,
    device: str = 'cuda',
    generator: torch.Generator = None) -> torch.Tensor:
    sign = 2 * torch.randint(0, 2, size, device=device) - 1

    return sign * torch.log10(cutoff +  torch.rand(*size, generator=generator, device=device) * (1. - cutoff))

In [11]:
grn_net.shape

(5317, 942)

In [47]:
import torch.nn as nn 
from sklearn.metrics import roc_auc_score

antoine_model = False

class Scaler(torch.nn.Module):
    
    def __init__(self, m: int) -> None:
        torch.nn.Module.__init__(self)
        self.m: int = m
        self.a: torch.Tensor = torch.nn.Parameter(torch.ones((1, self.m)))
        self.b: torch.Tensor = torch.nn.Parameter(torch.zeros((1, self.m)))
    
    def forward(self, X: torch.Tensor) -> torch.Tensor:
        return self.a * X + self.b

class NN(torch.nn.Module):
    def __init__(self, n_genes:int, grn_net:np.ndarray, n_nodes_hidden:int=120):
        torch.nn.Module.__init__(self)
        self.n_genes = n_genes
        self.encode = len(features_X)
        self.A = torch.tensor(grn_net, dtype=torch.float32, device='cuda', requires_grad=False)
        # self.A = torch.eye(self.n_genes).to(device='cuda')
        # self.A = torch.randn(n_genes, 1000).to(device='cuda')
        self.n_tfs = self.A.shape[1]
        
        dropout_rate = .05
        
#         self.encoder = nn.Sequential(
#             nn.Linear(n_genes * self.encode, n_nodes_hidden),
#             nn.LeakyReLU(0.2),
#             nn.Dropout(dropout_rate),  
            
#             nn.Linear(n_nodes_hidden, n_genes),
#         )

#         self.decoder = nn.Sequential(
#             nn.Linear(n_genes, n_nodes_hidden),  
#             nn.LeakyReLU(0.2),
#             nn.Dropout(dropout_rate), 
            
#             nn.Linear(n_nodes_hidden, n_genes)
#         )
        # self.mlp = nn.Sequential(
        #     nn.Linear(n_genes * self.encode, n_nodes_hidden),
        #     nn.LeakyReLU(0.2),
        #     nn.Dropout(dropout_rate),  
        #     nn.Linear(n_nodes_hidden, 64),
        #     nn.LeakyReLU(0.2),
        #     nn.Dropout(dropout_rate),  
        #     nn.Linear(64, 120),
        #     nn.LeakyReLU(0.2),
        #     nn.Dropout(dropout_rate),  
        #     nn.Linear(n_nodes_hidden, n_genes)
        # )
        
        self.mlp1 = nn.Sequential(
            nn.Linear(n_genes * self.encode, n_nodes_hidden),
            nn.LeakyReLU(0.2),
            nn.Linear(n_nodes_hidden, 16),
            nn.LeakyReLU(0.2),
            nn.Linear(16, 120),
            nn.LeakyReLU(0.2),
            nn.Linear(n_nodes_hidden, n_genes * self.encode)
        )
        self.mlp2 = nn.Sequential(
            nn.Linear(2*self.encode+self.A.shape[1], 16),
            nn.LeakyReLU(0.2),
            nn.Linear(16, 16),
            nn.LeakyReLU(0.2),
            nn.Linear(16, 1)
        )
        
        # self.mlp = nn.Sequential(
        #     nn.Linear(n_genes * self.encode, n_nodes_hidden),
        #     nn.LeakyReLU(0.2),
        #     nn.Linear(n_nodes_hidden, 32),
        #     nn.LeakyReLU(0.2),
        #     nn.Linear(32, 120),
        #     nn.LeakyReLU(0.2),
        #     nn.Linear(n_nodes_hidden, n_genes)
        # )
        # self.scaler_1 = nn.Sequential(
        #     Scaler(self.n_tfs), 
        #     nn.LeakyReLU(0.2),
        #     Scaler(self.n_tfs)
        # )
        # self.scaler_2 = nn.Sequential(
        #     Scaler(n_genes), 
        #     nn.LeakyReLU(0.2),
        #     Scaler(n_genes)
        # )
    def reparametrize(self, mu, log_var):
        std = torch.sqrt(torch.exp(log_var))
        eps = torch.randn_like(log_var)
        return mu + std*eps
        # return mu + log_var*eps

    def forward(self, x: torch.Tensor):
        if False:
            x = self.encoder(x.reshape(x.shape[0], -1))
            mu, log_var = torch.chunk(x, 2, dim=1)

            mu = torch.matmul(mu, self.A)
            log_var = torch.matmul(log_var, self.A)

            x = torch.matmul(x, self.A)

            z = self.reparametrize(mu, log_var)
            x = self.decoder(x)
        if False:
            x = self.mlp(x.reshape(x.shape[0], -1))
        if True:
            x_org = x.clone()
            dim_1, dim_2, dim_3 = x.shape
            x = self.mlp1(x.reshape(x.shape[0], -1))
            x = x.reshape(dim_1, dim_2, dim_3)

            x = torch.cat((x, x_org), dim=2)
            
            # Add a new dimension at the start (dimension 0) to make it [1, 5000, 79]
            A_rep = self.A.unsqueeze(0)

            # Repeat the tensor along the new dimension to make it [20, 5000, 79]
            A_rep = A_rep.repeat(x.shape[0], 1, 1)
            
            x = torch.cat((x, A_rep), dim=2) # add gene-tf weights to the encode
            
            x = x.reshape(-1, 2*dim_3+self.A.shape[1])
            
            x = self.mlp2(x)
            
            x = x.reshape(dim_1, dim_2)
        if False:
            x = self.mlp(x.reshape(x.shape[0], -1))
            x = torch.matmul(x, self.A)
            x = self.scaler_1(x)
            A_inv = torch.linalg.pinv(self.A)
            x = torch.matmul(x, A_inv)
            x = self.scaler_2(x)
        #return x, mu, log_var
        return x, None, None


In [19]:
model.A.shape

torch.Size([5317, 76])

###  Train

In [57]:
from torch.utils.data import DataLoader
batch_size = int(df_train.shape[0]/20)
n_epoch = 100


# models = []
y_pred_submit_all = []
for i in range(10):
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=1)
    model = NN(n_genes, grn_net=grn_net)
    model = model.to('cuda')
    model.train()

    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, eps=1e-8)
    loss_func = lambda y_pred, y_true: torch.sum(torch.square(y_pred-y_true))

    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.9, 
                patience=5, threshold_mode='rel', threshold=0.0001, cooldown=5, min_lr=1e-10, eps=1e-8)

    pbar = tqdm.tqdm(range(n_epoch))
    for i_epoch in pbar:
        rel_loss_store = []
        Y_pred_stack = []
        Y_true_stack = []
        for batch_idx, (data_batch, label_batch) in enumerate(dataloader):
            
            optimizer.zero_grad()
            data_batch = data_batch.to('cuda')
            label_batch = label_batch.to('cuda')
            
            if antoine_model:
                generator = torch.Generator(device='cuda').manual_seed(32)
                label_batch = label_batch + 0.2 * background_noise(*label_batch.size(), generator=generator) #TODO: optimize the weight

            x_pred, mu, log_var = model(data_batch) #forward

            Y_true_stack.append(label_batch.cpu().detach().numpy())
            Y_pred_stack.append(x_pred.cpu().detach().numpy())

            loss_x = loss_func(x_pred, label_batch)

            #loss_KL =  - 0.5 * torch.sum(1.0 + log_var - mu.pow(2) - log_var.exp()) #TODO: fix this

            beta = 1
            #loss = loss_x + beta*loss_KL
            loss = loss_x 
            loss.backward()
            optimizer.step()

            # baseline pred
            Y_pred_mean = torch.mean(label_batch, axis=0)
            loss_baseline = loss_func(label_batch, Y_pred_mean)
            rel_loss = loss_x/loss_baseline
            rel_loss_store.append(rel_loss.item())
        # if i_epoch%10==0:
        #     # AUROC
        #     mask = ~np.eye(n_genes, dtype=bool)
        #     grn_pred = np.abs(model.A.cpu().data.numpy())
        #     print('AUROC', roc_auc_score(np.abs(grn_net[mask]), grn_pred[mask]))

        mean_rel_loss = np.mean(rel_loss_store)
        scheduler.step(mean_rel_loss)

        y_pred = np.concatenate(Y_pred_stack, axis=0)
        y_true = np.concatenate(Y_true_stack, axis=0)

        r2 = r2_score(y_true, y_pred, multioutput='variance_weighted')

        pbar.set_description(f'Rel loss: {mean_rel_loss:.3f}, R2:{r2:.3f}')
    # predict
    model.eval()
    y_pred_submit, mu, log_var = model(torch.tensor(X_submit, dtype=torch.float32, device='cuda'))
    y_pred_submit = y_pred_submit.cpu().detach().numpy()
    y_pred_submit_all.append(y_pred_submit)

    print('r2:', r2_score(df_test, y_pred_submit, multioutput='variance_weighted'))
    models.append(model)

Rel loss: 0.163, R2:0.930: 100%|██████████| 100/100 [00:25<00:00,  3.91it/s]


r2: 0.20594417146551347


Rel loss: 0.315, R2:0.911: 100%|██████████| 100/100 [00:25<00:00,  3.91it/s]


r2: 0.06666554083018042


Rel loss: 0.249, R2:0.929: 100%|██████████| 100/100 [00:25<00:00,  3.93it/s]


r2: 0.08273153517792112


Rel loss: 0.177, R2:0.938: 100%|██████████| 100/100 [00:25<00:00,  3.94it/s]


r2: 0.22634402500679096


Rel loss: 0.139, R2:0.951: 100%|██████████| 100/100 [00:25<00:00,  3.90it/s]


r2: 0.19364236123920395


Rel loss: 0.187, R2:0.954: 100%|██████████| 100/100 [00:25<00:00,  3.93it/s]


r2: 0.2472775487196935


Rel loss: 0.354, R2:0.851: 100%|██████████| 100/100 [00:25<00:00,  3.91it/s]


r2: 0.2251267031293495


Rel loss: 0.114, R2:0.955: 100%|██████████| 100/100 [00:25<00:00,  3.93it/s]


r2: 0.23930045531338306


Rel loss: 0.161, R2:0.893: 100%|██████████| 100/100 [00:25<00:00,  3.96it/s]


r2: 0.2587423359229923


Rel loss: 0.181, R2:0.943: 100%|██████████| 100/100 [00:25<00:00,  3.92it/s]

r2: 0.26904458454488905





- baseline:
r2: 0.28898735855873964
mse: 1.6771364452118458
mrrmse: 0.7764429858370391

-scenicplus: 
r2: 0.3008508432154561
mse: 1.6491528604971506
mrrmse: 0.7628936673821836

-scenicplus (shuffled): 
r2: 0.2857228691302343
mse: 1.6848367220795772
mrrmse: 0.7788572802129669


### Predict

In [56]:
print('number of models:', len(y_pred_submit_all))
y_pred_submit_mean = np.mean(y_pred_submit_all, axis=0)
print('r2:', r2_score(df_test, y_pred_submit_mean, multioutput='variance_weighted'))
print('mse:', mean_squared_error(y_pred_submit_mean, df_test.values))
print('mrrmse:', mrrmse(y_pred_submit_mean, df_test.values))

number of models: 10
r2: 0.29235347369238274
mse: 1.669196453655622
mrrmse: 0.7747873985104607


In [None]:
r2: 0.24227989212421114
mse: 1.7873100056454099
mrrmse: 0.8062594076816958

## Baseline RF

In [None]:
aaa

In [None]:

from sklearn.decomposition import TruncatedSVD, KernelPCA    
from sklearn.ensemble import  RandomForestRegressor


def encode(df_train, df_test, feature_space):
    # encode each factor
    x_encoded_dict = {}
    for feature in feature_space:
        index = df_train.index.get_level_values(feature)
        n_com = min([len(index.unique()), 50])
        # var_x = TruncatedSVD(n_components=n_com, n_iter=12, random_state=random_state).fit_transform(df_train)
        var_x = KernelPCA(n_components=n_com, kernel='linear', random_state=random_state).fit_transform(df_train)
        x_encoded = pd.DataFrame(var_x, index=index).reset_index()
        x_encoded = x_encoded.groupby(feature).mean()
        x_encoded_dict[feature] = x_encoded
    # create X and X_submit
    X = []
    X_submit = []
    for i_feature, feature in enumerate(feature_space):
        # encode train data
        index = df_train.index.get_level_values(feature)
        feature_encoded = np.asarray([x_encoded_dict[feature].loc[name].values for name in index])
        if i_feature == 0:
            X = feature_encoded
        else: 
            X = np.concatenate([X, feature_encoded], axis=1)
        
        # encode test data
        index = df_test.index.get_level_values(feature)
        feature_encoded = np.asarray([x_encoded_dict[feature].loc[name].values for name in index])
        if i_feature == 0:
            X_submit = feature_encoded
        else: 
            X_submit = np.concatenate([X_submit, feature_encoded], axis=1)
    return X, X_submit


random_state = 32
n_components = 50
X_rf, X_submit_rf = encode(df_train, df_test, features_X)
emb_model = RandomForestRegressor(n_estimators=100, random_state=random_state)
reducer = TruncatedSVD(n_components=n_components, n_iter=12, random_state=random_state)
Y = reducer.fit_transform(df_train)

emb_model.fit(X_rf, Y)
y_pred_submit = reducer.inverse_transform(emb_model.predict(X_submit_rf))

print('r2:', r2_score(df_test, y_pred_submit, multioutput='variance_weighted'))
print('mse:', mean_squared_error(y_pred_submit, df_test.values))
print('mrrmse:', mrrmse(y_pred_submit, df_test.values))

In [None]:
r2: 0.24227989212421114
mse: 1.7873100056454099
mrrmse: 0.8062594076816958

# TF activity encoding 

In [15]:

from sklearn.decomposition import TruncatedSVD, KernelPCA    
from sklearn.ensemble import  RandomForestRegressor


def encode(df_train, df_test, feature_space):
    # encode each factor
    x_encoded_dict = {}
    for feature in feature_space:
        index = df_train.index.get_level_values(feature)
        n_com = min([len(index.unique()), 50])
        # var_x = TruncatedSVD(n_components=n_com, n_iter=12, random_state=random_state).fit_transform(df_train)
        var_x = KernelPCA(n_components=n_com, kernel='linear', random_state=random_state).fit_transform(df_train)
        x_encoded = pd.DataFrame(var_x, index=index).reset_index()
        x_encoded = x_encoded.groupby(feature).mean()
        x_encoded_dict[feature] = x_encoded
    # create X and X_submit
    X = []
    X_submit = []
    for i_feature, feature in enumerate(feature_space):
        # encode train data
        index = df_train.index.get_level_values(feature)
        feature_encoded = np.asarray([x_encoded_dict[feature].loc[name].values for name in index])
        if i_feature == 0:
            X = feature_encoded
        else: 
            X = np.concatenate([X, feature_encoded], axis=1)
        
        # encode test data
        index = df_test.index.get_level_values(feature)
        feature_encoded = np.asarray([x_encoded_dict[feature].loc[name].values for name in index])
        if i_feature == 0:
            X_submit = feature_encoded
        else: 
            X_submit = np.concatenate([X_submit, feature_encoded], axis=1)
    return X, X_submit


random_state = 32
n_components = 50

grn_net_shuffled = grn_net.copy()
np.random.shuffle(grn_net_shuffled)
tf_activities = df_train.values.dot(grn_net_shuffled)
tf_activities = pd.DataFrame(tf_activities, index=df_train.index)

X_rf, X_submit_rf = encode(tf_activities, df_test, features_X)

emb_model = RandomForestRegressor(n_estimators=100, random_state=random_state)
reducer = TruncatedSVD(n_components=n_components, n_iter=12, random_state=random_state)
Y = reducer.fit_transform(df_train)

emb_model.fit(X_rf, Y)
y_pred_submit = reducer.inverse_transform(emb_model.predict(X_submit_rf))

print('r2:', r2_score(df_test, y_pred_submit, multioutput='variance_weighted'))
print('mse:', mean_squared_error(y_pred_submit, df_test.values))
print('mrrmse:', mrrmse(y_pred_submit, df_test.values))

  x_encoded = x_encoded.groupby(feature).mean()
  x_encoded = x_encoded.groupby(feature).mean()


r2: 0.29187043844764465
mse: 1.6703358370730952
mrrmse: 0.7474491838704398
