In [3]:
import os
import gzip
import pandas as pd
import numpy as np
import anndata
import scanpy as sc
import matplotlib.pyplot as plt
import tqdm
import scipy
import decoupler as dc
import seaborn as sns
import json
from scipy.sparse import csr_matrix
from venn import venn # https://github.com/LankyCyril/pyvenn/blob/master/pyvenn-demo.ipynb

surragate_names = {'collectRI':'CollectRI', 'collectRI_sign':'CollectRI-signs',
                   'sp_grn':'Scenic+', 'sp_ct':'Scenic+', 'sp_grn_sign': 'Scenic+-signs',
                   'co_grn':'CellOracle', 'co_ct':'CellOracle', 'co_grn_sign':'CellOracle-signs',
                   'figr_grn':'FigR', 'figr_ct':'FigR', 'figr_grn_sign':'FigR-signs',
                   'baseline':'Baseline'
                   }

%matplotlib inline
work_dir = '../output'
kaggle_data_dir = '../input/kaggle/input'
os.makedirs(f'{work_dir}', exist_ok=True)

train_cell_types = ['T cells CD4+', 'NK cells', 'T regulatory cells', 'T cells CD8+']
test_celltypes = ['B cells', 'Myeloid cells'] 
agg_type = ['T regulatory cells', 'T cells CD8+', 'T cells CD4+']
all_cell_types = ['T cells CD4+', 'NK cells', 'T regulatory cells', 'T cells CD8+', 'B cells', 'Myeloid cells']

def calculate_p_values(genes, df):
    '''
    We conduct a 1-sample Kolmogorov-Smirnov (KS) test to detemine p-values for each given gene versus a uniform set.
    Taken from Antoine Passiemier.
    '''
    p_values = []
    for k, gene in tqdm.tqdm(enumerate(genes)):  # Perform test for each gene
        v = df.iloc[:, k].values
        v_valid = v[~np.isnan(v)]
        if len(v_valid)<10:
            raise ValueError('shouldnt be')
        else:
            v_valid = 10 ** (-np.abs(v_valid))  # Transform DE values to p-values (assumed to be uniformly-distributed)
            res = scipy.stats.mstats.ks_1samp(v_valid, scipy.stats.uniform.cdf)  # Kolmogorov-Smirnov test
            p_value = res[1]
        p_values.append(p_value)
    p_values = np.asarray(p_values)
    return p_values



In [4]:
de_train = pd.read_parquet(f'{kaggle_data_dir}/open-problems-single-cell-perturbations/de_train.parquet')
data_df = de_train.iloc[:,5:]
data_df['cell_type'] = de_train['cell_type']
data_df['sm_name'] = de_train['sm_name']
data_df = data_df.set_index(['cell_type','sm_name'])
data = data_df.values

# Regression analysis 

In [7]:
original_data = True

cv_scheme = 'sm_name_10' # 'cell_type', 'sm_name', '10cv', 'sm_name_10'
metric_type = 'r2'

## Training data

In [8]:
if original_data:
    df_main_reg = pd.read_csv('../output/postprocess/EDA/data_df.csv', index_col=0).set_index(['cell_type', 'sm_name'])
else:
    df_main_reg = pd.read_csv('../output/postprocess/EDA/data_df_f.csv', index_col=0).set_index(['cell_type', 'sm_name'])
real_values_mask = ~df_main_reg.isna()
print('ratio of sparsity:', 1-real_values_mask.sum().sum()/real_values_mask.size)
sig_mask = sig_mask = 10 ** (-np.abs(df_main_reg)) < 0.05
print('ratio of sig:', sig_mask.sum().sum()/sig_mask.size)
# set those nans to zero
df_main_reg.isna().sum().sum()/df_main_reg.size
df_main_reg.fillna(0, inplace=True)

ratio of sparsity: 0.0
ratio of sig: 0.12872298430075105


## Encoder model and enrichment scheme 

In [10]:
def enrich_tfs_single(df_main_c, net):
    tf_act, p_values = dc.run_ulm(
                df_main_c,
                net,
                source='source',
                target='target',
                weight='weight',
                verbose=True
                )
    # tf_act = tf_act.set_index(df_main.index, drop=True)
    return tf_act
def enrich_tfs(df_main, net):
    "TF enrichment score cell specific grns"
    df_main_c = df_main.copy()
    df_main_c.index = df_main.index.map(lambda x: '@'.join(map(str, x))) # this is needed for enrichment analyisis
    if 'cell_type' in net:
        cell_type_index = df_main.index.get_level_values('cell_type')
        # EA for each cell type
        tf_act_stack = []
        for cell_type in net.cell_type.unique():
            if cell_type == 'agg_type':
                mask = cell_type_index.isin(agg_type)
            else:
                mask = cell_type_index==cell_type
            df_main_celltype = df_main_c[mask]

            net_celltype = net[net.cell_type==cell_type]
            tf_act = enrich_tfs_single(df_main_celltype, net_celltype)
            tf_act_stack.append(tf_act)
        tf_act_df = pd.concat(tf_act_stack).fillna(0)
        tf_act_df = tf_act_df.reindex(df_main_c.index)
            
    else:
        tf_act_df = enrich_tfs_single(df_main_c, net)
    return tf_act_df


In [16]:
import category_encoders as ce
from sklearn.ensemble import RandomForestRegressor
import decoupler as dc
from sklearn.linear_model import Ridge, ElasticNet
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split, LeaveOneGroupOut, KFold

random_state = 32
id_map = pd.read_csv(f'{kaggle_data_dir}/open-problems-single-cell-perturbations/id_map.csv', index_col=0)

class model_encoder:
    def __init__(self, df_main, n_components=35, shares={
                                        'sm_name':{'de_x':35},
                                        'cell_type': {'de_x':6}
                                        },
                                        te_params = dict(min_samples_leaf=20, smoothing=100),
                                        grn_model=None
                                        ):  
        # target df to encode
        self.df_main = df_main
        self.grn_model = grn_model
        # regressor
        self.emb_model = RandomForestRegressor(n_estimators=100, random_state=random_state)
        # reducer
        self.reducer = TruncatedSVD(n_components=n_components, n_iter=12, random_state=random_state)
        self.Y = self.reducer.fit_transform(df_main)
        # encoder 
        self.enc = ce.TargetEncoder(**te_params) #[0, 0.05, 0.5, 0.1,0.9,10]
        # encode data
        var_x_name = list(shares['sm_name'].keys())[0]
        n_components = shares['sm_name'][var_x_name]
        y_sm_name = self.create_feature_space(self.df_main, var_x=var_x_name, n=n_components)
        
        # cell type
        var_x_name = list(shares['cell_type'].keys())[0]
        n_components = shares['cell_type'][var_x_name]
        y_cell_type = self.create_feature_space(self.df_main, var_x=var_x_name, n=n_components)
        # the order is important
        self.emb_data = {'sm_name':y_sm_name, 'cell_type': y_cell_type}
        self.determine_X()
        print(self.X.shape, self.X_submit.shape)

    def determine_X(self):
        for i, (name, feature_x) in enumerate(self.emb_data.items()):
            for i_target in tqdm.tqdm(range(feature_x.shape[1])):
                if i_target == 0:
                    # print(feature_x[:,i_target])
                    X_encoded = self.enc.fit_transform(self.df_main.reset_index()[name], feature_x[:,i_target])
                    X_submit_encoded = self.enc.transform(id_map.reset_index()[name])
                else:
                    X_encoded_tmp = self.enc.fit_transform(self.df_main.reset_index()[name], feature_x[:,i_target])
                    X_encoded = np.concatenate( [X_encoded, X_encoded_tmp], axis = 1)
                    X_encoded_tmp = self.enc.transform(id_map.reset_index()[name])
                    X_submit_encoded = np.concatenate([X_submit_encoded, X_encoded_tmp], axis = 1)
            if i == 0:
                X = X_encoded
                X_submit = X_submit_encoded
            else:
                X = np.concatenate([X, X_encoded], axis = 1)
                X_submit = np.concatenate([X_submit, X_submit_encoded], axis = 1)

        self.X = X
        self.X_submit = X_submit
    def create_feature_space(self, df_to_encode, var_x='sm_name', n=35):
        if var_x == 'de_x':
            return TruncatedSVD(n_components=n, n_iter=12, random_state=random_state).fit_transform(df_to_encode.values)

        elif var_x == 'tf_x':
            df_main_c = df_to_encode.copy()
            net = self.grn_model
            tf_act = enrich_tfs(df_main_c, net)

            # tf_act = TruncatedSVD(n_components=n, n_iter=12, random_state=random_state).fit_transform(tf_act)
            tf_act = tf_act.values
            return tf_act
        raise ValueError('define feature space')
    def validate(self, mask_tr, mask_va):
        # print(mask_tr)
        X_tr, Y_tr = self.X[mask_tr,:], self.Y[mask_tr,:]
        X_va = self.X[mask_va,:]
        self.emb_model.fit(X_tr, Y_tr)
        return self.reducer.inverse_transform(self.emb_model.predict(X_va))
    def calculate_y_submit(self):
        self.emb_model.fit(self.X, self.Y)
        return self.reducer.inverse_transform(self.emb_model.predict(self.X_submit))


## GRN models

### Load inferred GRNs

In [11]:
grn_model_names = ["collectRI", "figr_grn", "co_grn", "sp_grn"]
# grn_model_names = ["sp_grn"]
grn_models_dict = {}
for name in grn_model_names:
    grn_models_dict[name] = pd.read_csv(f'../output/postprocess/grn_models/{name}.csv', index_col=0)
grn_models_signs_dict = {}
for name in grn_model_names:
    name = f'{name}_sign'
    grn_models_signs_dict[name] = pd.read_csv(f'../output/postprocess/grn_models/{name}.csv', index_col=0)

In [12]:
grn_models_shuffled_dict = {}
for name, grn in grn_models_dict.items():
    grn_s = grn.copy()
    # grn_s['source'] = grn_s['source'].sample(frac=1).reset_index(drop=True)
    grn_s['target'] = grn_s['target'].sample(frac=1).reset_index(drop=True)

    dup_flags = grn_s[['source','target']].duplicated()
    grn_s = grn_s[~dup_flags].reset_index(drop=True)
    if grn_s.duplicated().sum()>0:
        raise ValueError('')
    name = f'{name}_shuffled'
    grn_models_shuffled_dict[name] = grn_s

In [13]:
# grn_models_all_dict = grn_models_dict | grn_models_signs_dict 
# grn_models_all_dict = grn_models_signs_dict | grn_models_shuffled_dict
grn_models_all_dict = grn_models_dict | grn_models_signs_dict | grn_models_shuffled_dict
# del grn_models_all_dict['collectRI_sign']
grn_models_all_dict.keys()

dict_keys(['collectRI', 'figr_grn', 'co_grn', 'sp_grn', 'collectRI_sign', 'figr_grn_sign', 'co_grn_sign', 'sp_grn_sign', 'collectRI_shuffled', 'figr_grn_shuffled', 'co_grn_shuffled', 'sp_grn_shuffled'])

## Number of SVDs

In [14]:
from sklearn.decomposition import PCA, TruncatedSVD

def func_svds(df, net, explained_variance_t=0.9): 
    """Determine number of SVDs to explain 90% of varaince"""
    df_main_c = df.copy()
    tf_act = enrich_tfs(df_main_c, net)
    # print(tf_act.iloc[0:3,:5])
    n_components = min([500, tf_act.shape[1]])
    # reducer = TruncatedSVD(n_components=n_components, n_iter=10, random_state=32)
    reducer = PCA(n_components=n_components, random_state=32) 
    reducer.fit(tf_act)
    variance_explained = reducer.explained_variance_ratio_.cumsum()
    n_components = sum(variance_explained < explained_variance_t) + 1  # Number of components to capture 90% variance
    print(f"Number of components to retain {explained_variance_t} of the variance: {n_components}")
    return n_components
n_components_dict = {}
for name, grn in grn_models_all_dict.items():
    print('----',name)
    n_components_dict[name] = func_svds(df_main_reg, grn)

---- collectRI
Running ulm on mat with 614 samples and 18211 targets for 632 sources.
Number of components to retain 0.9 of the variance: 132
---- figr_grn
Running ulm on mat with 614 samples and 18211 targets for 727 sources.
Number of components to retain 0.9 of the variance: 246
---- co_grn
Running ulm on mat with 17 samples and 18211 targets for 504 sources.
Running ulm on mat with 17 samples and 18211 targets for 520 sources.
Running ulm on mat with 146 samples and 18211 targets for 503 sources.
Running ulm on mat with 434 samples and 18211 targets for 479 sources.
Number of components to retain 0.9 of the variance: 7
---- sp_grn


KeyboardInterrupt: 

## Build regression models

In [17]:
# create encoding models 
enc_models = {}
for name, grn in grn_models_all_dict.items():
    
    enc_models[name] = model_encoder(df_main_reg, grn_model=grn, shares={
                                        'sm_name':{'tf_x':None},
                                        'cell_type': {'tf_x':None}
                                        })
    # ababa

Running ulm on mat with 614 samples and 18211 targets for 632 sources.
Running ulm on mat with 614 samples and 18211 targets for 632 sources.


100%|██████████| 632/632 [00:12<00:00, 48.63it/s]
100%|██████████| 632/632 [00:13<00:00, 47.67it/s]


(614, 1264) (255, 1264)
Running ulm on mat with 614 samples and 18211 targets for 727 sources.
Running ulm on mat with 614 samples and 18211 targets for 727 sources.


100%|██████████| 727/727 [00:15<00:00, 45.89it/s]
 27%|██▋       | 199/727 [00:04<00:11, 47.12it/s]


KeyboardInterrupt: 

In [36]:
## add baseline model. for subset gene study, we create one baseline per grn because the number of target genes are different from one grn to another
enc_models['baseline'] = model_encoder(df_main_reg, shares={
                                            'sm_name':{'de_x':35},
                                            'cell_type': {'de_x':6}
                                            })

100%|██████████| 35/35 [00:00<00:00, 40.17it/s]
100%|██████████| 6/6 [00:00<00:00, 46.49it/s]

(614, 41) (255, 41)





## Kaggle score
We can only submit if they are build on original data

In [136]:
from kaggle.api.kaggle_api_extended import KaggleApi
os.environ['KAGGLE_USERNAME'] = 'jalilnourisa'
os.environ['KAGGLE_KEY'] = '63552f12403af36f40106e6821e80327'
api = KaggleApi()
api.authenticate() 
prefix = 'tfactivity_shuffled'    # group of runs. set this to something that tags your experiemnt 


In [137]:
gene_names = df_main_reg.columns

os.makedirs('../output/submits/', exist_ok=True)

def format_y_submit(Y_submit):
    y_submit_df = pd.DataFrame(Y_submit, columns=gene_names)
    y_submit_df.index.name = 'id'
    y_submit_df = y_submit_df.round(5)
    return y_submit_df
def write_submit(Y_submit_df, file_name):
    Y_submit_df.to_csv(f'../output/submits/{file_name}.csv')
def submit(file_name):
    filename = f"../output/submits/{file_name}.csv"
    competition = "open-problems-single-cell-perturbations"
    api.competition_submit(file_name=filename, message=file_name, competition=competition)

for name, model in enc_models.items():
    # if name in ['collectRI','baseline', 'collectRI_sign']:
    #     continue
    file_name = f'{prefix}_{name}'
    print(name)
    y_submit = model.calculate_y_submit()
    y_submit_df = format_y_submit(y_submit)
    write_submit(y_submit_df, file_name)
    submit(file_name)

collectRI_shuffled


 33%|███▎      | 12.3M/36.7M [00:18<00:27, 944kB/s]   

100%|██████████| 36.7M/36.7M [00:48<00:00, 791kB/s]


figr_grn_shuffled


100%|██████████| 36.6M/36.6M [00:48<00:00, 791kB/s]   


co_grn_shuffled


100%|██████████| 36.7M/36.7M [00:48<00:00, 791kB/s]   


sp_grn_shuffled


100%|██████████| 36.8M/36.8M [00:48<00:00, 789kB/s]   


### kaggle scores

In [193]:
def get_kaggle_scores(prefix):
    submissions = api.competition_submissions("open-problems-single-cell-perturbations")
    kaggle_scores_dict = {}
    for submission in submissions:
        kaggle_scores_dict[submission.fileName.replace('.csv', '')] = [submission.publicScore,  submission.privateScore]
    kaggle_scores_dict = {key:values for key,values in kaggle_scores_dict.items() if (prefix in key)}
    kaggle_scores_df = pd.DataFrame(kaggle_scores_dict.values(), index=kaggle_scores_dict.keys(), columns=['public_test', 'private_test']).reset_index().rename(columns={'index':'grn_model'})
    kaggle_scores_df.grn_model = kaggle_scores_df.grn_model.str.replace(f'{prefix}_','')
    return kaggle_scores_df

In [139]:
kaggle_scores_df = get_kaggle_scores(prefix)

Unnamed: 0,grn_model,public_test,private_test
0,sp_grn_shuffled,0.613,0.791
1,co_grn_shuffled,0.597,0.774
2,figr_grn_shuffled,0.59,0.789
3,collectRI_shuffled,0.598,0.768


In [126]:
kaggle_scores_df

Unnamed: 0,grn_model,public_test,private_test
0,sp_grn_sign,0.614,0.77
1,co_grn_sign,0.602,0.774
2,figr_grn_sign,0.597,0.767
3,sp_grn,0.599,0.788
4,co_grn,0.594,0.789
5,figr_grn,0.595,0.766
6,collectRI,0.602,0.769


In [None]:
aaaaa

## Cross validation

In [166]:
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score
if metric_type=='MSE':
    error_metric = lambda y_pred, y_true: mean_absolute_error(y_pred, y_true)
elif metric_type=='r2':
    error_metric = lambda y_pred, y_true: r2_score(y_pred, y_true)
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import r2_score
# define groups 
if cv_scheme=='sm_name': # cluster of sm_name
    sm_names = df_main_reg.index.get_level_values('sm_name')
    unique_sm_names = sm_names.unique()
    group_assignments = range(len(unique_sm_names))
    group_dict = dict(zip(unique_sm_names, group_assignments))
    groups = sm_names.map(group_dict)
elif cv_scheme=='sm_name_10': # cluster of sm_name
    n = 10
    sm_names = df_main_reg.index.get_level_values('sm_name')
    unique_sm_names = sm_names.unique().values
    group_assignments = range(n)
    np.random.shuffle(unique_sm_names)
    group_dict = {}
    for i, sm_name in enumerate(unique_sm_names):
        group_dict[sm_name] = i%n
    groups = sm_names.map(group_dict)
elif cv_scheme=='cell_type': # one group for each trainig cell_type
    cell_types = df_main_reg.index.get_level_values('cell_type')
    train_cell_types = ['NK cells', 'T cells CD4+', 'T cells CD8+', 'T regulatory cells']
    group_assignments = range(len(train_cell_types))
    group_dict = dict(zip(train_cell_types, group_assignments))
    groups = cell_types.map(group_dict)
elif cv_scheme=='10cv':
    num_groups = 10
    group_assignments = range(num_groups)
    group_size = len(df_main_reg) // num_groups
    groups = np.repeat(np.arange(num_groups), group_size)
    if len(df_main_reg) % num_groups != 0:
        groups = np.concatenate((groups, np.arange(len(df_main_reg) % num_groups)))
    np.random.shuffle(groups)
else:
    raise ValueError('define')

def compute_cv_raw(model, df_main_reg):
    y_true_list = []
    y_pred_list = [] 
    for group in tqdm.tqdm(group_assignments):
        mask_va = groups==group
        mask_tr = groups!=group
        y_true = df_main_reg[mask_va]
        y_pred = pd.DataFrame(model.validate(mask_tr, mask_va), index=y_true.index, columns=y_true.columns)
        # evaluate only those that are sig and also non imputed
        real_values = real_values_mask[mask_va] 
        
        group_mask = real_values

        y_true = y_true[group_mask]
        y_pred = y_pred[group_mask]
        
        y_pred = y_pred.reset_index()
        y_pred['group'] = group
        y_pred_list.append(y_pred)

        y_true = y_true.reset_index()
        y_true['group'] = group
        y_true_list.append(y_true)
    y_pred_df = pd.concat(y_pred_list).reset_index(drop=True)
    y_true_df = pd.concat(y_true_list).reset_index(drop=True)
    return y_pred_df, y_true_df


### actual run

In [154]:
import json
fresh_start = True

# fresh start or load the cv results partly done
cv_scores_dict = {}
if fresh_start: #fresh start
    cv_scores_genes_dict = {}
else: # warm start
    with open(f'../output/postprocess/CV/cv_scores_genes_dict_{original_data}_{only_sig_de}_{subset_genes}.json', 'r') as file:
        cv_scores_genes_dict = json.load(file)
# del cv_scores_genes_dict['collectRI_figr_union_grn']
for name, model in enc_models.items():
    if name in list(cv_scores_genes_dict.keys()):
        continue
    elif name in ['collectRI_sign']:
        continue
    print('----',name)
    
    ## --- calculate mean score 
    # get y pred and y true for all cv groups
    y_pred_df, y_true_df = compute_cv_raw(model, df_main_reg)
    # melt them into arrays
    y_pred_values = y_pred_df[df_main_reg.columns].melt().value.values
    y_true_values = y_true_df[df_main_reg.columns].melt().value.values
    # nan check
    nan_mask = np.isnan(y_true_values)
    y_true_values = y_true_values[~nan_mask]
    y_pred_values = y_pred_values[~nan_mask]
    # zero check
    zero_mask = y_true_values==0
    y_true_values = y_true_values[~zero_mask]
    y_pred_values = y_pred_values[~zero_mask]
    cv_score = error_metric(y_true_values, y_pred_values)
    print(cv_score)
    cv_scores_dict[name] = cv_score
    
    ## --- calculate gene wise score
    r2_scores_genes = []
    for gene in df_main_reg.columns:
        y_true = y_true_df[gene].values
        y_pred = y_pred_df[gene].values

        nan_mask = np.isnan(y_true)

        y_true = y_true[~nan_mask]
        y_pred = y_pred[~nan_mask]

        zero_mask = y_true==0

        y_true = y_true[~zero_mask]
        y_pred = y_pred[~zero_mask]
        if len(y_true)<5:
            # print(gene, 'insufficient samples')
            r2_score_case = np.nan
        else:
            y_mean = y_true.mean()
            
            total_sum_of_squares = ((y_true - y_mean) ** 2).sum()
            sum_of_squares_of_residuals = ((y_true - y_pred) ** 2).sum()
            
            r2_score_case = 1 - (sum_of_squares_of_residuals / total_sum_of_squares)
        r2_scores_genes.append(r2_score_case)
    r2_scores_genes = np.asarray(r2_scores_genes)
    cv_scores_genes_dict[name] = list(r2_scores_genes)

    with open(f'../output/postprocess/CV/cv_scores_genes_dict_{original_data}.json', 'w') as file:
        json.dump(cv_scores_genes_dict, file)


---- collectRI


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [04:09<00:00, 24.91s/it]


0.29847007933264735
---- figr_grn


100%|██████████| 10/10 [04:55<00:00, 29.52s/it]


0.22158774519683688
---- co_grn


100%|██████████| 10/10 [03:25<00:00, 20.54s/it]


0.30067948130563227
---- sp_grn


100%|██████████| 10/10 [00:32<00:00,  3.23s/it]


0.280540622049124
---- figr_grn_sign


100%|██████████| 10/10 [05:02<00:00, 30.28s/it]


0.20600421980247985
---- co_grn_sign


100%|██████████| 10/10 [04:13<00:00, 25.32s/it]


0.36731856616681713
---- sp_grn_sign


100%|██████████| 10/10 [00:40<00:00,  4.01s/it]


0.22382029724542352
---- collectRI_shuffled


100%|██████████| 10/10 [05:26<00:00, 32.67s/it]


0.25812516530071417
---- figr_grn_shuffled


100%|██████████| 10/10 [06:14<00:00, 37.42s/it]


0.19975416992511041
---- co_grn_shuffled


100%|██████████| 10/10 [04:15<00:00, 25.56s/it]


0.3488416282450282
---- sp_grn_shuffled


100%|██████████| 10/10 [00:45<00:00,  4.55s/it]


0.18264536141947096


In [156]:
cv_scores_dict

{'collectRI': 0.29847007933264735,
 'figr_grn': 0.22158774519683688,
 'co_grn': 0.30067948130563227,
 'sp_grn': 0.280540622049124,
 'figr_grn_sign': 0.20600421980247985,
 'co_grn_sign': 0.36731856616681713,
 'sp_grn_sign': 0.22382029724542352,
 'collectRI_shuffled': 0.25812516530071417,
 'figr_grn_shuffled': 0.19975416992511041,
 'co_grn_shuffled': 0.3488416282450282,
 'sp_grn_shuffled': 0.18264536141947096}

## Robustness analysis

### Random baseline

In [168]:
# pool the predictions
y_submits = []
for name, enc_model in tqdm.tqdm(enc_models.items()):
    y_submits.append(enc_model.calculate_y_submit())
pool_ = np.concatenate([np.ndarray.flatten(y_submit) for y_submit in y_submits])


In [183]:
y_submit_format = format_y_submit(y_submits[0])
y_submits_random = []
for i in range(100):
    y_submits_random.append(format_y_submit(np.random.choice(pool_, size=y_submit_format.shape)))

In [185]:
from kaggle.api.kaggle_api_extended import KaggleApi
os.environ['KAGGLE_USERNAME'] = 'jalilnourisa'
os.environ['KAGGLE_KEY'] = '63552f12403af36f40106e6821e80327'
api = KaggleApi()
api.authenticate()
prefix = 'random'    

for i, y_submit_df in enumerate(y_submits_random):
    if i < 17:
        continue
    file_name = f'{prefix}_{i}'
    write_submit(y_submit_df, file_name)
    submit(file_name)



100%|██████████| 36.7M/36.7M [01:01<00:00, 624kB/s] 
100%|██████████| 36.7M/36.7M [00:55<00:00, 690kB/s] 
100%|██████████| 36.7M/36.7M [01:18<00:00, 488kB/s] 
100%|██████████| 36.7M/36.7M [00:58<00:00, 658kB/s] 
100%|██████████| 36.7M/36.7M [00:57<00:00, 670kB/s] 
100%|██████████| 36.7M/36.7M [01:10<00:00, 547kB/s] 
100%|██████████| 36.7M/36.7M [01:02<00:00, 615kB/s] 
100%|██████████| 36.7M/36.7M [00:56<00:00, 687kB/s] 
100%|██████████| 36.7M/36.7M [00:54<00:00, 703kB/s] 
100%|██████████| 36.7M/36.7M [00:59<00:00, 643kB/s] 
100%|██████████| 36.7M/36.7M [00:57<00:00, 673kB/s] 
100%|██████████| 36.7M/36.7M [01:14<00:00, 520kB/s] 
100%|██████████| 36.7M/36.7M [01:15<00:00, 509kB/s] 
100%|██████████| 36.7M/36.7M [01:30<00:00, 426kB/s] 
100%|██████████| 36.7M/36.7M [01:22<00:00, 465kB/s]   
100%|██████████| 36.7M/36.7M [01:04<00:00, 600kB/s]   
100%|██████████| 36.7M/36.7M [01:03<00:00, 611kB/s]   


KeyboardInterrupt: 

In [189]:
kaggle_scores_df

Unnamed: 0,grn_model,public_test,private_test
0,16,,
1,15,0.901,1.289
2,14,0.898,1.291
3,13,0.895,1.286
4,12,0.897,1.286
5,11,0.897,1.286
6,10,0.899,1.289
7,9,0.897,1.286
8,8,0.898,1.287
9,7,0.897,1.282


### Shuffle grn 

In [206]:
prefix = 'shuffletest'
for name, grn in zip(['co_grn'], [co_grn]):
    file_name = f'{prefix}_{name}'
    print(file_name)
    y_submit = enc_model.calculate_y_submit()
    y_submit_df = format_y_submit(y_submit)
    write_submit(y_submit_df, file_name)
    submit(file_name)

    for i in range(0,20):
        grn_s = grn.copy()
        grn_s['source'] = grn_s['source'].sample(frac=1).reset_index(drop=True)
        grn_s['target'] = grn_s['target'].sample(frac=1).reset_index(drop=True)

        dup_flags = grn_s[['source','target']].duplicated()
        grn_s = grn_s[~dup_flags].reset_index(drop=True)
        if grn_s.duplicated().sum()>0:
            raise ValueError('')
        #-- create the model
        sm_name_svd_n = 35
        celltype_svd_n = 6

        enc_model = model_encoder(df_main_reg, grn_model=grn_s, shares={
                                            'sm_name':{'tf_x':sm_name_svd_n},
                                            'cell_type': {'tf_x':celltype_svd_n}
                                            })
        file_name = f'{prefix}_{name}_{i}'
        print(file_name)
        y_submit = enc_model.calculate_y_submit()
        y_submit_df = format_y_submit(y_submit)
        write_submit(y_submit_df, file_name)
        submit(file_name)

shuffletest_co_grn


100%|██████████| 36.7M/36.7M [00:46<00:00, 833kB/s] 


Running ulm on mat with 17 samples and 18211 targets for 530 sources.
Running ulm on mat with 17 samples and 18211 targets for 529 sources.
Running ulm on mat with 146 samples and 18211 targets for 529 sources.
Running ulm on mat with 434 samples and 18211 targets for 522 sources.
Running ulm on mat with 17 samples and 18211 targets for 530 sources.
Running ulm on mat with 17 samples and 18211 targets for 529 sources.
Running ulm on mat with 146 samples and 18211 targets for 529 sources.
Running ulm on mat with 434 samples and 18211 targets for 522 sources.


100%|██████████| 553/553 [00:14<00:00, 39.02it/s]
100%|██████████| 553/553 [00:13<00:00, 39.65it/s]


(614, 1106) (255, 1106)
shuffletest_co_grn_0


100%|██████████| 36.7M/36.7M [00:47<00:00, 806kB/s] 


Running ulm on mat with 17 samples and 18211 targets for 526 sources.
Running ulm on mat with 17 samples and 18211 targets for 535 sources.
Running ulm on mat with 146 samples and 18211 targets for 532 sources.
Running ulm on mat with 434 samples and 18211 targets for 522 sources.
Running ulm on mat with 17 samples and 18211 targets for 526 sources.
Running ulm on mat with 17 samples and 18211 targets for 535 sources.
Running ulm on mat with 146 samples and 18211 targets for 532 sources.
Running ulm on mat with 434 samples and 18211 targets for 522 sources.


100%|██████████| 554/554 [00:14<00:00, 38.28it/s]
100%|██████████| 554/554 [00:13<00:00, 40.86it/s]


(614, 1108) (255, 1108)
shuffletest_co_grn_1


100%|██████████| 36.8M/36.8M [00:47<00:00, 816kB/s] 


Running ulm on mat with 17 samples and 18211 targets for 529 sources.
Running ulm on mat with 17 samples and 18211 targets for 525 sources.
Running ulm on mat with 146 samples and 18211 targets for 528 sources.
Running ulm on mat with 434 samples and 18211 targets for 530 sources.
Running ulm on mat with 17 samples and 18211 targets for 529 sources.
Running ulm on mat with 17 samples and 18211 targets for 525 sources.
Running ulm on mat with 146 samples and 18211 targets for 528 sources.
Running ulm on mat with 434 samples and 18211 targets for 530 sources.


100%|██████████| 558/558 [00:14<00:00, 38.79it/s]
100%|██████████| 558/558 [00:12<00:00, 44.28it/s]


(614, 1116) (255, 1116)
shuffletest_co_grn_2


100%|██████████| 36.7M/36.7M [00:46<00:00, 833kB/s] 


Running ulm on mat with 17 samples and 18211 targets for 525 sources.
Running ulm on mat with 17 samples and 18211 targets for 531 sources.
Running ulm on mat with 146 samples and 18211 targets for 528 sources.
Running ulm on mat with 434 samples and 18211 targets for 529 sources.
Running ulm on mat with 17 samples and 18211 targets for 525 sources.
Running ulm on mat with 17 samples and 18211 targets for 531 sources.
Running ulm on mat with 146 samples and 18211 targets for 528 sources.
Running ulm on mat with 434 samples and 18211 targets for 529 sources.


100%|██████████| 552/552 [00:12<00:00, 43.53it/s]
100%|██████████| 552/552 [00:12<00:00, 43.14it/s]


(614, 1104) (255, 1104)
shuffletest_co_grn_3


100%|██████████| 36.7M/36.7M [00:45<00:00, 845kB/s] 


Running ulm on mat with 17 samples and 18211 targets for 528 sources.
Running ulm on mat with 17 samples and 18211 targets for 529 sources.
Running ulm on mat with 146 samples and 18211 targets for 528 sources.
Running ulm on mat with 434 samples and 18211 targets for 530 sources.
Running ulm on mat with 17 samples and 18211 targets for 528 sources.
Running ulm on mat with 17 samples and 18211 targets for 529 sources.
Running ulm on mat with 146 samples and 18211 targets for 528 sources.
Running ulm on mat with 434 samples and 18211 targets for 530 sources.


100%|██████████| 554/554 [00:12<00:00, 44.27it/s]
100%|██████████| 554/554 [00:12<00:00, 44.55it/s]


(614, 1108) (255, 1108)
shuffletest_co_grn_4


100%|██████████| 36.7M/36.7M [00:44<00:00, 857kB/s] 


Running ulm on mat with 17 samples and 18211 targets for 533 sources.
Running ulm on mat with 17 samples and 18211 targets for 525 sources.
Running ulm on mat with 146 samples and 18211 targets for 528 sources.
Running ulm on mat with 434 samples and 18211 targets for 533 sources.
Running ulm on mat with 17 samples and 18211 targets for 533 sources.
Running ulm on mat with 17 samples and 18211 targets for 525 sources.
Running ulm on mat with 146 samples and 18211 targets for 528 sources.
Running ulm on mat with 434 samples and 18211 targets for 533 sources.


100%|██████████| 553/553 [00:12<00:00, 44.49it/s]
100%|██████████| 553/553 [00:12<00:00, 45.28it/s]


(614, 1106) (255, 1106)
shuffletest_co_grn_5


100%|██████████| 36.7M/36.7M [00:45<00:00, 848kB/s] 


Running ulm on mat with 17 samples and 18211 targets for 528 sources.
Running ulm on mat with 17 samples and 18211 targets for 529 sources.
Running ulm on mat with 146 samples and 18211 targets for 530 sources.
Running ulm on mat with 434 samples and 18211 targets for 532 sources.
Running ulm on mat with 17 samples and 18211 targets for 528 sources.
Running ulm on mat with 17 samples and 18211 targets for 529 sources.
Running ulm on mat with 146 samples and 18211 targets for 530 sources.
Running ulm on mat with 434 samples and 18211 targets for 532 sources.


100%|██████████| 554/554 [00:13<00:00, 41.83it/s]
100%|██████████| 554/554 [00:12<00:00, 44.64it/s]


(614, 1108) (255, 1108)
shuffletest_co_grn_6


100%|██████████| 36.7M/36.7M [00:45<00:00, 850kB/s] 


Running ulm on mat with 17 samples and 18211 targets for 529 sources.
Running ulm on mat with 17 samples and 18211 targets for 533 sources.
Running ulm on mat with 146 samples and 18211 targets for 528 sources.
Running ulm on mat with 434 samples and 18211 targets for 531 sources.
Running ulm on mat with 17 samples and 18211 targets for 529 sources.
Running ulm on mat with 17 samples and 18211 targets for 533 sources.
Running ulm on mat with 146 samples and 18211 targets for 528 sources.
Running ulm on mat with 434 samples and 18211 targets for 531 sources.


100%|██████████| 555/555 [00:12<00:00, 45.36it/s]
100%|██████████| 555/555 [00:12<00:00, 45.30it/s]


(614, 1110) (255, 1110)
shuffletest_co_grn_7


100%|██████████| 36.7M/36.7M [00:45<00:00, 846kB/s] 


Running ulm on mat with 17 samples and 18211 targets for 527 sources.
Running ulm on mat with 17 samples and 18211 targets for 526 sources.
Running ulm on mat with 146 samples and 18211 targets for 522 sources.
Running ulm on mat with 434 samples and 18211 targets for 523 sources.
Running ulm on mat with 17 samples and 18211 targets for 527 sources.
Running ulm on mat with 17 samples and 18211 targets for 526 sources.
Running ulm on mat with 146 samples and 18211 targets for 522 sources.
Running ulm on mat with 434 samples and 18211 targets for 523 sources.


100%|██████████| 551/551 [00:12<00:00, 42.75it/s]
100%|██████████| 551/551 [00:12<00:00, 43.82it/s]


(614, 1102) (255, 1102)
shuffletest_co_grn_8


100%|██████████| 36.8M/36.8M [00:45<00:00, 844kB/s] 


Running ulm on mat with 17 samples and 18211 targets for 526 sources.
Running ulm on mat with 17 samples and 18211 targets for 535 sources.
Running ulm on mat with 146 samples and 18211 targets for 524 sources.
Running ulm on mat with 434 samples and 18211 targets for 530 sources.
Running ulm on mat with 17 samples and 18211 targets for 526 sources.
Running ulm on mat with 17 samples and 18211 targets for 535 sources.
Running ulm on mat with 146 samples and 18211 targets for 524 sources.
Running ulm on mat with 434 samples and 18211 targets for 530 sources.


100%|██████████| 559/559 [00:12<00:00, 44.34it/s]
100%|██████████| 559/559 [00:12<00:00, 44.06it/s]


(614, 1118) (255, 1118)
shuffletest_co_grn_9


100%|██████████| 36.7M/36.7M [00:46<00:00, 833kB/s] 


Running ulm on mat with 17 samples and 18211 targets for 528 sources.
Running ulm on mat with 17 samples and 18211 targets for 524 sources.
Running ulm on mat with 146 samples and 18211 targets for 525 sources.
Running ulm on mat with 434 samples and 18211 targets for 529 sources.
Running ulm on mat with 17 samples and 18211 targets for 528 sources.
Running ulm on mat with 17 samples and 18211 targets for 524 sources.
Running ulm on mat with 146 samples and 18211 targets for 525 sources.
Running ulm on mat with 434 samples and 18211 targets for 529 sources.


100%|██████████| 554/554 [00:13<00:00, 40.99it/s]
100%|██████████| 554/554 [00:14<00:00, 38.67it/s]


(614, 1108) (255, 1108)
shuffletest_co_grn_10


100%|██████████| 36.7M/36.7M [00:45<00:00, 849kB/s] 


Running ulm on mat with 17 samples and 18211 targets for 525 sources.
Running ulm on mat with 17 samples and 18211 targets for 534 sources.
Running ulm on mat with 146 samples and 18211 targets for 526 sources.
Running ulm on mat with 434 samples and 18211 targets for 531 sources.
Running ulm on mat with 17 samples and 18211 targets for 525 sources.
Running ulm on mat with 17 samples and 18211 targets for 534 sources.
Running ulm on mat with 146 samples and 18211 targets for 526 sources.
Running ulm on mat with 434 samples and 18211 targets for 531 sources.


100%|██████████| 554/554 [00:13<00:00, 41.05it/s]
100%|██████████| 554/554 [00:13<00:00, 40.08it/s]


(614, 1108) (255, 1108)
shuffletest_co_grn_11


100%|██████████| 36.7M/36.7M [00:46<00:00, 825kB/s] 


Running ulm on mat with 17 samples and 18211 targets for 531 sources.
Running ulm on mat with 17 samples and 18211 targets for 525 sources.
Running ulm on mat with 146 samples and 18211 targets for 522 sources.
Running ulm on mat with 434 samples and 18211 targets for 528 sources.
Running ulm on mat with 17 samples and 18211 targets for 531 sources.
Running ulm on mat with 17 samples and 18211 targets for 525 sources.
Running ulm on mat with 146 samples and 18211 targets for 522 sources.
Running ulm on mat with 434 samples and 18211 targets for 528 sources.


100%|██████████| 556/556 [00:14<00:00, 38.37it/s]
100%|██████████| 556/556 [00:14<00:00, 39.27it/s]


(614, 1112) (255, 1112)
shuffletest_co_grn_12


100%|██████████| 36.7M/36.7M [00:45<00:00, 841kB/s] 


Running ulm on mat with 17 samples and 18211 targets for 527 sources.
Running ulm on mat with 17 samples and 18211 targets for 529 sources.
Running ulm on mat with 146 samples and 18211 targets for 528 sources.
Running ulm on mat with 434 samples and 18211 targets for 532 sources.
Running ulm on mat with 17 samples and 18211 targets for 527 sources.
Running ulm on mat with 17 samples and 18211 targets for 529 sources.
Running ulm on mat with 146 samples and 18211 targets for 528 sources.
Running ulm on mat with 434 samples and 18211 targets for 532 sources.


100%|██████████| 553/553 [00:13<00:00, 39.95it/s]
100%|██████████| 553/553 [00:13<00:00, 39.59it/s]


(614, 1106) (255, 1106)
shuffletest_co_grn_13


100%|██████████| 36.8M/36.8M [00:46<00:00, 830kB/s] 


Running ulm on mat with 17 samples and 18211 targets for 523 sources.
Running ulm on mat with 17 samples and 18211 targets for 524 sources.
Running ulm on mat with 146 samples and 18211 targets for 533 sources.
Running ulm on mat with 434 samples and 18211 targets for 530 sources.
Running ulm on mat with 17 samples and 18211 targets for 523 sources.
Running ulm on mat with 17 samples and 18211 targets for 524 sources.
Running ulm on mat with 146 samples and 18211 targets for 533 sources.
Running ulm on mat with 434 samples and 18211 targets for 530 sources.


100%|██████████| 555/555 [00:14<00:00, 38.63it/s]
100%|██████████| 555/555 [00:13<00:00, 39.78it/s]


(614, 1110) (255, 1110)
shuffletest_co_grn_14


100%|██████████| 36.8M/36.8M [00:45<00:00, 845kB/s] 


Running ulm on mat with 17 samples and 18211 targets for 530 sources.
Running ulm on mat with 17 samples and 18211 targets for 524 sources.
Running ulm on mat with 146 samples and 18211 targets for 531 sources.
Running ulm on mat with 434 samples and 18211 targets for 527 sources.
Running ulm on mat with 17 samples and 18211 targets for 530 sources.
Running ulm on mat with 17 samples and 18211 targets for 524 sources.
Running ulm on mat with 146 samples and 18211 targets for 531 sources.
Running ulm on mat with 434 samples and 18211 targets for 527 sources.


100%|██████████| 555/555 [00:13<00:00, 40.21it/s]
100%|██████████| 555/555 [00:13<00:00, 40.79it/s]


(614, 1110) (255, 1110)
shuffletest_co_grn_15


100%|██████████| 36.7M/36.7M [00:47<00:00, 811kB/s] 


Running ulm on mat with 17 samples and 18211 targets for 533 sources.
Running ulm on mat with 17 samples and 18211 targets for 531 sources.
Running ulm on mat with 146 samples and 18211 targets for 526 sources.
Running ulm on mat with 434 samples and 18211 targets for 529 sources.
Running ulm on mat with 17 samples and 18211 targets for 533 sources.
Running ulm on mat with 17 samples and 18211 targets for 531 sources.
Running ulm on mat with 146 samples and 18211 targets for 526 sources.
Running ulm on mat with 434 samples and 18211 targets for 529 sources.


100%|██████████| 555/555 [00:14<00:00, 39.16it/s]
100%|██████████| 555/555 [00:15<00:00, 35.34it/s]


(614, 1110) (255, 1110)
shuffletest_co_grn_16


100%|██████████| 36.7M/36.7M [00:45<00:00, 844kB/s] 


Running ulm on mat with 17 samples and 18211 targets for 526 sources.
Running ulm on mat with 17 samples and 18211 targets for 528 sources.
Running ulm on mat with 146 samples and 18211 targets for 532 sources.
Running ulm on mat with 434 samples and 18211 targets for 528 sources.
Running ulm on mat with 17 samples and 18211 targets for 526 sources.
Running ulm on mat with 17 samples and 18211 targets for 528 sources.
Running ulm on mat with 146 samples and 18211 targets for 532 sources.
Running ulm on mat with 434 samples and 18211 targets for 528 sources.


100%|██████████| 556/556 [00:13<00:00, 41.05it/s]
100%|██████████| 556/556 [00:13<00:00, 40.87it/s]


(614, 1112) (255, 1112)
shuffletest_co_grn_17


100%|██████████| 36.7M/36.7M [00:46<00:00, 822kB/s] 


Running ulm on mat with 17 samples and 18211 targets for 534 sources.
Running ulm on mat with 17 samples and 18211 targets for 527 sources.
Running ulm on mat with 146 samples and 18211 targets for 524 sources.
Running ulm on mat with 434 samples and 18211 targets for 534 sources.
Running ulm on mat with 17 samples and 18211 targets for 534 sources.
Running ulm on mat with 17 samples and 18211 targets for 527 sources.
Running ulm on mat with 146 samples and 18211 targets for 524 sources.
Running ulm on mat with 434 samples and 18211 targets for 534 sources.


100%|██████████| 554/554 [00:13<00:00, 40.99it/s]
100%|██████████| 554/554 [00:13<00:00, 41.93it/s]


(614, 1108) (255, 1108)
shuffletest_co_grn_18


100%|██████████| 36.8M/36.8M [00:46<00:00, 836kB/s] 


Running ulm on mat with 17 samples and 18211 targets for 532 sources.
Running ulm on mat with 17 samples and 18211 targets for 529 sources.
Running ulm on mat with 146 samples and 18211 targets for 529 sources.
Running ulm on mat with 434 samples and 18211 targets for 523 sources.
Running ulm on mat with 17 samples and 18211 targets for 532 sources.
Running ulm on mat with 17 samples and 18211 targets for 529 sources.
Running ulm on mat with 146 samples and 18211 targets for 529 sources.
Running ulm on mat with 434 samples and 18211 targets for 523 sources.


100%|██████████| 553/553 [00:16<00:00, 33.27it/s]
100%|██████████| 553/553 [00:15<00:00, 36.82it/s]


(614, 1106) (255, 1106)
shuffletest_co_grn_19


100%|██████████| 36.7M/36.7M [00:46<00:00, 838kB/s] 


In [207]:
df = get_kaggle_scores(prefix)
real  = df[df.grn_model=='figr_grn']
rest = df[df.grn_model!='figr_grn']
for score in ['public_test', 'private_test']:
    # print(rest[score].astype(float).mean())
    print(real[score].mean(), rest[score].astype(float).mean())

# df = get_kaggle_scores(prefix)
real  = df[df.grn_model=='co_grn']
rest = df[df.grn_model!='co_grn']
for score in ['public_test', 'private_test']:
    # print(rest[score].astype(float).mean())
    print(real[score].mean(), rest[score].astype(float).mean())


0.599 0.5954390243902439
0.764 0.7741463414634147
0.607 0.5952439024390244
0.77 0.774
