# Data preprocessing

In [1]:
loc_data = '../train_data.csv'
loc_vocab = '../vocab_sgabbr.csv'

import os
import copy
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option("display.max_columns", 50)
from feature_encoding_250326 import *

import Bio
from Bio.SeqUtils import MeltingTemp as mt
from Bio.Seq import Seq

import RNA

import khmer # python 3.9.16
from functools import reduce

from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torchinfo import summary
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
from sklearn.metrics import precision_score, recall_score, mean_absolute_error

import random
import time

def set_random_seed(seed):
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [2]:
df_data = pd.read_csv(loc_data)
df_data.insert(0,'ID_in_model',df_data.index)
df_vocab = pd.read_csv(loc_vocab,dtype='str')

### raw data preview

In [3]:
df_data.head()

Unnamed: 0,ID_in_model,id,publication_id,gene_target_symbol_name,gene_target_ncbi_id,gene_target_species,siRNA_duplex_id,siRNA_sense_seq,siRNA_antisense_seq,cell_line_donor,siRNA_concentration,concentration_unit,Transfection_method,Duration_after_transfection_h,modified_siRNA_sense_seq,modified_siRNA_antisense_seq,modified_siRNA_sense_seq_list,modified_siRNA_antisense_seq_list,gene_target_seq,mRNA_remaining_pct
0,0,7,WO9efb397e42,LDHA,NM_005566.3,Homo sapiens,AD-158499,GGUUCCAAGUCCAAUAUGGCA,UGCCAUAUUGGACUUGGAACCAA,Hep3B Cells,10.0,nM,Lipofectamine,24.0,gguuccAfaGfUfCfcaauauggcaL96,uGfccaUfaUfUfggacUfuGfgaaccaa,g g u u c c Af a Gf Uf Cf c a a u a u g g c a L96,u Gf c c a Uf a Uf Uf g g a c Uf u Gf g a a c ...,GTCTGCCGGTCGGTTGTCTGGCTGCGCGCGCCACCCGGGCCTCTCC...,29.76
1,1,16,WO9efb397e42,LDHA,NM_005566.3,Homo sapiens,AD-158536,AUUUAUAAUCUUCUAAAGGAA,UUCCUUUAGAAGAUUAUAAAUCA,Hep3B Cells,10.0,nM,Lipofectamine,24.0,auuuauAfaUfCfUfucuaaaggaaL96,uUfccuUfuAfGfaagaUfuAfuaaauca,a u u u a u Af a Uf Cf Uf u c u a a a g g a a L96,u Uf c c u Uf u Af Gf a a g a Uf u Af u a a a ...,GTCTGCCGGTCGGTTGTCTGGCTGCGCGCGCCACCCGGGCCTCTCC...,30.88
2,2,17,WO9efb397e42,LDHA,NM_005566.3,Homo sapiens,AD-158550,AAUAAGAUUACAGUUGUUGGA,UCCAACAACUGUAAUCUUAUUCU,Hep3B Cells,10.0,nM,Lipofectamine,24.0,aauaagAfuUfAfCfaguuguuggaL96,uCfcaaCfaAfCfuguaAfuCfuuauucu,a a u a a g Af u Uf Af Cf a g u u g u u g g a L96,u Cf c a a Cf a Af Cf u g u a Af u Cf u u a u ...,GTCTGCCGGTCGGTTGTCTGGCTGCGCGCGCCACCCGGGCCTCTCC...,28.87
3,3,22,WO9efb397e42,LDHA,NM_005566.3,Homo sapiens,AD-158571,GCCUGUGCCAUCAGUAUCUUA,UAAGAUACUGAUGGCACAGGCCA,Hep3B Cells,10.0,nM,Lipofectamine,24.0,gccuguGfcCfAfUfcaguaucuuaL96,uAfagaUfaCfUfgaugGfcAfcaggcca,g c c u g u Gf c Cf Af Uf c a g u a u c u u a L96,u Af a g a Uf a Cf Uf g a u g Gf c Af c a g g ...,GTCTGCCGGTCGGTTGTCTGGCTGCGCGCGCCACCCGGGCCTCTCC...,46.81
4,4,35,WO9efb397e42,LDHA,NM_005566.3,Homo sapiens,AD-158626,GUCAUCGAAGACAAAUUGAAA,UUUCAAUUUGUCUUCGAUGACAU,Hep3B Cells,10.0,nM,Lipofectamine,24.0,gucaucGfaAfGfAfcaaauugaaaL96,uUfucaAfuUfUfgucuUfcGfaugacau,g u c a u c Gf a Af Gf Af c a a a u u g a a a L96,u Uf u c a Af u Uf Uf g u c u Uf c Gf a u g a ...,GTCTGCCGGTCGGTTGTCTGGCTGCGCGCGCCACCCGGGCCTCTCC...,11.06


In [4]:
df_vocab#.head()

Unnamed: 0,Abbreviation,Chemical Name,single_abbr,deModi_sgabbr,modil_label_int
0,A,adenosine-3'-phosphate,A,A,1.0
1,C,cytidine-3'-phosphate,C,C,1.0
2,G,guanosine-3'-phosphate,G,G,1.0
3,P,5'-phosphate,P,P,
4,U,uridine-3'-phosphate,U,U,1.0
5,a,2'-O-methyladenosine-3'-phosphate,a,A,3.0
6,c,2'-O-methylcytidine-3'-phosphate,c,C,3.0
7,g,2'-O-methylguanosine-3'-phosphate,g,G,3.0
8,s,phosphorothioate linkage,s,S,
9,t,2'-O-methyl-5-methyluridine-3'-phosphate,t,T,3.0


In [5]:
df_data.isnull().sum()

ID_in_model                             0
id                                      0
publication_id                          0
gene_target_symbol_name              3049
gene_target_ncbi_id                  3049
gene_target_species                  3049
siRNA_duplex_id                         0
siRNA_sense_seq                         0
siRNA_antisense_seq                     0
cell_line_donor                      1082
siRNA_concentration                     0
concentration_unit                      0
Transfection_method                  1082
Duration_after_transfection_h        1082
modified_siRNA_sense_seq                0
modified_siRNA_antisense_seq            0
modified_siRNA_sense_seq_list           0
modified_siRNA_antisense_seq_list       0
gene_target_seq                      3049
mRNA_remaining_pct                      0
dtype: int64

In [6]:
df_data.nunique().sort_values(ascending=False)

ID_in_model                          25782
id                                   25782
siRNA_duplex_id                       8742
modified_siRNA_antisense_seq_list     8427
modified_siRNA_antisense_seq          8427
mRNA_remaining_pct                    7797
modified_siRNA_sense_seq              7588
modified_siRNA_sense_seq_list         7588
siRNA_antisense_seq                   6860
siRNA_sense_seq                       6732
gene_target_ncbi_id                     52
gene_target_seq                         52
gene_target_symbol_name                 45
publication_id                          40
siRNA_concentration                     16
cell_line_donor                         14
gene_target_species                      3
Transfection_method                      2
Duration_after_transfection_h            2
concentration_unit                       1
dtype: int64

## Feature encoding

In [3]:
df_structured = seq_structuring(df_data,df_vocab)
df_structured.head()

Unnamed: 0,ID_in_model,id,publication_id,gene_target_symbol_name,gene_target_ncbi_id,gene_target_species,siRNA_duplex_id,siRNA_sense_seq,siRNA_antisense_seq,cell_line_donor,siRNA_concentration,concentration_unit,Transfection_method,Duration_after_transfection_h,modified_siRNA_sense_seq,modified_siRNA_antisense_seq,modified_siRNA_sense_seq_list,modified_siRNA_antisense_seq_list,gene_target_seq,mRNA_remaining_pct,seq_sgchar_modi_sense,seq_sgchar_modi_anti,modi_sense_P+,modi_anti_P+,modi_sense_VP+,modi_anti_VP+,modi_sense_L96+,modi_anti_L96+,seq_agctus_sense,seq_agctus_anti,seq_agct_int_sense,seq_agct_int_anti,seq_modi_int_sense,seq_modi_int_anti
0,0,7,WO9efb397e42,LDHA,NM_005566.3,Homo sapiens,AD-158499,GGUUCCAAGUCCAAUAUGGCA,UGCCAUAUUGGACUUGGAACCAA,Hep3B Cells,10.0,nM,Lipofectamine,24.0,gguuccAfaGfUfCfcaauauggcaL96,uGfccaUfaUfUfggacUfuGfgaaccaa,g g u u c c Af a Gf Uf Cf c a a u a u g g c a L96,u Gf c c a Uf a Uf Uf g g a c Uf u Gf g a a c ...,GTCTGCCGGTCGGTTGTCTGGCTGCGCGCGCCACCCGGGCCTCTCC...,29.76,acgguauaac@$#a!ccuugg,u#cca$a$$ggac$u#gaaccaa,0,0,0,0,1,0,ACGGUAUAACCUGAACCUUGG,UGCCAUAUUGGACUUGGAACCAA,043310100441300441133,13440101133041133004400,333333333322232333333,32333232233332323333333
1,1,16,WO9efb397e42,LDHA,NM_005566.3,Homo sapiens,AD-158536,AUUUAUAAUCUUCUAAAGGAA,UUCCUUUAGAAGAUUAUAAAUCA,Hep3B Cells,10.0,nM,Lipofectamine,24.0,auuuauAfaUfCfUfucuaaaggaaL96,uUfccuUfuAfGfaagaUfuAfuaaauca,a u u u a u Af a Uf Cf Uf u c u a a a g g a a L96,u Uf c c u Uf u Af Gf a a g a Uf u Af u a a a ...,GTCTGCCGGTCGGTTGTCTGGCTGCGCGCGCCACCCGGGCCTCTCC...,30.88,aaggaaaucu$@$a!uauuua,u$ccu$u!#aaga$u!uaaauca,0,0,0,0,1,0,AAGGAAAUCUUCUAAUAUUUA,UUCCUUUAGAAGAUUAUAAAUCA,003300014114100101110,11441110300301101000140,333333333322232333333,32333232233332323333333
2,2,17,WO9efb397e42,LDHA,NM_005566.3,Homo sapiens,AD-158550,AAUAAGAUUACAGUUGUUGGA,UCCAACAACUGUAAUCUUAUUCU,Hep3B Cells,10.0,nM,Lipofectamine,24.0,aauaagAfuUfAfCfaguuguuggaL96,uCfcaaCfaAfCfuguaAfuCfuuauucu,a a u a a g Af u Uf Af Cf a g u u g u u g g a L96,u Cf c a a Cf a Af Cf u g u a Af u Cf u u a u ...,GTCTGCCGGTCGGTTGTCTGGCTGCGCGCGCCACCCGGGCCTCTCC...,28.87,agguuguuga@!$u!gaauaa,u@caa@a!@ugua!u@uuauucu,0,0,0,0,1,0,AGGUUGUUGACAUUAGAAUAA,UCCAACAACUGUAAUCUUAUUCU,033113113040110300100,14400400413100141101141,333333333322232333333,32333232233332323333333
3,3,22,WO9efb397e42,LDHA,NM_005566.3,Homo sapiens,AD-158571,GCCUGUGCCAUCAGUAUCUUA,UAAGAUACUGAUGGCACAGGCCA,Hep3B Cells,10.0,nM,Lipofectamine,24.0,gccuguGfcCfAfUfcaguaucuuaL96,uAfagaUfaCfUfgaugGfcAfcaggcca,g c c u g u Gf c Cf Af Uf c a g u a u c u u a L96,u Af a g a Uf a Cf Uf g a u g Gf c Af c a g g ...,GTCTGCCGGTCGGTTGTCTGGCTGCGCGCGCCACCCGGGCCTCTCC...,46.81,auucuaugac$!@c#uguccg,u!aga$a@$gaug#c!caggcca,0,0,0,0,1,0,AUUCUAUGACUACCGUGUCCG,UAAGAUACUGAUGGCACAGGCCA,011410130410443131443,10030104130133404033440,333333333322232333333,32333232233332323333333
4,4,35,WO9efb397e42,LDHA,NM_005566.3,Homo sapiens,AD-158626,GUCAUCGAAGACAAAUUGAAA,UUUCAAUUUGUCUUCGAUGACAU,Hep3B Cells,10.0,nM,Lipofectamine,24.0,gucaucGfaAfGfAfcaaauugaaaL96,uUfucaAfuUfUfgucuUfcGfaugacau,g u c a u c Gf a Af Gf Af c a a a u u g a a a L96,u Uf u c a Af u Uf Uf g u c u Uf c Gf a u g a ...,GTCTGCCGGTCGGTTGTCTGGCTGCGCGCGCCACCCGGGCCTCTCC...,11.06,aaaguuaaac!#!a#cuacug,u$uca!u$$gucu$c#augacau,0,0,0,0,1,0,AAAGUUAAACAGAAGCUACUG,UUUCAAUUUGUCUUCGAUGACAU,000311000403003410413,11140011131411430130401,333333333322232333333,32333232233332323333333


### onehot3d of modi-seq

In [4]:
SPACE_SHAPE = (28,6,7) # (seq len,ACGTUS,modi types)

print('Producing onehot3D of total dataset')
alldata_modiseq,alldata_skip_sample_index = df2onehot3d_v2(df_structured,SPACE_SHAPE)
print('\n',alldata_skip_sample_index)
print('alldata_modiseq.shape:\n',alldata_modiseq.shape)

ls_alldata_modiseq_onehot3d = [torch.tensor(d) for d in alldata_modiseq]
df_structured['!!_modiseq_onehot3d'] = ls_alldata_modiseq_onehot3d

Producing onehot3D of total dataset
25000
 []
alldata_modiseq.shape:
 (25782, 2, 28, 6, 7)


### sturct mtx: probability of pairing

In [5]:
print('2nd struct calculating ... sense')
df_structured[['mtx_pp_sense','dp_mfe_sense','dp_pp_sense','dp_centroid_sense','dp_MEA_sense',
               'val_mfe_sense','val_pf_sense','val_ctrdist_sense','val_MEA_sense']] = df_structured['siRNA_sense_seq'].apply(get_struct_ppdp).apply(pd.Series)
df_structured['!!_nt_struct_type_sense_mea'] = df_structured['dp_MEA_sense'].apply(get_nt_strtype_mtx)

print('2nd struct calculating ... antis')
df_structured[['mtx_pp_antis','dp_mfe_antis','dp_pp_antis','dp_centroid_antis','dp_MEA_antis',
               'val_mfe_antis','val_pf_antis','val_ctrdist_antis','val_MEA_antis']] = df_structured['siRNA_antisense_seq'].apply(get_struct_ppdp).apply(pd.Series)
df_structured['!!_nt_struct_type_antis_mea'] = df_structured['dp_MEA_antis'].apply(get_nt_strtype_mtx)

2nd struct calculating ... sense
2nd struct calculating ... antis


## Data splitting: train-validate set, IID test set, and OOD test set

In [6]:
env_list = df_structured['publication_id'].unique()

distr_summary = []
for data_source in env_list:
    df_temp = df_structured[df_structured['publication_id']==data_source]
    count_sp = len(df_temp['gene_target_species'].unique())
    count_tar = len(df_temp['gene_target_symbol_name'].unique())
    count_cell = len(df_temp['cell_line_donor'].unique())
    count_conc = len(df_temp['siRNA_concentration'].unique())
    count_tfx = len(df_temp['Transfection_method'].unique())
    count_tfxtime = len(df_temp['Duration_after_transfection_h'].unique())
    distr_summary.append((data_source,df_temp.shape[0],
                          count_sp,count_tar,count_cell,count_conc,count_tfx,count_tfxtime))

test_publication_id = ['WOee0afbdeee','WO9cdd416594','WO28aca1a182','WO40428852d0']
df_structured['dataset_usage'] = None

random.seed(1122)

test_ood_index = list(df_structured[df_structured['publication_id'].isin(test_publication_id)].index)
random.shuffle(test_ood_index)
df_structured.loc[test_ood_index,'dataset_usage'] = 'OOD_test'
print('TEST_OOD:',len(test_ood_index))

iid_index = list(df_structured.drop(test_ood_index,axis=0).index)
random.shuffle(iid_index)
test_iid_index = iid_index[:2568]
trvl_index = iid_index[2568:]

df_structured.loc[test_iid_index,'dataset_usage'] = 'IID_test'
print('TEST_IID:',len(test_iid_index))
df_structured.loc[trvl_index,'dataset_usage'] = 'IID_trvl'
print('TRAIN & VAL:',len(trvl_index))

print(set(test_ood_index) & set(iid_index))
print(set(test_iid_index) & set(trvl_index))
print()
print(df_structured['dataset_usage'].value_counts(),end='\n\n')
print(df_structured[df_structured['dataset_usage']=='OOD_test']['publication_id'].unique())

TEST_OOD: 2588
TEST_IID: 2568
TRAIN & VAL: 20626
set()
set()

IID_trvl    20626
OOD_test     2588
IID_test     2568
Name: dataset_usage, dtype: int64

['WO28aca1a182' 'WO40428852d0' 'WOee0afbdeee' 'WO9cdd416594']


## experimental context

In [7]:
df_trvl_encoded,ftr_encoding_params = feature_encoding_train(df_structured[df_structured['dataset_usage']=='IID_trvl'])
print(df_trvl_encoded.shape)
print(df_trvl_encoded.isnull().sum().unique())

df_test_iid_encoded = feature_encoding_test(df_structured[df_structured['dataset_usage']=='IID_test'],df_trvl_encoded,ftr_encoding_params)
print(df_test_iid_encoded.shape)
print(df_test_iid_encoded.isnull().sum().unique())
print('iid_test_column order is same:',set(df_trvl_encoded.columns==df_test_iid_encoded.columns))

df_test_ood_encoded = feature_encoding_test(df_structured[df_structured['dataset_usage']=='OOD_test'],df_trvl_encoded,ftr_encoding_params)
print(df_test_ood_encoded.shape)
print(df_test_ood_encoded.isnull().sum().unique())
print('ood_test_column order is same:',set(df_trvl_encoded.columns==df_test_ood_encoded.columns))

  kmer_counts = np.array(kmer_counts)/sum(kmer_counts)


(20626, 109)
[0]
column order is same: {True}


  kmer_counts = np.array(kmer_counts)/sum(kmer_counts)


(2568, 109)
[0]
iid_test_column order is same: {True}
column order is same: {True}
(2588, 109)
[0]
ood_test_column order is same: {True}


In [11]:
import pickle
pickle.dump(ftr_encoding_params, open('ftr_encoding_params_v3-0329.pkl', 'wb'))

In [8]:
df_structured_encoded_iid_trvl = pd.merge(df_structured,df_trvl_encoded,left_index=True,right_index=True,how='right')
df_structured_encoded_iid_test = pd.merge(df_structured,df_test_iid_encoded,left_index=True,right_index=True,how='right')
df_structured_encoded_ood_test = pd.merge(df_structured,df_test_ood_encoded,left_index=True,right_index=True,how='right')

df_structured_encoded = pd.concat([df_structured_encoded_iid_trvl,df_structured_encoded_iid_test,df_structured_encoded_ood_test])

In [9]:
#set(df_structured_encoded.columns[df_structured_encoded.columns.str.contains('!')])-set(df_test_iid_encoded.columns)
print(df_structured_encoded.columns[df_structured_encoded.columns.str.contains('!!')])
df_structured_encoded.columns[df_structured_encoded.columns.str.contains(r'!\w+!')]

Index(['!!_modiseq_onehot3d', '!!_nt_struct_type_sense_mea',
       '!!_nt_struct_type_antis_mea'],
      dtype='object')


Index(['!modiseq!_GC_ratio_antisense', '!tfx!_hpt_regu', '!tfx!_conc_regu',
       '!tfx!_ct_cell_line_donor_A549 Cells',
       '!tfx!_ct_cell_line_donor_BE(2)-C Cells',
       '!tfx!_ct_cell_line_donor_COS-7 Cells',
       '!tfx!_ct_cell_line_donor_DU145 Cells',
       '!tfx!_ct_cell_line_donor_HeLa Cells',
       '!tfx!_ct_cell_line_donor_Hep3B Cells',
       '!tfx!_ct_cell_line_donor_HepG2 Cells',
       ...
       '!mrna!_kmer_54_freq', '!mrna!_kmer_55_freq', '!mrna!_kmer_56_freq',
       '!mrna!_kmer_57_freq', '!mrna!_kmer_58_freq', '!mrna!_kmer_59_freq',
       '!mrna!_kmer_60_freq', '!mrna!_kmer_61_freq', '!mrna!_kmer_62_freq',
       '!mrna!_kmer_63_freq'],
      dtype='object', length=109)

## save

In [10]:
print('df_structured:',df_structured.shape)
print('df_structured_encoded:',df_structured_encoded.shape)
print('df_trvl_encoded:',df_trvl_encoded.shape)
print('df_test_iid_encoded:',df_test_iid_encoded.shape)
print('df_test_ood_encoded:',df_test_ood_encoded.shape)

print(set(df_structured[df_structured['dataset_usage']=='IID_trvl'].index)==set(df_trvl_encoded.index))
print(set(df_structured[df_structured['dataset_usage']=='IID_test'].index)==set(df_test_iid_encoded.index))
print(set(df_structured[df_structured['dataset_usage']=='OOD_test'].index)==set(df_test_ood_encoded.index))

df_structured: (25782, 56)
df_structured_encoded: (25782, 165)
df_trvl_encoded: (20626, 109)
df_test_iid_encoded: (2568, 109)
df_test_ood_encoded: (2588, 109)
True
True
True


In [11]:
df_structured_encoded.to_pickle('/home/ken/MyStorage/siRNA_2503/Data/df_structured_encoded_0326.pkl')

---