In [1]:
import os
import gc

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter
from prettytable import PrettyTable
from IPython.display import Image

from sklearn.preprocessing import LabelEncoder
from tensorflow import keras
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Input, Dense, Dropout, Flatten, Activation
from tensorflow.keras.layers import Conv1D, Add, MaxPooling1D, BatchNormalization
from tensorflow.keras.utils import multi_gpu_model


data_path = '/home/ziang/goo'
print('Available data', os.listdir(data_path))

def read_data(partition):
  data = []
  for fn in os.listdir(os.path.join(data_path, partition)):
    with open(os.path.join(data_path, partition, fn)) as f:
      data.append(pd.read_csv(f, index_col=None))
  return pd.concat(data)
df_train = read_data('train')
df_val = read_data('dev')
df_test = read_data('test')
classes = df_train['family_accession'].value_counts().index.tolist()
len(classes)
train_sm = df_train.loc[df_train['family_accession'].isin(classes)].reset_index()
val_sm = df_val.loc[df_val['family_accession'].isin(classes)].reset_index()
test_sm = df_test.loc[df_test['family_accession'].isin(classes)].reset_index()

codes = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
         'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

def create_dict(codes):
  char_dict = {}
  for index, val in enumerate(codes):
    char_dict[val] = index+1

  return char_dict

char_dict = create_dict(codes)

def integer_encoding(data):
  """
  - Encodes code sequence to integer values.
  - 20 common amino acids are taken into consideration
    and rest 4 are categorized as 0.
  """
  
  encode_list = []
  for row in data['sequence'].values:
    row_encode = []
    for code in row:
      row_encode.append(char_dict.get(code, 0))
    encode_list.append(np.array(row_encode))
  
  return encode_list
train_encode = integer_encoding(train_sm) 
val_encode = integer_encoding(val_sm) 
test_encode = integer_encoding(test_sm) 

max_length = 2048
train_pad = pad_sequences(train_encode, maxlen=max_length, padding='post', truncating='post')
val_pad = pad_sequences(val_encode, maxlen=max_length, padding='post', truncating='post')
test_pad = pad_sequences(test_encode, maxlen=max_length, padding='post', truncating='post')

le = LabelEncoder()

y_train_le = le.fit_transform(train_sm['family_accession'])
y_val_le = le.transform(val_sm['family_accession'])
y_test_le = le.transform(test_sm['family_accession'])

y_train = to_categorical(y_train_le)
y_val = to_categorical(y_val_le)
y_test = to_categorical(y_test_le)
def display_model_score(model, train, val, test, batch_size):

  train_score = model.evaluate(train[0], train[1], batch_size=batch_size, verbose=1)
  print('Train loss: ', train_score[0])
  print('Train accuracy: ', train_score[1])
  print('-'*70)

  val_score = model.evaluate(val[0], val[1], batch_size=batch_size, verbose=1)
  print('Val loss: ', val_score[0])
  print('Val accuracy: ', val_score[1])
  print('-'*70)
  
  test_score = model.evaluate(test[0], test[1], batch_size=batch_size, verbose=1)
  print('Test loss: ', test_score[0])
  print('Test accuracy: ', test_score[1])
def residual_block(data, filters, d_rate):
  """
  _data: input
  _filters: convolution filters
  _d_rate: dilation rate
  """

  shortcut = data

  bn1 = BatchNormalization()(data)
  act1 = Activation('relu')(bn1)
  conv1 = Conv1D(filters, 1, dilation_rate=d_rate, padding='same', kernel_regularizer=l2(0.001))(act1)

  #bottleneck convolution
  bn2 = BatchNormalization()(conv1)
  act2 = Activation('relu')(bn2)
  conv2 = Conv1D(filters, 3, padding='same', kernel_regularizer=l2(0.001))(act2)

  #skip connection
  x = Add()([conv2, shortcut])

  return x

train_ohe = to_categorical(train_pad)
val_ohe = to_categorical(val_pad)
test_ohe = to_categorical(test_pad)



Available data ['annotate', 'dev', 'make_lmdb.py', 'train', 'test']


In [None]:
from scipy import sparse

train_ohe=train_ohe.astype(np.bool_)
val_ohe=val_ohe.astype(np.bool_)
test_ohe=test_ohe.astype(np.bool_)

train_ohe=np.reshape(train_ohe, (1086741,-1))
val_ohe=np.reshape(val_ohe, (126171,-1))
test_ohe=np.reshape(test_ohe, (126171,-1))

train_ohe_sp=sparse.csr_matrix(train_ohe) # 采用行优先的方式压缩矩阵
sparse.save_npz('train_ohe_2048.npz',train_ohe_sp)  # 保存稀疏矩阵

val_ohe_sp=sparse.csr_matrix(val_ohe) # 采用行优先的方式压缩矩阵
sparse.save_npz('val_ohe_2048.npz',val_ohe_sp) 

test_ohe_sp=sparse.csr_matrix(test_ohe) # 采用行优先的方式压缩矩阵
sparse.save_npz('test_ohe_2048.npz',test_ohe_sp) 



In [None]:
print(train_ohe.shape,val_ohe.shape,test_ohe.shape)

In [6]:
print(y_train.shape,y_val.shape)

(1086741, 17929) (126171, 17929)


In [None]:
y_train=y_train.astype(np.bool_)
y_val=y_val.astype(np.bool_)
y_test=y_test.astype(np.bool_)

y_train_sp=sparse.csr_matrix(y_train) # 采用行优先的方式压缩矩阵
sparse.save_npz('y_train_2048.npz',y_train_sp)  # 保存稀疏矩阵

y_val_sp=sparse.csr_matrix(y_val) # 采用行优先的方式压缩矩阵
sparse.save_npz('y_val_2048.npz',y_val_sp) 


y_test_sp=sparse.csr_matrix(y_test) # 采用行优先的方式压缩矩阵
sparse.save_npz('y_test_2048.npz',y_test_sp) 

In [7]:
y_test_sp

<126171x17929 sparse matrix of type '<class 'numpy.bool_'>'
	with 126171 stored elements in Compressed Sparse Row format>