In [None]:
import pandas as  pd 
import numpy as np 
import tensorflow as tf
import tensorflow_datasets as tfds
import os 
from sklearn.model_selection import train_test_split
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

### Data processing

In [None]:
def write_to_txt(file_name,column):
    with open(file_name, 'w') as f:
        for item in column:
            f.write("%s\n" % item)

In [None]:
pip install PyDrive



In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [None]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
downloaded = drive.CreateFile({'id':"17Noh_WLmNY7XRGwVong641aj2mZxbmWl"})   # replace the id with id of file you want to access
downloaded.GetContentFile('train.csv') 

In [None]:
downloaded = drive.CreateFile({'id':"17S3ghO_EuSn2xSPZw3zy6Nw5t0FwKoyr"})   # replace the id with id of file you want to access
downloaded.GetContentFile('test.csv')  

In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [None]:
test.head()

Unnamed: 0,ID,Sequence
0,ID_test_0,MSIRKRLTWQFTLIVTSILVLFFVFIYIFYADFRREEYYSRLYNKA...
1,ID_test_1,MRWGILVNLAIILFVSGVLLFIAFCASLERAAVDSRVYQAAVLFEA...
2,ID_test_2,MNLQNYYETVMELSHGIVVALDLNGGIIHGNSELVAMSGYTIEELA...
3,ID_test_3,MRMLMSSLVLVVLATIAGLGWSISEFAALQNQDTATANVNSRIAAL...
4,ID_test_4,MIKEYAKQMVSLLFLLSGIALSSSAQKQQVWKPFYDKCRREKSIID...


In [None]:
max_seq_length = 550 # max seq length in this data set is 550 

In [None]:
# split data to train and validation 
train, val = train_test_split(train,test_size=0.1,random_state=1994)

# reduce seq length
if max_seq_length>550 : 
    train["Sequence"] = train["Sequence"].apply(lambda x: "".join(list(x)[0:max_seq_length]))
    val["Sequence"] = val["Sequence"].apply(lambda x: "".join(list(x)[0:max_seq_length]))
    test["Sequence"] = test["Sequence"].apply(lambda x: "".join(list(x)[0:max_seq_length]))

In [None]:
train.head()

Unnamed: 0,ID,Sequence,target
189739,ID_train_189739,MEKEPEPERMVKVVRSHDDSIKLLSCRYRSSGYLTALMAALQDTND...,1
427390,ID_train_427390,MVSILAMVIIILPVIAVIIMNSFEKHMVRSIENELSAYSYSILAVA...,2
133827,ID_train_133827,MDFENGAHVRRAQARVGTVLSGVWRLDALVGLGGMAAVYAATHRSG...,1
21175,ID_train_21175,MTTQTSVIYVISDALGETAEFVSRAAAAQFIGVKTRIRRVPYVRDQ...,5
204839,ID_train_204839,MLWANRRRLRDYAKVGRQLLFQRIAIYSAAIFLAGAYYNWKIALIF...,2


In [None]:
# write Sequence column to txt file 
write_to_txt("train.txt",train.Sequence)
write_to_txt("test.txt",test.Sequence)
write_to_txt("val.txt",val.Sequence)

In [None]:
train_label = train[["target"]].copy()
val_label = val[["target"]].copy()
train_label.to_csv("train_label.csv",index=False)
val_label.to_csv("val_label.csv",index=False)

### Data loaders 

In [None]:
train_label = pd.read_csv("train_label.csv")
val_label = pd.read_csv("val_label.csv")

In [None]:
train_batch_size = 1024
val_batch_size = 1024
number_of_class = train_label.target.nunique()
train_steps = len(train_label) // train_batch_size + int(len(train_label) % train_batch_size > 0)
val_steps = len(val_label) // val_batch_size + int(len(val_label) % val_batch_size > 0)

In [None]:
voc_set = set(['P', 'V', 'I', 'K', 'N', 'B', 'F', 'Y', 'E', 'W', 'R', 'D', 'X', 'S', 'C', 'U', 'Q', 'A', 'M', 'H', 'L', 'G', 'T'])
voc_set_map = { k:v for k , v in zip(voc_set,range(1,len(voc_set)+1))}

In [None]:
def encode(text_tensor, label):
    encoded_text = [ voc_set_map[e] for e in list(text_tensor.numpy().decode())]
    return encoded_text, label

def encode_map_fn(text, label):
    # py_func doesn't set the shape of the returned tensors.
    encoded_text, label = tf.py_function(encode, 
                                       inp=[text, label], 
                                       Tout=(tf.int64, tf.int64))
    encoded_text.set_shape([None])
    label = tf.one_hot(label,number_of_class)
    label.set_shape([number_of_class])
    
    return encoded_text, label

def get_data_loader(file,batch_size,labels):
    # data_set=tf.data.Dataset.from_tensor_slices((df.Sequence,df.target))
    
    label_data = tf.data.Dataset.from_tensor_slices(labels.target)
    data_set = tf.data.TextLineDataset(file)
    data_set = tf.data.Dataset.zip((data_set,label_data))

    data_set = data_set.repeat()
    data_set = data_set.shuffle(len(labels))
    data_set = data_set.map(encode_map_fn,tf.data.experimental.AUTOTUNE)
    data_set = data_set.padded_batch(batch_size)
    data_set = data_set.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    return data_set

def get_test_data_loader(file,batch_size):
    data_set=tf.data.TextLineDataset(file)
    data_set=train_data_loader.map(encode_map_fn,tf.data.experimental.AUTOTUNE)
    data_set=data_set.padded_batch(batch_size)
    data_set = data_set.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    return data_set

In [None]:
train_dl = get_data_loader("train.txt",train_batch_size,train_label)
val_dl = get_data_loader("val.txt",train_batch_size,val_label)

### Model 

In [None]:
from tensorflow.keras.layers import Input,Dense,Dropout,Embedding,Concatenate,Flatten,LSTM ,Bidirectional
from tensorflow.keras.activations import relu ,sigmoid,softmax
from tensorflow.keras.losses import CategoricalCrossentropy
def model():
    name = "seq"
    dropout_rate = 0.1
    learning_rate = 0.001
    sequnce = Input([None],name="sequnce")
    
    EMB_layer = Embedding(input_dim = len(voc_set)+1, output_dim = 64, name = "emb_layer")

    LSTM_layer_2 = LSTM(units=256, name = "lstm_2", return_sequences = False)
    BIDIR_layer_2 = Bidirectional(LSTM_layer_2, name="bidirectional_2")
    
    Dens_layer_1 = Dense(units=512, activation=relu, kernel_regularizer=None, bias_regularizer=None, name=name+"_dense_layer_1")
    Dens_layer_2 = Dense(units=256, activation=relu, kernel_regularizer=None, bias_regularizer=None, name=name+"_dense_layer_2")
    
    output = Dense(units=number_of_class, activation=softmax, kernel_regularizer=None, bias_regularizer=None, name=name+"_dense_layer_output")
    
    dropout_1 = Dropout(dropout_rate)
    
    
    emb_layer = EMB_layer(sequnce)
    logits = output(Dens_layer_2(dropout_1(Dens_layer_1(BIDIR_layer_2(emb_layer)))))

    
    model = tf.keras.Model(inputs={"sequnce":sequnce, },outputs=logits) 
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss=CategoricalCrossentropy(), metrics=[tf.keras.metrics.CategoricalAccuracy(name="Acc")]) 
    model.summary()
    return model 
    

In [None]:
model=model()

Model: "functional_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
sequnce (InputLayer)         [(None, None)]            0         
_________________________________________________________________
emb_layer (Embedding)        (None, None, 64)          1536      
_________________________________________________________________
bidirectional_2 (Bidirection (None, 512)               657408    
_________________________________________________________________
seq_dense_layer_1 (Dense)    (None, 512)               262656    
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0         
_________________________________________________________________
seq_dense_layer_2 (Dense)    (None, 256)               131328    
_________________________________________________________________
seq_dense_layer_output (Dens (None, 8)                

In [None]:
# you can add early stoping method as callback and save best model to improve your score 

In [None]:
history = model.fit(train_dl,
                    validation_data=val_dl,
                    epochs=4,
                    verbose=1,
                    validation_steps=val_steps,
                    steps_per_epoch=train_steps
                   )

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [None]:
def encode(text_tensor, label):
    encoded_text = [ voc_set_map[e] for e in list(text_tensor.numpy().decode())]
    return encoded_text, label

def encode_map_fn(text, label):
    # py_func doesn't set the shape of the returned tensors.
    encoded_text, label = tf.py_function(encode, 
                                       inp=[text, label], 
                                       Tout=(tf.int64, tf.int64))
    encoded_text.set_shape([None])
    label = tf.one_hot(label,number_of_class)
    label.set_shape([number_of_class])
    
    return encoded_text, label

def get_data_loader(file,batch_size,labels):
    # data_set=tf.data.Dataset.from_tensor_slices((df.Sequence,df.target))
    
    label_data = tf.data.Dataset.from_tensor_slices(labels.target)
    data_set = tf.data.TextLineDataset(file)
    data_set = tf.data.Dataset.zip((data_set,label_data))

    data_set = data_set.repeat()
    data_set = data_set.shuffle(len(labels))
    data_set = data_set.map(encode_map_fn,tf.data.experimental.AUTOTUNE)
    data_set = data_set.padded_batch(batch_size)
    data_set = data_set.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    return data_set

def get_test_data_loader(file,batch_size):
    data_set=tf.data.TextLineDataset(file)
    data_set=train_data_loader.map(encode_map_fn,tf.data.experimental.AUTOTUNE)
    data_set=data_set.padded_batch(batch_size)
    data_set = data_set.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    return data_set

In [None]:
test_dl = get_test_data_loader("test.txt",512)
test_pred = model.predict(test_dl,verbose=True)

NameError: ignored

In [None]:
sub = test[["ID"]].copy()
for i in range(number_of_class):
    sub["target_{}".format(i)]=test_pred[:,i]

NameError: ignored

In [None]:
sub.to_csv("StarterNotebookDL_sub.csv",index=False)