In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras as keras
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
tf.config.run_functions_eagerly(True)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
seq = pd.read_csv('/content/drive/My Drive/Colab Notebooks/sequence_dict.txt', sep = '\t')
print(len(seq))
seq.head()

In [None]:
seq_dict = {k:v for k, v in zip(seq['Uniprot_ID'],seq['Fasta'])}

In [None]:
ppi = pd.read_csv('/content/drive/My Drive/Colab Notebooks/ppi_human.txt', sep = '\t')
print(len(ppi))
ppi.head()

In [None]:
ppi['Interaction'] = (ppi['Interaction'] > 0).astype(int)
ppi.head()

In [None]:
ppi['Protein_A_sequence'] = ppi['Uniprot_A']
ppi['Protein_B_sequence'] = ppi['Uniprot_B']
for i in range(len(ppi)):
  ppi['Protein_A_sequence'][i] = seq_dict[ppi['Uniprot_A'][i]]
  ppi['Protein_B_sequence'][i] = seq_dict[ppi['Uniprot_B'][i]]

In [None]:
ppi.head()

In [None]:
words = list(set(ppi["Protein_A_sequence"].values) | set(ppi["Protein_B_sequence"].values))
chars = set([w_i for w in words for w_i in w])
chars = sorted(list(chars))
print(chars)

In [None]:
char_to_index = {c: i + 2 for i, c in enumerate(chars)}
index_to_char = {}
for key, value in char_to_index.items():
  index_to_char[value] = key

In [None]:
max_len_char = 1000

def padding_char_indice(char_indice, max_len_char):
  if len(char_indice) >= max_len_char: 
    return char_indice[0:max_len_char+1]
  else:
    return char_indice + ['0']*(max_len_char - len(char_indice))

def integer_coding(sentences):
  char_indice = [char_to_index[char] for char in sentences]
  char_indice = padding_char_indice(char_indice, max_len_char)
  return char_indice

for i in range(len(ppi)):
  ppi['Protein_A_sequence'][i] =  integer_coding(list(ppi['Protein_A_sequence'][i]))
  ppi['Protein_B_sequence'][i] =  integer_coding(list(ppi['Protein_B_sequence'][i]))

In [None]:
ppi['Sequence'] = ppi['Protein_A_sequence']
for i in range(len(ppi)):
  ppi['Sequence'][i] = ppi['Protein_A_sequence'][i] + ppi['Protein_B_sequence'][i]
ppi.head()

In [None]:
dataset = pd.DataFrame()
dataset['x'] = ppi['Sequence'][:100000]
dataset['y'] = ppi['Interaction'][:100000]
dataset.describe()

In [None]:
def vectorize(sequences, dimension = 23):
  results = np.zeros((2000, dimension))
  for i, sequence in enumerate(sequences[:2000]):
    try:
      results[i, int(sequence)] = 1
    except TypeError as e :
      print(sequence)
  return results.astype('float32')

for i in range(len(dataset['x'])):
  dataset['x'][i] = vectorize(dataset['x'][i])

dataset

In [None]:
data_x = np.stack(
                   dataset['x'].values
)
data_y = np.stack(
                   dataset['y'].values
).T
print(data_x.shape, data_y.shape)

In [None]:
train_x, valid_x, train_y, valid_y = train_test_split(data_x, data_y, test_size = 0.1)
valid_x, test_x, valid_y, test_y = train_test_split(valid_x, valid_y, test_size = 0.5)
print(
    train_x.shape, train_y.shape, 
    valid_x.shape, valid_y.shape,
    test_x.shape, test_y.shape
)

In [None]:
model = keras.models.Sequential()
model.add(keras.layers.Flatten())
model.add(keras.layers.Dense(16, activation = 'relu', input_shape = (46000, )))
model.add(keras.layers.Dense(16, activation = 'relu'))
model.add(keras.layers.Dense(1, activation = 'sigmoid'))
model.compile(
    optimizer = 'adam',
    loss = 'binary_crossentropy',
    metrics = ['accuracy']
)
model.summary()

In [None]:
history = model.fit(
    x = train_x, 
    y = train_y, 
    batch_size = 512, 
    epochs = 20, 
    verbose = 'auto',
    validation_data = (valid_x, valid_y),
    shuffle = True,
)

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model_loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc = 'upper left')
plt.show()