In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import pandas as pd 
import numpy as np
from statistics import mean
import re

In [4]:
proteins_with_functions = pd.read_csv("./Data/joined_tables.csv")

In [5]:
proteins_cleaned = proteins_with_functions.drop_duplicates(subset="structureId",keep="first")

## Sequence motif identification

I assumed that a good way to extract extra information from the data is to identify motifs that may be present in the sequences.

First, the first few amino acids at the beginning and end of each sequence are analysed. It is a simplified version of motif discovery which could have been executed on the whole sequence. This simple approach is reasonable, since in the polypeptide chains some signal peptides are often present at the N or C end of a protein. It made me think that to some extent it could be possible to distinguish protein types based on the first few amino acids in the sequence, as well as some short motifs at the end. 

The simplest way to check that was to slice first the first few amino acids, deduplicating sub-sequences, encoding them, and using them as a multi-class variable. To avoid increasing dimensionality, frequency encoding was initially attempted, however, this method produced many classes with a count = 1 and as a result it was not suitable for such case, so numerical encoding was used instead, which was eventually transformed into a one-hot encoding.

In [6]:
proteins_cleaned

Unnamed: 0,structureId,chainId,sequence,residueCount,len,A,C,D,E,F,...,T,U,V,W,Y,X,B,Z,classification,chainCount
0,4LZV,A,LIVTQTMKGLDIQKVAGTWYSLAMAASDISLLDAQSAPLRVYVEEL...,162,162,0.092593,0.030864,0.061728,0.098765,0.024691,...,0.049383,0.0,0.055556,0.012346,0.024691,0.0,0.0,0.0,TRANSPORT PROTEIN,1
1,4FK5,A,GAAAAMSICPHIQQVFQNEKSKDGVLKTCNAARYILNHSVPKEKFL...,767,476,0.042017,0.054622,0.048319,0.048319,0.052521,...,0.042017,0.0,0.042017,0.010504,0.033613,0.0,0.0,0.0,HYDROLASE,4
2,4DOY,A,MGSSHHHHHHSSGLVPRGSHMTLSPEKQHVRPRDAADNDPVAVARG...,3496,437,0.125858,0.002288,0.059497,0.054920,0.032037,...,0.061785,0.0,0.066362,0.022883,0.025172,0.0,0.0,0.0,OXIDOREDUCTASE,8
3,2XZK,A,INDPAKSAAPYHDEFPLFRSANMASPDKLSTGIGFHSFRIPAVVRT...,772,386,0.085492,0.002591,0.072539,0.033679,0.036269,...,0.077720,0.0,0.049223,0.025907,0.033679,0.0,0.0,0.0,HYDROLASE,2
4,4WR9,A,AYLDEELQTELYEIKHQILQTMGVLSLQGSMLSVGDKVFSTNGQSV...,148,148,0.054054,0.027027,0.067568,0.087838,0.027027,...,0.067568,0.0,0.060811,0.013514,0.067568,0.0,0.0,0.0,BINDING PROTEIN,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117508,3QAH,A,MGSHHHHHHHHGSDYDIPTTENLYFQGSTKVKYVDKIHIGNYEIDA...,304,304,0.023026,0.026316,0.052632,0.055921,0.039474,...,0.042763,0.0,0.055921,0.023026,0.075658,0.0,0.0,0.0,TRANSFERASE,1
117509,4M0P,A,MPNIKIFSGSSHQDLSQKIADRLGLELGKVVTKKFSNQETCVEIGE...,652,326,0.095092,0.027607,0.067485,0.052147,0.027607,...,0.042945,0.0,0.085890,0.006135,0.015337,0.0,0.0,0.0,TRANSFERASE,2
117510,4NPM,A,DESEYEERRDAEARRVKSGIKQASIFTLEECARIEAKIDEVVAKAD...,500,250,0.072000,0.020000,0.068000,0.076000,0.040000,...,0.028000,0.0,0.088000,0.004000,0.032000,0.0,0.0,0.0,OXIDOREDUCTASE,2
117511,2AF0,A,ADLGTENLYFQSMKPSPEEAQLWSEAFDELLASKYGLAAFRAFLKS...,146,146,0.089041,0.027397,0.041096,0.116438,0.082192,...,0.054795,0.0,0.006849,0.013699,0.041096,0.0,0.0,0.0,SIGNALING PROTEIN,1


In [7]:
proteins_dict = dict(zip(list(proteins_cleaned["classification"].value_counts().index), range(len(proteins_cleaned["classification"].value_counts().index.tolist()))))
proteins_cleaned["proteins_encoded"] = proteins_cleaned["classification"].map(proteins_dict)

In [8]:
proteins_cleaned["beg_seq"] = proteins_cleaned["sequence"].apply(lambda x: x[:10])
proteins_cleaned["end_seq"] = proteins_cleaned["sequence"].apply(lambda x: x[-10:])

In [9]:
value_dict = dict(zip(proteins_cleaned["beg_seq"].value_counts().keys().tolist(),
                      range(len(proteins_cleaned["beg_seq"].value_counts().index.tolist()))
                      ))


proteins_cleaned["start_seq_enc"] = proteins_cleaned["beg_seq"].map(value_dict)

In [10]:
value_dict = dict(zip(proteins_cleaned["end_seq"].value_counts().keys().tolist(),
                      range(len(proteins_cleaned["end_seq"].value_counts().index.tolist()))
                      ))


proteins_cleaned["end_seq_enc"] = proteins_cleaned["end_seq"].map(value_dict)

The RF was first used to make predictions only using the motifs as X variable and later with the entire combined dataset.

In [12]:
from sklearn.ensemble import RandomForestClassifier

df_size = len(proteins_cleaned)
train_size = int(len(proteins_cleaned)*0.7)

df = proteins_cleaned.sample(frac=1)

train = df[:train_size]
test = df[train_size:]

X_train = train[["start_seq_enc","end_seq_enc"]]
y_train = train["proteins_encoded"]

X_test = test[["start_seq_enc","end_seq_enc"]]
y_test = test["proteins_encoded"]

In [None]:
model = RandomForestClassifier()
model.fit(X_train,y_train)

In [None]:
y_pred = model.predict(X_test)

The results are quite promising, they indicate that there are some patterns in the data that enable to predict protein function based on the beginning and end motif.

In [45]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [None]:
print(f"accuracy: {accuracy_score(y_test, y_pred)}")
print(f"F1: {f1_score(y_test, y_pred, average='macro')}")
print(f"precision: {precision_score(y_test, y_pred, average='macro')}")
print(f"recall: {recall_score(y_test, y_pred, average='macro')}")

accuracy: 0.6643785102399727
F1: 0.5540940549530348
precision: 0.5744424433817411
recall: 0.5373828015869262


In [None]:
X_train = train.drop(columns=['structureId','chainId','sequence','classification','proteins_encoded', 'beg_seq', 'end_seq'])
y_train = train["proteins_encoded"]


X_test = test.drop(columns=['structureId','chainId','sequence','classification','proteins_encoded', 'beg_seq', 'end_seq'])
y_test = test["proteins_encoded"]

In [None]:
model = RandomForestClassifier()
model.fit(X_train,y_train)

The result haven't improved comparing to base model

In [None]:
y_pred = model.predict(X_test)

print(f"accuracy: {accuracy_score(y_test, y_pred)}")
print(f"F1: {f1_score(y_test, y_pred, average='macro')}")
print(f"precision: {precision_score(y_test, y_pred, average='macro')}")
print(f"recall: {recall_score(y_test, y_pred, average='macro')}")

accuracy: 0.7648210132183582
F1: 0.6849236923109903
precision: 0.7943563095250471
recall: 0.6182355059336834


In [13]:
motifs = df[["beg_seq","classification"]].groupby("classification").agg(list).reset_index()

In [14]:
motifs["len"]=motifs["beg_seq"].apply(len)

The sequences do not seem to be characteristic to the group, based on quick visual comparison. There are no typical patterns that characterises particular groups.

In [15]:
motifs

Unnamed: 0,classification,beg_seq,len
0,APOPTOSIS,"[MDEDVLPGEV, AHMAHAGRSG, MGNAQERPSE, MSQSNRELV...",554
1,BINDING PROTEIN,"[HHHHHHDMLN, GPLGSNNVND, MHHHHHHSSG, MHHHHHHSS...",9744
2,BIOSYNTHETIC PROTEIN,"[MEGTGVVAVY, MGSDKIHHHH, PPGPPGPPGP, MSNSFCVVY...",438
3,CELL ADHESION,"[KLGFFKRQYK, GSHMIWEQHT, AKPCTVSTTN, MPKPINVRV...",1386
4,CELL CYCLE,"[MGEGFNWEIE, MASQPNSSAK, MDKPKRKEAV, ASQPNSSAK...",811
5,CHAPERONE,"[LLKANKDLIS, MDQPMEEEEV, PMISEEREPL, VETFAFQAE...",1308
6,CONTRACTILE PROTEIN,"[MCDEDETTAL, MDDIYKAAVE, MNPIHDRTSD, GSHMPLLSI...",292
7,CYTOKINE,"[GAMDTHRLTR, MRKNRPAGAI, MRGSHHHHHH, SAKELRCQC...",386
8,FLUORESCENT PROTEIN,"[GVIKPDMKIK, LYFQGMVSKG, SKGEELFTGV, MGSSLYNSH...",369
9,HORMONE,"[GIVEQCCTSI, GIVEQCCTSI, GIVEQCCASV, GSPGISGGG...",809


Another approach was to cluster similar sequences together. To simplify the process and make it more computationally efficient only 5 amino acids were sliced both from the beginning and the end of the sequence. 

In [16]:
proteins_cleaned["beg_seq"] = proteins_cleaned["beg_seq"].apply(lambda x: x[:5])
proteins_cleaned["end_seq"] = proteins_cleaned["end_seq"].apply(lambda x: x[:5])

In [17]:
proteins_cleaned

Unnamed: 0,structureId,chainId,sequence,residueCount,len,A,C,D,E,F,...,X,B,Z,classification,chainCount,proteins_encoded,beg_seq,end_seq,start_seq_enc,end_seq_enc
0,4LZV,A,LIVTQTMKGLDIQKVAGTWYSLAMAASDISLLDAQSAPLRVYVEEL...,162,162,0.092593,0.030864,0.061728,0.098765,0.024691,...,0.0,0.0,0.0,TRANSPORT PROTEIN,1,4,LIVTQ,PTQLE,177,165
1,4FK5,A,GAAAAMSICPHIQQVFQNEKSKDGVLKTCNAARYILNHSVPKEKFL...,767,476,0.042017,0.054622,0.048319,0.048319,0.052521,...,0.0,0.0,0.0,HYDROLASE,4,0,GAAAA,LLFYT,2113,2045
2,4DOY,A,MGSSHHHHHHSSGLVPRGSHMTLSPEKQHVRPRDAADNDPVAVARG...,3496,437,0.125858,0.002288,0.059497,0.054920,0.032037,...,0.0,0.0,0.0,OXIDOREDUCTASE,8,2,MGSSH,GQYPI,0,4462
3,2XZK,A,INDPAKSAAPYHDEFPLFRSANMASPDKLSTGIGFHSFRIPAVVRT...,772,386,0.085492,0.002591,0.072539,0.033679,0.036269,...,0.0,0.0,0.0,HYDROLASE,2,0,INDPA,LSWIL,4064,2825
4,4WR9,A,AYLDEELQTELYEIKHQILQTMGVLSLQGSMLSVGDKVFSTNGQSV...,148,148,0.054054,0.027027,0.067568,0.087838,0.027027,...,0.0,0.0,0.0,BINDING PROTEIN,1,3,AYLDE,LQYRL,736,986
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117508,3QAH,A,MGSHHHHHHHHGSDYDIPTTENLYFQGSTKVKYVDKIHIGNYEIDA...,304,304,0.023026,0.026316,0.052632,0.055921,0.039474,...,0.0,0.0,0.0,TRANSFERASE,1,1,MGSHH,SVCLK,753,4586
117509,4M0P,A,MPNIKIFSGSSHQDLSQKIADRLGLELGKVVTKKFSNQETCVEIGE...,652,326,0.095092,0.027607,0.067485,0.052147,0.027607,...,0.0,0.0,0.0,TRANSFERASE,2,1,MPNIK,PLLEH,1026,387
117510,4NPM,A,DESEYEERRDAEARRVKSGIKQASIFTLEECARIEAKIDEVVAKAD...,500,250,0.072000,0.020000,0.068000,0.076000,0.040000,...,0.0,0.0,0.0,OXIDOREDUCTASE,2,2,DESEY,LKAKR,14903,13361
117511,2AF0,A,ADLGTENLYFQSMKPSPEEAQLWSEAFDELLASKYGLAAFRAFLKS...,146,146,0.089041,0.027397,0.041096,0.116438,0.082192,...,0.0,0.0,0.0,SIGNALING PROTEIN,1,8,ADLGT,FYQDL,15693,23262


In [18]:
sequences_beg = pd.DataFrame({"beg_seq": proteins_cleaned['beg_seq'].value_counts().index, "count":proteins_cleaned["beg_seq"].value_counts().values})
sequences_end = pd.DataFrame({"end_seq": proteins_cleaned['end_seq'].value_counts().index, "count":proteins_cleaned["end_seq"].value_counts().values})

In [19]:
sequences_filtered_beg = sequences_beg[sequences_beg["count"]>=100]
sequences_filtered_end = sequences_end[sequences_end["count"]>=100]

Next step was to identify repetitive sequences within the dataset. From the sliced parts the occurrences of identical motifs were counted. It unfortunately resulted in relatively low values, with many unique sequences and numerous instances repeating only a few hundred times, which is not significant in a dataset of 100 000 instances. Despite this, the approach was tested to determine whether sequences could be grouped into similar motifs that might still be characteristic of specific proteins.  

Repetitive motifs were found in only around 1/3 of the instances in the dataset.

In [None]:
sequences_filtered_beg["count"].max()

np.int64(4703)

In [20]:
sequences_filtered_beg["count"].sum()

np.int64(27658)

In [21]:
sequences_filtered_end["count"].sum()

np.int64(12541)

In [22]:
beg_motifs_df = pd.DataFrame(columns=np.array(sequences_filtered_beg["beg_seq"]), index=range(len(sequences_filtered_beg)))

Similarity between sequences was calculated using local sequence alignment from Biopython package. Then similarity matrix was constructed and calculated pairwise similarity. Finally a threshold (0,6) was established to define if sequences are similar to each other. 

On the resulting matrix connected component algorithm was executed to identify groups of similar motifs. They were labeled as one group. Motifs at the beginning and end of the sequences were identified and assigned corresponding labels. While this approach may not be ideal, as it could imply a numerical order, it was the simplest solution at this stage. Therefore, this encoding was used for the initial test.

In [None]:
# def jaccard_similarity(target,query):
#     set_a = set(target)
#     set_b = set(query)
#     score = len(set_a & set_b) / len(set_a| set_b)
#     return score

In [None]:
from Bio import Align
aligner = Align.PairwiseAligner()

In [29]:
for idx_col, column in enumerate(beg_motifs_df.columns):

    for idx_row, row in beg_motifs_df.iterrows():
        target = beg_motifs_df.columns[idx_col]
        query = beg_motifs_df.columns[idx_row]
        score = aligner.score(target, query) / (max(len(target),len(query)))

        beg_motifs_df.iloc[idx_row,idx_col] = score

In [30]:
beg_motifs_df

Unnamed: 0,MGSSH,MHHHH,GSSGS,MAHHH,GPLGS,HHHHH,MRGSH,MGHHH,KVFGR,PQITL,...,GSHMK,GAMDP,ENSNI,VLSEG,GANKT,DYKDD,SNAMN,MGSHH,IVGGQ,GLSDG
0,1.0,0.4,0.6,0.4,0.4,0.2,0.8,0.6,0.2,0.0,...,0.6,0.2,0.2,0.2,0.2,0.0,0.2,0.8,0.2,0.4
1,0.4,1.0,0.0,0.8,0.0,0.8,0.4,0.8,0.0,0.0,...,0.2,0.2,0.0,0.0,0.0,0.0,0.2,0.6,0.0,0.0
2,0.6,0.0,1.0,0.0,0.6,0.0,0.4,0.2,0.2,0.0,...,0.4,0.2,0.2,0.4,0.2,0.0,0.2,0.4,0.4,0.6
3,0.4,0.8,0.0,1.0,0.0,0.6,0.4,0.8,0.0,0.0,...,0.2,0.2,0.0,0.0,0.2,0.0,0.2,0.6,0.0,0.0
4,0.4,0.0,0.6,0.0,1.0,0.0,0.4,0.2,0.2,0.4,...,0.4,0.4,0.2,0.4,0.2,0.0,0.2,0.4,0.4,0.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,...,0.2,0.2,0.0,0.0,0.2,1.0,0.0,0.0,0.0,0.2
64,0.2,0.2,0.2,0.2,0.2,0.0,0.2,0.2,0.0,0.0,...,0.4,0.4,0.4,0.2,0.4,0.0,1.0,0.2,0.0,0.2
65,0.8,0.6,0.4,0.6,0.4,0.4,0.8,0.8,0.2,0.0,...,0.6,0.2,0.2,0.2,0.2,0.0,0.2,1.0,0.2,0.4
66,0.2,0.0,0.4,0.0,0.4,0.0,0.2,0.2,0.4,0.2,...,0.2,0.2,0.2,0.4,0.2,0.0,0.0,0.2,1.0,0.4


In [31]:
beg_motifs_df_binary = beg_motifs_df.copy()

In [32]:
for column in beg_motifs_df_binary.columns:
    beg_motifs_df_binary[column] = beg_motifs_df_binary[column].apply(lambda x: 1 if x>=0.6 else 0)

In [33]:
from scipy.sparse.csgraph import connected_components

n_components, labels = connected_components(csgraph=np.array(beg_motifs_df_binary), directed=False, return_labels=True)
n_components

12

In [34]:
beg_motifs_df_binary.index = beg_motifs_df_binary.columns
beg_motifs_df_binary["labels"] = labels
beg_motifs_df_binary["seq"] = beg_motifs_df_binary.index

In [35]:
beg_motifs_df_binary

Unnamed: 0,MGSSH,MHHHH,GSSGS,MAHHH,GPLGS,HHHHH,MRGSH,MGHHH,KVFGR,PQITL,...,ENSNI,VLSEG,GANKT,DYKDD,SNAMN,MGSHH,IVGGQ,GLSDG,labels,seq
MGSSH,1,0,1,0,0,0,1,1,0,0,...,0,0,0,0,0,1,0,0,0,MGSSH
MHHHH,0,1,0,1,0,1,0,1,0,0,...,0,0,0,0,0,1,0,0,0,MHHHH
GSSGS,1,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,GSSGS
MAHHH,0,1,0,1,0,1,0,1,0,0,...,0,0,0,0,0,1,0,0,0,MAHHH
GPLGS,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,GPLGS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DYKDD,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,11,DYKDD
SNAMN,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,SNAMN
MGSHH,1,1,0,1,0,0,1,1,0,0,...,0,0,0,0,0,1,0,0,0,MGSHH
IVGGQ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,IVGGQ


In [36]:
beg_seq_dict = dict(zip(beg_motifs_df_binary["seq"], beg_motifs_df_binary["labels"]))

In [37]:
end_motifs_df = pd.DataFrame(columns=np.array(sequences_filtered_end["end_seq"]), index=range(len(sequences_filtered_end)))

for idx_col, column in enumerate(end_motifs_df.columns):

    for idx_row, row in end_motifs_df.iterrows():
        target = end_motifs_df.columns[idx_col]
        query = end_motifs_df.columns[idx_row]
        score = aligner.score(target, query) / (max(len(target),len(query)))

        end_motifs_df.iloc[idx_row,idx_col] = score

end_motifs_df_binary = end_motifs_df.copy()

for column in end_motifs_df_binary.columns:
    end_motifs_df_binary[column] = end_motifs_df_binary[column].apply(lambda x: 1 if x>=0.6 else 0)


n_components, labels = connected_components(csgraph=np.array(end_motifs_df_binary), directed=False, return_labels=True)


end_motifs_df_binary.index = end_motifs_df_binary.columns
end_motifs_df_binary["labels"] = labels
end_motifs_df_binary["seq"] = end_motifs_df_binary.index

end_seq_dict = dict(zip(end_motifs_df_binary["seq"], end_motifs_df_binary["labels"]))

In [38]:
n_components

36

In [39]:
proteins_cleaned["beg_motif"] = proteins_cleaned["beg_seq"].map(beg_seq_dict)
proteins_cleaned["end_motif"] = proteins_cleaned["end_seq"].map(end_seq_dict)

In [40]:
proteins_cleaned["beg_motif"].fillna(-1, inplace=True)
proteins_cleaned["end_motif"].fillna(-1, inplace=True)

Final test using Random Forest were performed and achieved very similar results to the base model. It is possible that the few first nucleotides do not provide enough information, cases belonging to any identified group constitute only around 25% of the dataset and as a result the information is probably insufficient. 

In [41]:
df = proteins_cleaned.sample(frac=1)

train = df[:train_size]
test = df[train_size:]

X_train = train.drop(columns=['structureId','chainId','sequence','classification','proteins_encoded', 'beg_seq', 'end_seq','start_seq_enc', 'end_seq_enc'])
y_train = train["proteins_encoded"]

X_test = test.drop(columns=['structureId','chainId','sequence','classification','proteins_encoded', 'beg_seq', 'end_seq','start_seq_enc', 'end_seq_enc'])
y_test = test["proteins_encoded"]

In [42]:
X_train.columns

Index(['residueCount', 'len', 'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
       'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'Y', 'X', 'B',
       'Z', 'chainCount', 'beg_motif', 'end_motif'],
      dtype='object')

In [43]:
model = RandomForestClassifier()
model.fit(X_train,y_train)

In [46]:
y_pred = model.predict(X_test)

print(f"accuracy: {accuracy_score(y_test, y_pred)}")
print(f"F1: {f1_score(y_test, y_pred, average='macro')}")
print(f"precision: {precision_score(y_test, y_pred, average='macro')}")
print(f"recall: {recall_score(y_test, y_pred, average='macro')}")

accuracy: 0.7689907528223747
F1: 0.6944700357592838
precision: 0.8010852674526864
recall: 0.6286207505610867


In [None]:
# import json

# with open('./Data/beg_motifs_dict.json', 'w') as fp:
#     json.dump(beg_seq_dict, fp)

# with open('./Data/end_motifs_dict.json', 'w') as fp:
#     json.dump(end_seq_dict, fp)