In [8]:
import numpy as np
from collections import Counter

import matplotlib.pyplot as plt

import pandas as pd
import os
import sys
sys.path.append('../')
from deepfold.data.aminoacids import MAXLEN
from deepfold.data.ontology import Ontology
from deepfold.data.utils.data_utils import is_exp_code, is_cafa_target

### Input file

In [9]:
data_path_dict = {}
base_path = u"/Users/robin/xbiome/datasets/protein"
go_file = os.path.join(base_path, u"go.obo")
swissprot_file = os.path.join(base_path, u"uniprot_sprot.dat.gz")
data_path_dict['go'] = go_file
data_path_dict['uniprot_sprot'] = swissprot_file

for k, v in data_path_dict.items():
    print("%s : %s [%s]"%(k, v, os.path.exists(v)))

go : /Users/robin/xbiome/datasets/protein/go.obo [True]
uniprot_sprot : /Users/robin/xbiome/datasets/protein/uniprot_sprot.dat.gz [True]


### Output file

In [10]:
save_path = u"/Users/robin/xbiome/datasets/protein/data/"
min_count = 5

if not os.path.exists(os.path.join(save_path, f"min_count_{min_count}")):
    os.mkdir(os.path.join(save_path, f"min_count_{min_count}"))

# Result file with a list of proteins, sequences and annotations from swissprot_file in pandas.DataFrame format
out_swissprot_pd_file = os.path.join(save_path, f"min_count_{min_count}/swissprot.pkl")

# Result file with a list of terms for evaluation task
evaluation_terms_file = os.path.join(save_path, f"min_count_{min_count}/exp_terms.pkl")

# Result file with a DataFrame for evaluation task
evaluation_data_file = os.path.join(save_path, f"min_count_{min_count}/test_data.pkl")

train_data_with_exp = os.path.join(save_path, f"min_count_{min_count}/train_data.pkl")

# Result file with a list of terms for training
train_terms_file = os.path.join(save_path, f"min_count_{min_count}/train_terms_swissprot.pkl")
test_terms_file = os.path.join(save_path, f"min_count_{min_count}/test_terms_swissprot.pkl")

# Result file with a DataFrame for training
train_data_swissprot_file = os.path.join(save_path, f"min_count_{min_count}/train_data_swissprot.pkl")

# Result file with a DataFrame for testing
test_data_swissprot_file = os.path.join(save_path, f"min_count_{min_count}/test_data_swissprot.pkl")

data_path_dict['swissprot'] = out_swissprot_pd_file

# Data for evaluation
data_path_dict['evaluation_data'] = evaluation_data_file
data_path_dict['evaluation_terms'] = evaluation_terms_file
data_path_dict['train_with_exp'] = train_data_with_exp

# Data for training
data_path_dict['train_data_swissprot'] = train_data_swissprot_file
data_path_dict['test_data_swissprot'] = test_data_swissprot_file
data_path_dict['train_terms_swissprot'] = train_terms_file
data_path_dict['test_terms_swissprot'] = test_terms_file

for k, v in data_path_dict.items():
    print("%s : %s [%s]"%(k, v, os.path.exists(v)))

go : /Users/robin/xbiome/datasets/protein/go.obo [True]
uniprot_sprot : /Users/robin/xbiome/datasets/protein/uniprot_sprot.dat.gz [True]
swissprot : /Users/robin/xbiome/datasets/protein/data/min_count_5/swissprot.pkl [False]
evaluation_data : /Users/robin/xbiome/datasets/protein/data/min_count_5/test_data.pkl [False]
evaluation_terms : /Users/robin/xbiome/datasets/protein/data/min_count_5/exp_terms.pkl [False]
train_with_exp : /Users/robin/xbiome/datasets/protein/data/min_count_5/train_data.pkl [False]
train_data_swissprot : /Users/robin/xbiome/datasets/protein/data/min_count_5/train_data_swissprot.pkl [False]
test_data_swissprot : /Users/robin/xbiome/datasets/protein/data/min_count_5/test_data_swissprot.pkl [False]
train_terms_swissprot : /Users/robin/xbiome/datasets/protein/data/min_count_5/train_terms_swissprot.pkl [False]
test_terms_swissprot : /Users/robin/xbiome/datasets/protein/data/min_count_5/test_terms_swissprot.pkl [False]


# Gene Ontology from .obo File

In [11]:
go = Ontology(data_path_dict['go'], with_rels=True)

In [12]:
print("Total number of terms in go: %d"%len(go.ontology))

Total number of terms in go: 47342


In [13]:
# Print all go terms
set(go.ontology.keys())

{'GO:0103050',
 'GO:1905617',
 'GO:0050677',
 'GO:0035780',
 'GO:0042404',
 'GO:0072205',
 'GO:0006901',
 'GO:0048804',
 'GO:1903568',
 'GO:1901382',
 'GO:0007018',
 'GO:0010994',
 'GO:0033948',
 'GO:0046934',
 'GO:0045823',
 'GO:2001233',
 'GO:0007145',
 'GO:1990024',
 'GO:0140635',
 'GO:0090415',
 'GO:0010608',
 'GO:0002306',
 'GO:0010341',
 'GO:0039511',
 'GO:0070163',
 'GO:0018815',
 'GO:0016917',
 'GO:0042979',
 'GO:0004382',
 'GO:0002025',
 'GO:0043185',
 'GO:0043270',
 'GO:1903430',
 'GO:0001891',
 'GO:0004569',
 'GO:0007277',
 'GO:0106257',
 'GO:0009446',
 'GO:1901142',
 'GO:0001050',
 'GO:0106050',
 'GO:0018679',
 'GO:0032118',
 'GO:0047379',
 'GO:0043210',
 'GO:0075222',
 'GO:0055019',
 'GO:0002193',
 'GO:0048424',
 'GO:0036380',
 'GO:0072133',
 'GO:0031214',
 'GO:0044697',
 'GO:0009114',
 'GO:0007558',
 'GO:0050497',
 'GO:0008283',
 'GO:0070226',
 'GO:0034408',
 'GO:2000292',
 'GO:0071570',
 'GO:0040007',
 'GO:0005044',
 'GO:0043445',
 'GO:0071065',
 'GO:0071140',
 'GO:19012

# UniProt/SWISS-PROT

In [14]:
df_swissprot = pd.read_pickle(data_path_dict['swissprot'])
df_swissprot.head()

FileNotFoundError: [Errno 2] No such file or directory: '/Users/robin/xbiome/datasets/protein/data/min_count_5/swissprot.pkl'

In [None]:
print(f'Total number of swissprot: {len(df_swissprot)}')

Total number of swissprot: 565254


In [None]:
# get frequency of each term in 'annotations'
cnt = Counter()
for i, row in df_swissprot.iterrows():
    for annot in row['annotations']:
        go_id, code = annot.split('|')
        cnt[go_id] += 1

# get go terms with sequences no less than min_count
cnt_gt_min = [k for k, v in cnt.items() if v >= min_count]      

# print info
print("Number of sequences in UniProtKB/Swiss-Prot:", len(df_swissprot))
print("Number of annotations refered in UniProt/SWISS-PROT:", len(cnt))
print("Number of annotations with sequences no less than %d: %d"%(min_count, len(cnt_gt_min)))   

Number of sequences in UniProtKB/Swiss-Prot: 565254
Number of annotations refered in UniProt/SWISS-PROT: 28852
Number of annotations with sequences no less than 5: 19803


## Statistics on 3-character GO evidence code

In [None]:
# Experiment Code
EXP_CODES = set([
    'EXP', 'IDA', 'IPI', 'IMP', 'IGI', 'IEP', 'TAS', 'IC',
    'HTP', 'HDA', 'HMP', 'HGI', 'HEP'])

def is_exp_code(code):
    return code in EXP_CODES

# Similarity Evidence Code
SIM_CODES = set(['ISS', 'ISO', 'ISA', 'ISM', 'IGC', 'IBA', 'IBD', 'IKR', 'IRD'])

def is_sim_code(code):
    return code in SIM_CODES

def is_non_code(code):
    return code not in (EXP_CODES | SIM_CODES)

# split test set from swissprot

In [None]:
def split_test_set(data_frame, split_rate):
    index = []
    annotations = []
    for i, row in enumerate(data_frame.itertuples()):
        annots = []
        for annot in row.annotations:
            go_id, code = annot.split('|')
            if is_exp_code(code):
                annots.append(go_id)
        # Ignore proteins without experimental annotations
        if len(annots) == 0:
            continue
        index.append(i)
        annotations.append(annots)
    data_frame = data_frame.iloc[index]
    data_frame = data_frame.reset_index()
    data_frame['exp_annotations'] = annotations
    
    # propagate annotation
    prop_annotations = []
    for i, row in data_frame.iterrows():
        annot_set = set()
        annots = row['exp_annotations']
        for go_id in annots:
            annot_set |= go.get_anchestors(go_id)
        annots = list(annot_set)
        prop_annotations.append(annots)
    data_frame['prop_annotations'] = prop_annotations
    data_frame['contain_exp'] = np.ones(len(data_frame))     # Mask sequence with exp terms
    
    # split into train_df and test_df
    ids = np.arange(len(data_frame))
    split_point = int(split_rate * len(data_frame))
    
    np.random.seed(seed=0)
    np.random.shuffle(ids)
    
    train_df = data_frame.iloc[ids[:split_point]]
    test_df = data_frame.iloc[ids[split_point:]]
    
    return train_df, test_df

In [None]:
df_exp = df_swissprot.copy()
train_df, test_df = split_test_set(df_exp, 0.95)
print(f"Number of train_df: {len(train_df)}")
print(f"Number of test_df: {len(test_df)}")

Number of train_df: 73592
Number of test_df: 3874


In [None]:
test_df.head()

Unnamed: 0,index,proteins,accessions,sequences,annotations,interpros,orgs,exp_annotations,prop_annotations,contain_exp
1253,6710,ACK1_YEAST,Q07622; D6VRF1;,MVNQGQPQPNLYDKHINMFPPARARESSHKLGNANSDRHGLPAQNI...,"[GO:0005739|HDA, GO:0008047|IBA, GO:0031505|IM...","[IPR006597, IPR011990]",559292,"[GO:0005739, GO:0031505, GO:0009967]","[GO:0071555, GO:0009987, GO:0007165, GO:004522...",1.0
33330,210625,KIFC1_CRIGR,Q60443;,MKEALEPAKKRTRGLGAVTKIDTSRSKGPLLSSLSQPQGPTAAQKG...,"[GO:0005769|IEA, GO:0005874|IEA, GO:0005815|IE...","[IPR019821, IPR001752, IPR036961, IPR027417]",10029,[GO:0030496],"[GO:0110165, GO:0005575, GO:0030496]",1.0
18394,109472,DSC3_HUMAN,Q14574; A6NN35; Q14200; Q9HAZ9;,MAAAGPRRSVRGAVCLHLLLTLVIFSRAGEACKKVILNVPSKLEAD...,"[GO:0030054|IDA, GO:0005911|IBA, GO:0001533|TA...","[IPR002126, IPR015919, IPR020894, IPR000233, I...",9606,"[GO:0030054, GO:0001533, GO:0016020, GO:000588...","[GO:0140096, GO:0009987, GO:0003824, GO:003005...",1.0
95,687,2AAA_SCHPO,Q9UT08; Q10293;,MQTENQVNDLYPIAVLIDELKHDEITYRLNALERLSTIALALGPER...,"[GO:0005737|IBA, GO:0005829|HDA, GO:0090443|ID...","[IPR011989, IPR016024, IPR000357, IPR021133]",284812,"[GO:0005829, GO:0090443, GO:0110085, GO:004473...","[GO:0045786, GO:0010648, GO:0034613, GO:005112...",1.0
11368,62522,CH1CO_SYNAS,Q2LQN9;,MKGPIKFNALSLQGRSVMSNQSNDTTITQRRDTMNELTEEQKLLME...,"[GO:0003995|IEA, GO:0050660|IDA, GO:0052890|ID...","[IPR006089, IPR006091, IPR036250, IPR009075, I...",56780,"[GO:0050660, GO:0052890, GO:0051262]","[GO:0000166, GO:0044085, GO:0003824, GO:000998...",1.0


In [None]:
# Concat the train dataframe and test dataframe with experience terms
df_exp = pd.concat([train_df, test_df])
df_exp.head()

Unnamed: 0,index,proteins,accessions,sequences,annotations,interpros,orgs,exp_annotations,prop_annotations,contain_exp
63995,454399,ST1S3_DANRE,Q7T2V2;,MEISDFSSMKLNSRPELIDFEGISMIHYFTDNWEKVKNFQARPDDI...,"[GO:0005737|IBA, GO:0004062|IBA, GO:0008146|ID...","[IPR027417, IPR000863]",7955,"[GO:0008146, GO:0006805]","[GO:0051716, GO:0003824, GO:0009987, GO:001674...",1.0
36226,229770,LYAM1_MOUSE,P18337;,MVFPWRCEGTYWGSRNILKLWVWTLLCCDFLIHHGTHCWTYHYSEK...,"[GO:0009986|IDA, GO:0009897|IDA, GO:0005887|IS...","[IPR001304, IPR016186, IPR018378, IPR016187, I...",10090,"[GO:0009986, GO:0009897, GO:0005886, GO:005083...","[GO:0046683, GO:0050839, GO:0010243, GO:190170...",1.0
73638,523000,XYNB_PRUPE,P83344;,ADAIKAGLDLDCGPFLAIHTEAAVRRGLVSQLEINWALANTMTVQM...,"[GO:0016798|TAS, GO:0004553|IEA, GO:0005975|IEA]","[IPR026891, IPR002772, IPR036881, IPR036962, I...",3760,[GO:0016798],"[GO:0003674, GO:0016798, GO:0003824, GO:0016787]",1.0
3003,14698,ALKH_ECOLI,P0A955; P10177;,MKNWKTSAESILTTGPVVPVIVVKKLEHAVPMAKALVAGGVRVLEV...,"[GO:0005829|IDA, GO:0016020|HDA, GO:0106009|ID...","[IPR000887, IPR013785, IPR031337, IPR031338]",83333,"[GO:0005829, GO:0016020, GO:0106009, GO:000867...","[GO:0016830, GO:0003824, GO:0016833, GO:004280...",1.0
25019,151684,GEPH_MOUSE,Q8BUV3; E9QKJ1;,MATEGMILTNHDHQIRVGVLTVSDSCFRNLAEDRSGINLKDLVQDP...,"[GO:0099144|IDA, GO:0005737|ISO, GO:0005856|IE...","[IPR036425, IPR001453, IPR008284, IPR038987, I...",10090,"[GO:0099144, GO:0005829, GO:0030425, GO:001989...","[GO:0016830, GO:0099084, GO:0031503, GO:006223...",1.0


## Statistic of terms

把只带有实验性的序列筛选出来并且统计包含序列大于5的terms，目的是后续评估Fmax、Smin、AUPR时只用这部分数据

In [None]:
# get frequency of each term in 'prop_annotations'
def statistic_terms(dataframe, min_count):
    cnt = Counter()
    for i, row in dataframe.iterrows():
        for term in row['prop_annotations']:
            cnt[term] += 1

    print("Number of prop_annotations:", len(cnt))
    sorted_by_freq_tuples = sorted(cnt.items(), key=lambda x: x[0])
    sorted_by_freq_tuples.sort(key=lambda x: x[1], reverse=True)
    
    # Filter terms with annotations more than min_count
    res = {}
    for key, val in cnt.items():
        if val >= min_count:
            ont = key.split(':')[0]
            if ont not in res:
                res[ont] = []
            res[ont].append(key)

    terms = []
    for key, val in res.items():
        print(key, len(val))
        terms += val

    print("Number of prop_annotations with sequences more than %d: %d (%.2f%%)"
          %(min_count, len(terms), 100*len(terms)/len(cnt)))
    return sorted_by_freq_tuples, terms

In [None]:
# Minimum number of annotated proteins in each GO annotation(minimum count: 5)
min_count = 5
sort_terms, exp_terms = statistic_terms(df_exp, min_count)

Number of prop_annotations: 31145
GO 18789
Number of prop_annotations with sequences more than 5: 18789 (60.33%)


In [None]:
sort_dict = pd.DataFrame(sort_terms, columns=["term", "sequence_num"])
sort_dict.iloc[:len(exp_terms),:]

Unnamed: 0,term,sequence_num
0,GO:0008150,59955
1,GO:0005575,59004
2,GO:0110165,58132
3,GO:0009987,51728
4,GO:0005622,46447
...,...,...
18784,GO:2001107,5
18785,GO:2001159,5
18786,GO:2001200,5
18787,GO:2001206,5


In [None]:
# Pickling test experiment terms to file
sort_dict.to_pickle(data_path_dict['evaluation_terms'])
test_df.to_pickle(data_path_dict['evaluation_data'])
train_df.to_pickle(data_path_dict['train_with_exp'])

In [None]:
train_x = train_df['index'].values
test_x = test_df['index'].values

In [None]:
exp_index = np.concatenate([train_x, test_x])

In [None]:
df_without_exp = df_swissprot.drop(index=exp_index)
print(f"Total sequence without experiment terms: {len(df_without_exp)}")

Total sequence without experiment terms: 487788


In [None]:
df_without_exp.head()

Unnamed: 0,proteins,accessions,sequences,annotations,interpros,orgs
0,001R_FRG3G,Q6GZX4;,MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQV...,[GO:0046782|IEA],[IPR007031],654924
1,002L_FRG3G,Q6GZX3;,MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQT...,"[GO:0033644|IEA, GO:0016021|IEA]",[IPR004251],654924
2,002R_IIV3,Q197F8;,MASNTVSAQGGSNRPVRDFSNIQDVAQFLLFDPIWNEQPGSIVPWK...,[],[],345201
3,003L_IIV3,Q197F7;,MYQAINPCPQSWYGSPQLEREIVCKMSGAPHYPNYYPVHPNALGGA...,[],[],345201
4,003R_FRG3G,Q6GZX2;,MARPLLGKTSSVRRRLESLSACSIFFFLRKFCQKMASLVFLNSPVY...,[],[],654924


# Split annotations into exp | sim | non annotations

In [None]:
def split_annotations(data_frame):
    index = []
    annotations_exp = []
    annotations_sim = []
    annotations_non = []

    for i, row in enumerate(data_frame.itertuples()):
        annots_exp = []
        annots_sim = []
        annots_non = []
        for annot in row.annotations:
            go_id, code = annot.split('|')
            if is_exp_code(code):
                annots_exp.append(go_id)
            if is_sim_code(code):
                annots_sim.append(go_id) 
            if is_non_code(code):           
                annots_non.append(go_id) 
        # Ignore proteins without experimental annotations
        if len(annots_exp+annots_sim+annots_non) == 0:
            continue
        index.append(i)
        annotations_exp.append(annots_exp)
        annotations_sim.append(annots_sim)
        annotations_non.append(annots_non)
    
    data_frame = data_frame.iloc[index]
    data_frame = data_frame.reset_index()
    data_frame['exp_annotations'] = annotations_exp
    data_frame['sim_annotations'] = annotations_sim
    data_frame['non_annotations'] = annotations_non
    data_frame['contain_exp'] = np.zeros(len(data_frame))
    return data_frame

In [None]:
df_annoted = df_without_exp.copy()
df_annoted = split_annotations(df_annoted)
df_annoted.head()

Unnamed: 0,index,proteins,accessions,sequences,annotations,interpros,orgs,exp_annotations,sim_annotations,non_annotations,contain_exp
0,0,001R_FRG3G,Q6GZX4;,MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQV...,[GO:0046782|IEA],[IPR007031],654924,[],[],[GO:0046782],0.0
1,1,002L_FRG3G,Q6GZX3;,MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQT...,"[GO:0033644|IEA, GO:0016021|IEA]",[IPR004251],654924,[],[],"[GO:0033644, GO:0016021]",0.0
2,5,004R_FRG3G,Q6GZX1;,MNAKYDTDQGVGRMLFLGTIGLAVVVGGLMAYGYYYDGKTPSSGTS...,"[GO:0033644|IEA, GO:0016021|IEA]",[],654924,[],[],"[GO:0033644, GO:0016021]",0.0
3,13,009L_FRG3G,Q6GZW6;,MDTSPYDFLKLYPWLSRGEADKGTLLDAFPGETFEQSLASDVAMRR...,"[GO:0005524|IEA, GO:0003677|IEA, GO:0004386|IE...","[IPR006935, IPR014001, IPR001650, IPR027417, I...",654924,[],[],"[GO:0005524, GO:0003677, GO:0004386, GO:0016787]",0.0
4,15,010R_FRG3G,Q6GZW5;,MKMDTDCRHWIVLASVPVLTVLAFKGEGALALAGLLVMAAVAMYRD...,"[GO:0033644|IEA, GO:0016021|IEA]",[],654924,[],[],"[GO:0033644, GO:0016021]",0.0


In [None]:
group_names = ['exp_annotations', 'sim_annotations', 'non_annotations']
groups_dict = {}

for gname in group_names:
    sub_group_idx = [idx for idx, annot in enumerate(df_annoted[gname]) if len(annot) > 0]
    groups_dict[gname] = sub_group_idx

for k, v in groups_dict.items():
    print("Number of sequences with %s : %6d (%.2f%%)"%(k, len(v), 100*len(v)/len(df_annoted)))

Number of sequences with exp_annotations :      0 (0.00%)
Number of sequences with sim_annotations :  66495 (14.35%)
Number of sequences with non_annotations : 453852 (97.94%)


In [None]:
def stastic_annotations(dataframe):
    group_names = ['exp_annotations', 'sim_annotations', 'non_annotations']
    groups_dict = {}

    for gname in group_names:
        sub_group_idx = [idx for idx, annot in enumerate(dataframe[gname]) if len(annot) > 0]
        groups_dict[gname] = sub_group_idx

    for k, v in groups_dict.items():
        print("Number of sequences with %s : %6d (%.2f%%)"%(k, len(v), 100*len(v)/len(dataframe)))

## Propagate annotations with the same weight for both annotations

In [None]:
def propgate_terms(data_frame):
    prop_annotations = []
    for i, row in data_frame.iterrows():
        # Propagate annotations
        annot_set = set()
        # exp_
        annots = row['exp_annotations']
        for go_id in annots:
            annot_set |= go.get_anchestors(go_id)
        # sim_
        annots = row['sim_annotations']
        for go_id in annots:
            annot_set |= go.get_anchestors(go_id)
        # non_
        annots = row['non_annotations']
        for go_id in annots:
            annot_set |= go.get_anchestors(go_id)
        # to_list
        annots = list(annot_set)
        prop_annotations.append(annots)

    data_frame['prop_annotations'] = prop_annotations
    return data_frame

In [None]:
df_swissprot_without_exp = propgate_terms(df_annoted)
df_swissprot_without_exp.head()

Unnamed: 0,index,proteins,accessions,sequences,annotations,interpros,orgs,exp_annotations,sim_annotations,non_annotations,contain_exp,prop_annotations
0,0,001R_FRG3G,Q6GZX4;,MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQV...,[GO:0046782|IEA],[IPR007031],654924,[],[],[GO:0046782],0.0,"[GO:0016032, GO:0050789, GO:0019083, GO:001908..."
1,1,002L_FRG3G,Q6GZX3;,MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQT...,"[GO:0033644|IEA, GO:0016021|IEA]",[IPR004251],654924,[],[],"[GO:0033644, GO:0016021]",0.0,"[GO:0110165, GO:0043657, GO:0005575, GO:003364..."
2,5,004R_FRG3G,Q6GZX1;,MNAKYDTDQGVGRMLFLGTIGLAVVVGGLMAYGYYYDGKTPSSGTS...,"[GO:0033644|IEA, GO:0016021|IEA]",[],654924,[],[],"[GO:0033644, GO:0016021]",0.0,"[GO:0110165, GO:0043657, GO:0005575, GO:003364..."
3,13,009L_FRG3G,Q6GZW6;,MDTSPYDFLKLYPWLSRGEADKGTLLDAFPGETFEQSLASDVAMRR...,"[GO:0005524|IEA, GO:0003677|IEA, GO:0004386|IE...","[IPR006935, IPR014001, IPR001650, IPR027417, I...",654924,[],[],"[GO:0005524, GO:0003677, GO:0004386, GO:0016787]",0.0,"[GO:0000166, GO:0017111, GO:0003824, GO:014065..."
4,15,010R_FRG3G,Q6GZW5;,MKMDTDCRHWIVLASVPVLTVLAFKGEGALALAGLLVMAAVAMYRD...,"[GO:0033644|IEA, GO:0016021|IEA]",[],654924,[],[],"[GO:0033644, GO:0016021]",0.0,"[GO:0110165, GO:0043657, GO:0005575, GO:003364..."


In [None]:
sort_terms_pro, terms_pro = statistic_terms(df_swissprot_without_exp, min_count)

Number of prop_annotations: 24754
GO 17186
Number of prop_annotations with sequences more than 5: 17186 (69.43%)


In [None]:
sort_dict_pro = pd.DataFrame(sort_terms_pro, columns=["term", "sequence_num"])
sort_dict_pro

Unnamed: 0,term,sequence_num
0,GO:0008150,421891
1,GO:0003674,413515
2,GO:0009987,392119
3,GO:0008152,351902
4,GO:0005575,345136
...,...,...
24749,GO:2001229,1
24750,GO:2001291,1
24751,GO:2001292,1
24752,GO:2001294,1


In [None]:
out_of_expterms = set(sort_dict_pro['term'].values) - set(exp_terms)
print(f"Total number of terms which not in exp_terms: {len(out_of_expterms)}")

Total number of terms which not in exp_terms: 7452


In [None]:
for t in out_of_expterms:
    frequence = sort_dict_pro[sort_dict_pro['term']==t].sequence_num.values[0]
    print(f"Sequence number of {t}: {frequence}")

Sequence number of GO:1902081: 3
Sequence number of GO:0047801: 7
Sequence number of GO:0004397: 196
Sequence number of GO:0044604: 1
Sequence number of GO:0140568: 6
Sequence number of GO:0072061: 1
Sequence number of GO:0047343: 3
Sequence number of GO:0032765: 8
Sequence number of GO:0035985: 3
Sequence number of GO:0007557: 1
Sequence number of GO:0016287: 1
Sequence number of GO:0061150: 4
Sequence number of GO:0008715: 44
Sequence number of GO:0102264: 21
Sequence number of GO:0102339: 45
Sequence number of GO:0047711: 4
Sequence number of GO:0047273: 4
Sequence number of GO:0019281: 214
Sequence number of GO:0036482: 1
Sequence number of GO:0031697: 1
Sequence number of GO:0072054: 1
Sequence number of GO:0039615: 121
Sequence number of GO:0047753: 1
Sequence number of GO:0008972: 30
Sequence number of GO:0008983: 29
Sequence number of GO:0051669: 8
Sequence number of GO:0070118: 26
Sequence number of GO:0043838: 1
Sequence number of GO:0008869: 45
Sequence number of GO:0046224:

## Information of dataframe and transfer to the same formation

In [None]:
train_df = train_df.drop(['exp_annotations'], axis=1)
train_df.head()

Unnamed: 0,index,proteins,accessions,sequences,annotations,interpros,orgs,prop_annotations,contain_exp
63995,454399,ST1S3_DANRE,Q7T2V2;,MEISDFSSMKLNSRPELIDFEGISMIHYFTDNWEKVKNFQARPDDI...,"[GO:0005737|IBA, GO:0004062|IBA, GO:0008146|ID...","[IPR027417, IPR000863]",7955,"[GO:0051716, GO:0003824, GO:0009987, GO:001674...",1.0
36226,229770,LYAM1_MOUSE,P18337;,MVFPWRCEGTYWGSRNILKLWVWTLLCCDFLIHHGTHCWTYHYSEK...,"[GO:0009986|IDA, GO:0009897|IDA, GO:0005887|IS...","[IPR001304, IPR016186, IPR018378, IPR016187, I...",10090,"[GO:0046683, GO:0050839, GO:0010243, GO:190170...",1.0
73638,523000,XYNB_PRUPE,P83344;,ADAIKAGLDLDCGPFLAIHTEAAVRRGLVSQLEINWALANTMTVQM...,"[GO:0016798|TAS, GO:0004553|IEA, GO:0005975|IEA]","[IPR026891, IPR002772, IPR036881, IPR036962, I...",3760,"[GO:0003674, GO:0016798, GO:0003824, GO:0016787]",1.0
3003,14698,ALKH_ECOLI,P0A955; P10177;,MKNWKTSAESILTTGPVVPVIVVKKLEHAVPMAKALVAGGVRVLEV...,"[GO:0005829|IDA, GO:0016020|HDA, GO:0106009|ID...","[IPR000887, IPR013785, IPR031337, IPR031338]",83333,"[GO:0016830, GO:0003824, GO:0016833, GO:004280...",1.0
25019,151684,GEPH_MOUSE,Q8BUV3; E9QKJ1;,MATEGMILTNHDHQIRVGVLTVSDSCFRNLAEDRSGINLKDLVQDP...,"[GO:0099144|IDA, GO:0005737|ISO, GO:0005856|IE...","[IPR036425, IPR001453, IPR008284, IPR038987, I...",10090,"[GO:0016830, GO:0099084, GO:0031503, GO:006223...",1.0


In [None]:
test_df = test_df.drop(['exp_annotations'], axis=1)
test_df.head()

Unnamed: 0,index,proteins,accessions,sequences,annotations,interpros,orgs,prop_annotations,contain_exp
1253,6710,ACK1_YEAST,Q07622; D6VRF1;,MVNQGQPQPNLYDKHINMFPPARARESSHKLGNANSDRHGLPAQNI...,"[GO:0005739|HDA, GO:0008047|IBA, GO:0031505|IM...","[IPR006597, IPR011990]",559292,"[GO:0071555, GO:0009987, GO:0007165, GO:004522...",1.0
33330,210625,KIFC1_CRIGR,Q60443;,MKEALEPAKKRTRGLGAVTKIDTSRSKGPLLSSLSQPQGPTAAQKG...,"[GO:0005769|IEA, GO:0005874|IEA, GO:0005815|IE...","[IPR019821, IPR001752, IPR036961, IPR027417]",10029,"[GO:0110165, GO:0005575, GO:0030496]",1.0
18394,109472,DSC3_HUMAN,Q14574; A6NN35; Q14200; Q9HAZ9;,MAAAGPRRSVRGAVCLHLLLTLVIFSRAGEACKKVILNVPSKLEAD...,"[GO:0030054|IDA, GO:0005911|IBA, GO:0001533|TA...","[IPR002126, IPR015919, IPR020894, IPR000233, I...",9606,"[GO:0140096, GO:0009987, GO:0003824, GO:003005...",1.0
95,687,2AAA_SCHPO,Q9UT08; Q10293;,MQTENQVNDLYPIAVLIDELKHDEITYRLNALERLSTIALALGPER...,"[GO:0005737|IBA, GO:0005829|HDA, GO:0090443|ID...","[IPR011989, IPR016024, IPR000357, IPR021133]",284812,"[GO:0045786, GO:0010648, GO:0034613, GO:005112...",1.0
11368,62522,CH1CO_SYNAS,Q2LQN9;,MKGPIKFNALSLQGRSVMSNQSNDTTITQRRDTMNELTEEQKLLME...,"[GO:0003995|IEA, GO:0050660|IDA, GO:0052890|ID...","[IPR006089, IPR006091, IPR036250, IPR009075, I...",56780,"[GO:0000166, GO:0044085, GO:0003824, GO:000998...",1.0


In [None]:
prepare_data = df_swissprot_without_exp.drop(['exp_annotations', 'sim_annotations', 'non_annotations'], axis=1)
col = ['index', 'proteins', 'accessions', 'sequences', 'annotations', 'interpros', 'orgs', 'prop_annotations', 'contain_exp']
prepare_data = prepare_data.loc[:, col]
prepare_data.head()

Unnamed: 0,index,proteins,accessions,sequences,annotations,interpros,orgs,prop_annotations,contain_exp
0,0,001R_FRG3G,Q6GZX4;,MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQV...,[GO:0046782|IEA],[IPR007031],654924,"[GO:0016032, GO:0050789, GO:0019083, GO:001908...",0.0
1,1,002L_FRG3G,Q6GZX3;,MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQT...,"[GO:0033644|IEA, GO:0016021|IEA]",[IPR004251],654924,"[GO:0110165, GO:0043657, GO:0005575, GO:003364...",0.0
2,5,004R_FRG3G,Q6GZX1;,MNAKYDTDQGVGRMLFLGTIGLAVVVGGLMAYGYYYDGKTPSSGTS...,"[GO:0033644|IEA, GO:0016021|IEA]",[],654924,"[GO:0110165, GO:0043657, GO:0005575, GO:003364...",0.0
3,13,009L_FRG3G,Q6GZW6;,MDTSPYDFLKLYPWLSRGEADKGTLLDAFPGETFEQSLASDVAMRR...,"[GO:0005524|IEA, GO:0003677|IEA, GO:0004386|IE...","[IPR006935, IPR014001, IPR001650, IPR027417, I...",654924,"[GO:0000166, GO:0017111, GO:0003824, GO:014065...",0.0
4,15,010R_FRG3G,Q6GZW5;,MKMDTDCRHWIVLASVPVLTVLAFKGEGALALAGLLVMAAVAMYRD...,"[GO:0033644|IEA, GO:0016021|IEA]",[],654924,"[GO:0110165, GO:0043657, GO:0005575, GO:003364...",0.0


In [None]:
index_drop = []
for t in out_of_expterms:
    prepare_data['contain_t'] = prepare_data.prop_annotations.apply(lambda x: t in x)
    add_data = prepare_data[prepare_data['contain_t']==True]
    print(f"Number of sequence with terms_{t}: {len(add_data)}")
    test_df = pd.concat([test_df, add_data])

Number of sequence with terms_GO:1902081: 3
Number of sequence with terms_GO:0047801: 7
Number of sequence with terms_GO:0004397: 196
Number of sequence with terms_GO:0044604: 1
Number of sequence with terms_GO:0140568: 6
Number of sequence with terms_GO:0072061: 1
Number of sequence with terms_GO:0047343: 3
Number of sequence with terms_GO:0032765: 8
Number of sequence with terms_GO:0035985: 3
Number of sequence with terms_GO:0007557: 1
Number of sequence with terms_GO:0016287: 1
Number of sequence with terms_GO:0061150: 4
Number of sequence with terms_GO:0008715: 44
Number of sequence with terms_GO:0102264: 21
Number of sequence with terms_GO:0102339: 45
Number of sequence with terms_GO:0047711: 4
Number of sequence with terms_GO:0047273: 4
Number of sequence with terms_GO:0019281: 214
Number of sequence with terms_GO:0036482: 1
Number of sequence with terms_GO:0031697: 1
Number of sequence with terms_GO:0072054: 1
Number of sequence with terms_GO:0039615: 121
Number of sequence with

In [None]:
test_df = test_df.drop('contain_t', axis=1)
test_df.head()

Unnamed: 0,index,proteins,accessions,sequences,annotations,interpros,orgs,prop_annotations,contain_exp
1253,6710,ACK1_YEAST,Q07622; D6VRF1;,MVNQGQPQPNLYDKHINMFPPARARESSHKLGNANSDRHGLPAQNI...,"[GO:0005739|HDA, GO:0008047|IBA, GO:0031505|IM...","[IPR006597, IPR011990]",559292,"[GO:0071555, GO:0009987, GO:0007165, GO:004522...",1.0
33330,210625,KIFC1_CRIGR,Q60443;,MKEALEPAKKRTRGLGAVTKIDTSRSKGPLLSSLSQPQGPTAAQKG...,"[GO:0005769|IEA, GO:0005874|IEA, GO:0005815|IE...","[IPR019821, IPR001752, IPR036961, IPR027417]",10029,"[GO:0110165, GO:0005575, GO:0030496]",1.0
18394,109472,DSC3_HUMAN,Q14574; A6NN35; Q14200; Q9HAZ9;,MAAAGPRRSVRGAVCLHLLLTLVIFSRAGEACKKVILNVPSKLEAD...,"[GO:0030054|IDA, GO:0005911|IBA, GO:0001533|TA...","[IPR002126, IPR015919, IPR020894, IPR000233, I...",9606,"[GO:0140096, GO:0009987, GO:0003824, GO:003005...",1.0
95,687,2AAA_SCHPO,Q9UT08; Q10293;,MQTENQVNDLYPIAVLIDELKHDEITYRLNALERLSTIALALGPER...,"[GO:0005737|IBA, GO:0005829|HDA, GO:0090443|ID...","[IPR011989, IPR016024, IPR000357, IPR021133]",284812,"[GO:0045786, GO:0010648, GO:0034613, GO:005112...",1.0
11368,62522,CH1CO_SYNAS,Q2LQN9;,MKGPIKFNALSLQGRSVMSNQSNDTTITQRRDTMNELTEEQKLLME...,"[GO:0003995|IEA, GO:0050660|IDA, GO:0052890|ID...","[IPR006089, IPR006091, IPR036250, IPR009075, I...",56780,"[GO:0000166, GO:0044085, GO:0003824, GO:000998...",1.0


In [None]:
test_df = test_df.drop_duplicates(['proteins'])
test_df.head()

Unnamed: 0,index,proteins,accessions,sequences,annotations,interpros,orgs,prop_annotations,contain_exp
1253,6710,ACK1_YEAST,Q07622; D6VRF1;,MVNQGQPQPNLYDKHINMFPPARARESSHKLGNANSDRHGLPAQNI...,"[GO:0005739|HDA, GO:0008047|IBA, GO:0031505|IM...","[IPR006597, IPR011990]",559292,"[GO:0071555, GO:0009987, GO:0007165, GO:004522...",1.0
33330,210625,KIFC1_CRIGR,Q60443;,MKEALEPAKKRTRGLGAVTKIDTSRSKGPLLSSLSQPQGPTAAQKG...,"[GO:0005769|IEA, GO:0005874|IEA, GO:0005815|IE...","[IPR019821, IPR001752, IPR036961, IPR027417]",10029,"[GO:0110165, GO:0005575, GO:0030496]",1.0
18394,109472,DSC3_HUMAN,Q14574; A6NN35; Q14200; Q9HAZ9;,MAAAGPRRSVRGAVCLHLLLTLVIFSRAGEACKKVILNVPSKLEAD...,"[GO:0030054|IDA, GO:0005911|IBA, GO:0001533|TA...","[IPR002126, IPR015919, IPR020894, IPR000233, I...",9606,"[GO:0140096, GO:0009987, GO:0003824, GO:003005...",1.0
95,687,2AAA_SCHPO,Q9UT08; Q10293;,MQTENQVNDLYPIAVLIDELKHDEITYRLNALERLSTIALALGPER...,"[GO:0005737|IBA, GO:0005829|HDA, GO:0090443|ID...","[IPR011989, IPR016024, IPR000357, IPR021133]",284812,"[GO:0045786, GO:0010648, GO:0034613, GO:005112...",1.0
11368,62522,CH1CO_SYNAS,Q2LQN9;,MKGPIKFNALSLQGRSVMSNQSNDTTITQRRDTMNELTEEQKLLME...,"[GO:0003995|IEA, GO:0050660|IDA, GO:0052890|ID...","[IPR006089, IPR006091, IPR036250, IPR009075, I...",56780,"[GO:0000166, GO:0044085, GO:0003824, GO:000998...",1.0


In [None]:
sort_test_terms, test_terms = statistic_terms(test_df, 1)

Number of prop_annotations: 24919
GO 24919
Number of prop_annotations with sequences more than 1: 24919 (100.00%)


In [None]:
terms_out_of_swissprot = set(go.ont.keys()) - set(test_terms)
print(f"Number of terms not in swissprot: {len(terms_out_of_swissprot)}")

Number of terms not in swissprot: 22423


# add rest of swissprot data in train_df

In [None]:
print(f'Number of sequence in df_swissprot_without_exp: {len(df_swissprot_without_exp)}')
print(f'Number of sequence in test_df: {len(test_df)}')
df_swissprot_without_exp.head()

Number of sequence in df_swissprot_without_exp: 463380
Number of sequence in test_df: 152757


Unnamed: 0,index,proteins,accessions,sequences,annotations,interpros,orgs,exp_annotations,sim_annotations,non_annotations,contain_exp,prop_annotations
0,0,001R_FRG3G,Q6GZX4;,MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQV...,[GO:0046782|IEA],[IPR007031],654924,[],[],[GO:0046782],0.0,"[GO:0016032, GO:0050789, GO:0019083, GO:001908..."
1,1,002L_FRG3G,Q6GZX3;,MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQT...,"[GO:0033644|IEA, GO:0016021|IEA]",[IPR004251],654924,[],[],"[GO:0033644, GO:0016021]",0.0,"[GO:0110165, GO:0043657, GO:0005575, GO:003364..."
2,5,004R_FRG3G,Q6GZX1;,MNAKYDTDQGVGRMLFLGTIGLAVVVGGLMAYGYYYDGKTPSSGTS...,"[GO:0033644|IEA, GO:0016021|IEA]",[],654924,[],[],"[GO:0033644, GO:0016021]",0.0,"[GO:0110165, GO:0043657, GO:0005575, GO:003364..."
3,13,009L_FRG3G,Q6GZW6;,MDTSPYDFLKLYPWLSRGEADKGTLLDAFPGETFEQSLASDVAMRR...,"[GO:0005524|IEA, GO:0003677|IEA, GO:0004386|IE...","[IPR006935, IPR014001, IPR001650, IPR027417, I...",654924,[],[],"[GO:0005524, GO:0003677, GO:0004386, GO:0016787]",0.0,"[GO:0000166, GO:0017111, GO:0003824, GO:014065..."
4,15,010R_FRG3G,Q6GZW5;,MKMDTDCRHWIVLASVPVLTVLAFKGEGALALAGLLVMAAVAMYRD...,"[GO:0033644|IEA, GO:0016021|IEA]",[],654924,[],[],"[GO:0033644, GO:0016021]",0.0,"[GO:0110165, GO:0043657, GO:0005575, GO:003364..."


In [None]:
# 求差集
set_diff_df = pd.concat([df_swissprot_without_exp, test_df, test_df]).drop_duplicates('proteins', keep=False)

In [None]:
train_df = pd.concat([train_df, set_diff_df])
print(f"Number of train data: {len(train_df)}")

Number of train data: 388089


In [None]:
sort_train_terms, train_terms = statistic_terms(train_df, 1)

Number of prop_annotations: 30864
GO 30864
Number of prop_annotations with sequences more than 1: 30864 (100.00%)


In [None]:
print(f"Number of train data: {len(test_df)}")

Number of train data: 152757


In [None]:
# train_df = train_df.drop(['exp_annotations', 'sim_annotations', 'non_annotations'], axis=1)
train_df.head()

Unnamed: 0,index,proteins,accessions,sequences,annotations,interpros,orgs,prop_annotations,contain_exp,exp_annotations,sim_annotations,non_annotations
63995,454399,ST1S3_DANRE,Q7T2V2;,MEISDFSSMKLNSRPELIDFEGISMIHYFTDNWEKVKNFQARPDDI...,"[GO:0005737|IBA, GO:0004062|IBA, GO:0008146|ID...","[IPR027417, IPR000863]",7955,"[GO:0051716, GO:0003824, GO:0009987, GO:001674...",1.0,,,
36226,229770,LYAM1_MOUSE,P18337;,MVFPWRCEGTYWGSRNILKLWVWTLLCCDFLIHHGTHCWTYHYSEK...,"[GO:0009986|IDA, GO:0009897|IDA, GO:0005887|IS...","[IPR001304, IPR016186, IPR018378, IPR016187, I...",10090,"[GO:0046683, GO:0050839, GO:0010243, GO:190170...",1.0,,,
73638,523000,XYNB_PRUPE,P83344;,ADAIKAGLDLDCGPFLAIHTEAAVRRGLVSQLEINWALANTMTVQM...,"[GO:0016798|TAS, GO:0004553|IEA, GO:0005975|IEA]","[IPR026891, IPR002772, IPR036881, IPR036962, I...",3760,"[GO:0003674, GO:0016798, GO:0003824, GO:0016787]",1.0,,,
3003,14698,ALKH_ECOLI,P0A955; P10177;,MKNWKTSAESILTTGPVVPVIVVKKLEHAVPMAKALVAGGVRVLEV...,"[GO:0005829|IDA, GO:0016020|HDA, GO:0106009|ID...","[IPR000887, IPR013785, IPR031337, IPR031338]",83333,"[GO:0016830, GO:0003824, GO:0016833, GO:004280...",1.0,,,
25019,151684,GEPH_MOUSE,Q8BUV3; E9QKJ1;,MATEGMILTNHDHQIRVGVLTVSDSCFRNLAEDRSGINLKDLVQDP...,"[GO:0099144|IDA, GO:0005737|ISO, GO:0005856|IE...","[IPR036425, IPR001453, IPR008284, IPR038987, I...",10090,"[GO:0016830, GO:0099084, GO:0031503, GO:006223...",1.0,,,


# Saving result

In [None]:
train_df.to_pickle(data_path_dict['train_data_swissprot'])
test_df.to_pickle(data_path_dict['test_data_swissprot'])

In [None]:
train_sort_dict = pd.DataFrame(sort_train_terms, columns=["term", "sequence_num"])
test_sort_dict = pd.DataFrame(sort_test_terms, columns=["term", "sequence_num"])
train_sort_dict.to_pickle(data_path_dict['train_terms_swissprot'])
test_sort_dict.to_pickle(data_path_dict['test_terms_swissprot'])

In [None]:
print(f"Total sequence of train data: {len(train_df)}")
print(f"Total sequence of test data: {len(test_df)}")

Total sequence of train data: 388089
Total sequence of test data: 152757


In [None]:
print(f"Total number of go terms: {len(go.ont)}")
print(f"Total number of train terms: {len(sort_train_terms)}")
print(f"Total number of test terms: {len(sort_test_terms)}")

Total number of go terms: 47342
Total number of train terms: 30864
Total number of test terms: 24919


## adding CAFA information

In [None]:
def add_CAFA(data_frame):
    cafa_target = []

    for i, row in enumerate(data_frame.itertuples()):
        if is_cafa_target(row.orgs):
            cafa_target.append(True)
        else:
            cafa_target.append(False)

    data_frame['cafa_target'] = cafa_target
    return data_frame

In [None]:
train_df_pro = add_CAFA(train_df_pro)