# Construction of the **PUSec17** dataset

In [1]:
import os
import os.path as osp
import pickle
import re

import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer

folder = r'D:\dataset'
raw_dir = 'human-body-fluid-protein-raw-20211126'
save_dir = 'BodyFluidData-20220912'

raw_path = osp.join(folder, raw_dir)
save_path = osp.join(folder, save_dir)
if not osp.exists(save_path):
    os.mkdir(save_path)

### Protein features

In [2]:
feat_df = pd.read_csv(osp.join(raw_path, 'protein-attribute.csv'), index_col='id')
feat_df.head()

feat_pro = list(feat_df.iloc[:, 0])
print(feat_pro[:5])
print('Number of proteins that have feature data:', len(feat_df))

['A0A584', 'Q9BXU3', 'Q15031', 'Q6PKC3', 'P42681']
Number of proteins that have feature data: 20373


### Protein sequences

In [3]:
seq_df = pd.read_csv(osp.join(raw_path, 'protein-sequence.csv'), index_col='id')
seq_df.head()

seq_pro = list(seq_df.iloc[:, 0])
print(seq_pro[:5])
print('Number of proteins that have sequence data:', len(seq_df))

['A0A024RBG1', 'A0A075B6H9', 'A0A075B6I0', 'A0A075B6I1', 'A0A075B6I4']
Number of proteins that have sequence data: 20396


### The PSSM matrix

In [4]:
pssm_files = os.listdir(osp.join(raw_path, 'protein-pssm'))
pssm_pro = [str(file).split('.')[0] for file in pssm_files]
print('number of pssm records:', len(pssm_pro))
print(pssm_pro[: 5])

number of pssm records: 20383
['A0A024RBG1', 'A0A075B6H9', 'A0A075B6I0', 'A0A075B6I1', 'A0A075B6I4']


In [5]:
pro_set = set(feat_pro) & set(seq_pro) & set(pssm_pro)
pro_list = sorted(pro_set)
num_pros = len(pro_list)
print('Number of proteins that have all three kinds of data:', num_pros)

Number of proteins that have all three kinds of data: 20345


### The records of secreted proteins

In [6]:
sec_df = pd.read_excel(
    osp.join(raw_path, 'fluid-protein-information.xlsx')
)
uni_seq_df = sec_df.loc[:, ['编号', '体液编号']].drop_duplicates()
uni_seq_df.head()
print('Number of secreted protein records:', len(uni_seq_df))

Number of secreted protein records: 46206


In [7]:
fluid_names = [
    'Plasma', 'Saliva', 'Urine', 'CSF', 'Seminal',
    'Amniotic', 'Tear', 'BALF', 'Milk', 'Synovial',
    'NAF', 'CVF', 'PE', 'Sputum', 'EBC', 'PJ', 'Sweat'
]
num_fluids = len(fluid_names)

id_counts = pd.value_counts(uni_seq_df.iloc[:, 1]).sort_index()
fluid_counts = pd.DataFrame(
    {
        'Fluid names': fluid_names,
        'Number of secreted proteins': id_counts
    }
)
print(fluid_counts)

   Fluid names  Number of secreted proteins
1       Plasma                         6822
2       Saliva                         2758
3        Urine                         7330
4          CSF                         4366
5      Seminal                         4084
6     Amniotic                         3025
7         Tear                         1980
8         BALF                         3434
9         Milk                         2457
10    Synovial                         1642
11         NAF                         1734
12         CVF                          949
13          PE                         1519
14      Sputum                         1809
15         EBC                          351
16          PJ                          702
17       Sweat                         1244


### Extract the secreted proteins as positive dataset

In [8]:
P_pro_list = []
for i in range(1, num_fluids + 1):
    mask = uni_seq_df.iloc[:, 1] == i
    i_pro_set = set(
        uni_seq_df.iloc[mask.values, 0].tolist()
    )
    P_pro = sorted(
        i_pro_set & pro_set
    )
    P_pro_list.append(P_pro)
    print(i, fluid_names[i-1], len(P_pro), P_pro[:3])

1 Plasma 6798 ['A0A075B6H9', 'A0A075B6I0', 'A0A075B6I1']
2 Saliva 2744 ['A0A075B6I0', 'A0A075B6I4', 'A0A075B6K4']
3 Urine 7305 ['A0A075B6H9', 'A0A075B6I0', 'A0A075B6I9']
4 CSF 4350 ['A0A075B6I0', 'A0A075B6K4', 'A0A075B6S5']
5 Seminal 4066 ['A0A024RBG1', 'A0A0B4J2D5', 'A0AV96']
6 Amniotic 3013 ['A0A075B6I0', 'A0A075B6K4', 'A0A075B6K5']
7 Tear 1971 ['A0A075B6I0', 'A0A075B6J9', 'A0A075B6K4']
8 BALF 3415 ['A0A075B6I0', 'A0A075B6I1', 'A0A075B6J9']
9 Milk 2444 ['A0A075B6K4', 'A0A075B6K5', 'A0A075B6P5']
10 Synovial 1636 ['A0A075B6I0', 'A0A075B6I9', 'A0A075B6J9']
11 NAF 1723 ['A0A075B6H9', 'A0A0C4DH29', 'A0A0C4DH32']
12 CVF 944 ['A0A075B6I0', 'A0A075B6J9', 'A0A075B6K4']
13 PE 1514 ['A0A0A0MRZ8', 'A0A0B4J1V0', 'A0A5B9']
14 Sputum 1803 ['A0A075B6K6', 'A0A0C4DH67', 'A0A0C4DH72']
15 EBC 350 ['A6NCN2', 'A8K2U0', 'O00299']
16 PJ 697 ['A6NGU5', 'A6NMY6', 'A8K2U0']
17 Sweat 1234 ['A0A0B4J1Y9', 'A0A0J9YVY3', 'A0FGR8']


The PUSec17 dataset is constructed to compare PU methods with the previous traditional methods.

To enable this, the negative datasets should be generated by protein families. And then, the negative datasets for previous methods are included in the new PU datasets.

Dataset: positive dataset, fake negative dataset, and unknown dataset.

The traditional methods are trained with positive and fake negative datasets. And the PU methods are trained with positive and unlabeled datasets, where the unlabeled dataset consists of fake negative and unknown datasets.

Both the traditional methods and PU methods are evaluated on the full independent test dataset.

In [9]:
pfam_df = pd.read_csv(osp.join(raw_path, 'protein-family.csv'), index_col='id')
pfam_df.head()

pfam_dict = {}
nofam_list = []
for _, (pro, fams) in pfam_df.iterrows():
    if isinstance(fams, str):
        for fam in fams.split(';')[:-1]:
            if fam in pfam_dict.keys():
                pfam_dict[fam].append(pro)
            else:
                pfam_dict[fam] = [pro]
    else:
        nofam_list.append(pro)
print('Number of no family proteins:', len(nofam_list))
print('Number of families:', len(pfam_dict.keys()))

Number of no family proteins: 2104
Number of families: 6216


In [10]:
FN_pro_list = []  # Fake negative datasets
for i in range(num_fluids):
    P_pro = P_pro_list[i]
    P_pro_set = set(P_pro)
    FN_pro = []

    for pro in pro_list:
        if pro in P_pro_set:
            continue
        
        for fam in pfam_dict.keys():
            i_fam_set = set(
                pfam_dict[fam]
            )
            inter_set = P_pro_set & i_fam_set
            if pro in i_fam_set and len(inter_set) > 0:
                break
        else:
            FN_pro.append(pro)
    
    FN_pro_list.append(FN_pro)
    print(i, fluid_names[i], len(P_pro), len(FN_pro))

0 Plasma 6798 5067
1 Saliva 2744 8433
2 Urine 7305 5008
3 CSF 4350 6573
4 Seminal 4066 7593
5 Amniotic 3013 9138
6 Tear 1971 10057
7 BALF 3415 7785
8 Milk 2444 7651
9 Synovial 1636 10075
10 NAF 1723 10275
11 CVF 944 12569
12 PE 1514 9466
13 Sputum 1803 9914
14 EBC 350 15480
15 PJ 697 13485
16 Sweat 1234 12170


Filter the redundant proteins with CD-HIT (90%).

In [11]:
cdhit90 = []
with open(osp.join(raw_path, 'cdhit90.fasta'), 'r') as f:
    for row in f.readlines():
        if row.startswith('>'):
            pro = str(row[1:-1])
            cdhit90.append(pro)
cdhit90_set = set(cdhit90)
print('CD-HIT(90):', len(cdhit90))

CD-HIT(90): 19394


In [12]:
new_P_pro_list = []
new_FN_pro_list = []
unknown_pro_list = []
for fluid_name, P_pro, FN_pro in zip(fluid_names, P_pro_list, FN_pro_list):

    P_pro_set = set(P_pro)
    FN_pro_set = set(FN_pro)

    P_len = len(P_pro)
    FN_len = len(FN_pro)

    new_P_pro = sorted(
        P_pro_set & cdhit90_set
    )
    new_FN_pro_set = FN_pro_set & cdhit90_set
    new_FN_pro = sorted(new_FN_pro_set)
    unknown_pro = sorted(
        (pro_set - P_pro_set - new_FN_pro_set) & cdhit90_set
    )

    new_P_len = len(new_P_pro)
    new_FN_len = len(new_FN_pro)
    unknown_len = len(unknown_pro)

    new_P_pro_list.append(new_P_pro)
    new_FN_pro_list.append(new_FN_pro)
    unknown_pro_list.append(unknown_pro)

    print(fluid_name, P_len, FN_len, new_P_len, new_FN_len, unknown_len)

Plasma 6798 5067 6530 4856 7972
Saliva 2744 8433 2521 8048 8789
Urine 7305 5008 6972 4760 7626
CSF 4350 6573 4082 6281 8995
Seminal 4066 7593 3929 7230 8199
Amniotic 3013 9138 2876 8725 7757
Tear 1971 10057 1843 9597 7918
BALF 3415 7785 3241 7392 8725
Milk 2444 7651 2324 7333 9701
Synovial 1636 10075 1525 9624 8209
NAF 1723 10275 1640 9800 7918
CVF 944 12569 877 12062 6419
PE 1514 9466 1437 9087 8834
Sputum 1803 9914 1696 9515 8147
EBC 350 15480 326 14903 4129
PJ 697 13485 646 12957 5755
Sweat 1234 12170 1162 11660 6536


In [13]:
P_len_list = [len(P_pro) for P_pro in new_P_pro_list]
FN_len_list = [len(FN_pro) for FN_pro in new_FN_pro_list]
unknown_len_list = [len(unknown_pro) for unknown_pro in unknown_pro_list]
U_len_list = [FN_len+unknown_len for FN_len, unknown_len in zip(FN_len_list, unknown_len_list)]

ds_info = pd.DataFrame(
    {
        'Positive': P_len_list,
        'Negative': FN_len_list,
        'Unknown': unknown_len_list,
        'Unlabeled': U_len_list
    },
    index=fluid_names
)
ds_info.index.name = 'Fluid'
ds_info.to_csv(osp.join(save_path, 'dataset-info.csv'))
ds_info

Unnamed: 0_level_0,Positive,Negative,Unknown,Unlabeled
Fluid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Plasma,6530,4856,7972,12828
Saliva,2521,8048,8789,16837
Urine,6972,4760,7626,12386
CSF,4082,6281,8995,15276
Seminal,3929,7230,8199,15429
Amniotic,2876,8725,7757,16482
Tear,1843,9597,7918,17515
BALF,3241,7392,8725,16117
Milk,2324,7333,9701,17034
Synovial,1525,9624,8209,17833


## Process the protein features

In [14]:
feat_df2 = feat_df.set_index('accession')
feat_values = feat_df2.loc[pro_list, :].values
feat_data = SimpleImputer(strategy='median').fit_transform(feat_values)

with open(osp.join(save_path, 'pro-attr.pkl'), 'wb') as f:
    pickle.dump(feat_data, f)

feat_data[:5, :5]

array([[0.   , 0.   , 0.05 , 0.011, 4.   ],
       [0.   , 0.   , 0.153, 0.017, 1.   ],
       [0.   , 0.   , 0.235, 0.025, 0.   ],
       [0.   , 0.   , 0.148, 0.017, 1.   ],
       [0.   , 0.   , 0.157, 0.009, 2.   ]])

In [15]:
seq_df2 = seq_df.set_index('accession')
seq_df3 = seq_df2.loc[pro_list, 'sequence']
seq_data = seq_df3.tolist()

with open(osp.join(save_path, 'pro-seq.pkl'), 'wb') as f:
    pickle.dump(seq_data, f)

seq_data[:5]

['MMKFKPNQTRTYDREGFKKRAACLCFRSEQEDEVLLVSSSRYPDQWIVPGGGMEPEEEPGGAAVREVYEEAGVKGKLGRLLGIFEQNQDRKHRTYVYVLTVTEILEDWEDSVNIGRKREWFKVEDAIKVLQCHKPVHAEYLEKLKLGCSPANGNSTVPSLPDNNALFVTAAQTSGLPSSVR',
 'MAWTPLLFLTLLLHCTGSLSQLVLTQSPSASASLGASVKLTCTLSSGHSSYAIAWHQQQPEKGPRYLMKLNSDGSHSKGDGIPDRFSGSSSGAERYLTISSLQSEDEADYYCQTWGTGI',
 'MSVPTMAWMMLLLGLLAYGSGVDSQTVVTQEPSFSVSPGGTVTLTCGLSSGSVSTSYYPSWYQQTPGQAPRTLIYSTNTRSSGVPDRFSGSILGNKAALTITGAQADDESDYYCVLYMGSGI',
 'MAWTPLLLLFPLLLHCTGSLSQPVLTQSSSASASLGSSVKLTCTLSSGHSSYIIAWHQQQPGKAPRYLMKLEGSGSYNKGSGVPDRFSGSSSGADRYLTISNLQFEDEADYYCETWDSNT',
 'MPWALLLLTLLTHSAVSVVQAGLTQPPSVSKGLRQTATLTCTGNSNIVGNQGAAWLQQHQGHPPKLLSYRNNNRPSGISERFSASRSGNTASLTITGLQPEDEADYYCSALDSSLSA']

In [16]:
pssm_data = []
for pro in pro_list:
    iter_pssm = []
    with open(osp.join(raw_path, 'protein-pssm', '{:s}.pssm'.format(pro)), 'r') as f:
        for line in f.readlines():
            split = re.split(r'\s+', line)
            if len(split) > 42:
                values = [int(value) for value in split[3:23]]
                iter_pssm.append(values)
    iter_pssm = np.array(iter_pssm, dtype=np.int32)
    pssm_data.append(iter_pssm)

with open(osp.join(save_path, 'pro-pssm.pkl'), 'wb') as f:
    pickle.dump(pssm_data, f)

pssm_data[:5]

[array([[-4, -5, -6, ..., -5, -4, -2],
        [-3, -5, -5, ..., -5, -4,  5],
        [ 0, -1, -1, ..., -6, -4, -4],
        ...,
        [-1, -3,  0, ..., -5, -4, -3],
        [ 1, -2, -3, ..., -4, -2,  3],
        [-3,  6, -2, ..., -4, -3, -4]]),
 array([[-5, -5, -6, ..., -5, -5, -3],
        [ 6, -4, -4, ..., -5, -4, -2],
        [-7, -3, -8, ..., 13, -2, -7],
        ...,
        [-2, -1,  1, ..., -4, -3, -4],
        [-2, -4,  4, ..., -6, -5, -5],
        [-2, -4, -5, ..., -3, -2,  2]]),
 array([[-1, -3, -3, ..., -3, -2,  3],
        [ 0, -1, -1, ..., -3, -2, -1],
        [-1, -3, -2, ..., -3, -2,  3],
        ...,
        [-2, -2,  3, ..., -6, -5, -5],
        [ 2, -3,  3, ..., -4, -1, -3],
        [-2, -3, -4, ..., -3, -2,  3]]),
 array([[-5, -5, -6, ..., -5, -5, -3],
        [ 6, -4, -4, ..., -5, -4, -2],
        [-7, -3, -8, ..., 13, -2, -7],
        ...,
        [-2, -1,  1, ..., -5, -2, -4],
        [-2, -2,  4, ..., -6, -5, -5],
        [-1, -3, -1, ..., -5, -4, -3]]),
 arr

## Data splits

The dataset corresponding to each fluid is divided into train, valid, and test datasets with the propotion as 60%, 20%, and 20%.

In [17]:
pro_len = seq_df3.apply(len)
len_with_indices = pd.DataFrame({
    'length': pro_len,
    'indices': list(range(0, num_pros))
}, index=pro_list)
len_with_indices.head()

Unnamed: 0,length,indices
A0A024RBG1,181,0
A0A075B6H9,119,1
A0A075B6I0,122,2
A0A075B6I1,120,3
A0A075B6I4,117,4


In [18]:
splits = {}

for fluid_name, P_pro, FN_pro, unknown_pro in zip(fluid_names, new_P_pro_list, new_FN_pro_list, unknown_pro_list):
    tr_pos_index = []
    tr_neg_index = []
    tr_unknown_index = []

    va_pos_index = []
    va_neg_index = []
    va_unknown_index = []

    te_pos_index = []
    te_neg_index = []
    te_unknown_index = []

    pos_with_indices = len_with_indices.loc[P_pro].sort_values(by='length')
    for i, index in enumerate(pos_with_indices.indices):
        rest = i % 5
        if rest <= 2:
            # 0, 1, 2 for train
            tr_pos_index.append(index)
        elif rest <= 3:
            # 3 for valid
            va_pos_index.append(index)
        else:
            # 4 for test
            te_pos_index.append(index)

    neg_with_indices = len_with_indices.loc[FN_pro].sort_values(by='length')
    for i, index in enumerate(neg_with_indices.indices):
        rest = i % 5
        if rest <= 2:
            tr_neg_index.append(index)
        elif rest <= 3:
            va_neg_index.append(index)
        else:
            te_neg_index.append(index)
    
    unknown_with_indices = len_with_indices.loc[unknown_pro].sort_values(by='length')
    for i, index in enumerate(unknown_with_indices.indices):
        rest = i % 5
        if rest <= 2:
            tr_unknown_index.append(index)
        elif rest <= 3:
            va_unknown_index.append(index)
        else:
            te_unknown_index.append(index)
    
    P_pro_set = set(P_pro)
    eval_pro = sorted(pro_set - P_pro_set)

    eval_index = len_with_indices.loc[eval_pro, 'indices'].tolist()
    
    tr_pos_len = len(tr_pos_index)
    tr_neg_len = len(tr_neg_index)
    tr_unknown_len = len(tr_unknown_index)
    va_pos_len = len(va_pos_index)
    va_neg_len = len(va_neg_index)
    va_unknown_len = len(va_unknown_index)
    te_pos_len = len(te_pos_index)
    te_neg_len = len(te_neg_index)
    te_unknown_len = len(te_unknown_index)

    tr_len = tr_pos_len + tr_neg_len + tr_unknown_len
    va_len = va_pos_len + va_neg_len + va_unknown_len
    te_len = te_pos_len + te_neg_len + te_unknown_len
    eval_len = len(eval_index)
    
    num_sum = tr_len + va_len + te_len

    print('{:s}: train({:d}, {:d}, {:d}), valid({:d}, {:d}, {:d}) test({:d}, {:d}, {:d}), eval({:d}), sum({:d})'
          .format(fluid_name, tr_pos_len, tr_neg_len, tr_unknown_len,
                  va_pos_len, va_neg_len, va_unknown_len,
                  te_pos_len, te_neg_len, te_unknown_len, eval_len, num_sum))
    
    splits[fluid_name] = {
        'tr_pos': np.array(tr_pos_index, dtype=np.int32),
        'tr_neg': np.array(tr_neg_index, dtype=np.int32),
        'tr_unknown': np.array(tr_unknown_index, dtype=np.int32),
        'va_pos': np.array(va_pos_index, dtype=np.int32),
        'va_neg': np.array(va_neg_index, dtype=np.int32),
        'va_unknown': np.array(va_unknown_index, dtype=np.int32),
        'te_pos': np.array(te_pos_index, dtype=np.int32),
        'te_neg': np.array(te_neg_index, dtype=np.int32),
        'te_unknown': np.array(te_unknown_index, dtype=np.int32),
        'eval': np.array(eval_index, dtype=np.int32)
    }

with open(osp.join(save_path, 'fluid-splits.pkl'), 'wb') as f:
    pickle.dump(splits, f)
    pickle.dump(pro_list, f)

Plasma: train(3918, 2914, 4784), valid(1306, 971, 1594) test(1306, 971, 1594), eval(13815), sum(19358)
Saliva: train(1513, 4830, 5274), valid(504, 1609, 1758) test(504, 1609, 1757), eval(17824), sum(19358)
Urine: train(4184, 2856, 4576), valid(1394, 952, 1525) test(1394, 952, 1525), eval(13373), sum(19358)
CSF: train(2450, 3769, 5397), valid(816, 1256, 1799) test(816, 1256, 1799), eval(16263), sum(19358)
Seminal: train(2358, 4338, 4920), valid(786, 1446, 1640) test(785, 1446, 1639), eval(16416), sum(19358)
Amniotic: train(1726, 5235, 4655), valid(575, 1745, 1551) test(575, 1745, 1551), eval(17469), sum(19358)
Tear: train(1107, 5759, 4752), valid(368, 1919, 1583) test(368, 1919, 1583), eval(18502), sum(19358)
BALF: train(1945, 4436, 5235), valid(648, 1478, 1745) test(648, 1478, 1745), eval(17104), sum(19358)
Milk: train(1395, 4401, 5821), valid(465, 1466, 1940) test(464, 1466, 1940), eval(18021), sum(19358)
Synovial: train(915, 5775, 4926), valid(305, 1925, 1642) test(305, 1924, 1641), 