# 对数据进行降冗余去重
> 2024-12-04   
> zhenkun.shi@tib.cas.cn  

## 1. 导入必要的包

In [1]:
import sys,os
sys.path.insert(0, os.path.dirname(os.path.realpath('__file__')))
sys.path.insert(1,'../')
from config import conf as cfg
import pandas as pd
import json
import plotly.graph_objects as go
from tools import btools
from tqdm import tqdm
import tools.bioFunctionLib as bfl
from IPython.display import HTML
from pandarallel import pandarallel # 导入pandaralle

pandarallel.initialize(progress_bar=False)
%load_ext autoreload
%autoreload 2

INFO: Pandarallel will run on 128 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## 2.加载数据

In [2]:
ds_train = pd.read_feather(cfg.FILE_DS_TRAIN)

In [3]:
ds_train

Unnamed: 0,uniprot_id,seq,reaction_id,ec_number,functionCounts,ec_specific_level,isenzyme,label
0,Q6GZX4,MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQV...,-,-,0,0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,Q6GZX3,MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQT...,-,-,0,0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Q197F8,MASNTVSAQGGSNRPVRDFSNIQDVAQFLLFDPIWNEQPGSIVPWK...,-,-,0,0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,Q197F7,MYQAINPCPQSWYGSPQLEREIVCKMSGAPHYPNYYPVHPNALGGA...,-,-,0,0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,Q6GZX2,MARPLLGKTSSVRRRLESLSACSIFFFLRKFCQKMASLVFLNSPVY...,-,-,0,0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...
508582,Q6UY62,MGNSKSKSKLSANQYEQQTVNSTKQVAILKRQAEPSLYGRHNCRCC...,-,-,0,0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
508583,P08105,MSSSLEITSFYSFIWTPHIGPLLFGIGLWFSMFKEPSHFCPCQHPH...,-,-,0,0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
508584,Q88470,MGNCNRTQKPSSSSNNLEKPPQAAEFRRTAEPSLYGRYNCKCCWFA...,-,-,0,0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
508585,A9JR22,MGLRYSKEVRDRHGDKDPEGRIPITQTMPQTLYGRYNCKSCWFANK...,-,-,0,0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


## 3. Cdhit clustering

In [8]:
cls_90 = bfl.pycdhit(uniportid_seq_df=ds_train, identity=0.9, thred_num=100)
cls_90[cls_90.is_representative]

Unnamed: 0,cluster_id,uniprot_id,identity,is_representative,cluster_size
114,0,A8W3D6,1.0,True,381
381,1,Q8QPJ5,1.0,True,160
643,2,Q09FU6,1.0,True,155
700,3,P0CC32,1.0,True,145
841,4,Q7J1C5,1.0,True,135
...,...,...,...,...,...
507464,292925,Q9S5E2,1.0,True,1
507465,292926,A7TEE6,1.0,True,1
507466,292927,Q84RE1,1.0,True,1
507467,292928,Q9LUM6,1.0,True,1


In [8]:
cls_80 = bfl.pycdhit(uniportid_seq_df=ds_train, identity=0.8, thred_num=100)
cls_80[cls_80.is_representative]

Unnamed: 0,cluster_id,uniprot_id,identity,is_representative,cluster_size
394,0,Q04910,1.0,True,983
1150,1,A8W3D6,1.0,True,532
1623,2,A4QLB3,1.0,True,201
1725,3,B3GZ21,1.0,True,192
1928,4,A6MM91,1.0,True,177
...,...,...,...,...,...
507464,235489,Q58744,1.0,True,1
507465,235490,Q72FU0,1.0,True,1
507466,235491,Q9YAB0,1.0,True,1
507467,235492,Q7N365,1.0,True,1


In [18]:
cls_70 = bfl.pycdhit(uniportid_seq_df=ds_train, identity=0.7, thred_num=100)
cls_70[cls_70.is_representative]

Unnamed: 0,cluster_id,uniprot_id,identity,is_representative,cluster_size
311,0,Q9XNU9,1.0,True,819
989,1,A8W3D6,1.0,True,536
1482,2,Q8SGA0,1.0,True,489
2173,3,Q6S8J3,1.0,True,336
2417,4,A6W394,1.0,True,334
...,...,...,...,...,...
507464,189133,A1L0Z6,1.0,True,1
507465,189134,K7VCB9,1.0,True,1
507466,189135,A8GG83,1.0,True,1
507467,189136,O43247,1.0,True,1


In [19]:
cls_60 = bfl.pycdhit(uniportid_seq_df=ds_train, identity=0.6, thred_num=100)
cls_60[cls_60.is_representative]

Unnamed: 0,cluster_id,uniprot_id,identity,is_representative,cluster_size
291,0,Q8SGA0,1.0,True,1582
1621,1,P29685,1.0,True,753
2525,2,A8W3D6,1.0,True,587
3277,3,B3E073,1.0,True,579
3687,4,C1CXF6,1.0,True,575
...,...,...,...,...,...
507464,147245,Q58120,1.0,True,1
507465,147246,Q02002,1.0,True,1
507466,147247,Q67N85,1.0,True,1
507467,147248,Q5DP50,1.0,True,1


In [20]:
cls_50 = bfl.pycdhit(uniportid_seq_df=ds_train, identity=0.5, thred_num=100)
cls_50[cls_50.is_representative]

Unnamed: 0,cluster_id,uniprot_id,identity,is_representative,cluster_size
331,0,Q8SGA0,1.0,True,1682
2460,1,Q1JDK8,1.0,True,891
3170,2,O24310,1.0,True,847
4074,3,A9ETF4,1.0,True,801
4366,4,P19483,1.0,True,763
...,...,...,...,...,...
507464,109991,A5USV2,1.0,True,1
507465,109992,Q7VGZ3,1.0,True,1
507466,109993,P39657,1.0,True,1
507467,109994,Q796C3,1.0,True,1


In [21]:
cls_40 = bfl.pycdhit(uniportid_seq_df=ds_train, identity=0.4, thred_num=100)
cls_40[cls_40.is_representative]

Unnamed: 0,cluster_id,uniprot_id,identity,is_representative,cluster_size
2,0,P51131,1.0,True,1662
2261,1,Q8EWY8,1.0,True,1022
3381,2,B3CQ26,1.0,True,980
3845,3,Q9ZI47,1.0,True,943
5536,4,Q01607,1.0,True,942
...,...,...,...,...,...
507464,78058,P30975,1.0,True,1
507465,78059,Q49AM3,1.0,True,1
507466,78060,Q20332,1.0,True,1
507467,78061,P54356,1.0,True,1
