In [2]:
import pandas as pd

In [19]:
idp_single = pd.read_csv('./datasets/idp_single.csv')

In [20]:
idp_single

Unnamed: 0,SASBDB_ID,uniprot_id,uniprot_range_first,uniprot_range_last,sequence
0,SASDBY5,Q9VYW2,509.0,716.0,GSGGIEGRHAGRQKVQEMKEKFSTIIKAEMPTQSSSPDLPASQAPQ...
1,SASDBZ6,Q8NBI3,,,MAGPAIHTAPMLFLVLLLPLELSLAGALAPGTPARNLPENHIDLPG...
2,SASDC62,Q15554,42.0,86.0,GPPGSMAGGGGSSDGSGRAAGRRASRSSGRARRGRHEPGLGGPAERGAG
3,SASDC53,P08083,1.0,90.0,MGSNGADNAHNNAFGGGKNPGIGNTSGAGSNGSASSNRGNSNGWSW...
4,SASDCY4,Q46977,603.0,850.0,ERQQDRRKPRQNNRRDRNERRDTRSERTEGSDNREENRRNRRQAQQ...
...,...,...,...,...,...
109,SASDT67,P08551,441.0,543.0,MGCGPAYYNSHVQEEQTEVEETIEATKAEEAKDEPPSEGEAEEEEK...
110,SASDTY6,P08551,441.0,543.0,MGCGPAYYNSHVQEEQTEVEETIEATKAEEAKDEPPSEGEAEEEEK...
111,SASDU37,F4IED2,161.0,274.0,SGSGPKNGEQYGAPFIEEEWAEDDDDDVDEPANQLVVSASVDNSLW...
112,SASDU67,Q13426,201.0,336.0,GSAAQEREKDIKQEGETAICSEMTADRDPVYDESTDEESENQTDLS...


In [21]:
from localcider.sequenceParameters import SequenceParameters

# Function to compute features using LocalCIDER
def compute_localcider_features(seq):
    try:
        seq_obj = SequenceParameters(seq)

        return {
            "NCPR": seq_obj.get_NCPR(),               # Net charge per residue
            "FCR": seq_obj.get_FCR(),                 # Fraction of charged residues
            "Hydropathy": seq_obj.get_mean_hydropathy(),
            "Omega": seq_obj.get_Omega(),
            "Kappa": seq_obj.get_kappa(),             # Charge patterning parameter
            "Sequence_length": len(seq)
        }
    except:

        return {
            "NCPR": None,
            "FCR": None,
            "Hydropathy": None,
            "Omega": None,
            "Kappa": None,
            "Sequence_length": None
        }

# Apply function to each row and expand results into columns
features_df = idp_single["sequence"].apply(compute_localcider_features).apply(pd.Series)

# Merge back with original dataframe
idp_single = pd.concat([idp_single, features_df], axis=1)

print(idp_single)

    SASBDB_ID uniprot_id  uniprot_range_first  uniprot_range_last  \
0     SASDBY5     Q9VYW2                509.0               716.0   
1     SASDBZ6     Q8NBI3                  NaN                 NaN   
2     SASDC62     Q15554                 42.0                86.0   
3     SASDC53     P08083                  1.0                90.0   
4     SASDCY4     Q46977                603.0               850.0   
..        ...        ...                  ...                 ...   
109   SASDT67     P08551                441.0               543.0   
110   SASDTY6     P08551                441.0               543.0   
111   SASDU37     F4IED2                161.0               274.0   
112   SASDU67     Q13426                201.0               336.0   
113   SASDUX9        NaN                  NaN                 NaN   

                                              sequence      NCPR       FCR  \
0    GSGGIEGRHAGRQKVQEMKEKFSTIIKAEMPTQSSSPDLPASQAPQ... -0.009217  0.211982   
1    MAGPAIHTAP

In [32]:
idp_single.dropna(subset=['uniprot_id']).sort_values('Sequence_length').drop_duplicates(subset=['sequence']).reset_index()


Unnamed: 0,index,SASBDB_ID,uniprot_id,uniprot_range_first,uniprot_range_last,sequence,NCPR,FCR,Hydropathy,Omega,Kappa,Sequence_length
0,37,SASDHH8,P15516,20.0,43.0,DSHAKRHHGYKRKFHEKHHSHRGY,0.208333,0.375,2.045833,0.032558,0.248794,24.0
1,2,SASDC62,Q15554,42.0,86.0,GPPGSMAGGGGSSDGSGRAAGRRASRSSGRARRGRHEPGLGGPAERGAG,0.122449,0.244898,3.391837,0.16993,0.32888,49.0
2,46,SASDKD6,Q9UTR8,288.0,345.0,GAMGISLPLLKQDDWLSSSKPFGSSTPNVVIEFDSDDDGDDFSNSK...,-0.114286,0.228571,3.49,0.185588,0.35868,70.0
3,71,SASDQE7,O55777,200.0,254.0,MSYYHHHHHHLESTSLYKKAGFTPTEEPPVIPEAAAGSGRRGDLSK...,-0.103896,0.285714,3.376623,0.181415,0.365304,77.0
4,70,SASDQD7,O55777,200.0,254.0,MSYYHHHHHHLESTSLYKKAGFTPTEEPPVIPEYYYGSGRRGDLSK...,-0.103896,0.285714,3.255844,0.181415,0.365304,77.0
5,75,SASDSN2,A0A178VHQ2,54.0,131.0,AAPEGGISDVVEKSIKEAQETCAGDPVSGECVAAWDEVEELSAAAS...,-0.153846,0.358974,3.589744,0.102266,0.1571,78.0
6,33,SASDF27,Q9WTL8,530.0,625.0,GPDASSPGGKKILNGGTPDIPSTGLLPGQAQETPGYPYSDSSSILG...,-0.122449,0.163265,4.227551,0.153661,0.204474,98.0
7,3,SASDC53,P08083,1.0,90.0,MGSNGADNAHNNAFGGGKNPGIGNTSGAGSNGSASSNRGNSNGWSW...,0.020408,0.122449,3.016327,0.515468,0.25791,98.0
8,5,SASDCY9,A6Q0K5,28.0,107.0,HHHHHHHHHHSSGHIEGRHMSGQPAVDLNKKVQDAVKEAEDACAKG...,-0.1,0.3,3.707,0.186189,0.13746,100.0
9,53,SASDKU8,P04275,1596.0,1668.0,MEDREQAPNLVYMVTGNPASDEIKRLPGDIQVVPIGVGPNANVQEL...,-0.058252,0.213592,3.913592,0.193836,0.102972,103.0
