In [90]:
import pandas as pd
import numpy as np
import os

In [91]:
# download PPI profiles from BioGRID 
# (setd2 and setd5)
df1 = pd.read_csv(r'/lustre/home/acct-bmelgn/bmelgn-2/QianWei/MIPPI2/src/kaggle/setd_test/BIOGRID-GENE-118845-4.0.189.tab3.txt', delimiter='\t')
df2 = pd.read_csv(r'/lustre/home/acct-bmelgn/bmelgn-2/QianWei/MIPPI2/src/kaggle/setd_test/BIOGRID-GENE-120505-4.0.189.tab3.txt', delimiter='\t')

## get all protein uniprot id to get corresponding fastas

In [92]:
all_id1 = pd.unique(df1[['SWISS-PROT Accessions Interactor A', 'SWISS-PROT Accessions Interactor B']].values.ravel('K'))
all_id2 = pd.unique(df2[['SWISS-PROT Accessions Interactor A', 'SWISS-PROT Accessions Interactor B']].values.ravel('K'))

In [93]:
all_id = all_id1.tolist() + all_id2.tolist()
all_id = set(all_id)
# then upload these protein id on UNIPROT ID mapping page, and download corresponding fasta file, rename into 'uniprot_.fasta'

## make all protein-seq table

In [94]:
protein_name = []
fasta = []
with open(r'uniprot_.fasta', 'r') as f:
    init_flag = True
    tmp_fasta = ''
    for line in f:
        line  = line.strip()
        if line[0] == '>' and init_flag:
            init_flag = False
            protein_name.append(line.split('|')[1])
        elif line[0] == '>':
            protein_name.append(line.split('|')[1])
            fasta.append(tmp_fasta)
            tmp_fasta = ''
        else:
            tmp_fasta += line
    fasta.append(tmp_fasta)

In [95]:
df = pd.DataFrame({'id': protein_name, 'seq': fasta})
df.set_index(['id'], inplace=True, drop=True)

In [96]:
df.head()

Unnamed: 0_level_0,seq
id,Unnamed: 1_level_1
Q8TEL6,MAAAPVAAGSGAGRGRRSAATVAAWGGWGGRPRPGNILLQLRQGQL...
Q8NHQ8,MELKVWVDGVQRIVCGVTEVTTCQEVVIALAQAIGRTGRYTLIEKW...
Q86WV1,MQAAALPEEIRWLLEDAEEFLAEGLRNENLSAVARDHRDHILRGFQ...
P14866,MSRRLLPRAEKRRRRLEQRQQPDEQRRRSGAMVKMAAAGGGGGGGR...
Q15717,MSNGYEDHMAEDCRGDIGRTNLIVNYLPQNMTQDELRSLFSSIGEV...


## make interaction table

In [97]:
df_all = pd.concat([df1, df2])
df_all = df_all[~(df_all['SWISS-PROT Accessions Interactor A'] == '-')]
df_all = df_all[~(df_all['SWISS-PROT Accessions Interactor B'] == '-')]
df_all.reset_index(inplace=True)
df_all.head()

Unnamed: 0,index,#BioGRID Interaction ID,Entrez Gene Interactor A,Entrez Gene Interactor B,BioGRID ID Interactor A,BioGRID ID Interactor B,Systematic Name Interactor A,Systematic Name Interactor B,Official Symbol Interactor A,Official Symbol Interactor B,...,TREMBL Accessions Interactor B,REFSEQ Accessions Interactor B,Ontology Term IDs,Ontology Term Names,Ontology Term Categories,Ontology Term Qualifier IDs,Ontology Term Qualifier Names,Ontology Term Types,Organism Name Interactor A,Organism Name Interactor B
0,0,7994,3064,29072,109314,118845,-,HSPC069,HTT,SETD2,...,-,NP_054878,-,-,-,-,-,-,Homo sapiens,Homo sapiens
1,1,260572,29072,3064,118845,109314,HSPC069,-,SETD2,HTT,...,-,NP_002102,-,-,-,-,-,-,Homo sapiens,Homo sapiens
2,2,458039,55677,29072,120807,118845,-,HSPC069,IWS1,SETD2,...,-,NP_054878,-,-,-,-,-,-,Homo sapiens,Homo sapiens
3,3,479475,29072,7157,118845,113010,HSPC069,-,SETD2,TP53,...,H2EHT1|A0A087WT22|A0A087WXZ1|Q53GA5|A0A087X1Q1...,NP_001119586|NP_001119590|NP_001119584|NP_0005...,-,-,-,-,-,-,Homo sapiens,Homo sapiens
4,4,479476,7157,29072,113010,118845,-,HSPC069,TP53,SETD2,...,-,NP_054878,-,-,-,-,-,-,Homo sapiens,Homo sapiens


In [98]:
unique_id = []
for i in df_all.index:
    if (df_all.loc[i, 'SWISS-PROT Accessions Interactor A'] == 'Q9C0A6') or (df_all.loc[i, 'SWISS-PROT Accessions Interactor A'] == 'Q9BYW2'):
        unique_id.append(df_all.loc[i, 'SWISS-PROT Accessions Interactor A'] + df_all.loc[i, 'SWISS-PROT Accessions Interactor B'])
    else:
        unique_id.append(df_all.loc[i, 'SWISS-PROT Accessions Interactor B'] + df_all.loc[i, 'SWISS-PROT Accessions Interactor A'])
df_all['uid'] = unique_id

In [99]:
df_all.shape

(85, 39)

In [100]:
df_all.drop_duplicates(['uid'], inplace=True)

In [101]:
df_all.shape

(68, 39)

In [102]:
mut_id = []
par_id = []
mut_seq = []
par_seq = []
for i in df_all.index:
    if (df_all.loc[i, 'SWISS-PROT Accessions Interactor A'] == 'Q9C0A6') or (df_all.loc[i, 'SWISS-PROT Accessions Interactor A'] == 'Q9BYW2'):
        mut_id.append(df_all.loc[i, 'SWISS-PROT Accessions Interactor A'])
        par_id.append(df_all.loc[i, 'SWISS-PROT Accessions Interactor B'])
        mut_seq.append(df.loc[df_all.loc[i, 'SWISS-PROT Accessions Interactor A'], 'seq'])
        par_seq.append(df.loc[df_all.loc[i, 'SWISS-PROT Accessions Interactor B'], 'seq'])
    else:
        mut_id.append(df_all.loc[i, 'SWISS-PROT Accessions Interactor B'])
        par_id.append(df_all.loc[i, 'SWISS-PROT Accessions Interactor A'])
        mut_seq.append(df.loc[df_all.loc[i, 'SWISS-PROT Accessions Interactor B'][:6], 'seq'])
        par_seq.append(df.loc[df_all.loc[i, 'SWISS-PROT Accessions Interactor A'][:6], 'seq'])

In [103]:
df_all['mutAC'] = mut_id
df_all['parAC'] = par_id
df_all['mut0'] = mut_seq
df_all['par0'] = par_seq

In [None]:
setd2_pos = [1624, 1815, 1666]
setd2_ori = ['S', 'L', 'Y']
setd2_mut = ['C', 'T', 'C']
setd5_pos = 77
setd5_ori = 'R'
setd5_mut = 'C'

df_setd5 = df_all[df_all['mutAC'] == 'Q9C0A6']
df_setd5['pos'] = setd5_pos
df_setd5['oriaa'] = setd5_ori
df_setd5['mutaa'] = setd5_mut

for i in range(len(setd2_pos)):
    df_tmp = df_all[df_all['mutAC'] == 'Q9BYW2']
    df_tmp['pos'] = setd2_pos[i]
    df_tmp['oriaa'] = setd2_ori[i]
    df_tmp['mutaa'] = setd2_mut[i]
    df_setd5 = pd.concat([df_setd5, df_tmp])

In [106]:
df_setd5.reset_index(inplace=True, drop=True)

In [107]:
df_setd5.head()

Unnamed: 0,index,#BioGRID Interaction ID,Entrez Gene Interactor A,Entrez Gene Interactor B,BioGRID ID Interactor A,BioGRID ID Interactor B,Systematic Name Interactor A,Systematic Name Interactor B,Official Symbol Interactor A,Official Symbol Interactor B,...,Organism Name Interactor A,Organism Name Interactor B,uid,mutAC,parAC,mut0,par0,pos,oriaa,mutaa
0,0,120115,55209,151871,120505,127407,-,-,SETD5,DPPA2,...,Homo sapiens,Homo sapiens,Q9C0A6Q7Z7J5,Q9C0A6,Q7Z7J5,MSIAIPLGVTTSDTSYSDMAAGSDPESVEASPAVNEKSVYSTHNYG...,MSDANLDSSKKNFLEGEVDDEESVILTLVPVKDDANMEQMEPSVSS...,77,R,C
1,1,120480,11007,55209,116198,120505,-,-,CCDC85B,SETD5,...,Homo sapiens,Homo sapiens,Q9C0A6Q15834,Q9C0A6,Q15834,MSIAIPLGVTTSDTSYSDMAAGSDPESVEASPAVNEKSVYSTHNYG...,MEAEAGGLEELTDEEMAALGKEELVRRLRREEAARLAALVQRGRLM...,77,R,C
2,2,121055,7186,55209,113038,120505,-,-,TRAF2,SETD5,...,Homo sapiens,Homo sapiens,Q9C0A6Q12933,Q9C0A6,Q12933,MSIAIPLGVTTSDTSYSDMAAGSDPESVEASPAVNEKSVYSTHNYG...,MAAASVTPPGSLELLQPGFSKTLLGTKLEAKYLCSACRNVLRRPFQ...,77,R,C
3,3,838797,55209,23641,120505,117169,-,-,SETD5,LDOC1,...,Homo sapiens,Homo sapiens,Q9C0A6O95751,Q9C0A6,O95751,MSIAIPLGVTTSDTSYSDMAAGSDPESVEASPAVNEKSVYSTHNYG...,MVDELVLLLHALLMRHRALSIENSQLMEQLRLLVCERASLLRQVRP...,77,R,C
4,4,1046359,23281,55209,116880,120505,-,-,MTUS2,SETD5,...,Homo sapiens,Homo sapiens,Q9C0A6Q5JR59,Q9C0A6,Q5JR59,MSIAIPLGVTTSDTSYSDMAAGSDPESVEASPAVNEKSVYSTHNYG...,MSVPVAPKKSCYTQLRDNRNAARNNNESILSLGDTNANQIMLEVSS...,77,R,C


In [108]:
mut1 = []
mut0_51 = []
mut1_51 = []
for i in df_setd5.index:
    if df_setd5.loc[i, 'mut0'][df_setd5.loc[i, 'pos'] - 1] == df_setd5.loc[i, 'oriaa']:
        mut1_ = df_setd5.loc[i, 'mut0'][0: df_setd5.loc[i, 'pos'] - 1] + df_setd5.loc[i, 'mutaa'] + df_setd5.loc[i, 'mut0'][df_setd5.loc[i, 'pos']:]
        mut1.append(mut1_)
        mut0_tmp = '0' * 25 + df_setd5.loc[i, 'mut0'] + '0' * 25
        mut0_51.append(mut0_tmp[df_setd5.loc[i, 'pos'] - 1:df_setd5.loc[i, 'pos'] + 50])
        mut1_tmp = '0' * 25 + mut1_ + '0' * 25
        mut1_51.append(mut1_tmp[df_setd5.loc[i, 'pos'] - 1:df_setd5.loc[i, 'pos'] + 50])
    else:
        print('match error')

In [109]:
df_setd5['mut1'] = mut1
df_setd5['mut0_51'] = mut0_51
df_setd5['mut1_51'] = mut1_51

In [110]:
df_setd5.head()

Unnamed: 0,index,#BioGRID Interaction ID,Entrez Gene Interactor A,Entrez Gene Interactor B,BioGRID ID Interactor A,BioGRID ID Interactor B,Systematic Name Interactor A,Systematic Name Interactor B,Official Symbol Interactor A,Official Symbol Interactor B,...,mutAC,parAC,mut0,par0,pos,oriaa,mutaa,mut1,mut0_51,mut1_51
0,0,120115,55209,151871,120505,127407,-,-,SETD5,DPPA2,...,Q9C0A6,Q7Z7J5,MSIAIPLGVTTSDTSYSDMAAGSDPESVEASPAVNEKSVYSTHNYG...,MSDANLDSSKKNFLEGEVDDEESVILTLVPVKDDANMEQMEPSVSS...,77,R,C,MSIAIPLGVTTSDTSYSDMAAGSDPESVEASPAVNEKSVYSTHNYG...,GCRGLPYATIIPRSDLNGLPSPVEERCGDSPNSEGETVPTWCPCGL...,GCRGLPYATIIPRSDLNGLPSPVEECCGDSPNSEGETVPTWCPCGL...
1,1,120480,11007,55209,116198,120505,-,-,CCDC85B,SETD5,...,Q9C0A6,Q15834,MSIAIPLGVTTSDTSYSDMAAGSDPESVEASPAVNEKSVYSTHNYG...,MEAEAGGLEELTDEEMAALGKEELVRRLRREEAARLAALVQRGRLM...,77,R,C,MSIAIPLGVTTSDTSYSDMAAGSDPESVEASPAVNEKSVYSTHNYG...,GCRGLPYATIIPRSDLNGLPSPVEERCGDSPNSEGETVPTWCPCGL...,GCRGLPYATIIPRSDLNGLPSPVEECCGDSPNSEGETVPTWCPCGL...
2,2,121055,7186,55209,113038,120505,-,-,TRAF2,SETD5,...,Q9C0A6,Q12933,MSIAIPLGVTTSDTSYSDMAAGSDPESVEASPAVNEKSVYSTHNYG...,MAAASVTPPGSLELLQPGFSKTLLGTKLEAKYLCSACRNVLRRPFQ...,77,R,C,MSIAIPLGVTTSDTSYSDMAAGSDPESVEASPAVNEKSVYSTHNYG...,GCRGLPYATIIPRSDLNGLPSPVEERCGDSPNSEGETVPTWCPCGL...,GCRGLPYATIIPRSDLNGLPSPVEECCGDSPNSEGETVPTWCPCGL...
3,3,838797,55209,23641,120505,117169,-,-,SETD5,LDOC1,...,Q9C0A6,O95751,MSIAIPLGVTTSDTSYSDMAAGSDPESVEASPAVNEKSVYSTHNYG...,MVDELVLLLHALLMRHRALSIENSQLMEQLRLLVCERASLLRQVRP...,77,R,C,MSIAIPLGVTTSDTSYSDMAAGSDPESVEASPAVNEKSVYSTHNYG...,GCRGLPYATIIPRSDLNGLPSPVEERCGDSPNSEGETVPTWCPCGL...,GCRGLPYATIIPRSDLNGLPSPVEECCGDSPNSEGETVPTWCPCGL...
4,4,1046359,23281,55209,116880,120505,-,-,MTUS2,SETD5,...,Q9C0A6,Q5JR59,MSIAIPLGVTTSDTSYSDMAAGSDPESVEASPAVNEKSVYSTHNYG...,MSVPVAPKKSCYTQLRDNRNAARNNNESILSLGDTNANQIMLEVSS...,77,R,C,MSIAIPLGVTTSDTSYSDMAAGSDPESVEASPAVNEKSVYSTHNYG...,GCRGLPYATIIPRSDLNGLPSPVEERCGDSPNSEGETVPTWCPCGL...,GCRGLPYATIIPRSDLNGLPSPVEECCGDSPNSEGETVPTWCPCGL...


In [111]:
mutAC1 = df_setd5['mutAC'] + '_' + df_setd5['oriaa'] + df_setd5['pos'].astype('str') + df_setd5['mutaa']
df_setd5['mutAC1'] = mutAC1

In [112]:
# par_unique = df_setd5.drop_duplicates(['parAC'], keep='first')
# mut_unique = df_setd5.drop_duplicates(['mutAC', 'pos', 'oriaa', 'mutaa'], keep='first')

# for i in par_unique.index:
#     with open('./ori_mut_fastas/' + par_unique.loc[i, 'parAC'], 'w') as f:
#         f.write('>' + par_unique.loc[i, 'parAC'] + '\n')
#         f.write(par_unique.loc[i, 'par0'])

# for i in mut_unique.index:
#     with open('./ori_mut_fastas/' + mut_unique.loc[i, 'mutAC'], 'w') as f:
#         f.write('>' + mut_unique.loc[i, 'mutAC'] + '\n')
#         f.write(mut_unique.loc[i, 'mut0'])

# for i in mut_unique.index:
#     with open('./ori_mut_fastas/' + mut_unique.loc[i, 'mutAC'] + '_' + mut_unique.loc[i, 'oriaa'] + mut_unique.loc[i, 'pos'].astype('str') + mut_unique.loc[i, 'mutaa'], 'w') as f:
#         f.write('>' + mut_unique.loc[i, 'mutAC'] + '_' + mut_unique.loc[i, 'oriaa'] + mut_unique.loc[i, 'pos'].astype('str') + mut_unique.loc[i, 'mutaa'] + '\n')
#         f.write(mut_unique.loc[i, 'mut1'])

In [113]:
def load_file(file_path):
    try:
        with open(file_path) as f_in:
            lines = f_in.readlines()
        return lines
    except IOError as err:
        print('Can not open file: ' + file_path)
        return 'nan'

def parse_pssm(filelines, winsize=51, pssm_root=None, mutated_pos=None, most1024=False):
#     pssm_root = '/lustre/home/acct-bmelgn/bmelgn-2/QianWei/app/psipred_file/psipred/BLAST+/v20200727/pssm'
    filelines = load_file(os.path.join(pssm_root, filelines + '.pssm'))
    if filelines == 'nan':
        return 'nan'
    pssmvalue = np.array([])
    for line in filelines:
        if len(line.split()) == 44:
            pssmvalue = np.r_[pssmvalue, np.array(line.split()[2:22]).astype(float)]
    pssmvalue = np.reshape(pssmvalue, (-1, 20))
    if pssmvalue.shape[0] < 1024:
        pssmvalue = np.r_[pssmvalue, np.zeros([1024 - pssmvalue.shape[0], 20])]
    if most1024:
        if pssmvalue.shape[0] > 1024:
            pssmvalue = pssmvalue[:1024, :]
    if mutated_pos != None:
        pssmvalue = np.r_[np.zeros([25, 20]), pssmvalue, np.zeros([25, 20])]
        pssmvalue = pssmvalue[mutated_pos - 1: mutated_pos + 50, :]
    
    return pssmvalue

In [None]:
# change it into your PSSM root
pssm_path = '/lustre/home/acct-bmelgn/bmelgn-2/QianWei/app/psipred_file/psipred/BLAST+/setd_test0923/pssm'

In [114]:
pssm_par0 = [parse_pssm(x, pssm_root=pssm_path) for x in df_setd5['parAC']]
# pssm 51 window only available to single mutation items
df_setd5['pssm_par0'] = pssm_par0
df_setd5 = df_setd5[~(df_setd5['pssm_par0'] == 'nan')]
print('after pssm_par0: {}'.format(df_setd5.shape))

# pssm_mut0 = [parse_pssm(x, pssm_root=pssm_path) for x in df_setd5['mutAC']]
pssm_mut0 = [parse_pssm(df_setd5.loc[i, 'mutAC'], pssm_root=pssm_path, mutated_pos=df_setd5.loc[i, 'pos']) for i in df_setd5.index]
# pssm 51 window only available to single mutation items
df_setd5['pssm_win_mut0'] = pssm_mut0
df_setd5 = df_setd5[~(df_setd5['pssm_win_mut0'] == 'nan')]
print('after pssm_mut0: {}'.format(df_setd5.shape))

# pssm_mut1 = [parse_pssm(x, pssm_root=pssm_path) for x in df_setd5['mutAC1']]
pssm_mut1 = [parse_pssm(df_setd5.loc[i, 'mutAC1'], pssm_root=pssm_path, mutated_pos=df_setd5.loc[i, 'pos']) for i in df_setd5.index]
# pssm 51 window only available to single mutation items
df_setd5['pssm_win_mut1'] = pssm_mut1
df_setd5 = df_setd5[~(df_setd5['pssm_win_mut1'] == 'nan')]
print('after pssm_mut1: {}'.format(df_setd5.shape))



# pssm_mut0 = [parse_pssm(df_w.loc[i, 'oriAC'], pssm_root=pssm_path, mutated_pos=df_w.loc[i, 'pos1_w']) for i in df_w.index]
# # pssm 51 window only available to single mutation items
# df_w['pssm_mut0_win'] = pssm_mut0
# df_w = df_w[~(df_w['pssm_mut0_win'] == 'nan')]
# print('after pssm_mut0: {}'.format(df_w.shape))

# pssm_mut1 = [parse_pssm(df_w.loc[i, 'mutAC'], pssm_root=pssm_path, mutated_pos=df_w.loc[i, 'pos1_w']) for i in df_w.index]
# # pssm 51 window only available to single mutation items
# df_w['pssm_mut1_win'] = pssm_mut1
# df_w = df_w[~(df_w['pssm_mut1_win'] == 'nan')]
# print('after pssm_mut1: {}'.format(df_w.shape))

Can not open file: /lustre/home/acct-bmelgn/bmelgn-2/QianWei/app/psipred_file/psipred/BLAST+/setd_test0923/pssm/P86478|P86479|P86481|P86480|P86496.pssm


  result = libops.scalar_compare(x.ravel(), y, op)


after pssm_par0: (163, 51)
after pssm_mut0: (163, 52)
after pssm_mut1: (163, 53)


In [115]:
df_setd5

Unnamed: 0,index,#BioGRID Interaction ID,Entrez Gene Interactor A,Entrez Gene Interactor B,BioGRID ID Interactor A,BioGRID ID Interactor B,Systematic Name Interactor A,Systematic Name Interactor B,Official Symbol Interactor A,Official Symbol Interactor B,...,pos,oriaa,mutaa,mut1,mut0_51,mut1_51,mutAC1,pssm_par0,pssm_win_mut0,pssm_win_mut1
0,0,120115,55209,151871,120505,127407,-,-,SETD5,DPPA2,...,77,R,C,MSIAIPLGVTTSDTSYSDMAAGSDPESVEASPAVNEKSVYSTHNYG...,GCRGLPYATIIPRSDLNGLPSPVEERCGDSPNSEGETVPTWCPCGL...,GCRGLPYATIIPRSDLNGLPSPVEECCGDSPNSEGETVPTWCPCGL...,Q9C0A6_R77C,"[[-3.0, -4.0, -5.0, -6.0, -4.0, -3.0, -4.0, -5...","[[-1.0, -3.0, 0.0, 1.0, -2.0, -2.0, -2.0, 3.0,...","[[-1.0, -2.0, 1.0, 0.0, -1.0, 0.0, -1.0, 2.0, ..."
1,1,120480,11007,55209,116198,120505,-,-,CCDC85B,SETD5,...,77,R,C,MSIAIPLGVTTSDTSYSDMAAGSDPESVEASPAVNEKSVYSTHNYG...,GCRGLPYATIIPRSDLNGLPSPVEERCGDSPNSEGETVPTWCPCGL...,GCRGLPYATIIPRSDLNGLPSPVEECCGDSPNSEGETVPTWCPCGL...,Q9C0A6_R77C,"[[-1.0, -2.0, -2.0, -3.0, -2.0, -2.0, -3.0, -3...","[[-1.0, -3.0, 0.0, 1.0, -2.0, -2.0, -2.0, 3.0,...","[[-1.0, -2.0, 1.0, 0.0, -1.0, 0.0, -1.0, 2.0, ..."
2,2,121055,7186,55209,113038,120505,-,-,TRAF2,SETD5,...,77,R,C,MSIAIPLGVTTSDTSYSDMAAGSDPESVEASPAVNEKSVYSTHNYG...,GCRGLPYATIIPRSDLNGLPSPVEERCGDSPNSEGETVPTWCPCGL...,GCRGLPYATIIPRSDLNGLPSPVEECCGDSPNSEGETVPTWCPCGL...,Q9C0A6_R77C,"[[-6.0, -7.0, -8.0, -9.0, -7.0, -6.0, -7.0, -8...","[[-1.0, -3.0, 0.0, 1.0, -2.0, -2.0, -2.0, 3.0,...","[[-1.0, -2.0, 1.0, 0.0, -1.0, 0.0, -1.0, 2.0, ..."
3,3,838797,55209,23641,120505,117169,-,-,SETD5,LDOC1,...,77,R,C,MSIAIPLGVTTSDTSYSDMAAGSDPESVEASPAVNEKSVYSTHNYG...,GCRGLPYATIIPRSDLNGLPSPVEERCGDSPNSEGETVPTWCPCGL...,GCRGLPYATIIPRSDLNGLPSPVEECCGDSPNSEGETVPTWCPCGL...,Q9C0A6_R77C,"[[-4.0, -5.0, -5.0, -6.0, -5.0, -4.0, -5.0, -6...","[[-1.0, -3.0, 0.0, 1.0, -2.0, -2.0, -2.0, 3.0,...","[[-1.0, -2.0, 1.0, 0.0, -1.0, 0.0, -1.0, 2.0, ..."
4,4,1046359,23281,55209,116880,120505,-,-,MTUS2,SETD5,...,77,R,C,MSIAIPLGVTTSDTSYSDMAAGSDPESVEASPAVNEKSVYSTHNYG...,GCRGLPYATIIPRSDLNGLPSPVEERCGDSPNSEGETVPTWCPCGL...,GCRGLPYATIIPRSDLNGLPSPVEECCGDSPNSEGETVPTWCPCGL...,Q9C0A6_R77C,"[[-3.0, -4.0, -4.0, -5.0, -4.0, -3.0, -4.0, -5...","[[-1.0, -3.0, 0.0, 1.0, -2.0, -2.0, -2.0, 3.0,...","[[-1.0, -2.0, 1.0, 0.0, -1.0, 0.0, -1.0, 2.0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159,60,2631975,8359,29072,113955,118845,-,HSPC069,HIST1H4A,SETD2,...,1666,Y,C,MKQLQPQPPPKMGDFYDPEHPTPEEEENEAKIENVQKTGFIKGPMF...,TVNGQLRVGFFTTKLVPSGSELTFDYQFQRYGKEAQKCFCGSANCR...,TVNGQLRVGFFTTKLVPSGSELTFDCQFQRYGKEAQKCFCGSANCR...,Q9BYW2_Y1666C,"[[-5.0, -6.0, -6.0, -7.0, -6.0, -5.0, -6.0, -7...","[[-1.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, -1.0, 2....","[[-1.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, -1.0, 2...."
160,61,2639647,26094,29072,117545,118845,-,HSPC069,DCAF4,SETD2,...,1666,Y,C,MKQLQPQPPPKMGDFYDPEHPTPEEEENEAKIENVQKTGFIKGPMF...,TVNGQLRVGFFTTKLVPSGSELTFDYQFQRYGKEAQKCFCGSANCR...,TVNGQLRVGFFTTKLVPSGSELTFDCQFQRYGKEAQKCFCGSANCR...,Q9BYW2_Y1666C,"[[-4.0, -5.0, -6.0, -7.0, -5.0, -4.0, -6.0, -6...","[[-1.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, -1.0, 2....","[[-1.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, -1.0, 2...."
161,62,2747619,9820,29072,115159,118845,-,HSPC069,CUL7,SETD2,...,1666,Y,C,MKQLQPQPPPKMGDFYDPEHPTPEEEENEAKIENVQKTGFIKGPMF...,TVNGQLRVGFFTTKLVPSGSELTFDYQFQRYGKEAQKCFCGSANCR...,TVNGQLRVGFFTTKLVPSGSELTFDCQFQRYGKEAQKCFCGSANCR...,Q9BYW2_Y1666C,"[[-3.0, -3.0, -4.0, -5.0, -3.0, -2.0, -4.0, -4...","[[-1.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, -1.0, 2....","[[-1.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, -1.0, 2...."
162,63,2754851,1489680,29072,4383947,118845,SARS-CoV-1ab-nsp9ab,HSPC069,nsp9ab,SETD2,...,1666,Y,C,MKQLQPQPPPKMGDFYDPEHPTPEEEENEAKIENVQKTGFIKGPMF...,TVNGQLRVGFFTTKLVPSGSELTFDYQFQRYGKEAQKCFCGSANCR...,TVNGQLRVGFFTTKLVPSGSELTFDCQFQRYGKEAQKCFCGSANCR...,Q9BYW2_Y1666C,"[[-2.0, -2.0, -3.0, -4.0, -2.0, -1.0, -3.0, -4...","[[-1.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, -1.0, 2....","[[-1.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, -1.0, 2...."


In [118]:
df_setd5.to_pickle('../data/setd_dataset.dataset')

In [116]:
df_setd5['pssm_win_mut0'].values[0].shape

(51, 20)

In [117]:
for i in range(df_setd5['pssm_win_mut0'].values.shape[0]):
    if df_setd5['pssm_win_mut0'].values[i].shape[0] != 51:
        print(i)

In [119]:
df_setd5.iloc[0]

index                                                                                 0
#BioGRID Interaction ID                                                          120115
Entrez Gene Interactor A                                                          55209
Entrez Gene Interactor B                                                         151871
BioGRID ID Interactor A                                                          120505
BioGRID ID Interactor B                                                          127407
Systematic Name Interactor A                                                          -
Systematic Name Interactor B                                                          -
Official Symbol Interactor A                                                      SETD5
Official Symbol Interactor B                                                      DPPA2
Synonyms Interactor A                                                                 -
Synonyms Interactor B           