In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras.models import load_model
# from keras.metrics import sparse_top_k_categorical_accuracy
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold
from collections import Counter
import tensorflow.keras.backend as K
import numpy as np
import pandas as pd
import os
from tensorflow.keras.utils import to_categorical
import sys
from sklearn.model_selection import train_test_split
from mippiNetbuild import *

In [59]:

# df_path = r'../../../data/raw/raw_s51_pssm_0905_modify.dataset'
df_path = r'../data/setd_dataset.dataset'
df = pd.read_pickle(df_path)
# df = df[~(df['label'] == 4)]

aaDict = {'0':0, 'D':1, 'S':2, 'Q':3, 'K':4,
          'I':5, 'P':6, 'T':7, 'F':8, 'N':9,
          'G':10, 'H':11, 'L':12, 'R':13, 'W':14,
          'A':15, 'V':16, 'E':17, 'Y':18, 'M':19, 'C':20}

max_len = 1024
window_len = 51

mut0_c = [[aaDict[x] for x in a] for a in df['mut0_51']]
mut1_c = [[aaDict[x] for x in a] for a in df['mut1_51']]
par0_c = [[aaDict[x] for x in a] for a in df['par0']]

mut0_c = keras.preprocessing.sequence.pad_sequences(mut0_c, maxlen=window_len, padding='post')
mut1_c = keras.preprocessing.sequence.pad_sequences(mut1_c, maxlen=window_len, padding='post')
par0_c = keras.preprocessing.sequence.pad_sequences(par0_c, maxlen=max_len, padding='post')

pssm_win_mut0 = df['pssm_win_mut0'].values
pssm_win_mut0 = np.stack(pssm_win_mut0, axis=0).astype('float32')
pssm_win_mut1 = df['pssm_win_mut1'].values
pssm_win_mut1 = np.stack(pssm_win_mut1, axis=0).astype('float32')
pssm_par0 = df['pssm_par0'].values
pssm_par0 = [x[:1024, :].astype('float32') for x in pssm_par0]
pssm_par0 = np.stack(pssm_par0, axis=0).astype('float32')


In [60]:
data = [mut0_c, mut1_c, par0_c,
          pssm_win_mut0, pssm_win_mut1, pssm_par0]
data_re = [mut1_c, mut0_c, par0_c,
          pssm_win_mut1, pssm_win_mut0, pssm_par0]

In [61]:
K.clear_session()
model = build_model()
adam = optimizers.Adam(learning_rate=0.0002)
model.compile(adam, loss=categorical_focal_loss(alpha=[.25, .25, .1, .25], gamma=2.), 
              metrics=['acc', tf.keras.metrics.TopKCategoricalAccuracy(k=2, name='top2acc')])

## merge 5-fold models prediction to give final result

In [62]:
init_flag = True
all_pred = np.array([])
all_re_pred = np.array([])
all_score = np.array([])
all_re_score = np.array([])
for i in range(5):
    best_acc_model_path = r'../models/bestAcc.h5' + str(i)
    model.load_weights(best_acc_model_path).expect_partial()
    y_pred = model.predict(data)
    y_pred_re = model.predict(data_re)
    y_pred_class = y_pred.argmax(axis=-1)
    y_pred_re_class = y_pred_re.argmax(axis=-1)
    df['pred_class' + str(i)] = y_pred_class
    df['re_pred_class' + str(i)] = y_pred_re_class
    df['score' + str(i)] = y_pred.max(axis=-1)
    df['re_score' + str(i)] = y_pred_re.max(axis=-1)
    if init_flag:
        all_pred = y_pred_class
        all_score = y_pred
        all_re_pred = y_pred_re_class
        all_re_score = y_pred_re
        init_flag = False
    else:
        all_pred = np.c_[all_pred, y_pred_class]
        all_score += y_pred
        all_re_pred = np.c_[all_re_pred, y_pred_re_class]
        all_re_score += y_pred_re

In [63]:
from collections import Counter
consistent_score = np.zeros(df.shape[0])
most_common = np.zeros(df.shape[0])
for i in range(all_pred.shape[0]):
    consistent_score[i] = Counter(all_pred[i]).most_common()[0][1]
    most_common[i] = Counter(all_pred[i]).most_common()[0][0]

In [64]:
score_cv5_class = all_score.argmax(axis=-1)
score_cv5_reverse_class = all_re_score.argmax(axis=-1)
df['cv5_class'] = score_cv5_class
df['cv5_reverse_class'] = score_cv5_reverse_class
df['cv5_score'] = (all_score / 5).max(axis=-1)
df['cv5_reverse_score'] = (all_re_score / 5).max(axis=-1)
df.head()

Unnamed: 0,index,#BioGRID Interaction ID,Entrez Gene Interactor A,Entrez Gene Interactor B,BioGRID ID Interactor A,BioGRID ID Interactor B,Systematic Name Interactor A,Systematic Name Interactor B,Official Symbol Interactor A,Official Symbol Interactor B,...,score3,re_score3,pred_class4,re_pred_class4,score4,re_score4,cv5_class,cv5_reverse_class,cv5_score,cv5_reverse_score
0,0,120115,55209,151871,120505,127407,-,-,SETD5,DPPA2,...,0.862982,0.926179,2,2,0.58958,0.704174,2,2,0.803372,0.839118
1,1,120480,11007,55209,116198,120505,-,-,CCDC85B,SETD5,...,0.901425,0.94797,2,2,0.872148,0.902068,2,2,0.84988,0.892739
2,2,121055,7186,55209,113038,120505,-,-,TRAF2,SETD5,...,0.879841,0.939003,3,1,0.578974,0.477155,2,2,0.609446,0.669295
3,3,838797,55209,23641,120505,117169,-,-,SETD5,LDOC1,...,0.941576,0.969819,2,2,0.841737,0.868257,2,2,0.883834,0.91934
4,4,1046359,23281,55209,116880,120505,-,-,MTUS2,SETD5,...,0.922741,0.959661,2,2,0.938612,0.952473,2,2,0.927492,0.939971


In [65]:
df['con_score'] = consistent_score
df['most_common'] = most_common

In [66]:
df['all_score0'] = all_score[:,0]
df['all_score1'] = all_score[:,1]
df['all_score2'] = all_score[:,2]
df['all_score3'] = all_score[:,3]

In [139]:
df.head()

Unnamed: 0.1,Unnamed: 0,index,#BioGRID Interaction ID,Entrez Gene Interactor A,Entrez Gene Interactor B,BioGRID ID Interactor A,BioGRID ID Interactor B,Systematic Name Interactor A,Systematic Name Interactor B,Official Symbol Interactor A,...,cv5_class,cv5_reverse_class,cv5_score,cv5_reverse_score,con_score,most_common,all_score0,all_score1,all_score2,all_score3
0,0,0,120115,55209,151871,120505,127407,-,-,SETD5,...,2,2,0.820022,0.699346,5.0,2.0,0.810765,0.044138,4.100112,0.044985
1,1,1,120480,11007,55209,116198,120505,-,-,CCDC85B,...,2,2,0.68499,0.637614,4.0,2.0,0.578725,0.32057,3.424951,0.675754
2,2,2,121055,7186,55209,113038,120505,-,-,TRAF2,...,2,2,0.558783,0.52894,4.0,2.0,0.775046,0.54703,2.793913,0.88401
3,3,3,838797,55209,23641,120505,117169,-,-,SETD5,...,2,2,0.827524,0.719965,5.0,2.0,0.68442,0.080763,4.137618,0.097199
4,4,4,1046359,23281,55209,116880,120505,-,-,MTUS2,...,2,2,0.864992,0.767151,5.0,2.0,0.47025,0.094637,4.32496,0.110153


## load genetic evidence table to select high evidence gene

In [142]:
df_map = pd.read_csv(r'../data/20191208_gene_GeneticEvidence_MasterTable.csv')

In [143]:
df_map.head()

Unnamed: 0,#GeneID,Symbol,chr,map location,n exons,tx start,tx end,bp,description,gene type,...,"ASD, DD neurodevelopmental Risk Genes",Neurodegenerative disease genes,ExAC nonpsych pLI score,ExAC nonpsych pLI >=0.9,ExAC all pLI score,ExAC all pLI >=0.9,Haploinsufficiency Score,Haploinsufficiency RankPercent,likely haploinsufficient genes,Total evidence
0,6812.0,STXBP1,9,9q34.11,19.0,130374682.0,130446756.0,1812.0,syntaxin binding protein 1,protein-coding,...,1.0,0.0,0.99971,1.0,0.999883,1.0,0.391277,0.21329,0.0,19.0
1,23077.0,MYCBP2,13,13q22.3,83.0,77619512.0,77900796.0,13923.0,"MYC binding protein 2, E3 ubiquitin protein li...",protein-coding,...,0.0,0.0,1.0,1.0,1.0,1.0,0.667579,0.0959,1.0,12.0
2,2332.0,FMR1,X,Xq27.3,17.0,146993697.0,147030364.0,1899.0,fragile X mental retardation 1,protein-coding,...,0.0,0.0,0.296796,0.0,0.127198,0.0,0.940937,0.024995,1.0,7.0
3,23352.0,UBR4,1,1p36.13,106.0,19401325.0,19536742.0,15552.0,ubiquitin protein ligase E3 component n-recogn...,protein-coding,...,0.0,0.0,1.0,1.0,1.0,1.0,0.369648,0.230391,0.0,8.0
4,3190.0,HNRNPK,9,9q21.32,15.0,86584321.0,86593167.0,1395.0,heterogeneous nuclear ribonucleoprotein K,protein-coding,...,0.0,0.0,0.999504,1.0,0.999782,1.0,0.920492,0.030276,1.0,9.0


In [144]:
df_map_ = df_map[['Total evidence', 'Symbol']]

In [145]:
df_map_

Unnamed: 0,Total evidence,Symbol
0,19.0,STXBP1
1,12.0,MYCBP2
2,7.0,FMR1
3,8.0,UBR4
4,9.0,HNRNPK
...,...,...
60263,0.0,RPY
60264,,DELYQ11
60265,,DFNY1
60266,,HEY


In [146]:
df['mutProtein'] = 0
df['parProtein'] = 0
for i in df.index:
    if (df.loc[i, 'Official Symbol Interactor A'] == 'SETD2' or df.loc[i, 'Official Symbol Interactor A'] == 'SETD5'):
        df.loc[i, 'mutProtein'] = df.loc[i, 'Official Symbol Interactor A']
        df.loc[i, 'parProtein'] = df.loc[i, 'Official Symbol Interactor B']
    else:
        df.loc[i, 'mutProtein'] = df.loc[i, 'Official Symbol Interactor B']
        df.loc[i, 'parProtein'] = df.loc[i, 'Official Symbol Interactor A']

In [147]:
df_all = df.merge(df_map, left_on='parProtein', right_on='Symbol')

In [150]:
df.head()

Unnamed: 0.1,Unnamed: 0,index,#BioGRID Interaction ID,Entrez Gene Interactor A,Entrez Gene Interactor B,BioGRID ID Interactor A,BioGRID ID Interactor B,Systematic Name Interactor A,Systematic Name Interactor B,Official Symbol Interactor A,...,cv5_score,cv5_reverse_score,con_score,most_common,all_score0,all_score1,all_score2,all_score3,mutProtein,parProtein
0,0,0,120115,55209,151871,120505,127407,-,-,SETD5,...,0.820022,0.699346,5.0,2.0,0.810765,0.044138,4.100112,0.044985,SETD5,DPPA2
1,1,1,120480,11007,55209,116198,120505,-,-,CCDC85B,...,0.68499,0.637614,4.0,2.0,0.578725,0.32057,3.424951,0.675754,SETD5,CCDC85B
2,2,2,121055,7186,55209,113038,120505,-,-,TRAF2,...,0.558783,0.52894,4.0,2.0,0.775046,0.54703,2.793913,0.88401,SETD5,TRAF2
3,3,3,838797,55209,23641,120505,117169,-,-,SETD5,...,0.827524,0.719965,5.0,2.0,0.68442,0.080763,4.137618,0.097199,SETD5,LDOC1
4,4,4,1046359,23281,55209,116880,120505,-,-,MTUS2,...,0.864992,0.767151,5.0,2.0,0.47025,0.094637,4.32496,0.110153,SETD5,MTUS2


In [151]:
#for i in df_all.index:
#    if (df_all.loc[i, 'Symbol_x'] == 'SETD2' or df_all.loc[i, 'Symbol_x'] == 'SETD5'):
#        tmp = df_all.loc[i, 'Symbol_x']
#        df_all.loc[i, 'Symbol_x'] = df_all.loc[i, 'Symbol_y']
#        df_all.loc[i, 'Symbol_y'] = tmp

In [152]:
df_all[:20]

Unnamed: 0.1,Unnamed: 0,index,#BioGRID Interaction ID,Entrez Gene Interactor A,Entrez Gene Interactor B,BioGRID ID Interactor A,BioGRID ID Interactor B,Systematic Name Interactor A,Systematic Name Interactor B,Official Symbol Interactor A,...,"ASD, DD neurodevelopmental Risk Genes",Neurodegenerative disease genes,ExAC nonpsych pLI score,ExAC nonpsych pLI >=0.9,ExAC all pLI score,ExAC all pLI >=0.9,Haploinsufficiency Score,Haploinsufficiency RankPercent,likely haploinsufficient genes,Total evidence
0,0,0,120115,55209,151871,120505,127407,-,-,SETD5,...,0.0,0.0,3.23e-08,0.0,7.76e-08,0.0,0.00189,0.914192,0.0,0.0
1,1,1,120480,11007,55209,116198,120505,-,-,CCDC85B,...,0.0,0.0,0.5063296,0.0,0.4913796,0.0,0.075622,0.566932,0.0,1.0
2,2,2,121055,7186,55209,113038,120505,-,-,TRAF2,...,0.0,0.0,0.9941008,1.0,0.9964809,1.0,0.106243,0.504602,0.0,4.0
3,3,3,838797,55209,23641,120505,117169,-,-,SETD5,...,0.0,0.0,0.5833637,0.0,0.5754803,0.0,0.038067,0.670257,0.0,2.0
4,4,4,1046359,23281,55209,116880,120505,-,-,MTUS2,...,0.0,0.0,0.1713212,0.0,0.3306731,0.0,0.119692,0.480182,0.0,1.0
5,5,5,1047991,55209,80321,120505,123234,-,-,SETD5,...,0.0,0.0,5.98e-10,0.0,5.95e-11,0.0,0.084092,0.54931,0.0,0.0
6,6,6,1047992,55209,125115,120505,125917,-,-,SETD5,...,0.0,0.0,0.000693622,0.0,0.000203217,0.0,0.036747,0.674859,0.0,0.0
7,7,7,1053078,10482,55209,115745,120505,-,-,NXF1,...,0.0,0.0,0.9992515,1.0,0.9989905,1.0,0.222751,0.345744,0.0,4.0
8,8,8,1189457,8631,55209,114184,120505,-,-,SKAP1,...,0.0,0.0,0.000826293,0.0,0.00030184,0.0,0.891327,0.037126,1.0,1.0
9,9,9,1191638,3275,55209,109511,120505,-,-,PRMT2,...,0.0,0.0,0.9000655,1.0,0.7514202,0.0,0.035254,0.680245,0.0,3.0


In [153]:
df_all.shape

(154, 166)

### save raw result table

In [154]:
df_all.to_csv(r'../data/setd_select.csv')

### optional: filter genes with genetic total evidence >5

In [155]:
# df_all = df_all[df_all['Total evidence'] >= 5]

### optional:filter entries with model confidence score >0.3

In [156]:
# df_all = df_all[df_all['cv5_score'] > 0.3]

In [157]:
df_all['mut_'] = df_all['oriaa'] + df_all['pos'].astype('str') + df_all['mutaa']

In [158]:
df_all = df_all[df_all['mutProtein'] == 'SETD2']

In [159]:
df_b = df_all[['parProtein', 'pos', 'oriaa', 'mutaa', 'mut_', 'cv5_class', 'cv5_score', 'cv5_reverse_class', 'cv5_reverse_score', 'Total evidence', 'Throughput', 'Experimental System', 'Publication Source']]
df_b

Unnamed: 0,parProtein,pos,oriaa,mutaa,mut_,cv5_class,cv5_score,cv5_reverse_class,cv5_reverse_score,Total evidence,Throughput,Experimental System,Publication Source
19,HTT,1624,S,C,S1624C,3,0.40563,1,0.410458,7.0,Low Throughput,Two-hybrid,PUBMED:9700202
20,HTT,1815,L,T,L1815T,1,0.474224,3,0.504523,7.0,Low Throughput,Two-hybrid,PUBMED:9700202
21,HTT,1666,Y,C,Y1666C,3,0.625117,1,0.584795,7.0,Low Throughput,Two-hybrid,PUBMED:9700202
25,TP53,1624,S,C,S1624C,2,0.34089,2,0.373236,6.0,Low Throughput,Affinity Capture-Western,PUBMED:18585004
26,TP53,1815,L,T,L1815T,2,0.491011,2,0.521,6.0,Low Throughput,Affinity Capture-Western,PUBMED:18585004
27,TP53,1666,Y,C,Y1666C,3,0.484876,1,0.421901,6.0,Low Throughput,Affinity Capture-Western,PUBMED:18585004
31,SETD2,1624,S,C,S1624C,2,0.37624,2,0.40022,9.0,Low Throughput,Biochemical Activity,PUBMED:16118227
32,SETD2,1815,L,T,L1815T,2,0.43399,2,0.469095,9.0,Low Throughput,Biochemical Activity,PUBMED:16118227
33,SETD2,1666,Y,C,Y1666C,3,0.455039,1,0.374677,9.0,Low Throughput,Biochemical Activity,PUBMED:16118227
34,POLR2A,1624,S,C,S1624C,0,0.386663,0,0.293914,8.0,Low Throughput,Affinity Capture-Western,PUBMED:16118227


In [160]:
df_b = df_b[df_b.duplicated(['parProtein'], keep=False)]

In [162]:
# df_b.to_csv('setd_select_havetest_brief_copy1.csv')
df_b.to_csv('../data/setd_select_brief.csv')

In [163]:
gene = df_b['parProtein'].unique().tolist()
mut = df_b['oriaa'] + df_b['pos'].astype(str) + df_b['mutaa']
mut = mut.unique().tolist()

In [164]:
df_b_n = pd.DataFrame({'partner': gene})
df_b_n['L1815T'] = 0
df_b_n['S1624C'] = 0
df_b_n['Y1666C'] = 0
df_b_n['evidence'] = 0
df_b_n['throughput'] = 0
df_b_n['Experimental System'] = 0
df_b_n['Publication Source'] = 0

In [165]:
for i in df_b_n.index:
    if len(df_b[(df_b['parProtein'] == df_b_n.loc[i, 'partner']) & (df_b['mut_'] == 'L1815T')]['cv5_class'].tolist()) > 0:
        df_b_n.loc[i, 'L1815T'] = df_b[(df_b['parProtein'] == df_b_n.loc[i, 'partner']) & (df_b['mut_'] == 'L1815T')]['cv5_class'].tolist()[0]
    else:
        df_b_n.loc[i, 'L1815T'] = '-'
    if len(df_b[(df_b['parProtein'] == df_b_n.loc[i, 'partner']) & (df_b['mut_'] == 'S1624C')]['cv5_class'].tolist()) > 0:
        df_b_n.loc[i, 'S1624C'] = df_b[(df_b['parProtein'] == df_b_n.loc[i, 'partner']) & (df_b['mut_'] == 'S1624C')]['cv5_class'].tolist()[0]
    else:
        df_b_n.loc[i, 'S1624C'] = '-'
    if len(df_b[(df_b['parProtein'] == df_b_n.loc[i, 'partner']) & (df_b['mut_'] == 'Y1666C')]['cv5_class'].tolist()) > 0:
        df_b_n.loc[i, 'Y1666C'] = df_b[(df_b['parProtein'] == df_b_n.loc[i, 'partner']) & (df_b['mut_'] == 'Y1666C')]['cv5_class'].tolist()[0]
    else:
        df_b_n.loc[i, 'Y1666C'] = '-'
    df_b_n.loc[i, 'evidence'] = df_b[df_b['parProtein'] == df_b_n.loc[i, 'partner']]['Total evidence'].tolist()[0]
    df_b_n.loc[i, 'Throughput'] = df_b[df_b['parProtein'] == df_b_n.loc[i, 'partner']]['Throughput'].tolist()[0]
    df_b_n.loc[i, 'Experimental System'] = df_b[df_b['parProtein'] == df_b_n.loc[i, 'partner']]['Experimental System'].tolist()[0]
    df_b_n.loc[i, 'Publication Source'] = df_b[df_b['parProtein'] == df_b_n.loc[i, 'partner']]['Publication Source'].tolist()[0]

In [166]:
df_b_n['L1815T_score'] = 0
df_b_n['S1624C_score'] = 0
df_b_n['Y1666C_score'] = 0

for i in df_b_n.index:
    if len(df_b[(df_b['parProtein'] == df_b_n.loc[i, 'partner']) & (df_b['mut_'] == 'L1815T')]['cv5_score'].tolist()) > 0:
        df_b_n.loc[i, 'L1815T_score'] = df_b[(df_b['parProtein'] == df_b_n.loc[i, 'partner']) & (df_b['mut_'] == 'L1815T')]['cv5_score'].tolist()[0]
    else:
        df_b_n.loc[i, 'L1815T_score'] = '-'
    if len(df_b[(df_b['parProtein'] == df_b_n.loc[i, 'partner']) & (df_b['mut_'] == 'S1624C')]['cv5_score'].tolist()) > 0:
        df_b_n.loc[i, 'S1624C_score'] = df_b[(df_b['parProtein'] == df_b_n.loc[i, 'partner']) & (df_b['mut_'] == 'S1624C')]['cv5_score'].tolist()[0]
    else:
        df_b_n.loc[i, 'S1624C_score'] = '-'
    if len(df_b[(df_b['parProtein'] == df_b_n.loc[i, 'partner']) & (df_b['mut_'] == 'Y1666C')]['cv5_score'].tolist()) > 0:
        df_b_n.loc[i, 'Y1666C_score'] = df_b[(df_b['parProtein'] == df_b_n.loc[i, 'partner']) & (df_b['mut_'] == 'Y1666C')]['cv5_score'].tolist()[0]
    else:
        df_b_n.loc[i, 'Y1666C_score'] = '-'

In [167]:
df_b_n['L1815T_re'] = 0
df_b_n['S1624C_re'] = 0
df_b_n['Y1666C_re'] = 0

for i in df_b_n.index:
    if len(df_b[(df_b['parProtein'] == df_b_n.loc[i, 'partner']) & (df_b['mut_'] == 'L1815T')]['cv5_reverse_class'].tolist()) > 0:
        df_b_n.loc[i, 'L1815T_re'] = df_b[(df_b['parProtein'] == df_b_n.loc[i, 'partner']) & (df_b['mut_'] == 'L1815T')]['cv5_reverse_class'].tolist()[0]
    else:
        df_b_n.loc[i, 'L1815T_re'] = '-'
    if len(df_b[(df_b['parProtein'] == df_b_n.loc[i, 'partner']) & (df_b['mut_'] == 'S1624C')]['cv5_reverse_class'].tolist()) > 0:
        df_b_n.loc[i, 'S1624C_re'] = df_b[(df_b['parProtein'] == df_b_n.loc[i, 'partner']) & (df_b['mut_'] == 'S1624C')]['cv5_reverse_class'].tolist()[0]
    else:
        df_b_n.loc[i, 'S1624C_re'] = '-'
    if len(df_b[(df_b['parProtein'] == df_b_n.loc[i, 'partner']) & (df_b['mut_'] == 'Y1666C')]['cv5_reverse_class'].tolist()) > 0:
        df_b_n.loc[i, 'Y1666C_re'] = df_b[(df_b['parProtein'] == df_b_n.loc[i, 'partner']) & (df_b['mut_'] == 'Y1666C')]['cv5_reverse_class'].tolist()[0]
    else:
        df_b_n.loc[i, 'Y1666C_re'] = '-'

In [168]:
df_b_n['L1815T_re_score'] = 0
df_b_n['S1624C_re_score'] = 0
df_b_n['Y1666C_re_score'] = 0

for i in df_b_n.index:
    if len(df_b[(df_b['parProtein'] == df_b_n.loc[i, 'partner']) & (df_b['mut_'] == 'L1815T')]['cv5_reverse_score'].tolist()) > 0:
        df_b_n.loc[i, 'L1815T_re_score'] = df_b[(df_b['parProtein'] == df_b_n.loc[i, 'partner']) & (df_b['mut_'] == 'L1815T')]['cv5_reverse_score'].tolist()[0]
    else:
        df_b_n.loc[i, 'L1815T_re_score'] = '-'
    if len(df_b[(df_b['parProtein'] == df_b_n.loc[i, 'partner']) & (df_b['mut_'] == 'S1624C')]['cv5_reverse_score'].tolist()) > 0:
        df_b_n.loc[i, 'S1624C_re_score'] = df_b[(df_b['parProtein'] == df_b_n.loc[i, 'partner']) & (df_b['mut_'] == 'S1624C')]['cv5_reverse_score'].tolist()[0]
    else:
        df_b_n.loc[i, 'S1624C_re_score'] = '-'
    if len(df_b[(df_b['parProtein'] == df_b_n.loc[i, 'partner']) & (df_b['mut_'] == 'Y1666C')]['cv5_reverse_score'].tolist()) > 0:
        df_b_n.loc[i, 'Y1666C_re_score'] = df_b[(df_b['parProtein'] == df_b_n.loc[i, 'partner']) & (df_b['mut_'] == 'Y1666C')]['cv5_reverse_score'].tolist()[0]
    else:
        df_b_n.loc[i, 'Y1666C_re_score'] = '-'

In [169]:
df_b_n

Unnamed: 0,partner,L1815T,S1624C,Y1666C,evidence,throughput,Experimental System,Publication Source,Throughput,L1815T_score,S1624C_score,Y1666C_score,L1815T_re,S1624C_re,Y1666C_re,L1815T_re_score,S1624C_re_score,Y1666C_re_score
0,HTT,1,3,3,7.0,0,Two-hybrid,PUBMED:9700202,Low Throughput,0.474224,0.40563,0.625117,3,1,1,0.504523,0.410458,0.584795
1,TP53,2,2,3,6.0,0,Affinity Capture-Western,PUBMED:18585004,Low Throughput,0.491011,0.34089,0.484876,2,2,1,0.521,0.373236,0.421901
2,SETD2,2,2,3,9.0,0,Biochemical Activity,PUBMED:16118227,Low Throughput,0.43399,0.37624,0.455039,2,2,1,0.469095,0.40022,0.374677
3,POLR2A,0,0,3,8.0,0,Affinity Capture-Western,PUBMED:16118227,Low Throughput,0.507733,0.386663,0.480945,2,0,0,0.347462,0.293914,0.435339
4,ELAVL1,0,0,3,5.0,0,Affinity Capture-RNA,PUBMED:19322201,High Throughput,0.442214,0.469945,0.487382,3,0,0,0.347974,0.406258,0.495873
5,ATXN1,2,2,3,6.0,0,Two-hybrid,PUBMED:16713569,High Throughput,0.450511,0.339791,0.49881,2,2,1,0.454561,0.331618,0.420791
6,CIC,2,0,3,10.0,0,Two-hybrid,PUBMED:16713569,High Throughput,0.486832,0.377264,0.405597,2,2,0,0.578755,0.403002,0.442048
7,CBX8,2,2,3,6.0,0,Affinity Capture-MS,PUBMED:21282530,Low Throughput,0.547437,0.336831,0.389026,2,2,0,0.630705,0.484957,0.41366
8,SOX2,2,0,3,10.0,0,Affinity Capture-MS,PUBMED:23667531,High Throughput,0.509101,0.306139,0.40774,2,2,1,0.630465,0.449687,0.363275
9,SMAD3,2,0,3,7.0,0,Affinity Capture-Western,PUBMED:21988832,Low Throughput,0.431886,0.357274,0.487559,2,2,0,0.448943,0.291175,0.420435


In [170]:
# df_b_n.to_csv('setd_ppi_havetest_copy1.csv')

df_b_n.to_csv('../data/setd_ppi.csv')