In [145]:
import pandas as pd
import os
import numpy as np

# Grabbing Features

In [146]:
def build_data(file_names,directory):
    df_Final = pd.DataFrame(columns=['PDB code','VDWAALS','EEL','EGB','ESURF','GGAS','GSOLV','TOTAL'])
    for file in file_names:
        df = pd.read_csv(directory+"/"+file,delimiter='\t')
        df = df.iloc[83:92, :]
        df.drop([85,86],axis = 0,inplace=True)
        l1 = [file[:4]]
        for _,row in df.iterrows():
            val = float(row.iloc[0].split()[1])
            l1.append(val)
        df_Final.loc[len(df_Final.index)] = l1
    return df_Final

def create_pairs(df):
    df_Final = pd.DataFrame(columns=["PDB code 1","VDWAALS 1","EEL 1","EGB 1","ESURF 1","GGAS 1","GSOLV 1","TOTAL 1",'hbdist_mean1','hbdist_std1','hbnum_mean1','hbnum_std1','hbnum_mean_<.35_1','hbnum_std_<.35_1',"PDB code 2","VDWAALS 2","EEL 2","EGB 2","ESURF 2","GGAS 2","GSOLV 2","TOTAL 2",'hbdist_mean2','hbdist_std2','hbnum_mean2','hbnum_std2','hbnum_mean_<.35_2','hbnum_std_<.35_2'])
    for _ , row1 in df.iterrows():
        for _ , row2 in df.iterrows():
            if row1["PDB code"] == row2["PDB code"]:
                continue
            temp = row1.to_list() + row2.to_list()
            df_Final.loc[len(df_Final.index)] = temp
    return df_Final

In [147]:
directory = "complex_data_raw"
df_features_np = build_data(os.listdir(directory),directory)

# Complexes still missing

In [148]:
'''test = pd.read_csv("Aptamer-Protein-Information.csv")
for code in test["Unnamed: 0"]:
    if code not in set(df_features["PDB code 1"]):
        print(code)'''

'test = pd.read_csv("Aptamer-Protein-Information.csv")\nfor code in test["Unnamed: 0"]:\n    if code not in set(df_features["PDB code 1"]):\n        print(code)'

# More features

In [150]:
def read_file1(complex,file):
    try:
        _,y = np.loadtxt("hbond_analysis/" + complex+'/'+file,comments = ['@','#'],unpack=True)
        return np.mean(y), np.std(y)
    except:
        return None,None

def read_file2(complex,file):
    try:
        _,y,_ = np.loadtxt("hbond_analysis/" + complex+'/'+file,comments = ['@','#'],unpack=True)
        return np.mean(y), np.std(y)
    except:
        return None,None

def read_file3(complex,file):
    try:
        _,y,z = np.loadtxt("hbond_analysis/" + complex+'/'+file,comments = ['@','#'],unpack=True)
        return np.mean(y), np.std(y), np.mean(z), np.mean(y)
    except:
        return None,None

In [151]:
l1 = os.listdir("hbond_analysis")
#df_extra_features = pd.DataFrame(columns=['PDB code','hbdist_mean','hbdist_std','hblife_mean','hblife_std','hbnum_mean1','hbnum_std1','hbnum_mean2','hbnum_std2'])
df_extra_features = pd.DataFrame(columns=['PDB code','hbdist_mean','hbdist_std','hbnum_mean','hbnum_std','hbnum_mean_<.35','hbnum_std_<.35'])
for complex in l1:
    l2 = os.listdir("hbond_analysis/" + complex)
    l1 = [complex]
    for file in l2:
        if file == 'hbdist.xvg':
            mean, std = read_file1(complex,file)
            l1 += [mean,std]
        elif file == 'hblife.xvg':
            #mean, std = read_file2(complex,file)
            #l1 += [mean,std]
            continue
        elif file == 'hbnum.xvg':
            mean1, std1, mean2, std2 = read_file3(complex,file)
            l1 += [mean1,std1,mean2,std2]
    df_extra_features.loc[len(df_extra_features.index)] = l1

In [152]:
df_features = pd.merge(df_extra_features,df_features_np,on = "PDB code",how="inner")

In [168]:
df_extra_features[df_extra_features["PDB code"].isin(df_features_np["PDB code"]) == False]

Unnamed: 0,PDB code,hbdist_mean,hbdist_std,hbnum_mean,hbnum_std,hbnum_mean_<.35,hbnum_std_<.35
30,2kkf,2.857145,6.338872,16.995005,2.077176,13.296703,16.995005
45,1knz,2.857141,5.631103,98.955045,5.06353,83.223776,98.955045


In [153]:
df_features = create_pairs(df_features)

# Data outputs

In [154]:
def translate_kd(kd):
    kd.strip()
    kd = kd[3:]
    factor = 0
    if "nM" in kd:
        factor = 1
    elif "pM" in kd:
        factor = .001
    elif "uM" in kd:
        factor = 1000
    kd = kd[:-2]
    try:
        return float(kd) * factor
    except:
        print(kd)
        return None
    
def output(df):
    df_Final = pd.DataFrame(columns=["PDB code 1","Protein Name 1","Ligand Name 1","Affinity Data 1","kd value (nM) 1","PDB code 2","Protein Name 2","Ligand Name 2","Affinity Data 2","kd value (nM) 2","Output"])
    for _ , row1 in df.iterrows():
        for _ , row2 in df.iterrows():
            if row1["PDB code"] == row2["PDB code"]:
                continue
            output = 1 if row1["kd value (nM)"] < row2["kd value (nM)"] else 0
            temp = row1.to_list() + row2.to_list()
            temp.append(output)
            df_Final.loc[len(df_Final.index)] = temp
    return df_Final

In [155]:
df = pd.read_csv("KD_data.csv")
df.columns = df.iloc[0]
df.drop(index = 0,inplace=True)
df = df[['PDB code','Protein Name','Ligand Name','Affinity Data']]
df["kd value (nM)"] = [translate_kd(kd) for kd in df["Affinity Data"]]
df = df[df["PDB code"].isin(df_features["PDB code 1"])]
df_outputs = output(df)

0=4.9
0=0.03
0=0.3
0=4.6
0=4
0=0.25
0=126


In [156]:
df_outputs

Unnamed: 0,PDB code 1,Protein Name 1,Ligand Name 1,Affinity Data 1,kd value (nM) 1,PDB code 2,Protein Name 2,Ligand Name 2,Affinity Data 2,kd value (nM) 2,Output
0,1exd,GLUTAMINYL-TRNA SYNTHETASE,GLUTAMINE TRNA APTAMER,Kd=7.1nM,7.1,1il2,ASPARTYL-TRNA SYNTHETASE,ASPARTYL TRANSFER RNA,Kd=3uM,3000.00,1
1,1exd,GLUTAMINYL-TRNA SYNTHETASE,GLUTAMINE TRNA APTAMER,Kd=7.1nM,7.1,1jbr,Ribotoxin Restrictocin and a 31-mer SRD RNA In...,RNA,Kd=1uM,1000.00,1
2,1exd,GLUTAMINYL-TRNA SYNTHETASE,GLUTAMINE TRNA APTAMER,Kd=7.1nM,7.1,1je8,Nitrate/Nitrite Response Regulator Protein NARL,DNA,Kd=0.15nM,0.15,0
3,1exd,GLUTAMINYL-TRNA SYNTHETASE,GLUTAMINE TRNA APTAMER,Kd=7.1nM,7.1,1omh,trwC protein,DNA OLIGONUCLEOTIDE,Kd=70nM,70.00,1
4,1exd,GLUTAMINYL-TRNA SYNTHETASE,GLUTAMINE TRNA APTAMER,Kd=7.1nM,7.1,1osb,trwC protein,DNA OLIGONUCLEOTIDE,Kd=70nM,70.00,1
...,...,...,...,...,...,...,...,...,...,...,...
2445,5yts,"Nuclease-sensitive element-binding protein 1, ...",RNA A2U (UCUUCU),Kd=2.77uM,2770.0,6eo6,human alpha-thrombin,modified 15-mer DNA aptamer T4W,Kd=1nM,1.00,0
2446,5yts,"Nuclease-sensitive element-binding protein 1, ...",RNA A2U (UCUUCU),Kd=2.77uM,2770.0,6eo7,human alpha-thrombin,modified 15-mer DNA aptamer T4K,Kd=0.39nM,0.39,0
2447,5yts,"Nuclease-sensitive element-binding protein 1, ...",RNA A2U (UCUUCU),Kd=2.77uM,2770.0,5ytx,"Nuclease-sensitive element-binding protein 1, ...",RNA U3A (UCAACU),Kd=1.34uM,1340.00,0
2448,5yts,"Nuclease-sensitive element-binding protein 1, ...",RNA A2U (UCUUCU),Kd=2.77uM,2770.0,5ytv,"Nuclease-sensitive element-binding protein 1, ...",RNA CAUC(UCAUCU),Kd=1.26uM,1260.00,0


# Compile into single file

In [157]:
X = df_features
Y = df_outputs
Y = Y[["PDB code 1","PDB code 2", "Output"]]
Y["PDB codes"] = Y["PDB code 1"] +" , "+ Y["PDB code 2"]
Y = Y[["PDB codes","Output"]]
Y.set_index("PDB codes",inplace=True)
X["PDB codes"] = X["PDB code 1"] +" , "+ X["PDB code 2"]
X.set_index("PDB codes", inplace=True)
X.drop(["PDB code 1", "PDB code 2"],axis=1,inplace=True)
df, right = X.align(Y,join="right",axis = 0)
df.dropna(inplace=True)
Y = Y[Y.index.isin(df.index)]
df["Output"] = Y["Output"]
df.to_csv("features_outputs.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Y["PDB codes"] = Y["PDB code 1"] +" , "+ Y["PDB code 2"]
