In [1]:
import numpy as np
import os
from biopandas.pdb import PandasPdb
import pandas as pd
from scipy.spatial.distance import cdist
import itertools
import GNM, MFD, local_OPD, GT_curvature
import sys
import re
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
import igraph as ig

In [101]:
def make_graph(c1, cutoff):
    c1 = c1[c1['atom_name'] == 'CA']
    c1_CA = pd.concat([c1[c1['atom_name'] == 'CA']['x_coord'],
                       c1[c1['atom_name'] == 'CA']['y_coord'],
                       c1[c1['atom_name'] == 'CA']['z_coord']], axis=1).to_numpy()
    num_nodes = len(c1_CA)
    dists = cdist(c1_CA, c1_CA)

    connects = np.array(np.where((dists <= cutoff) == (dists > 0))).T
    edges = np.unique(np.sort(connects, axis=1), axis=0)
    g_ig = ig.Graph(n=num_nodes, edges=edges)
    G = g_ig.to_networkx()

    return G

def make_graph_np(c1, cutoff):
    c1_CA = pd.concat([c1['x_coord'],
                       c1['y_coord'],
                       c1['z_coord']], axis=1).to_numpy()
    num_nodes = len(c1_CA)
    dists = cdist(c1_CA, c1_CA)

    connects = np.array(np.where((dists <= cutoff) == (dists > 0))).T
    edges = np.unique(np.sort(connects, axis=1), axis=0)
    g_ig = ig.Graph(n=num_nodes, edges=edges)
    G = g_ig.to_networkx()

    return G

def CA_coord(pdb_name, chain1, np_name):

    coord1 = PandasPdb()
    # coord1.fetch_pdb(pdb_name)
    coord1.read_pdb(os.path.join(protein_dir, pdb_name + '.pdb'))
    prot1_df = coord1.df['ATOM']
    prot1_df = prot1_df[(prot1_df['alt_loc'] == "") | (prot1_df['alt_loc'] == "A")]

    c1_ = [[] for _ in range(len(chain1))]
    for ii in range(len(chain1)):
        c1_[ii] = prot1_df[prot1_df['chain_id'] == chain1[ii]]
    c1 = pd.concat(c1_).reset_index(drop=True)

    c1_all_res = c1[['chain_id', 'residue_number', 'insertion']].drop_duplicates().reset_index(drop=True)
    c1_ca_res = c1[c1['atom_name'] == 'CA'][['chain_id', 'residue_number', 'insertion']]
    c1_no_cas = pd.merge(c1_all_res, c1_ca_res, how='left', indicator=True)['_merge']=='left_only'

    if sum(c1_no_cas) != 0:
        c1_incomplete_res = np.squeeze(np.where(c1[['chain_id', 'residue_number', 'insertion']].astype(str).agg('_'.join, axis=1).to_numpy() ==
               				        c1_all_res[c1_no_cas].astype(str).agg('_'.join, axis=1).to_numpy()))
        c1 = c1.drop(np.atleast_1d((c1_incomplete_res))).reset_index(drop=True)
    else:
        c1_incomplete_res = np.array([])

    c1_CA = pd.concat([c1[c1['atom_name'] == 'CA']['x_coord'],
                   c1[c1['atom_name'] == 'CA']['y_coord'],
                   c1[c1['atom_name'] == 'CA']['z_coord']],
                  axis=1).to_numpy()

    coord2 = PandasPdb()
    coord2.read_pdb(os.path.join(nano_dir, np_name+'.pdb'))
    c2 = coord2.df['ATOM']
    c2 = c2[c2['element_symbol'] != 'H']
    c2_all = pd.concat([c2['x_coord'], c2['y_coord'], c2['z_coord']], axis=1).to_numpy()


    return c1, c2, c1_CA, c2_all, c1_incomplete_res

def distance_data_range(c1, c2, c1_CA, c2_all):
    c1 = c1 = c1[c1['atom_name'] == 'CA']
    c2 = c2[c2['element_symbol'] != 'H']
    c1_all = pd.concat([c1['x_coord'], c1['y_coord'], c1['z_coord']], axis=1).to_numpy()
    c2_all = pd.concat([c2['x_coord'], c2['y_coord'], c2['z_coord']], axis=1).to_numpy()
    dists_all = cdist(c1_all, c2_all)
    int_index = np.ones_like(dists_all)
    int_index[np.where(dists_all<=7)] = 0
    ints = int_index.flatten()

    c1_CA_res = c1[c1['atom_name'] == 'CA']['residue_number']
    c2_all_res = c2['residue_number']
    CA_all_res = np.array(list(itertools.product(c1_CA_res, c2_all_res)))
    # print('Len_CA_all', len(CA_all))
    print('Len_CA_all_res', len(CA_all_res))

    c1_CA_names = c1[c1['atom_name'] == 'CA']['residue_name']
    c2_all_names = c2['residue_name']
    CA_res_names = np.array(list(itertools.product(c1_CA_names, c2_all_names)))
    # pair_dists = updated_dist[np.arange(len(CA_all_res)) // len(c2_all), np.arange(len(CA_all_res)) % len(c2_all)]

    c1_chain = c1[c1['atom_name'] == 'CA']['chain_id']
    c2_chain = c2['chain_id']
    CA_chain = np.array(list(itertools.product(c1_chain, c2_chain)))

    c1_ins = c1[c1['atom_name'] == 'CA']['insertion']
    c2_ins = c2['insertion']
    CA_ins = np.array(list(itertools.product(c1_ins, c2_ins)))

    res_info_dict = {'c1_chain_id': CA_chain[:, 0],
                     'c1_residue_number': CA_all_res[:, 0],
                     'c1_insertion': CA_ins[:, 0],
                     'c1_residue_name': CA_res_names[:, 0],
                     'c2_chain_id': CA_chain[:, 1],
                     'c2_residue_number': CA_all_res[:, 1],
                     'c2_insertion': CA_ins[:, 1],
                     'c2_residue_name': CA_res_names[:, 1],
                     'distance': ints}

    pair_table = pd.DataFrame(res_info_dict)

    return pair_table

def binary_fill(c1, c2, data_feature1, data_feature2):
    data_feature1['res_name'] = c1[c1['atom_name'] =='CA']['residue_name'].to_numpy()
    data_feature2['res_name'] = c2['residue_name'].to_numpy()

    c1_pos = [int(_ in ['ARG','LYS','HIS','MOL']) for _ in data_feature1['res_name']]
    c2_pos = [int(_ in ['ARG','LYS','HIS','MOL']) for _ in data_feature2['res_name']]

    c1_neg = [int(_ in ['ASP','GLU']) for _ in data_feature1['res_name']]
    c2_neg = [int(_ in ['ASP','GLU']) for _ in data_feature2['res_name']]

    c1_polar = [int(_ in ['GLN','ASN', 'SER', 'THR', 'TYR', 'CYS']) for _ in data_feature1['res_name']]
    c2_polar = [int(_ in ['GLN','ASN', 'SER', 'THR', 'TYR', 'CYS']) for _ in data_feature2['res_name']]

    c1_amp = [int(_ in ['TRP','TYR', 'MET']) for _ in data_feature1['res_name']]
    c2_amp = [int(_ in ['TRP','TYR', 'MET']) for _ in data_feature2['res_name']]

    c1_hp = [int(_ in ['ALA','ILE', 'LEU', 'MET', 'PHE', 'VAL', 'PRO', 'GLY']) for _ in data_feature1['res_name']]
    c2_hp = [int(_ in ['ALA','ILE', 'LEU', 'MET', 'PHE', 'VAL', 'PRO', 'GLY']) for _ in data_feature2['res_name']]

    data_feature1['pos'] = np.array(c1_pos)
    data_feature2['pos'] = np.array(c2_pos)
    data_feature1['neg'] = np.array(c1_neg)
    data_feature2['neg'] = np.array(c2_neg)
    data_feature1['polar'] = np.array(c1_polar)
    data_feature2['polar'] = np.array(c2_polar)
    data_feature1['amp'] = np.array(c1_amp)
    data_feature2['amp'] = np.array(c2_amp)
    data_feature1['hp'] = np.array(c1_hp)
    data_feature2['hp'] = np.array(c2_hp)

    return data_feature1, data_feature2

hphob = {'MOL': 4.5, 'ILE': 4.5, 'VAL': 4.2, 'LEU': 3.8, 'PHE': 2.8, 'CYS': 2.5, 'MET': 1.9, 'ALA': 1.8, 'GLY': -0.4, 'THR': -0.7, 'SER': -0.8, 'TRP': -0.9, 'TYR': -1.3, 'PRO': -1.6, 'HIS': -3.2,'GLU': -3.5, 'GLN': -3.5, 'ASP': -3.5, 'ASN': -3.5, 'LYS': -3.9, 'ARG': -4.5}

def hydrophobicity_fill(data_feature1, data_feature2):

    hp1 = np.array([hphob[_] for _ in data_feature1['res_name']])
    hp2 = np.array([hphob[_] for _ in data_feature2['res_name']])
    data_feature1['hp_idx'] = hp1
    data_feature2['hp_idx'] = hp2

    return data_feature1, data_feature2

# geometry data needs to be calculated separately
def geometry(pdb_name, chain1, np_name, data_feature1, data_feature2, c1_CA, c2_all):
    geo_txt1 = os.path.join(geometry_dir, pdb_name + '_' + chain1 + '.txt')
    geo_txt2 = os.path.join(geometry_dir, np_name + '.txt')
    shellAcc1, Rinacc1, Pocketness1 = np.loadtxt(geo_txt1, skiprows=43, usecols=(3, 4, 7), unpack=True)
    shellAcc2, Rinacc2, Pocketness2 = np.loadtxt(geo_txt2, skiprows=43, usecols=(3, 4, 7), unpack=True)

    data_feature1['rd'] = Rinacc1[:len(c1_CA)]
    data_feature1['shell'] = shellAcc1[:len(c1_CA)]
    data_feature1['poc'] = Pocketness1[:len(c1_CA)]
    data_feature2['rd'] = Rinacc2[:len(c2_all)]
    data_feature2['shell'] = shellAcc2[:len(c2_all)]
    data_feature2['poc'] = Pocketness2[:len(c2_all)]

    return data_feature1, data_feature2

def graph_curvature(c1, c2, data_feature1, data_feature2):
    cutoff = 7
    np_cutoff = 4
    G1 = make_graph(c1, cutoff)
    G2 = make_graph_np(c2, np_cutoff)
    alpha = 0.5

    ollivier1 = np.array(GT_curvature.ollivier_ricci(G1))
    ollivier2 = np.array(GT_curvature.ollivier_ricci(G2))
    forman1 = np.array(GT_curvature.forman_ricci(G1))
    forman2 = np.array(GT_curvature.forman_ricci(G2))

    data_feature1['ollivier'] = ollivier1
    data_feature2['ollivier'] = ollivier2
    data_feature1['forman'] = forman1
    data_feature2['forman'] = forman2

    return data_feature1, data_feature2

def graph_gnm(c1, c2, c1_CA, c2_CA, pdb_name, chain1, chain2, data_feature1, data_feature2):
    gnm1 = np.array(GNM.gnm_sum_mode(os.path.join(protein_dir, pdb_name + '.pdb'), 10, chain1))
    gnm2 = np.array(GNM.gnm_sum_mode_np(os.path.join(nano_dir, np_name + '.pdb'), 10))

    print('gnm1', len(gnm1))
    print('gnm2', len(gnm2))

    data_feature1['gnm'] = gnm1
    print('1_same')

    data_feature2['gnm'] = gnm2
    print('2_same')

    return data_feature1, data_feature2

def graph_fd(c1, c2, data_feature1, data_feature2):
    cutoff = 7
    np_cutoff = 4

    G1 = make_graph(c1, cutoff)
    G2 = make_graph_np(c2, np_cutoff)

    fd1 = MFD.fractal_dimension(G1)
    fd2 = MFD.fractal_dimension(G2)
    print('fractal dimension ok')

    r_d = 5
    more_fd1 = MFD.more_box(G1, r_d)
    more_fd2 = MFD.more_box(G2, r_d)

    data_feature1['fd'] = fd1[:, 1]
    data_feature1['more_fd_1'] = more_fd1[:, 1]
    data_feature1['more_fd_2'] = more_fd1[:, 2]
    data_feature1['more_fd_3'] = more_fd1[:, 3]
    data_feature1['more_fd_4'] = more_fd1[:, 4]

    data_feature2['fd'] = fd2[:, 1]
    data_feature2['more_fd_1'] = more_fd2[:, 1]
    data_feature2['more_fd_2'] = more_fd2[:, 2]
    data_feature2['more_fd_3'] = more_fd2[:, 3]
    data_feature2['more_fd_4'] = more_fd2[:, 4]

    return data_feature1, data_feature2

def graph_os(c1, c2, data_feature1, data_feature2):
    c1 = c1[c1['atom_name'] == 'CA']

    G_os1_arr = local_OPD.GOS_single(c1)
    G_os2_arr = local_OPD.GOS_single(c2)

    data_feature1['G_os_5'] = G_os1_arr[:, 0]
    data_feature1['G_os_7'] = G_os1_arr[:, 1]
    data_feature1['G_os_10'] = G_os1_arr[:, 2]
    data_feature1['G_os_15'] = G_os1_arr[:, 3]

    data_feature2['G_os_5'] = G_os2_arr[:, 0]
    data_feature2['G_os_7'] = G_os2_arr[:, 1]
    data_feature2['G_os_10'] = G_os2_arr[:, 2]
    data_feature2['G_os_15'] = G_os2_arr[:, 3]

    return data_feature1, data_feature2

def get_intpoints(pair_table, res_keys1, res_keys2, pdb_name, chain1, chain2):
    c1_res = pair_table[['c1_chain_id', 'c1_residue_number', 'c1_insertion']].astype(str).agg('_'.join, axis=1).reset_index(drop=True)
    c2_res = pair_table[['c2_chain_id', 'c2_residue_number', 'c2_insertion']].astype(str).agg('_'.join, axis=1).reset_index(drop=True)
    distances = pair_table['distance']

    c1_ints = c1_res[distances < 2].unique()
    c2_ints = c2_res[distances < 2].unique()  # distance 4

    return c1_ints, c2_ints

In [73]:
def atom_count(c1, c2, data_feature1, data_feature2):
    aa_atom_count = os.path.join(new_data_dir, 'amino_acid_atom_count.csv')

    atom_count = pd.read_csv(aa_atom_count)
    atom_count.index = atom_count.iloc[:,0]
    res1 = c1[c1['atom_name'] == 'CA']['residue_name']
    res2 = c2['residue_name']

    data_feature1['N_count'] = atom_count.loc[res1]['N'].to_numpy()
    data_feature1['C_count'] = atom_count.loc[res1]['C'].to_numpy()
    data_feature1['O_count'] = atom_count.loc[res1]['O'].to_numpy()
    data_feature1['H_count'] = atom_count.loc[res1]['H'].to_numpy()
    data_feature1['S_count'] = atom_count.loc[res1]['S'].to_numpy()

    data_feature2['N_count'] = atom_count.loc[res2]['N'].to_numpy()
    data_feature2['C_count'] = atom_count.loc[res2]['C'].to_numpy()
    data_feature2['O_count'] = atom_count.loc[res2]['O'].to_numpy()
    data_feature2['H_count'] = atom_count.loc[res2]['H'].to_numpy()
    data_feature2['S_count'] = atom_count.loc[res2]['S'].to_numpy()

    return data_feature1, data_feature2


def atom_charge(c1, c2, data_feature1, data_feature2):
    aa_atom_charge = os.path.join(new_data_dir, 'amino_acid_charge.csv')

    atom_charge = pd.read_csv(aa_atom_charge)
    atom_charge.index = atom_charge.iloc[:,0]
    res1 = c1[c1['atom_name'] == 'CA']['residue_name']
    res2 = c2['residue_name']

    data_feature1['N_charge'] = atom_charge.loc[res1]['N'].to_numpy()
    data_feature1['C_charge'] = atom_charge.loc[res1]['C'].to_numpy()
    data_feature1['O_charge'] = atom_charge.loc[res1]['O'].to_numpy()
    data_feature1['H_charge'] = atom_charge.loc[res1]['H'].to_numpy()
    data_feature1['S_charge'] = atom_charge.loc[res1]['S'].to_numpy()

    data_feature2['N_charge'] = np.zeros(len(res2))
    data_feature2['C_charge'] = np.zeros(len(res2))
    data_feature2['O_charge'] = np.zeros(len(res2))
    data_feature2['H_charge'] = np.zeros(len(res2))
    data_feature2['S_charge'] = np.zeros(len(res2))


    return data_feature1, data_feature2

In [111]:
os.listdir(nano_dir)

['dv1_opt.pdb',
 'racv4_opt.pdb',
 'd_v4_opt.pdb',
 'lv1_opt_cam.pdb',
 'rac_v0_opt.pdb',
 'l_v4_opt.pdb',
 'rac_v0_opt_H.pdb',
 'v10_bare.pdb',
 'dv1_opt_cam.pdb',
 'lv1_opt.pdb']

In [128]:
for ii in os.listdir(nano_dir):
    pdb_name = 'actin_alphafold_noh'
    chain1 = 'A'
    np_name = ii[:-4]

    if os.path.isfile(os.path.join(out_dir, 'data_feature'+'_'+pdb_name+'_'+chain1+np_name+'.npy')):
        pass
    else:





        print(pdb_name, chain1, np_name)
        c1, c2, c1_CA, c2_all, c1_incomplete_res = CA_coord(pdb_name, chain1, np_name)

        res_nums1 = c1[['chain_id', 'residue_number', 'insertion']].astype(str).agg('_'.join, axis=1).reset_index(drop=True)
        res_nums2 = c2[['chain_id', 'residue_number', 'insertion']].astype(str).agg('_'.join, axis=1).reset_index(drop=True)

        data_feature1 = {}
        data_feature2 = {}

        feature_names = ['res_name', 'pos', 'neg', 'polar', 'amp', 'hp', 'hp_idx',
                         'rd', 'shell', 'poc',
                         'N_count', 'C_count', 'O_count', 'H_count', 'S_count',
                         'N_charge', 'C_charge', 'O_charge', 'H_charge', 'S_charge',
                         'ollivier', 'forman', 'gnm',
                         'fd', 'more_fd_1', 'more_fd_2', 'more_fd_3', 'more_fd_4',
                         'G_os_5', 'G_os_7', 'G_os_10', 'G_os_15']


        data_feature1, data_feature2 = binary_fill(c1, c2, data_feature1, data_feature2)
        print('binary_fill ok')
        data_feature1, data_feature2 = hydrophobicity_fill(data_feature1, data_feature2)
        print('hydrophobicity_fill ok')
        data_feature1, data_feature2 = geometry(pdb_name, chain1, np_name, data_feature1, data_feature2, c1_CA, c2_all)
        print('geometry_fill ok')
        data_feature1, data_feature2 = atom_count(c1, c2, data_feature1, data_feature2)
        print('atom_count_fill ok')
        data_feature1, data_feature2 = atom_charge(c1, c2, data_feature1, data_feature2)
        print('atom_charge_fill ok')
        data_feature1, data_feature2 = graph_curvature(c1, c2, data_feature1, data_feature2)
        print('graph_curvature_fill ok')
        data_feature1, data_feature2 = graph_gnm(c1, c2, c1_CA, c2_all, pdb_name, chain1, np_name, data_feature1, data_feature2)
        print('graph_gnm_fill ok')
        data_feature1, data_feature2 = graph_fd(c1, c2, data_feature1, data_feature2)
        print('graph_fd_fill ok')
        data_feature1, data_feature2 = graph_os(c1, c2, data_feature1, data_feature2)
        print('os fill ok')

        aa_mol = ['ALA','ARG','ASN','ASP','CYS','GLN','GLU','GLY','HIS','ILE','LEU','LYS','MET','PHE','PRO','SER','THR','TRP','TYR','VAL', 'MOL']
        le = preprocessing.LabelEncoder()
        le.fit(aa_mol)
        data_feature1['res_name'] = le.transform(data_feature1['res_name'])#.astype(int)
        data_feature2['res_name'] = le.transform(data_feature2['res_name'])

        for item in data_feature1.keys():
            print(item, len(data_feature1[item]))
        for item in data_feature2.keys():
            print(item, len(data_feature2[item]))

        data_feature1_pd = pd.DataFrame(data_feature1)
        data_feature2_pd = pd.DataFrame(data_feature2)

        data_feature_pair = pd.DataFrame(
            [np.concatenate(_) for _ in itertools.product(data_feature1_pd.to_numpy(), data_feature2_pd.to_numpy())])

        pair_table = distance_data_range(c1, c2, c1_CA, c2_all)
        data_feature_pair['distance'] = pair_table['distance']
        np.save(os.path.join(out_dir, 'data_feature'+'_'+pdb_name+'_'+chain1+np_name+'.npy'), data_feature_pair)

In [127]:
data_feature_pair

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,distance
0,12.0,0.0,0.0,0.0,1.0,1.0,1.9,4.827,75.07,10.12,...,1.119918,0.222222,0.666667,0.930556,1.000000,-1.394281e+06,-1.272552e+06,7.352147e+05,-1.195600e+05,1.0
1,12.0,0.0,0.0,0.0,1.0,1.0,1.9,4.827,75.07,10.12,...,1.345470,0.166667,0.569444,0.930556,1.000000,-2.991394e+03,1.504753e+05,-6.506870e+05,1.453358e+06,1.0
2,12.0,0.0,0.0,0.0,1.0,1.0,1.9,4.827,75.07,10.12,...,1.271862,0.138889,0.388889,0.680556,0.930556,3.488362e+05,-7.599788e+05,-2.482775e+06,-9.458188e+05,1.0
3,12.0,0.0,0.0,0.0,1.0,1.0,1.9,4.827,75.07,10.12,...,1.431116,0.152778,0.513889,0.972222,1.000000,-3.707579e+06,-2.742700e+06,-9.727098e+04,6.782258e+05,1.0
4,12.0,0.0,0.0,0.0,1.0,1.0,1.9,4.827,75.07,10.12,...,1.593109,0.083333,0.388889,0.736111,0.972222,2.492208e+03,-1.766624e+05,4.157355e+04,9.284627e+04,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27370,14.0,0.0,0.0,0.0,0.0,1.0,2.8,4.589,78.63,12.75,...,1.288714,0.125000,0.180556,0.375000,0.666667,7.237721e+05,2.860090e+06,-9.213698e+05,-6.796138e+05,1.0
27371,14.0,0.0,0.0,0.0,0.0,1.0,2.8,4.589,78.63,12.75,...,1.160785,0.166667,0.347222,0.611111,0.888889,-4.065239e+06,-4.544109e+04,-1.217978e+05,-1.111925e+05,1.0
27372,14.0,0.0,0.0,0.0,0.0,1.0,2.8,4.589,78.63,12.75,...,1.212356,0.152778,0.347222,0.611111,0.888889,1.518490e+06,-7.439433e+06,-2.483649e+06,-2.429507e+05,1.0
27373,14.0,0.0,0.0,0.0,0.0,1.0,2.8,4.589,78.63,12.75,...,1.491225,0.083333,0.180556,0.375000,0.666667,-2.091395e+06,-4.544109e+04,1.904218e+06,2.127728e+06,1.0


In [74]:
pdb_name = 'actin_alphafold_noh'
chain1 = 'A'
# np_name = 'lv1_opt_cam_noh'

In [114]:
data_dir = '../../data'
protein_dir = os.path.join(data_dir, 'pdb_protein')
nano_dir = os.path.join(data_dir, 'pdb_np')
geometry_dir = os.path.join(data_dir, 'geometry')
pdb_list_dir = os.path.join(data_dir, 'PPI_list')
new_data_dir = os.path.join(data_dir, 'charge_count')
pair_table_dir = os.path.join(data_dir, 'pair_table')
data_feature_dir = os.path.join(data_dir, 'descriptors_matrix')

In [116]:
out_dir = os.path.join(data_dir, 'processed_test_data', 'PNI')

In [None]:
np.save(os.path.join(out_dir, 'data_feature'+'_'+pdb_name+'_'+chain1+np_name+'.npy'), data_feature_pair)

In [76]:
print(pdb_name, chain1, np_name)
c1, c2, c1_CA, c2_all, c1_incomplete_res = CA_coord(pdb_name, chain1, np_name)

actin_alphafold_noh A lv1_opt_cam_noh


In [77]:
res_nums1 = c1[['chain_id', 'residue_number', 'insertion']].astype(str).agg('_'.join, axis=1).reset_index(drop=True)
res_nums2 = c2[['chain_id', 'residue_number', 'insertion']].astype(str).agg('_'.join, axis=1).reset_index(drop=True)

data_feature1 = {}
data_feature2 = {}

feature_names = ['res_name', 'pos', 'neg', 'polar', 'amp', 'hp', 'hp_idx',
                 'rd', 'shell', 'poc',
                 'N_count', 'C_count', 'O_count', 'H_count', 'S_count',
                 'N_charge', 'C_charge', 'O_charge', 'H_charge', 'S_charge',
                 'ollivier', 'forman', 'gnm',
                 'fd', 'more_fd_1', 'more_fd_2', 'more_fd_3', 'more_fd_4',
                 'G_os_5', 'G_os_7', 'G_os_10', 'G_os_15']

In [78]:
data_feature1, data_feature2 = binary_fill(c1, c2, data_feature1, data_feature2)
print('binary_fill ok')
data_feature1, data_feature2 = hydrophobicity_fill(data_feature1, data_feature2)
print('hydrophobicity_fill ok')
data_feature1, data_feature2 = geometry(pdb_name, chain1, np_name, data_feature1, data_feature2, c1_CA, c2_all)
print('geometry_fill ok')
data_feature1, data_feature2 = atom_count(c1, c2, data_feature1, data_feature2)
print('atom_count_fill ok')
data_feature1, data_feature2 = atom_charge(c1, c2, data_feature1, data_feature2)
print('atom_charge_fill ok')
data_feature1, data_feature2 = graph_curvature(c1, c2, data_feature1, data_feature2)
print('graph_curvature_fill ok')
data_feature1, data_feature2 = graph_gnm(c1, c2, c1_CA, c2_all, pdb_name, chain1, np_name, data_feature1, data_feature2)
print('graph_gnm_fill ok')
data_feature1, data_feature2 = graph_fd(c1, c2, data_feature1, data_feature2)
print('graph_fd_fill ok')
data_feature1, data_feature2 = graph_os(c1, c2, data_feature1, data_feature2)
print('os fill ok')

binary_fill ok
hydrophobicity_fill ok
geometry_fill ok


In [82]:
aa_mol = ['ALA','ARG','ASN','ASP','CYS','GLN','GLU','GLY','HIS','ILE','LEU','LYS','MET','PHE','PRO','SER','THR','TRP','TYR','VAL', 'MOL']
le = preprocessing.LabelEncoder()
le.fit(aa_mol)
data_feature1['res_name'] = le.transform(data_feature1['res_name'])#.astype(int)
data_feature2['res_name'] = le.transform(data_feature2['res_name'])

In [83]:
for item in data_feature1.keys():
    print(item, len(data_feature1[item]))
for item in data_feature2.keys():
    print(item, len(data_feature2[item]))

data_feature1_pd = pd.DataFrame(data_feature1)
data_feature2_pd = pd.DataFrame(data_feature2)

res_name 375
pos 375
neg 375
polar 375
amp 375
hp 375
hp_idx 375
rd 375
shell 375
poc 375
N_count 375
C_count 375
O_count 375
H_count 375
S_count 375
N_charge 375
C_charge 375
O_charge 375
H_charge 375
S_charge 375
ollivier 375
forman 375
gnm 375
fd 375
more_fd_1 375
more_fd_2 375
more_fd_3 375
more_fd_4 375
G_os_5 375
G_os_7 375
G_os_10 375
G_os_15 375
res_name 73
pos 73
neg 73
polar 73
amp 73
hp 73
hp_idx 73
rd 73
shell 73
poc 73
N_count 73
C_count 73
O_count 73
H_count 73
S_count 73
N_charge 73
C_charge 73
O_charge 73
H_charge 73
S_charge 73
ollivier 73
forman 73
gnm 73
fd 73
more_fd_1 73
more_fd_2 73
more_fd_3 73
more_fd_4 73
G_os_5 73
G_os_7 73
G_os_10 73
G_os_15 73


In [86]:
data_feature_pair = pd.DataFrame([np.concatenate(_) for _ in itertools.product(data_feature1_pd.to_numpy(), data_feature2_pd.to_numpy())])

In [102]:
pair_table = distance_data_range(c1, c2, c1_CA, c2_all)
data_feature_pair['distance'] = pair_table['distance']

Len_CA_all_res 27375


In [118]:
np.save('test.npy', data_feature_pair.to_numpy())

In [104]:
df1 = np.load('/home/jqma/PycharmProjects/unified_fast/data/processed_test_data/PNI/data_feature_actin_alphafold_noh_Alv1_opt_cam_noh.npy')

In [105]:
df1[:,-1][df1[:,-1]!=0] = 1

In [106]:
np.unique(np.where(np.isclose(data_feature_pair, df1) == False)[1])

array([21, 53])