In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from rdkit import Chem
from rdkit.Chem import inchi
from tqdm import tqdm
from time import sleep
from tqdm.notebook import tqdm

In [None]:
#Load the files that we want to merge: k-train, c-train

def load_files(transport):
    c_train_file = f'../cornelissen_data_prep/datasets/cornelissen_{transport}_train_raw.csv'
    c_train = pd.read_csv(c_train_file, index_col=0)
    
    k_train_file = f'../kadar_data_prep/train_data/kadar_{transport}_train.csv'
    k_train = pd.read_csv(k_train_file, index_col=0)

    return c_train, k_train


In [None]:
#Code the classes: 1 for positive, 0 for negative class

def fix_class(df,t):
    status_col = f'status_{t}'

    if t == "influx":
        df[status_col] = df[status_col].replace({'Substrate': 1.0, 'Non-substrate': 0.0})
    elif t == "efflux":
        df[status_col] = df[status_col].replace({'Substrate': 1.0, 'Non-substrate': 0.0})
    elif t == "pampa":
        df[status_col] = df[status_col].replace({'high': 1.0, 'low': 0.0})
    elif t =="bbb":
        df[status_col] = df[status_col].replace({'BBB+': 1.0, "BBB-": 0.0})

    return df


In [None]:
#Concat datasets

def concat_dfs(df1,df2):
    df = pd.concat([df1,df2], ignore_index=True, join="inner")

    return df

In [None]:
#Remove all contradicting duplicates, and ensure all the other molecules are present only once

def remove_duplicates(df,t):
    status_col = f'status_{t}'
    
    print(f'----{t}----')
    print(f'length: {len(df)}')
    inchi_un = df['inchi_connectivity'].nunique()
    print(f'unique_inchi: {inchi_un}')


    unique_counts = df.groupby('inchi_connectivity')[status_col].nunique()
    duplicates_diff_class = unique_counts[unique_counts > 1].index

    print(f'Contradicting duplicates: {len(duplicates_diff_class)}')

    #Remove duplicates
    df = df[~(df['inchi_connectivity'].isin(duplicates_diff_class))]
    print(f'After contradicting removed: {len(df)}')

    df=df.drop_duplicates(subset=['inchi_connectivity'], keep="first").reset_index(drop=True)
    print(f'After duplicated removed: {len(df)}')

    return df

In [None]:
#Define transports
transports = ['influx','efflux','pampa','bbb']

#Run the functions to create the combined training datasets
for t in transports:
    c_train_raw, k_train_raw = load_files(t)        #Load the files for merge
    k_train = fix_class(k_train_raw,t)              #Code the classes with 0 and 1
    df_combined = concat_dfs(c_train_raw,k_train)   #Concat k-train and c-train
    df = remove_duplicates(df_combined,t)           #Deduplicate
    display_jsonf = df.reset_index(drop=True)       #Reset indexing

    file_name=f'datasets/combined_{t}_train_raw.csv'    #Define name for saving file
    df.to_csv(file_name,index=True)                 #Save file