## **Importing libraries**

In [3]:
# Import necessary libraries
import pandas as pd
from chembl_webresource_client.new_client import new_client

## **Search for Target protein**

### **Target search for SARS coronavirus 3C-like proteinase**

Select and retrieve bioactivity data for the target protein. 3C-like proteinase, is key enzyme in SARS-CoV-2

In [4]:
# Target search for coronavirus
target = new_client.target
target_query = target.search('coronavirus')
targets = pd.DataFrame.from_dict(target_query)
targets = targets.drop(columns=['species_group_flag', 'cross_references'])
targets

Unnamed: 0,organism,pref_name,score,target_chembl_id,target_components,target_type,tax_id
0,Coronavirus,Coronavirus,17.0,CHEMBL613732,[],ORGANISM,11119
1,Feline coronavirus,Feline coronavirus,14.0,CHEMBL612744,[],ORGANISM,12663
2,Murine coronavirus,Murine coronavirus,14.0,CHEMBL5209664,[],ORGANISM,694005
3,Canine coronavirus,Canine coronavirus,14.0,CHEMBL5291668,[],ORGANISM,11153
4,Human coronavirus 229E,Human coronavirus 229E,13.0,CHEMBL613837,[],ORGANISM,11137
5,Human coronavirus OC43,Human coronavirus OC43,13.0,CHEMBL5209665,[],ORGANISM,31631
6,Severe acute respiratory syndrome-related coro...,SARS coronavirus 3C-like proteinase,10.0,CHEMBL3927,"[{'accession': 'P0C6U8', 'component_descriptio...",SINGLE PROTEIN,694009
7,Middle East respiratory syndrome-related coron...,Middle East respiratory syndrome-related coron...,9.0,CHEMBL4296578,[],ORGANISM,1335626
8,Severe acute respiratory syndrome-related coro...,Replicase polyprotein 1ab,4.0,CHEMBL5118,"[{'accession': 'P0C6X7', 'component_descriptio...",SINGLE PROTEIN,694009
9,Severe acute respiratory syndrome coronavirus 2,Replicase polyprotein 1ab,4.0,CHEMBL4523582,"[{'accession': 'P0DTD1', 'component_descriptio...",SINGLE PROTEIN,2697049


In [5]:
selected_target = targets.target_chembl_id.loc[targets['pref_name'].str.contains('3C-like proteinase')].iloc[0]
selected_target

'CHEMBL3927'

**Why filter by 'IC50'?**

'IC50' stands for half maximal inhibitory concentration. It’s a key pharmacological measurement that tells How much of a substance (a drug or compound) is needed to inhibit a given biological process (or enzyme) by 50%.

The standard value is the potency of the drug. The lower the number the better the potency of the drug becomes. The number reflects the concentration. If you have a higher number it means that you requirer a higher concentration of drug for the same effect.

- It's a standard and comparable way to evaluate inhibitory potency.
- Lower IC50 values mean the compound is more potent at inhibiting the target.





In [6]:
activity = new_client.activity
res = activity.filter(target_chembl_id=selected_target).filter(standard_type='IC50')
df = pd.DataFrame.from_dict(res)

# Find and remove entries with null standard_value
df = df[df.standard_value.notna()]

In [7]:
df.to_csv('coronavirus_3C-like_protease.csv', index=False)

## Handeling missing data or irrelevant columns

Classifying columns by dtype and checking unique values for the int ones, we can say thar `src_id` and `standard_flag` columns are irrevelant therefore we can drop them.

We will also drop the 0 Non-Null columns. And we will study columns where more than the 75% of the values are null in order to decide if they can be dropped or not.

In [10]:
dtype_classification = df.dtypes.to_dict()

object_columns = [column for column, dtype in dtype_classification.items() if dtype == 'object']
int_columns = [column for column, dtype in dtype_classification.items() if dtype == 'int64']

# Print the classification
print("Object Columns:", object_columns)
print("Integer Columns:", int_columns)

for col in int_columns:
    if len(df[col].unique()) < 10:
        print(f'{col}: {df[col].unique()}')

Object Columns: ['action_type', 'activity_comment', 'activity_properties', 'assay_chembl_id', 'assay_description', 'assay_type', 'assay_variant_accession', 'assay_variant_mutation', 'bao_endpoint', 'bao_format', 'bao_label', 'canonical_smiles', 'data_validity_comment', 'data_validity_description', 'document_chembl_id', 'document_journal', 'ligand_efficiency', 'molecule_chembl_id', 'molecule_pref_name', 'parent_molecule_chembl_id', 'pchembl_value', 'qudt_units', 'relation', 'standard_relation', 'standard_text_value', 'standard_type', 'standard_units', 'standard_upper_value', 'standard_value', 'target_chembl_id', 'target_organism', 'target_pref_name', 'target_tax_id', 'text_value', 'toid', 'type', 'units', 'uo_units', 'upper_value', 'value']
Integer Columns: ['activity_id', 'document_year', 'potential_duplicate', 'record_id', 'src_id', 'standard_flag']
document_year: [2005 2006 2007 2008 2010 2012 2013 2019 2023]
potential_duplicate: [0 1]
src_id: [1]
standard_flag: [1]


In [11]:
df.drop(columns=['src_id','standard_flag'], inplace=True)

null_columns = df.columns[df.isnull().sum() == len(df)]
if null_columns.empty:
    print("There are no columns with zero non-null values.")
else:
    print("Columns with zero non-null values:")
    for col in null_columns:
        print(col)

df.drop(columns=[col for col in null_columns],inplace=True)

Columns with zero non-null values:
activity_comment
assay_variant_accession
assay_variant_mutation
standard_text_value
standard_upper_value
text_value
toid
upper_value


In [13]:
for col in df.columns[df.isnull().sum() >= 0.75*len(df)]:
    if df[col].apply(lambda x: isinstance(x, (dict, list))).any():
        print(f'Column "{col}" contains unhashable types (dict or list).')
    else:
        non_null_values = df[col].dropna().unique()  # Get unique non-null values
        if non_null_values.size > 0:  # Check if there are any non-null values
            print(f'Non-null values in column "{col}": {non_null_values}, count: {df[col].notna().sum()}')

Column "action_type" contains unhashable types (dict or list).
Non-null values in column "data_validity_comment": ['Outside typical range'], count: 31
Non-null values in column "data_validity_description": ['Values for this activity type are unusually large/small, so may not be accurate'], count: 31
Non-null values in column "molecule_pref_name": ['BETULINIC ACID' 'NICLOSAMIDE' 'CURCUMIN' 'SAVININ' 'HINOKININ'
 'BETUNOLIC ACID' 'TANSHINONE IIA' 'TANSHINONE IIB' 'METHYL TANSHINONATE'
 'CRYPTOTANSHINONE' 'TANSHINONE I' 'DIHYDROTANSHINONE I' 'MILTIRONE'
 'CINANSERIN'], count: 36


Since the unique values for `data_validity_comment` and `data_validity_description` are Null or non informative, we can drop them as well.

Action type is unhasable (dict or list) therefore further analysis is needed. `molecule_pref_name` will be keeped for now as well.

## Data pre-processing of the bioactivity data

### **Labeling compounds as either being active, inactive or intermediate**
The bioactivity data is in the IC50 unit. Compounds having values of less than 1000 nM will be considered to be **active** while those greater than 10,000 nM will be considered to be **inactive**. As for those values in between 1,000 and 10,000 nM will be referred to as **intermediate**. 

In [14]:
bioactivity_class = []
for i in df.standard_value:
  if float(i) >= 10000:
    bioactivity_class.append("inactive")
  elif float(i) <= 1000:
    bioactivity_class.append("active")
  else:
    bioactivity_class.append("intermediate")

bioactivity_class = pd.Series(bioactivity_class, name='bioactivity_class')

`molecule_chembl_id`: chemical structure that produces a modulatory activity. It exerts some effect on the target protein.
Multiple compounds (rows) might contain the same molecule. For simplicity we will keep only one of them.


In [15]:
bioactivity_columns = ['molecule_chembl_id', 'canonical_smiles', 'standard_value']
df_bioactivity = df[bioactivity_columns]
pd.concat([df_bioactivity, bioactivity_class], axis=1)

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,bioactivity_class
0,CHEMBL187579,Cc1noc(C)c1CN1C(=O)C(=O)c2cc(C#N)ccc21,7200.0,intermediate
1,CHEMBL188487,O=C1C(=O)N(Cc2ccc(F)cc2Cl)c2ccc(I)cc21,9400.0,intermediate
2,CHEMBL185698,O=C1C(=O)N(CC2COc3ccccc3O2)c2ccc(I)cc21,13500.0,inactive
3,CHEMBL426082,O=C1C(=O)N(Cc2cc3ccccc3s2)c2ccccc21,13110.0,inactive
4,CHEMBL187717,O=C1C(=O)N(Cc2cc3ccccc3s2)c2c1cccc2[N+](=O)[O-],2000.0,intermediate
...,...,...,...,...
244,CHEMBL2365410,CC(C)C[C@H](NC(=O)OCc1ccccc1)C(=O)N[C@@H](CC1C...,161.0,active
245,CHEMBL5436771,S=C([S-])NCc1cccnc1.[K+],165.96,
246,CHEMBL2365410,CC(C)C[C@H](NC(=O)OCc1ccccc1)C(=O)N[C@@H](CC1C...,162.18,
220,,,,inactive


In [16]:
df_bioactivity.to_csv('bioactivity_preprocessed_data.csv', index=False)

```python
mol_cid = [i for i in df.molecule_chembl_id]
canonical_smiles = [i for i in df.canonical_smiles]
standard_value = [i for i in df.standard_value]

data_tuple = list(zip(mol_cid, canonical_smiles, bioactivity_class, standard_value))
df_bioactivity = pd.DataFrame(data_tuple, columns=['molecule_chembl_id', 'canonical_smiles', 'bioactivity_class', 'standard_value'])