In [916]:
# Code developed by KEVIN ESTIBEIRO - Student ID 202205805 - x2022FYV
# Welcome to the Toxic Chemical Prediction program 

# This program reads in training and test data, and uses rdKit and ML models to
# predict toxicity in the test data. If the chemical is toxic, it is represented
# as 1, else as 0. 
# FEATURES: There are 209 Features used in this program (208 from rdkit
# molecular descriptors +  Assay Id). Data exploration is done to identify missing
# or Null values in the training data, and these are imputed by the Mean value.
# The training data was modeled using different classifiers such as the 
# HistGradientBoosting classifier, XGBoost classifier, AdaBoost classifier, 
# RandomForest classifier, LGBM classifier, CatBoost classifier, etc.
# Prediction results for given test data are written to submission.csv output file.


# Importing libraries
import numpy as np  
import pandas as pd  
import os
import csv
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import PandasTools
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from sklearn import tree
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from catboost import CatBoostClassifier
import xgboost as xgb

In [917]:
# Loading the DESCRIPTORS train and test data from local path
train_data = pd.read_csv('train_with_descriptors.csv')
test_data = pd.read_csv('test_with_descriptors.csv')

# Loading the original train and test data from kaggle
original_train_data = pd.read_csv("train_II.csv")
original_test_data = pd.read_csv("test_II.csv")

# Splitting first column of original train data to get Chemical Id and Assay Id columns in a new training dataset 
newtrain_data = original_train_data['Id'].str.split(';',n=1,expand=True).rename(columns={0:'Id',1:'AssayId'})
# Added a new column AssayId to train_data which contains the values of AssayId column from newtrain_data
train_data['AssayId'] = newtrain_data['AssayId'] 

# Splitting first column of original test data to get Chemical Id and Assay Id columns in a new test dataset 
newtest_data = original_test_data['x'].str.split(';',n=1,expand=True).rename(columns={0:'Id',1:'AssayId'})
# Added a new column AssayId to test_data which contains the values of AssayId column from newtest_data
test_data["AssayId"] = newtest_data["AssayId"]

In [918]:
#The "Expected" column from original_train_data is assigned to y_train
y_train = original_train_data["Expected"]

In [919]:
newtrain_data.head(3)

Unnamed: 0,Id,AssayId
0,C1=CC(=CC=C1C(C2=CC=C(C=C2)O)C(Cl)(Cl)Cl)O,1644
1,CCCCCCCCC(=O)C,2451
2,CCCCCCCCCC[N+](C)(C)CCCCCCCCCC.[Cl-],1384


In [920]:
# To know the statistical measures for both training data and test data
print("\nTraining data statistics:  \n",train_data.describe())
print("\nTest data statistics:  \n",test_data.describe())


Training data statistics:  
        MaxEStateIndex  MinEStateIndex  MaxAbsEStateIndex  MinAbsEStateIndex  \
count    75383.000000    75383.000000       75383.000000       75383.000000   
mean        10.139974       -1.231018          10.139974           0.246767   
std          6.731101        6.204736           6.731101           5.968613   
min       -666.000000     -666.000000        -666.000000        -666.000000   
25%          8.982088       -1.688251           8.982088           0.041146   
50%         11.058519       -0.588989          11.058519           0.149429   
75%         12.340562       -0.001335          12.340562           0.339410   
max         17.262020        2.120000          17.262020           7.668295   

                qed         MolWt  HeavyAtomMolWt    ExactMolWt  \
count  75383.000000  75383.000000    75383.000000  75383.000000   
mean       0.516070    286.463619      270.176704    285.960991   
std        5.949503    138.420129      132.407715    138.

In [921]:
# To check if any null values exist in any column of the training data or test data
print("\nNull or NA values in the Training data: ",train_data.isna().sum(), "\n")
print("\nNull or NA values in the Test data: ",test_data.isna().sum(), "\n")


Null or NA values in the Training data:  MaxEStateIndex       0
MinEStateIndex       0
MaxAbsEStateIndex    0
MinAbsEStateIndex    0
qed                  0
                    ..
fr_thiocyan          0
fr_thiophene         0
fr_unbrch_alkane     0
fr_urea              0
AssayId              0
Length: 209, dtype: int64 


Null or NA values in the Test data:  MaxEStateIndex       0
MinEStateIndex       0
MaxAbsEStateIndex    0
MinAbsEStateIndex    0
qed                  0
                    ..
fr_thiocyan          0
fr_thiophene         0
fr_unbrch_alkane     0
fr_urea              0
AssayId              0
Length: 209, dtype: int64 



In [922]:
# To check for any duplicate rows in train and test data
print("Duplicate rows in train data: ",train_data.duplicated().sum())
print("Duplicate rows in test data: ",test_data.duplicated().sum())

Duplicate rows in train data:  189
Duplicate rows in test data:  7


In [923]:
# Ensuring NA or Null values are substituted by mean of those columns
x_train = train_data
x_train = x_train.fillna(x_train.mean())

In [924]:
x_train['AssayId']= x_train['AssayId'].astype(int)


In [925]:
test_data['AssayId']= test_data['AssayId'].astype(int)

In [926]:
# Label encoding for Expected column in training data to convert to categorical data
le = LabelEncoder()
y_train = le.fit_transform(original_train_data["Expected"])

In [927]:
# Training the model using the CatBoost classifier (params refers to parameters)
params = {'learning_rate': 0.2, 'max_depth': 12, 'n_estimators': 500, 'random_state': 42}
rfc = CatBoostClassifier(**params)

In [928]:
# Fitting the model with train data
rfc.fit(x_train, y_train)

0:	learn: 0.5471738	total: 1.59s	remaining: 13m 14s
1:	learn: 0.4697503	total: 3.22s	remaining: 13m 20s
2:	learn: 0.4210519	total: 4.66s	remaining: 12m 51s
3:	learn: 0.3856607	total: 6.09s	remaining: 12m 34s
4:	learn: 0.3679920	total: 7.46s	remaining: 12m 18s
5:	learn: 0.3510917	total: 8.91s	remaining: 12m 13s
6:	learn: 0.3408401	total: 10.4s	remaining: 12m 12s
7:	learn: 0.3333477	total: 11.8s	remaining: 12m 5s
8:	learn: 0.3266269	total: 13.2s	remaining: 11m 58s
9:	learn: 0.3195334	total: 14.6s	remaining: 11m 56s
10:	learn: 0.3142953	total: 16s	remaining: 11m 53s
11:	learn: 0.3093420	total: 17.5s	remaining: 11m 51s
12:	learn: 0.3043422	total: 18.9s	remaining: 11m 47s
13:	learn: 0.2999678	total: 20.4s	remaining: 11m 47s
14:	learn: 0.2955959	total: 21.8s	remaining: 11m 45s
15:	learn: 0.2920096	total: 23.2s	remaining: 11m 43s
16:	learn: 0.2885845	total: 24.7s	remaining: 11m 40s
17:	learn: 0.2848007	total: 26.1s	remaining: 11m 39s
18:	learn: 0.2835419	total: 27.6s	remaining: 11m 37s
19:	le

<catboost.core.CatBoostClassifier at 0x1a8f843f4c0>

In [929]:
# Making predictions on the test data using the trained model
test_data = test_data.fillna(test_data.mean())
y_pred = rfc.predict(test_data)

In [None]:
# Finding cross validation F1 scores
# This code was modified after sourcing from projectpro.io/recipes/check-models-f1-score-using-cross-validation-in-python 
print(cross_val_score(rfc, x_train, y_train, scoring="f1", cv=5))
mean_score = cross_val_score(rfc, x_train, y_train, scoring="f1", cv=5).mean()
print("\n The Mean score is ", mean_score)
# std_score = cross_val_score(rfc, x_train, y_train, scoring="f1", cv=5).std()
# print("\n The Standard score is ", std_score)

0:	learn: 0.5575408	total: 1.53s	remaining: 12m 45s
1:	learn: 0.4808287	total: 3.13s	remaining: 13m
2:	learn: 0.4413773	total: 3.38s	remaining: 9m 20s
3:	learn: 0.4038441	total: 4.9s	remaining: 10m 7s
4:	learn: 0.3777803	total: 6.47s	remaining: 10m 40s
5:	learn: 0.3592567	total: 8.01s	remaining: 10m 59s
6:	learn: 0.3476352	total: 9.63s	remaining: 11m 18s
7:	learn: 0.3383094	total: 11.4s	remaining: 11m 41s
8:	learn: 0.3317857	total: 13s	remaining: 11m 48s
9:	learn: 0.3245881	total: 14.6s	remaining: 11m 55s
10:	learn: 0.3190458	total: 16.2s	remaining: 12m
11:	learn: 0.3137345	total: 17.7s	remaining: 12m 1s
12:	learn: 0.3080526	total: 19.4s	remaining: 12m 6s
13:	learn: 0.3031987	total: 21s	remaining: 12m 8s
14:	learn: 0.3005925	total: 22.6s	remaining: 12m 9s
15:	learn: 0.2968899	total: 24.3s	remaining: 12m 13s
16:	learn: 0.2940301	total: 25.8s	remaining: 12m 12s
17:	learn: 0.2920821	total: 27.4s	remaining: 12m 12s
18:	learn: 0.2878398	total: 29s	remaining: 12m 13s
19:	learn: 0.2838470	tot

In [None]:
get_ids = pd.read_csv("test_II.csv")

In [None]:
# The inverse transform function is invoked to reverse the process of label encoding.
output = pd.DataFrame({'Id': get_ids['x'], 'Predicted': le.inverse_transform(y_pred)})
# The predictions indicating if the chemicals are toxic are output to the 'submission.csv' file
output.to_csv('submission63.csv', index=False)
print("\nToxicity Predictions are available in submission63.csv   End of program!!")