# HAICK 2023<br/>
### PCBM challenge

# Utils

In [3]:
!kaggle competitions download -c pcbm-challenge

Downloading pcbm-challenge.zip to /content
 80% 41.0M/51.2M [00:00<00:00, 60.7MB/s]
100% 51.2M/51.2M [00:00<00:00, 67.3MB/s]


In [4]:
!unzip pcbm-challenge.zip 

Archive:  pcbm-challenge.zip
  inflating: ERBB1.csv               
  inflating: ERBB2.csv               
  inflating: FLT-3.csv               
  inflating: HDACL1.csv              
  inflating: LCK.csv                 
  inflating: sample3.csv             
  inflating: test_ERBB1_Target_Descriptors.csv  
  inflating: test_ERBB2_Target_Descriptors.csv  
  inflating: test_FLT-3_Target_Descriptors.csv  
  inflating: test_HDACL1_Target_Descriptors.csv  
  inflating: test_LCK_Target_Descriptors.csv  


In [129]:
! pip install feature-engine

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting feature-engine
  Downloading feature_engine-1.5.2-py2.py3-none-any.whl (290 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.0/290.0 KB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: feature-engine
Successfully installed feature-engine-1.5.2


---------------------

# Data & Packages importations

In [130]:
import numpy as np 
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

import seaborn as sns
import matplotlib.pyplot as plt

In [218]:
# ERBB1
train_df_1 = pd.read_csv('ERBB1.csv')
test_df_1 = pd.read_csv('test_ERBB1_Target_Descriptors.csv')

# ERBB2
train_df_2 = pd.read_csv('ERBB2.csv')
test_df_2 = pd.read_csv('test_ERBB2_Target_Descriptors.csv')

# FLT-3 
train_df_3 = pd.read_csv('FLT-3.csv')
test_df_3 = pd.read_csv('test_FLT-3_Target_Descriptors.csv')

# HDACL1
train_df_4 = pd.read_csv('HDACL1.csv')
test_df_4 = pd.read_csv('test_HDACL1_Target_Descriptors.csv')

# LcK
train_df_5 = pd.read_csv('LCK.csv')
test_df_5 = pd.read_csv('test_LCK_Target_Descriptors.csv')

# Exploratory Data Analysis

In [7]:
test_df_1.head()

Unnamed: 0.1,Unnamed: 0,CHEMBL_ID,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,...,AATSC8i,AATSC0s,AATSC1s,AATSC2s,AATSC3s,AATSC4s,AATSC5s,AATSC6s,AATSC7s,AATSC8s
0,0,CHEMBL2064389_1,-0.5467,0.298881,45.8532,61.742653,16,17,53,32,...,-0.267073,2.39676,-0.038196,0.130467,0.270203,-0.005721,-0.061523,0.08675,0.121923,-0.383883
1,1,CHEMBL3775897_1,-1.8615,3.465182,55.8179,70.440997,16,17,61,32,...,-0.130088,0.812132,0.015281,0.061601,0.058054,0.042514,-0.004266,0.032531,0.089533,-0.146941
2,2,CHEMBL4084868_1,-0.6346,0.402717,100.9378,99.762513,18,18,86,45,...,-0.156428,0.933019,0.014589,0.080341,-0.003335,-0.063086,-0.103296,0.046651,0.150547,-0.004654
3,3,CHEMBL2325098_1,0.7808,0.609649,26.0473,61.919067,22,23,47,28,...,0.0192,0.247606,0.001568,-0.026382,-0.006207,-0.021769,-0.012945,0.013427,0.007537,-0.014867
4,4,CHEMBL2087358_1,-0.2229,0.049684,43.6652,57.22386,16,17,48,28,...,0.05934,0.574681,-0.019619,-0.10417,0.027906,0.02078,-0.012281,-0.016431,-0.053559,0.040081


# Data Preprocessing and Cleaning

In [219]:
train_df_1['Activity'] = train_df_1['Activity'].map({'active': 1, 'nonactive': 0})
train_df_2['Activity'] = train_df_2['Activity'].map({'active': 1, 'nonactive': 0})
train_df_3['Activity'] = train_df_3['Activity'].map({'active': 1, 'nonactive': 0})
train_df_4['Activity'] = train_df_4['Activity'].map({'active': 1, 'nonactive': 0})
train_df_5['Activity'] = train_df_5['Activity'].map({'active': 1, 'nonactive': 0})

In [220]:
train_df_1.drop('CHEMBL_ID', axis=1, inplace=True)
train_df_2.drop('CHEMBL_ID', axis=1, inplace=True)
train_df_3.drop('CHEMBL_ID', axis=1, inplace=True)
train_df_4.drop('CHEMBL_ID', axis=1, inplace=True)
train_df_5.drop('CHEMBL_ID', axis=1, inplace=True)

In [221]:
# Count the number of NaN values in each column
nan_counts = train_df_1.isna().sum()
nan_counts.describe()

count    255.000000
mean       1.129412
std        2.791106
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
max        8.000000
dtype: float64

In [222]:
train_df_1 = train_df_1.fillna(0)
train_df_2 = train_df_2.fillna(0)
train_df_3 = train_df_3.fillna(0)
train_df_4 = train_df_4.fillna(0)
train_df_5 = train_df_5.fillna(0)

# Feature Engineering

In [None]:
! pip install featurewiz

In [None]:
from featurewiz import featurewiz
# import optuna

In [None]:
# automatic feature selection by using featurewiz package
target = 'Activity'
 
features_1, train = featurewiz(train_df_1, target, corr_limit=0.7, verbose=2, sep=",",
header=0,test_data="", feature_engg="", category_encoders="")
train_df_1 = train

features_2, train = featurewiz(train_df_2, target, corr_limit=0.7, verbose=2, sep=",",
header=0,test_data="", feature_engg="", category_encoders="")
train_df_2 = train

features_3, train = featurewiz(train_df_3, target, corr_limit=0.7, verbose=2, sep=",",
header=0,test_data="", feature_engg="", category_encoders="")
train_df_3 = train

features_4, train = featurewiz(train_df_4, target, corr_limit=0.7, verbose=2, sep=",",
header=0,test_data="", feature_engg="", category_encoders="")
train_df_4 = train

features4, train = featurewiz(train_df_5, target, corr_limit=0.9, verbose=0, sep=",",
header=0,test_data="", feature_engg="", category_encoders="")
train_df_5 = train

# Model Building

In [223]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import Perceptron
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, classification_report, f1_score

#### Model 1

In [224]:
# Extract the features and target variable
X_1 = train_df_1.drop('Activity', axis=1)
y_1 = train_df_1['Activity']

# Split the data into training and validation sets
X_train_1, X_val_1, y_train_1, y_val_1 = train_test_split(X_1, y_1, test_size=0.2, random_state=42)

# Create a scaler object and fit it on the training data
scaler_1 = StandardScaler()
scaler_1.fit(X_train_1)

# Transform the training and validation data using the scaler
X_train_scaled_1 = scaler_1.transform(X_train_1)
X_val_scaled_1 = scaler_1.transform(X_val_1)

In [226]:
# Define models to fit and evaluate
models = {
    'Extreme Gradient Boosting': XGBClassifier(),
    'Random Forest' : RandomForestClassifier(n_estimators=100),
    'Support Vector Machine (Linear)': SVC(),
    'AdaBoost Classifier': AdaBoostClassifier(),
}

# Fit and evaluate each model
for name, model in models.items():
    # Fit the model
    model.fit(X_train_scaled_1, y_train_1)
    
    # Predict on test set
    y_pred_1 = model.predict(X_val_scaled_1)
    
    # Print results
    print(f'{name}:')
    acc = accuracy_score(y_val_1, y_pred_1)
    f1 = f1_score(y_val_1, y_pred_1, average='weighted')
    report = classification_report(y_val_1, y_pred_1)
    
    print("Accuracy:", acc)
    print('F1 Score:', f1)
    print('Classification Report:', report)


Extreme Gradient Boosting:
Accuracy: 0.7710371819960861
F1 Score: 0.7715838846954306
Classification Report:               precision    recall  f1-score   support

           0       0.81      0.78      0.79       578
           1       0.73      0.76      0.74       444

    accuracy                           0.77      1022
   macro avg       0.77      0.77      0.77      1022
weighted avg       0.77      0.77      0.77      1022

Random Forest:
Accuracy: 0.7984344422700587
F1 Score: 0.7978262512630833
Classification Report:               precision    recall  f1-score   support

           0       0.81      0.84      0.82       578
           1       0.78      0.75      0.76       444

    accuracy                           0.80      1022
   macro avg       0.80      0.79      0.79      1022
weighted avg       0.80      0.80      0.80      1022

Support Vector Machine (Linear):
Accuracy: 0.7925636007827789
F1 Score: 0.7924519366265456
Classification Report:               precision    r

#### Model 2

In [227]:
# Extract the features and target variable
X_2 = train_df_2.drop('Activity', axis=1)
y_2 = train_df_2['Activity']

# Split the data into training and validation sets
X_train_2, X_val_2, y_train_2, y_val_2 = train_test_split(X_2, y_2, test_size=0.2, random_state=42)

# Create a scaler object and fit it on the training data
scaler_2 = StandardScaler()
scaler_2.fit(X_train_2)

# Transform the training and validation data using the scaler
X_train_scaled_2 = scaler_2.transform(X_train_2)
X_val_scaled_2 = scaler_2.transform(X_val_2)

In [230]:
# Define models to fit and evaluate
models = {
    'Logistic Regression' : LogisticRegression(),
    'Random Forest' : RandomForestClassifier(n_estimators=200, max_depth=40),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Support Vector Machine': SVC(),
    'Extreme Gradient Boosting': XGBClassifier()
}

# Fit and evaluate each model
for name, model in models.items():
    # Fit the model
    model.fit(X_train_scaled_2, y_train_2)
    
    # Predict on test set
    y_pred_2 = model.predict(X_val_scaled_2)
    
    # Print results
    print(f'{name}:')
    acc = accuracy_score(y_val_2, y_pred_2)
    f1 = f1_score(y_val_2, y_pred_2, average='weighted')
    report = classification_report(y_val_2, y_pred_2)
    
    print("Accuracy:", acc)
    print('F1 Score:', f1)
    print('Classification Report:', report)


Logistic Regression:
Accuracy: 0.7481751824817519
F1 Score: 0.7477746182284769
Classification Report:               precision    recall  f1-score   support

           0       0.77      0.79      0.78       155
           1       0.72      0.70      0.71       119

    accuracy                           0.75       274
   macro avg       0.74      0.74      0.74       274
weighted avg       0.75      0.75      0.75       274

Random Forest:
Accuracy: 0.8357664233576643
F1 Score: 0.8358442581220205
Classification Report:               precision    recall  f1-score   support

           0       0.86      0.85      0.85       155
           1       0.81      0.82      0.81       119

    accuracy                           0.84       274
   macro avg       0.83      0.83      0.83       274
weighted avg       0.84      0.84      0.84       274

K-Nearest Neighbors:
Accuracy: 0.781021897810219
F1 Score: 0.7819260866706121
Classification Report:               precision    recall  f1-score   s

#### Model 3

In [229]:
# Extract the features and target variable
X_3 = train_df_3.drop('Activity', axis=1)
y_3 = train_df_3['Activity']

# Split the data into training and validation sets
X_train_3, X_val_3, y_train_3, y_val_3 = train_test_split(X_3, y_3, test_size=0.2, random_state=42)

# Create a scaler object and fit it on the training data
scaler_3 = StandardScaler()
scaler_3.fit(X_train_3)

# Transform the training and validation data using the scaler
X_train_scaled_3 = scaler_3.transform(X_train_3)
X_val_scaled_3 = scaler_3.transform(X_val_3)

In [231]:
# Define models to fit and evaluate
models = {
    'Logistic Regression' : LogisticRegression(),
    'Decision Tree' : DecisionTreeClassifier(),
    'Random Forest' : RandomForestClassifier(n_estimators=200),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Support Vector Machine': SVC(),
    'Extreme Gradient Boosting': XGBClassifier()
}

# Fit and evaluate each model
for name, model in models.items():
    # Fit the model
    model.fit(X_train_scaled_3, y_train_3)
    
    # Predict on test set
    y_pred_3 = model.predict(X_val_scaled_3)
    
    # Print results
    print(f'{name}:')
    acc = accuracy_score(y_val_3, y_pred_3)
    f1 = f1_score(y_val_3, y_pred_3, average='weighted')
    report = classification_report(y_val_3, y_pred_3)
    
    print("Accuracy:", acc)
    print('F1 Score:', f1)
    print('Classification Report:', report)


Logistic Regression:
Accuracy: 0.7736486486486487
F1 Score: 0.7719664471089791
Classification Report:               precision    recall  f1-score   support

           0       0.75      0.69      0.72       125
           1       0.79      0.84      0.81       171

    accuracy                           0.77       296
   macro avg       0.77      0.76      0.76       296
weighted avg       0.77      0.77      0.77       296

Decision Tree:
Accuracy: 0.6959459459459459
F1 Score: 0.6943951625769808
Classification Report:               precision    recall  f1-score   support

           0       0.65      0.61      0.63       125
           1       0.73      0.76      0.74       171

    accuracy                           0.70       296
   macro avg       0.69      0.68      0.69       296
weighted avg       0.69      0.70      0.69       296

Random Forest:
Accuracy: 0.8243243243243243
F1 Score: 0.8222440317412385
Classification Report:               precision    recall  f1-score   suppor

#### Model 4

In [233]:
# Extract the features and target variable
X_4 = train_df_4.drop('Activity', axis=1)
y_4 = train_df_4['Activity']


# Split the data into training and validation sets
X_train_4, X_val_4, y_train_4, y_val_4 = train_test_split(X_4, y_4, test_size=0.2, random_state=42)

# Create a scaler object and fit it on the training data
scaler_4 = StandardScaler()
scaler_4.fit(X_train_4)

# Transform the training and validation data using the scaler
X_train_scaled_4 = scaler_4.transform(X_train_4)
X_val_scaled_4 = scaler_4.transform(X_val_4)

In [234]:
# Define models to fit and evaluate
models = {
    'Logistic Regression' : LogisticRegression(),
    'Decision Tree' : DecisionTreeClassifier(),
    'Random Forest' : RandomForestClassifier(n_estimators=200),
    'Perceptron': Perceptron(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Support Vector Machine': SVC(),
    'Extreme Gradient Boosting': XGBClassifier()
}

# Fit and evaluate each model
for name, model in models.items():
    # Fit the model
    model.fit(X_train_scaled_4, y_train_4)
    
    # Predict on test set
    y_pred_4 = model.predict(X_val_scaled_4)
    
    # Print results
    print(f'{name}:')
    acc = accuracy_score(y_val_4, y_pred_4)
    f1 = f1_score(y_val_4, y_pred_4, average='weighted')
    report = classification_report(y_val_4, y_pred_4)
    
    print("Accuracy:", acc)
    print('F1 Score:', f1)
    print('Classification Report:', report)


Logistic Regression:
Accuracy: 0.793918918918919
F1 Score: 0.7944105302719371
Classification Report:               precision    recall  f1-score   support

           0       0.83      0.81      0.82       171
           1       0.75      0.78      0.76       125

    accuracy                           0.79       296
   macro avg       0.79      0.79      0.79       296
weighted avg       0.80      0.79      0.79       296

Decision Tree:
Accuracy: 0.7195945945945946
F1 Score: 0.7148484783458979
Classification Report:               precision    recall  f1-score   support

           0       0.73      0.82      0.77       171
           1       0.70      0.58      0.64       125

    accuracy                           0.72       296
   macro avg       0.72      0.70      0.70       296
weighted avg       0.72      0.72      0.71       296

Random Forest:
Accuracy: 0.8040540540540541
F1 Score: 0.8000743017789297
Classification Report:               precision    recall  f1-score   support

#### Model 5

In [235]:
# Extract the features and target variable
X_5 = train_df_5.drop('Activity', axis=1)
y_5 = train_df_5['Activity']


# # Split the data into training and validation sets
X_train_5, X_val_5, y_train_5, y_val_5 = train_test_split(X_5, y_5, test_size=0.2, random_state=42)

# Create a scaler object and fit it on the training data
scaler_5 = StandardScaler()
scaler_5.fit(X_train_5)

# Transform the training and validation data using the scaler
X_train_scaled_5 = scaler_5.transform(X_train_5)
X_val_scaled_5 = scaler_5.transform(X_val_5)

In [237]:
# Define models to fit and evaluate
models = {
    'Logistic Regression' : LogisticRegression(),
    'Decision Tree' : DecisionTreeClassifier(),
    'Random Forest' : RandomForestClassifier(n_estimators=200),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Support Vector Machine': SVC(),
    'Extreme Gradient Boosting': XGBClassifier()
}

# Fit and evaluate each model
for name, model in models.items():
    # Fit the model
    model.fit(X_train_scaled_5, y_train_5)
    
    # Predict on test set
    y_pred_5 = model.predict(X_val_scaled_5)
    
    # Print results
    print(f'{name}:')
    acc = accuracy_score(y_val_5, y_pred_5)
    f1 = f1_score(y_val_5, y_pred_5, average='weighted')
    report = classification_report(y_val_5, y_pred_5)
    
    print("Accuracy:", acc)
    print('F1 Score:', f1)
    print('Classification Report:', report)


Logistic Regression:
Accuracy: 0.7647058823529411
F1 Score: 0.7651186790505676
Classification Report:               precision    recall  f1-score   support

           0       0.80      0.78      0.79       154
           1       0.72      0.75      0.73       118

    accuracy                           0.76       272
   macro avg       0.76      0.76      0.76       272
weighted avg       0.77      0.76      0.77       272

Decision Tree:
Accuracy: 0.7426470588235294
F1 Score: 0.7409998395751688
Classification Report:               precision    recall  f1-score   support

           0       0.76      0.81      0.78       154
           1       0.72      0.66      0.69       118

    accuracy                           0.74       272
   macro avg       0.74      0.73      0.74       272
weighted avg       0.74      0.74      0.74       272

Random Forest:
Accuracy: 0.8161764705882353
F1 Score: 0.8157701409476258
Classification Report:               precision    recall  f1-score   suppor

# Model Evaluation

### Final Train

In [238]:
model_1 = RandomForestClassifier(n_estimators=100)
model_1.fit(X_1, y_1)

model_2 = RandomForestClassifier(n_estimators=100)
model_2.fit(X_2, y_2)

model_3 = RandomForestClassifier(n_estimators=100)
model_3.fit(X_3, y_3)

model_4 = XGBClassifier()
model_4.fit(X_4, y_4)

model_5 = XGBClassifier()
model_5.fit(X_5, y_5)

XGBClassifier()

# Submission

In [239]:
X_test_1 = test_df_1.drop('Unnamed: 0', axis=1)
X_test_2 = test_df_2.drop('Unnamed: 0', axis=1)
X_test_3 = test_df_3.drop('Unnamed: 0', axis=1)
X_test_4 = test_df_4.drop('Unnamed: 0', axis=1)
X_test_5 = test_df_5.drop('Unnamed: 0', axis=1)

In [240]:
X_test_1.drop('CHEMBL_ID', axis=1, inplace=True)
X_test_2.drop('CHEMBL_ID', axis=1, inplace=True)
X_test_3.drop('CHEMBL_ID', axis=1, inplace=True)
X_test_4.drop('CHEMBL_ID', axis=1, inplace=True)
X_test_5.drop('CHEMBL_ID', axis=1, inplace=True)

In [241]:
X_test_1 = X_test_1.fillna(0)
X_test_2 = X_test_2.fillna(0)
X_test_3 = X_test_3.fillna(0)
X_test_4 = X_test_4.fillna(0)
X_test_5 = X_test_5.fillna(0)

In [206]:
X_test_1 = X_test_1[features_1]
X_test_2 = X_test_2[features_2]
X_test_3 = X_test_3[features_3]
X_test_4 = X_test_4[features_4]
X_test_5 = X_test_5[features4]

In [242]:
y_test_pred_1 = model_1.predict(X_test_1)
y_test_pred_2 = model_2.predict(X_test_2)
y_test_pred_3 = model_3.predict(X_test_3)
y_test_pred_4 = model_4.predict(X_test_4)
y_test_pred_5 = model_5.predict(X_test_5)

In [243]:
Sample = pd.read_csv('./sample3.csv')
Sample.head()

Unnamed: 0,Id,Activity
0,CHEMBL2064389_1,active
1,CHEMBL3775897_1,active
2,CHEMBL4084868_1,active
3,CHEMBL2325098_1,active
4,CHEMBL2087358_1,active


In [208]:
submission_df_1 = pd.DataFrame(y_test_pred_1, columns=['Activity'])
submission_df_2 = pd.DataFrame(y_test_pred_2, columns=['Activity'])
submission_df_3 = pd.DataFrame(y_test_pred_3, columns=['Activity'])
submission_df_4 = pd.DataFrame(y_test_pred_4, columns=['Activity'])
submission_df_5 = pd.DataFrame(y_test_pred_5, columns=['Activity'])

In [244]:
submission_df_1['Activity'] = submission_df_1['Activity'].map({1 : 'active' , 0 : 'nonactive'})
submission_df_2['Activity'] = submission_df_2['Activity'].map({1 : 'active' , 0 : 'nonactive'})
submission_df_3['Activity'] = submission_df_3['Activity'].map({1 : 'active' , 0 : 'nonactive'})
submission_df_4['Activity'] = submission_df_4['Activity'].map({1 : 'active' , 0 : 'nonactive'})
submission_df_5['Activity'] = submission_df_5['Activity'].map({1 : 'active' , 0 : 'nonactive'})

In [245]:
submission_df_1 = pd.concat([test_df_1['CHEMBL_ID'] ,submission_df_1['Activity'] ], axis=1)
submission_df_2 = pd.concat([test_df_2['CHEMBL_ID'] ,submission_df_2['Activity'] ], axis=1)
submission_df_3 = pd.concat([test_df_3['CHEMBL_ID'] ,submission_df_3['Activity'] ], axis=1)
submission_df_4 = pd.concat([test_df_4['CHEMBL_ID'] ,submission_df_4['Activity'] ], axis=1)
submission_df_5 = pd.concat([test_df_5['CHEMBL_ID'] ,submission_df_5['Activity'] ], axis=1)

In [246]:
submission_df = pd.concat([submission_df_1, submission_df_2, submission_df_3, submission_df_4, submission_df_5])

In [247]:
submission_df = submission_df.reset_index()[['CHEMBL_ID','Activity']].rename(columns={"CHEMBL_ID": "Id"})
submission_df

Unnamed: 0,Id,Activity
0,CHEMBL2064389_1,
1,CHEMBL3775897_1,
2,CHEMBL4084868_1,
3,CHEMBL2325098_1,
4,CHEMBL2087358_1,
...,...,...
5313,CHEMBL100553_5,
5314,CHEMBL2204065_5,
5315,CHEMBL1076476_5,
5316,CHEMBL380889_5,


In [248]:
submission_df.to_csv('submission.csv', index=False)

In [249]:
df = pd.read_csv('./submission.csv')
df.describe()

Unnamed: 0,Activity
count,0.0
mean,
std,
min,
25%,
50%,
75%,
max,
