In [1]:
#Import the necessary libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from collections import Counter
from utilities import *
from train_utils import *
from sklearn.metrics import classification_report,confusion_matrix,roc_auc_score,average_precision_score,f1_score,recall_score, precision_score, balanced_accuracy_score

In [2]:
#Load and transpose the train and test df to get features as columns
train = pd.read_csv('homo_sapien-tox21_train.tsv', sep='\t',header=None).T 
test = pd.read_csv('homo_sapien-tox21_test.tsv', sep='\t',header=None).T 

In [3]:
#Pre-process the trian and test data
X_traindata=pre_proccess_df(df=train)
print(X_traindata.shape)
X_testdata=pre_proccess_df(df=test)
print(X_testdata.shape)

(7808, 5135)
(628, 5135)


In [4]:
#Load the train and test labels datasets
trainlabels = pd.read_csv('tox21_train_label.csv', header = 0)
print(trainlabels.values[0,0])
testlabels = pd.read_csv('tox21_test_label.csv', header = 0)
print(testlabels.values[:,0][0]) 

Br
BrCC(Br)c1ccccc1


In [5]:
#Pre-process the trian and test labels datasets
train_labels=pre_proccess_labels(trainlabels,train=True)
print(trainlabels.shape)
test_labels=pre_proccess_labels(testlabels,train=False)
print(testlabels.shape)

(7808, 12)
(628, 12)


In [6]:
#Get the dataset for each assay 
my_assay='nr-ahr'
X_train, y_train, X_test, y_test=pre_proccess_dfassay(my_assay,X_traindata,train_labels,X_testdata, test_labels)
print("Shape of train data for the assay",X_train.shape)
print("Shape of train labels for the assay", y_train.shape)
print("Shape of test data for the assay",X_test.shape)
print("Shape of test labels for the assay", y_test.shape)

Shape of train data for the assay (6511, 5135)
Shape of train labels for the assay (6511,)
Shape of test data for the assay (592, 5135)
Shape of test labels for the assay (592,)


In [7]:
# Summarize class distribution using IR
counter_train = Counter(y_train)
counter_test = Counter(y_test)
print("Imbalanced ratio for the assay {:0.2f}".format(counter_train[0]/counter_train[1]))
print("Imbalanced ratio for the assay {:0.2f}".format( counter_test[0]/counter_test[1]))

Imbalanced ratio for the assay 7.87
Imbalanced ratio for the assay 7.34


In [8]:
#Generate the resampled data for the assay 
datasets= []
datasets.append(transform(SMOTEENN(),X_train,y_train))

Transforming SMOTEENN


In [9]:
#Train the SMOTEENN+RF model on the resampled assay data
benchmark_scores= []
for sample_type,X,y in datasets:
    print('______________________________________________________________')
    print('{}'.format(sample_type))
    benchmark_scores.append(benchmark(sample_type,X,y.values.ravel()))
    print('______________________________________________________________')    

______________________________________________________________
SMOTEENN
Fitting 30 folds for each of 10 candidates, totalling 300 fits
Tuned Random Forest parameters: {'n_estimators': 500, 'max_depth': 47}
Best estimator AUC score: 0.9992020418097709
______________________________________________________________


In [10]:
result = []
# Train the model based on benchmark parameters
for sampling_type,score,param in benchmark_scores:
    print("Training a {}+RF".format(sampling_type))
    rf = RandomForestClassifier(**param)
    for s_type,X,y in datasets:
        if s_type == sampling_type:
            rf.fit(X,y.values.ravel())
            pred_test = rf.predict(X_test)
            pred_test_probs = rf.predict_proba(X_test)[:,1]
            cm = confusion_matrix(y_test, pred_test)
            TN = cm[0][0]
            FN = cm[1][0]
            TP = cm[1][1]
            FP = cm[0][1]
                      
            result.append((sampling_type, TN, FN, TP, FP,
                           f1_score(y_test,pred_test),
                           precision_score(y_test,pred_test),
                           recall_score(y_test,pred_test),
                           roc_auc_score(y_test, pred_test_probs),
                           average_precision_score(y_test, pred_test_probs),
                           balanced_accuracy_score(y_test, pred_test)))

Training a SMOTEENN+RF


In [11]:
sampling_result= pd.DataFrame(result,columns=['Sampling Type', "TN", "FN"," TP", "FP",'F1_score','Precision','Recall','AUCROC',"AUPRC","Balanced_Accuracy"])

In [14]:
sampling_result

Unnamed: 0,Sampling Type,TN,FN,TP,FP,F1_score,Precision,Recall,AUCROC,AUPRC,Balanced_Accuracy
0,SMOTEENN,381,7,64,140,0.465455,0.313725,0.901408,0.898313,0.54667,0.816347


In [12]:
# Get the feature importances by the fitted attribute feature_importances_ 
importances = rf.feature_importances_
# Sort the feature importance in descending order (most important feature appears first)
sorted_indices = np.argsort(importances)[::-1]
#Get the top 100 most important features for the assay to run an enrichement analysis
proteins=pd.DataFrame(X_train.columns[sorted_indices][:101], columns=['Important proteins(features)'])
#Save the features to a csv file
proteins.to_csv("{}__proteins.csv".format(train.keys()[-1]))