# Prediction of Breast Cancer peptides with the best MLP classifier

In [2]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# remove warnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [3]:
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.metrics import confusion_matrix,accuracy_score, roc_auc_score,f1_score, recall_score, precision_score
from sklearn.utils import class_weight

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, LassoCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process.kernels import RBF
from sklearn.svm import LinearSVC

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.feature_selection import RFECV, VarianceThreshold, SelectKBest, chi2
from sklearn.feature_selection import SelectFromModel, SelectPercentile, f_classif

import seaborn as sns; sns.set() # data visualization library 
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from imblearn.over_sampling import SMOTE

In [4]:
from myFunctions import *

# define output variables
outVar = 'Class'

# define list of folds
nfold = 3

# define a label for output files
label = 'Outer'

seed = 74

## Individual ML and preprocessing

In [5]:
# read dataset for the best model (it will generate: Mix-Best300)
sFile = './best_classifier/Mix_BreastCancer.csv'

print('\n-> Read dataset', sFile)
df = pd.read_csv(sFile)
print(len(df.columns))


-> Read dataset ./best_classifier/Mix_BreastCancer.csv


FileNotFoundError: [Errno 2] File ./best_classifier/Mix_BreastCancer.csv does not exist: './best_classifier/Mix_BreastCancer.csv'

### Preprocessing

In [61]:
# Clean columns (remove all extra columns but keep ProtID!)
df = ClearDatasets(df)
print(len(df.columns))

# drop ProtID to have only descriptors + Class (raw dataset)
print('\n-> Drop ProtID column')
df= df.drop(['ProtID'],axis = 1)
print('Done!')
print(len(df.columns))

# Check dataset
DataCheckings(df)

# Dataset preprocessing
df = DataPreprocessing(df)
print(len(df.columns))

# Remove zero variance columns
df = Remove0VarCols(df)


-> Modify dataset
Done!
8742

-> Drop ProtID column
Done!
8741

-> Checking dataset

Data points = 376

Columns (output + features)= 8741

Data types = [dtype('float64') dtype('int64')]


Column Names:
 Index(['A', 'R', 'N', 'D', 'C', 'E', 'Q', 'G', 'H', 'I',
       ...
       'DAYM780201.lag22', 'DAYM780201.lag23', 'DAYM780201.lag24',
       'DAYM780201.lag25', 'DAYM780201.lag26', 'DAYM780201.lag27',
       'DAYM780201.lag28', 'DAYM780201.lag29', 'DAYM780201.lag30', 'Class'],
      dtype='object', length=8741)

Categorical features: []

Columns with NaN:  0  /  8741

No of data points with NaN: 0  /  376
Done!

-> Dataset preprocessing
Inicial shape: (376, 8741)
Data points = 376
Columns (output + features)= 8741
Data types = [dtype('float64') dtype('int64')]


Column Names:
 Index(['A', 'R', 'N', 'D', 'C', 'E', 'Q', 'G', 'H', 'I',
       ...
       'DAYM780201.lag22', 'DAYM780201.lag23', 'DAYM780201.lag24',
       'DAYM780201.lag25', 'DAYM780201.lag26', 'DAYM780201.lag27',
       'D

In [62]:
df.shape

(376, 8709)

In [63]:
# Save initial ds
rawFile = sFile[:-4]+'.ds_raw.csv'
print('\n-> Save raw dataset:',rawFile)
df.to_csv(rawFile, index=False)
print('Done!')


-> Save raw dataset: ./best_classifier/Mix_BreastCancer.ds_raw.csv
Done!


In [64]:
# read the prediction file with the same descriptors (all from Mix)
# read dataset for the best model (it will generate: Mix-Best300)
sFilep = './best_classifier/Screening_3_RBPs.csv' 
# Screening_1_Metastasis.csv, Screening_2_Cancer_Immunotherapy_Genes.csv

print('\n-> Read dataset', sFilep)
dfp = pd.read_csv(sFilep)
print(len(dfp.columns))


-> Read dataset ./best_classifier/Screening_3_RBPs.csv
8741


In [65]:
# Preprocessing of prediction file
# Clean columns (remove all extra columns but keep ProtID!)
#df = ClearDatasets(df)
#print(len(df.columns))

# drop ProtID to have only descriptors + Class (raw dataset)
#print('\n-> Drop ProtID column')
#df= df.drop(['ProtID'],axis = 1)
#print('Done!')
#print(len(df.columns))

# Check dataset
print(DataCheckings(dfp))


-> Checking dataset

Data points = 1369

Columns (output + features)= 8741

Data types = [dtype('float64') dtype('int64')]


Column Names:
 Index(['A', 'R', 'N', 'D', 'C', 'E', 'Q', 'G', 'H', 'I',
       ...
       'Pc2.Hydrophilicity.26', 'Pc2.Hydrophobicity.27',
       'Pc2.Hydrophilicity.27', 'Pc2.Hydrophobicity.28',
       'Pc2.Hydrophilicity.28', 'Pc2.Hydrophobicity.29',
       'Pc2.Hydrophilicity.29', 'Pc2.Hydrophobicity.30',
       'Pc2.Hydrophilicity.30', 'Class'],
      dtype='object', length=8741)

Categorical features: []

Columns with NaN:  0  /  8741

No of data points with NaN: 0  /  1369
Done!
0


In [66]:
# Select the same features as the dataset
dfp = dfp[list(df.columns)]
dfp.shape

(1369, 8709)

In [67]:
# Scale raw dataframe
# scale dataframe, save scaled file, save scaler
Xdata, Ydata, Features = getDataFromDataFrame(df)# out var = Class
Xdatap, Ydatap, Featuresp = getDataFromDataFrame(dfp)# prediction data

# Normalize dataset & prediction set
scaler = MinMaxScaler()
Xdata = scaler.fit_transform(Xdata)
Xdatap = scaler.transform(Xdatap) # scaler prediction data with the same scaler

df = pd.DataFrame(Xdata,columns=Features)
df['Class'] = Ydata # add class column

dfp = pd.DataFrame(Xdatap,columns=Featuresp)
dfp['Class'] = Ydatap # add class column

scalerFile = sFile[:-4]+'.scaler_Std.pkl'
print('* Save scaler:', scalerFile)
joblib.dump(scaler, scalerFile) 

# Save initial ds
scaledFile = sFile[:-4]+'.ds_std.csv'
print('* Save scaled dataset:', scaledFile)
df.to_csv(scaledFile, index=False)

# Save initial ds for predictions
scaledFilep = sFilep[:-4]+'.ds_std.csv'
print('* Save scaled dataset:', scaledFilep)
dfp.to_csv(scaledFilep, index=False)

print('Done!')


-> Get X & Y data, Features list
Shape (376, 8709)
Shape X data: (376, 8708)
Shape Y data: (376,)
Done!

-> Get X & Y data, Features list
Shape (1369, 8709)
Shape X data: (1369, 8708)
Shape Y data: (1369,)
Done!
* Save scaler: ./best_classifier/Mix_BreastCancer.scaler_Std.pkl
* Save scaled dataset: ./best_classifier/Mix_BreastCancer.ds_std.csv
* Save scaled dataset: ./best_classifier/Screening_3_RBPs.ds_std.csv
Done!


In [68]:
nFeats = 300

Xdata, Ydata, Features = getDataFromDataFrame(df)# out var = Class

print('\n-> Univariate Feature selection')
selector= SelectKBest(chi2, k=nFeats)
Xdata = selector.fit_transform(Xdata, Ydata)
    
selectorFile = sFile[:-4]+'.featSelector_Univariate'+str(nFeats)+'.pkl'
print('* Save selector:', selectorFile)
joblib.dump(selector, selectorFile) 
    
# Selected features
SelFeatures = []
for i in selector.get_support(indices=True):
    SelFeatures.append(Features[i])
        
# create the resulted dataframe
df = pd.DataFrame(Xdata,columns=SelFeatures)
df['Class'] = Ydata # add class column
print('Final columns:', list(df.columns))
    
# Save selected feature ds
selectFile = sFile[:-4]+'.ds_sel.csv'
print('* Save selected features dataset:', selectFile)
df.to_csv(selectFile, index=False)

# create the resulted dataframe for predictions
dfp = dfp[list(df.columns)]
    
# Save selected feature ds
selectFilep = sFilep[:-4]+'.ds_sel.csv'
print('* Save selected features dataset:', selectFilep)
dfp.to_csv(selectFilep, index=False)
    
print('Done!')


-> Get X & Y data, Features list
Shape (376, 8709)
Shape X data: (376, 8708)
Shape Y data: (376,)
Done!

-> Univariate Feature selection
* Save selector: ./best_classifier/Mix_BreastCancer.featSelector_Univariate300.pkl
Final columns: ['MN', 'LG', 'QI', 'NK', 'EM', 'QM', 'MM', 'EY', 'FAA', 'FNA', 'PNA', 'MDA', 'YHA', 'YKA', 'WFA', 'GPA', 'NTA', 'EYA', 'PAR', 'QDR', 'KER', 'SQR', 'QGR', 'LLR', 'HKR', 'TKR', 'TMR', 'YMR', 'MFR', 'EAN', 'HAN', 'MRN', 'SNN', 'EDN', 'QCN', 'QQN', 'GQN', 'PGN', 'IHN', 'NKN', 'HKN', 'LKN', 'AMN', 'TMN', 'VMN', 'MPN', 'PSN', 'YTN', 'KWN', 'PWN', 'EYN', 'PYN', 'LVN', 'PVN', 'SVN', 'VAD', 'HRD', 'IND', 'PDD', 'IQD', 'NHD', 'YHD', 'NID', 'HFD', 'ITD', 'RYD', 'IYD', 'QRC', 'DNC', 'SNC', 'MDC', 'AQC', 'CGC', 'MGC', 'VHC', 'CKC', 'IKC', 'SKC', 'MMC', 'PFC', 'MPC', 'MVC', 'FVC', 'FDE', 'YDE', 'SQE', 'TQE', 'RHE', 'MHE', 'HIE', 'FKE', 'EME', 'QME', 'LME', 'MME', 'VME', 'SFE', 'DAQ', 'TNQ', 'IDQ', 'DCQ', 'KCQ', 'GLQ', 'FKQ', 'AMQ', 'CMQ', 'VPQ', 'PSQ', 'IWQ', 'YWQ', '

In [69]:
df.shape

(376, 301)

In [70]:
dfp.shape

(1369, 301)

In [71]:
# Balancing dataframe using SMOTE
df = SMOTEdf(df,sFile,seed)


-> Get X & Y data, Features list
Shape (376, 301)
Shape X data: (376, 300)
Shape Y data: (376,)
Done!

-> Dataframe SMOTE balancing
Initial dimensions: (376, 301)
Final shape: (466, 301)
* Save balanced dataset: ./best_classifier/Mix_BreastCancer.ds_bal.csv
Done!


In [44]:
df.shape

(466, 301)

### ML

In [72]:
# get ds for ML
Xdata, Ydata, Features = getDataFromDataFrame(df)# out var = Class 


-> Get X & Y data, Features list
Shape (466, 301)
Shape X data: (466, 300)
Shape Y data: (466,)
Done!


In [73]:
# Calculate class weights
class_weights = set_weights(Ydata)
print("Class weights = ", class_weights)

Class weights =  {0: 1.0, 1: 1.0}


In [74]:
outer_cv = StratifiedKFold(n_splits=3,shuffle=True,random_state=seed)

In [75]:
ifold = 0
ACCs  =[]
AUROCs=[]
models =[]
SelectedFeatures =[]

for train_index, test_index in outer_cv.split(Xdata, Ydata):
    ifold +=1
    
    print("Fold =",ifold)
    start = time.time()
    
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = Xdata[train_index], Xdata[test_index]
    y_train, y_test = Ydata[train_index], Ydata[test_index]

    #scaler.transform(X_test)
    clf = MLPClassifier(hidden_layer_sizes= (20),
                        random_state = seed,
                        max_iter=50000, shuffle=False)
    clf.fit(X_train, y_train)
    
    joblib.dump(clf, './best_classifier/MLP_model'+str(ifold)+'.pkl', compress = 1)
    models.append(clf)
    
    y_pred = clf.predict_proba(X_test)
    AUROC = roc_auc_score(y_test, y_pred[:, 1])
    AUROCs.append(AUROC)
    
    ACC = clf.score(X_test,y_test)
    ACCs.append(ACC)
   
    print("AUROC=",AUROC,"ACC=",ACC, (time.time() - start)/60,"mins")

Fold = 1
AUROC= 0.9837278106508875 ACC= 0.9358974358974359 0.013615576426188152 mins
Fold = 2
AUROC= 0.9807692307692308 ACC= 0.9423076923076923 0.017777299880981444 mins
Fold = 3
AUROC= 0.9748692865575983 ACC= 0.9285714285714286 0.028612228234608968 mins


In [76]:
print(np.mean(AUROCs),np.std(AUROCs))

0.9797887759925722 0.0036823299648140366


In [77]:
print(np.mean(ACCs),np.std(ACCs))

0.9355921855921855 0.005611958580845595


In [78]:
dfp.shape

(1369, 301)

In [79]:
# get ds for ML
Xdatap, Ydatap, Featuresp = getDataFromDataFrame(dfp)# out var = Class


-> Get X & Y data, Features list
Shape (1369, 301)
Shape X data: (1369, 300)
Shape Y data: (1369,)
Done!


In [80]:
Xdatap.shape

(1369, 300)

In [81]:
# load the saved model 2 from disk: AUROC= 0.9807692307692308 ACC= 0.9423076923076923 0.014167205492655436
clf = joblib.load('./best_classifier/MLP_model'+str(2)+'.pkl')

# predictions with the model
Ydatap = clf.predict(Xdatap)
        
# add probabilities (n_samples X n_classes; class 0, class 1)
Ydatapprob = clf.predict_proba(Xdatap)
        
# save predictions for list 1
dffp = pd.DataFrame(Xdatap,columns=Featuresp)
dffp['Class'] = Ydatap
dffp['Prob0'] = Ydatapprob[:,0]
dffp['Prob1'] = Ydatapprob[:,1]

In [82]:
# merge with protein information from other file
# AC.Screening_1_Metastasis.csv, AC.Screening_2_Cancer_Immunotherapy_Genes
result = pd.concat([dffp, pd.read_csv('./best_classifier/AC.Screening_3_RBPs.csv')], axis=1)

In [83]:
# creat new order of columns in final results
newHeader=['Class','Prob1','Prob0','V1','V2']
result = result[newHeader]
result = result.sort_values(by=['Prob1'], ascending=False)
result.to_csv(sFilep[:-4]+'_predictions.csv', index=True)

Have fun with ML! @muntisa