In [18]:
import numpy as np
import pandas as pd

# Import some helpful packages
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

# Import different learners
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# Import meta classifier
from sklearn.linear_model import LogisticRegression

# Load data, remove rows with NaN.

In [19]:
data = pd.read_csv('BreastCancer.csv')
data = data[np.logical_not(np.isnan(data))]
data = data[~np.isnan(data).any(axis=1)]

* Shuffle the data set. 
* Then assign the features and the classes into different matrices. 
* The first column is an ID number for each observation, therefore it is ignored.
* Replaced 0s in Y with -1s

In [20]:
data = shuffle(data)
X = data.iloc[:,1:-1].values
y = data.iloc[:,-1].values
y[y==0] = -1;

# Scaled the feature set, and split into training (80%) and test set(20%).

In [21]:
y = y[:, None]
X_scaled = X
X_scaled = (X_scaled - X_scaled.mean(axis=0))/X_scaled.std(axis=0)

X_train, X_testMeta, y_train, y_testMeta = train_test_split(X_scaled, y, test_size=0.2)
X_trainBase, X_testBase, y_trainBase, y_testBase = train_test_split(X_train, y_train, test_size=0.25)

# In the first layer there will be three models.
* SVM
* Random Forest
* LDA-QDA

In [22]:
svm_init = svm.SVC()
svm_init.fit(X_trainBase, y_trainBase.ravel())
yPredSVM = svm_init.predict(X_testBase)

In [23]:
randomForest_init = RandomForestClassifier(max_depth=2, random_state=0)
randomForest_init.fit(X_trainBase, y_trainBase.ravel())
yPredRF = randomForest_init.predict(X_testBase)

In [24]:
LDA_init = LinearDiscriminantAnalysis()
LDA_init.fit(X_trainBase, y_trainBase.ravel())
yPredLDA = LDA_init.predict(X_testBase)

In [25]:
yMeta = np.vstack([yPredSVM, yPredRF, yPredLDA])

# I choosed Logistic Regression Classifier as my meta model.

In [26]:
yStackedPredict = LogisticRegression(random_state=0).fit(yMeta.T, y_testBase.ravel())

In [27]:
yPredSVMTest = svm_init.predict(X_testMeta)
yPredRFTest = randomForest_init.predict(X_testMeta)
yPredLDATest = LDA_init.predict(X_testMeta)
yMetaTest = np.vstack([yPredSVMTest, yPredRFTest, yPredLDATest])

yStackedPredictions = yStackedPredict.predict(yMetaTest.T)
stackedAcc = accuracy_score(y_testMeta.ravel(), yStackedPredictions)

# I got 96.4% accuracy with stacking. Now compare this with base model accuracies.

In [28]:
SVMAccuracy = accuracy_score(y_testMeta.ravel(), yPredSVMTest)
RFAccuracy = accuracy_score(y_testMeta.ravel(), yPredRFTest)
LDAAccuracy = accuracy_score(y_testMeta.ravel(), yPredLDATest)
print('\n SVM Accuracy: ',SVMAccuracy, '\n Random Forest Accuracy: ', RFAccuracy, '\n LDA Accuracy: ',LDAAccuracy)

print('Stacked Model Accuracy: ', stackedAcc)


 SVM Accuracy:  0.9416058394160584 
 Random Forest Accuracy:  0.9562043795620438 
 LDA Accuracy:  0.9343065693430657
Stacked Model Accuracy:  0.9635036496350365


# Ensemble stacking has the highest accuracy.