In [1]:
from sklearn.datasets import load_breast_cancer
dataset = load_breast_cancer()
import pandas as pd
import numpy as np
cancer_df = pd.DataFrame(np.c_[dataset['data'],dataset['target']], 
                         columns = np.append(dataset['feature_names'],['target_names']))

In [29]:
X=cancer_df #.drop(['target_names'], axis=1)

In [30]:
from statsmodels.stats.outliers_influence import variance_inflation_factor 
vif_data = pd.DataFrame() 
vif_data["feature"] = X.columns 
  
# calculating VIF for each feature 
vif_data["VIF"] = [variance_inflation_factor(X.values, i) 
                          for i in range(len(X.columns))] 
  
print(vif_data)

                    feature           VIF
0               mean radius  63955.019898
1              mean texture    251.093672
2            mean perimeter  58213.349861
3                 mean area   1320.282793
4           mean smoothness    393.418392
5          mean compactness    200.983909
6            mean concavity    158.033586
7       mean concave points    155.053368
8             mean symmetry    184.712502
9    mean fractal dimension    683.379907
10             radius error    237.257583
11            texture error     24.675505
12          perimeter error    211.407233
13               area error     72.759750
14         smoothness error     26.365964
15        compactness error     45.049562
16          concavity error     33.650689
17     concave points error     54.060876
18           symmetry error     37.190834
19  fractal dimension error     27.619720
20             worst radius   9993.710616
21            worst texture    343.419477
22          worst perimeter   4498

In [31]:
vif_data['VIF']>500

0      True
1     False
2      True
3      True
4     False
5     False
6     False
7     False
8     False
9      True
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20     True
21    False
22     True
23     True
24    False
25    False
26    False
27    False
28    False
29    False
30    False
Name: VIF, dtype: bool

In [32]:
features=[]
for i in range(len(X.columns)):
    if vif_data.loc[i,'VIF']>500:
        features.append(vif_data.loc[i,'feature'])

In [33]:
X=cancer_df[features]
y=cancer_df['target_names']

In [34]:
from sklearn import model_selection
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC 

In [35]:
models=[]
models.append(('LDA',LinearDiscriminantAnalysis()))
models.append(('KNN',KNeighborsClassifier()))
models.append(('CART',DecisionTreeClassifier()))
models.append(('NB',GaussianNB()))
models.append(('SVM',SVC()))

In [36]:
X_train,X_test,y_train,y_test=model_selection.train_test_split(X,y,test_size=0.20,shuffle=True,random_state=6)

In [37]:
results=[]
names=[]
for name,model in models:
    kfold=model_selection.KFold(n_splits=10,random_state=6,shuffle=True)
    cv_results=model_selection.cross_val_score(model,X_train,y_train,cv=kfold,scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    print(name, cv_results.mean(),cv_results.std())

LDA 0.9409178743961352 0.03501669085890359
KNN 0.9254589371980677 0.039051659240425717
CART 0.9076328502415458 0.04481769336848325
NB 0.9077294685990338 0.04439873726164649
SVM 0.9145410628019324 0.04684817324173046
