# K-fold cross validation

In [1]:
# example is from SKlearn's website: 
# http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html

import numpy as np  
from sklearn.model_selection import KFold
X = np.array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8])
y = np.array([1, 2, 3, 4, 5, 6, 7, 8])

#KFold(n_splits=’warn’, shuffle=False, random_state=None)
kf = KFold(n_splits=4)           

for train_index, validation_index in kf.split(X):
   print("TRAIN:", X[train_index], "VALIDATION:", X[validation_index])
   X_train, X_validation = X[train_index], X[validation_index]
   y_train, y_validation = y[train_index], y[validation_index]

TRAIN: [0.3 0.4 0.5 0.6 0.7 0.8] VALIDATION: [0.1 0.2]
TRAIN: [0.1 0.2 0.5 0.6 0.7 0.8] VALIDATION: [0.3 0.4]
TRAIN: [0.1 0.2 0.3 0.4 0.7 0.8] VALIDATION: [0.5 0.6]
TRAIN: [0.1 0.2 0.3 0.4 0.5 0.6] VALIDATION: [0.7 0.8]


In [14]:
#Example is from SKLearn's website. Please see http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html
from sklearn import datasets, decomposition
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import train_test_split # loads functions from the ML library sklearn 
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPClassifier

iris = datasets.load_iris()
X = iris.data
y = iris.target

# split into a training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

# PCA 
nof_prin_components = 2 
pca = decomposition.PCA(n_components=nof_prin_components, 
                        svd_solver='full').fit(X_train)

# applies PCA to the train and test images to calculate the principal components
X_train_pca = pca.transform(X_train) 
X_test_pca = pca.transform(X_test)

nohn = 50 # nof hidden neurons

# ref: https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html
clf = MLPClassifier(hidden_layer_sizes=(nohn,), 
                    solver='sgd', 
                    activation='tanh', 
                    batch_size=8, 
                    verbose=False, 
                    early_stopping=True).fit(X_train_pca, 
                                             y_train)

y_pred = clf.predict(X_test_pca) # reoognises the test data 

[0.92105263 0.92105263 0.89189189 1.        ]


In [None]:
print(cross_val_score(clf, X, y, cv=4))

# AdaBoost

In [18]:
from sklearn.datasets import load_breast_cancer
cancer_data = load_breast_cancer()

In [26]:
from sklearn.model_selection import train_test_split

X, y = cancer_data.data, cancer_data.target

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=42)

In [27]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=2), 
    n_estimators=200,
    algorithm="SAMME.R", 
    learning_rate=0.5, 
    random_state=42)


In [1]:
ada_clf.fit(X_train, y_train)

NameError: name 'ada_clf' is not defined

In [22]:
y_pred = ada_clf.predict(X_test)

In [23]:
y_pred

array([1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1,
       0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1])

In [24]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_test, y_pred))

0.958041958041958


## Bagging classifier

In [25]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

pag_clf = BaggingClassifier(
          DecisionTreeClassifier(random_state=42), n_estimators=1000,
          max_samples=100, bootstrap=False, n_jobs=-1, random_state=42)
pag_clf.fit(X_train, y_train)
y_pred = pag_clf.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.958041958041958
