# K-fold cross validation

Implement a random k-fold cross validation algorithm from scratch.

Your algorithm should:
- load the iris dataset and split its columns into features and target
- split the dataset into k-fold to perform cross validation

You can use the code bellow to implement your algorithm or implement yourself from scratch.



In [40]:
# we will implement a k-fold cross validation from scratch
# we will use the iris dataset

import numpy as np
import pandas as pd
from sklearn.datasets import load_iris

# load the iris dataset
iris = load_iris()
X = iris.data
y = iris.target

def k_fold_cross_validation(X, y, k, model):
    # X is the data
    # y is the target
    # k is the number of folds
    # model is the model to use  
    # we will return the accuracy of the model
    # we will use the accuracy as a metric
    
    #################################

    # shuffle the data and create X and y ready to be used to fit the model
    # in a way that if I say X[0] the algorithm will return the first fold  of the data, the same for y

    data = np.arange(len(X))
    np.random.shuffle(data)
    folds_X = np.array_split(X[data], k)
    folds_y = np.array_split(y[data], k)

    #################################
    
    # we will need to define a for loop to iterate over the folds and guarantee that each fold is used as a test set at least once
    # inside this for loop we will call the functions fit and accuracy for each one of the folds
    # X_train, y_train, X_test, y_test are build each time the for loop is called by using X and y divided before
    
    accuracies = []
    # Iterate through each fold

    for test_index in range(k):
    # Find the index of the training fold (fold that is currently not used for testing)
        train_index = [fold for fold in range(k) if fold != test_index][0]
        # Split the data into training and testing sets
        X_train, y_train = folds_X[train_index], folds_y[train_index]
        X_test, y_test = folds_X[test_index], folds_y[test_index]

    # we will fit the model on the train data
        model.fit(X_train, y_train)
        
        # we will predict on the test data
        y_pred = model.predict(X_test)
        
        # we will compute the accuracy
        accuracy = np.mean(y_pred == y_test)
        
        # we will append the accuracy to the list
        accuracies.append(accuracy)
    
    # we will return the mean accuracy
    return np.mean(accuracies)

In [41]:
#You can use the code below to test your function

#import the random forest model
from sklearn.ensemble import RandomForestClassifier

# we will use the random forest model
model = RandomForestClassifier()

# we will use the k_fold_cross_validation function
k_fold_cross_validation(X, y, 5, model)

0.9533333333333335