# K-fold Cross-validation 

In [5]:
# import pandas and model_selection module from scikit-learn
import pandas as pd
from sklearn import model_selection

if __name__ == "__main__":
    # loading the dataset from the csv file
    df = pd.read_csv("train.csv")
    
    # creating the column kfold and assigning value -1
    df["kfold"] = -1
    
    # randomizing the rows of the dataframe
    df = df.sample(frac=1).reset_index(drop=True)
    
    # initializng the KFold class from model_selection
    kf = model_selection.KFold(n_splits=5)
    
    # filling the kflod column
    for fold, (train_, validation_) in enumerate(kf.split(X=df)):
        df.loc[validation_,"kfold"] = fold
        
    # saving the new csv with kflod column
    df.to_csv("train_folds.cdv", index=False)

# Stratified K-fold Cross-validation
* It should be used where the data is imbalanced

In [7]:
# importing pandas and model_selection from scikit-learn
import pandas as pd
from sklearn import model_selection

if __name__ == "__main__":
    # importing the dataset from the csv file
    df = pd.read_csv("train.csv")
    
    # creating the column kfold and assigning value -1
    df["kfold"] = -1 
    
    # fetching the targets
    y = df.targets.values
    
    # randomizing the rows of a dataframe
    df = df.sample(frac=1).reset_index(drop=True)
    
    # intializing the Stratified KFold class from model_selection 
    kf = model_selection.StratifiedKFold(n_splits=5)
    
    # filling the kfold column with the assigned fold
    for fold, (train_, validation_) in enumerate(kf.split(X=df, y=y)):
        df.loc[validation_, "kflod"] = fold
    
    # saving the new csv with kfold column
    df.to_csv("train_stratified_kfold.csv", index=False)

# Stratified K-fold Cross-validation for Regression
* It can be used if the distribution of targets is not consistent
* We have to divide the target into bins and then we can use stratified k-fold
* If we have lot of samples we can divide into 10 or 20 bins
* If we have fewer samples we have to use __Sturuge's Rule__
       Sturuge's Rule :
           Number of Bins = 1 + log2(N)

In [9]:
# importing pandas numpy, datasets from sklearn and model_selection from sklearn
import pandas as pd
import numpy as np

from sklearn import datasets
from sklearn import model_selection

def create_folds(data):
    # creating the column kfold and assigning value -1
    data["kfold"] = -1
    
    # randomizing the rows of the dataframe
    data = data.sample(frac=1).reset_index(drop=True)
    
    # Calculating the number of bins
    num_bins = int(np.floor(1 + np.log2(len(data))))
    
    # bin targets 
    data.loc[:, "bins"] = pd.cut(
        data["target"], bins=num_bins, labels=False
    )
    
    # initializing the stratified kfold class from model_selection
    kf = model_selection.StratifiedKFold(n_splits=5)
    
    # filling the kfold column of the dataframe
    for fold, (train_, validation_) in enumerate(kf.split(X=data, y=data.bins.values)):
        data.loc[validation_, "kfold"] = fold
    
    # droping the bins column
    data = data.drop("bins", axis=1)
    
    return data

if __name__ == "__main__":
    # Creating a sample dataset with 15000 exaples
    X, y = datasets.make_regression(
        n_samples=15000, n_features=100, n_targets=1
    )
    
    # creating a dataframe from numpy array
    df = pd.DataFrame(
        X,
        columns=[f"f_{i}" for i in range(X.shape[1])]
    )
    
    df.loc[:, "target"] = y
    
    # creating folds 
    df = create_folds(df)
    
    # writing into new csv file 
    df.to_csv("train_regression_stratified.csv", index=False)
    

