In [240]:
import numpy as np
import pandas as pd

In [241]:
# preparing dataset structure
NoofSamples = 2333
NoofFeatures = 8
# independent features array
IF = np.random.rand(NoofSamples, NoofFeatures)
IF.shape

(2333, 8)

In [242]:
# create feature column having 3 classes
Target = np.random.choice([0, 1, 2], p = [.29, .32, .39], size = NoofSamples).reshape(-1, 1)
Target = pd.DataFrame(Target)

In [243]:
# let us append target's for dataset
data = np.concatenate((IF, Target), axis = 1)

In [244]:
data

array([[0.80645612, 0.16633257, 0.16268624, ..., 0.2527723 , 0.84554867,
        2.        ],
       [0.79707799, 0.4105341 , 0.09720724, ..., 0.13045358, 0.34801146,
        2.        ],
       [0.90412655, 0.95361082, 0.88613967, ..., 0.55464031, 0.7863483 ,
        1.        ],
       ...,
       [0.05833278, 0.7970211 , 0.96494943, ..., 0.11489377, 0.32059879,
        1.        ],
       [0.20715247, 0.81835615, 0.28723042, ..., 0.25302147, 0.54774897,
        2.        ],
       [0.52039167, 0.22357294, 0.36460684, ..., 0.60561889, 0.16714341,
        1.        ]])

In [245]:
# preparing dataframe from np arrays
df = pd.DataFrame(data, columns = ["F1", "F2", "F3", "F4", "F5", "F6","F7", "F8", "Target"])
df.head()

Unnamed: 0,F1,F2,F3,F4,F5,F6,F7,F8,Target
0,0.806456,0.166333,0.162686,0.165912,0.12717,0.141674,0.252772,0.845549,2.0
1,0.797078,0.410534,0.097207,0.887955,0.749137,0.418989,0.130454,0.348011,2.0
2,0.904127,0.953611,0.88614,0.997652,0.862917,0.722249,0.55464,0.786348,1.0
3,0.6495,0.845755,0.954661,0.139992,0.784067,0.834052,0.740787,0.910084,2.0
4,0.984361,0.761462,0.067454,0.433798,0.265657,0.549154,0.912615,0.80263,2.0


In [250]:
# implementation of train test split class
class TrainTestSplit():
    def __init__(self, dataset):
        
        # shuffle dataset using sample method on dataset
        dataset = dataset.sample(frac = 1)
        
        # set dataset property
        self.dataset = dataset
        
        # get values from dataset
        self.datavalues = self.dataset.values
        
        
        # get noof samples and fetures and set them
        self.NoofSamples, self.NoofFeatures = self.dataset.shape[0], self.dataset.shape[1]
        
        # unique classes in dataset
        self.Classes = self.dataset.iloc[:,-1].unique()
        
        self.ValueCounts = list()
        
        # unique values of each class in self.dataset
        self.ValueCounts = list(self.dataset["Target"].value_counts())
        
        # initilize total no of classes
        # we can use it later for verification purpose after splitting
        self.NoofClsses = len(self.Classes)
        
        # initilize required values
        self.KFTrainSplits = None
        self.KFTestSplit = None
        self.SKFTrainSplits = None
        self.SKFTestSplit = None
    
    # kfold cross validation splitting technique in non stratified way
    def KfoldCV(self, K):
        
        # if no of splits greater than total no of samples
        if K > self.NoofSamples:
            
            # print error message
            print(f"value of k({K}) should not be greater than no of samples({self.NoofSamples})")
            
        
        # calculate no of samples per each split
        NoofSamplesPerSplit = self.NoofSamples // K
        
        # initilize traintest split to empty numpy array
        TrainSplits = np.empty([K-1, NoofSamplesPerSplit, self.NoofFeatures])
        
        # start with 0 index
        start = 0
        
        # end until noof samples are met
        end = NoofSamplesPerSplit
        
        # looping k-1 times
        for i in range(K-1):
            
            # update values of empty array
            TrainSplits[i] = self.datavalues[start : end, :]
            
            # increase index starting pointer to NoofSamplesPerSplit
            start += NoofSamplesPerSplit
            
            # same for end index too
            end += NoofSamplesPerSplit
        
        # remain 1 split can be used as test dataset
        # add remaining samples to test split
        TestSplit = np.array(self.dataset.iloc[start:, :].values.reshape(-1, 9))
        
        self.KFTrainSplits, self.KFTestSplit = TrainSplits, TestSplit
        
        self.Kfoldstatus()
    
    def Kfoldstatus(self):
        for i in range(len(self.KFTrainSplits)):
            UniqueClasses = np.unique(self.KFTrainSplits[i][:, -1])
            print(f"Split{i+1} shape = {self.KFTrainSplits[i].shape}", end = "")
            for cls in UniqueClasses:
                vals = np.count_nonzero(self.KFTrainSplits[i][:, -1] == cls)
                print(f" {vals} {int(cls)}'s", end = "")
            print("\n")

        UniqueClasses = np.unique(self.KFTestSplit[:, -1])
        print(f"Test shape = {self.KFTestSplit.shape}", end = "")
        for cls in UniqueClasses:
            vals = np.count_nonzero(self.KFTestSplit[:, -1] == cls)
            print(f" {vals} {int(cls)}'s", end = "")
        print("\n")
    
    def Stratifiedkfval(self, k):
        
        # if no of splits greater than total no of samples
        if k > self.NoofSamples:
            
            # print error message
            print(f"value of k({k}) should not be greater than no of samples({self.NoofSamples})")
           
        # get the copy of datavalues
        datavals = self.datavalues
        
        # calculate no of samples per each split
        NoofSamplesPerSplit = self.NoofSamples // k
        
        # initilize traintest split to empty numpy array
        TrainSplits = np.empty([k-1, NoofSamplesPerSplit, self.NoofFeatures])
        
        # spliting targets for a datasplit in stratified way
        SplitRatios = [x//k for x in self.ValueCounts]        
        
        # getting each split by looping k-1 times
        for i in range(k-1):
            
            # initilizing targets arr that holds target classes
            arr = list()
            
            # appending stratified targets to arr
            for j in range(len(SplitRatios)):
                arr = arr + [self.Classes[j]]* SplitRatios[j]
                
            # updating values that we initilized as empty before
            for l in range(len(arr)):
                
                # getting indexes of sampels having target arr[l] which includes all unique classes in it
                indexes = np.where(datavals[:, -1] == arr[l])
                
                # getting only first occurance of target value
                FirstOccurance = indexes[0][0]
                
                # changing values in sample
                TrainSplits[i][l] = datavals[FirstOccurance, :]
                
                # deleting sample from datavals, so that the same sample won't include again in dataset
                datavals = np.delete(datavals, FirstOccurance, 0)
        
        
        # appending traintest split results to self obj properties
        self.SKFTrainSplits, self.SKFTestSplit = TrainSplits, datavals
        
        # calling SLfoldstatus def
        self.SKfoldstatus()
          
    
    # function that analyses stratifed k fold split results
    def SKfoldstatus(self):
         
        for i in range(len(self.SKFTrainSplits)):
            UniqueClasses = np.unique(self.SKFTrainSplits[i][:, -1])
            print(f"Split{i+1} shape = {self.SKFTrainSplits[i].shape}", end = "")
            for cls in UniqueClasses:
                vals = np.count_nonzero(self.SKFTrainSplits[i][:, -1] == cls)
                print(f" {vals} {int(cls)}'s", end = "")
            print("\n")

        UniqueClasses = np.unique(self.SKFTestSplit[:, -1])
        print(f"Test shape = {self.SKFTestSplit.shape}", end = "")
        for cls in UniqueClasses:
            vals = np.count_nonzero(self.SKFTestSplit[:, -1] == cls)
            print(f" {vals} {int(cls)}'s", end = "")
        print("\n")
        

In [251]:
tts = TrainTestSplit(df)

In [252]:
tts.Stratifiedkfval(6)

Split1 shape = (388, 9) 115 0's 120 1's 153 2's

Split2 shape = (388, 9) 116 0's 119 1's 153 2's

Split3 shape = (388, 9) 115 0's 119 1's 154 2's

Split4 shape = (388, 9) 115 0's 119 1's 154 2's

Split5 shape = (388, 9) 115 0's 119 1's 154 2's

Test shape = (398, 9) 120 0's 120 1's 158 2's



In [253]:
tts.KfoldCV(6)

Split1 shape = (388, 9) 112 0's 121 1's 155 2's

Split2 shape = (388, 9) 117 0's 131 1's 140 2's

Split3 shape = (388, 9) 126 0's 125 1's 137 2's

Split4 shape = (388, 9) 106 0's 126 1's 156 2's

Split5 shape = (388, 9) 124 0's 98 1's 166 2's

Test shape = (393, 9) 110 0's 114 1's 169 2's



In [None]:
# we can k fold cross validation technique when implementing ml algorithm for better performance
# from above split we can clearly say that stratified way of splitting dataset
# us far better than normal k fold split
# we can perform leave one out cv too with our dataset
# i'll update it in a new file