In [1]:
import pandas as pd
from sklearn import model_selection

In [2]:
df = pd.read_csv("../input/train.csv")
df.head(3)

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0,0,0,T,Y,Green,Triangle,Snake,Finland,...,2f4cb3d51,2,Grandmaster,Cold,h,D,kr,2,2,0
1,1,0,1,0,T,Y,Green,Trapezoid,Hamster,Russia,...,f83c56c21,1,Grandmaster,Hot,a,A,bF,7,8,0
2,2,0,0,0,F,Y,Blue,Trapezoid,Lion,Russia,...,ae6800dd0,1,Expert,Lava Hot,h,R,Jc,7,2,0


In [3]:
# create a fake column kfold and assign value -1
df['kfold'] = -1

In [4]:
# shuffle the data and reset indices
df = df.sample(frac = 1).reset_index(drop=True)

In [5]:
df.head(4)

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target,kfold
0,78638,0,0,1,T,Y,Blue,Square,Dog,Russia,...,3,Contributor,Freezing,f,G,ac,1,4,0,-1
1,101982,0,1,0,T,N,Blue,Trapezoid,Hamster,Russia,...,3,Grandmaster,Lava Hot,e,E,aM,2,3,1,-1
2,58694,0,1,0,T,Y,Blue,Square,Cat,Canada,...,1,Grandmaster,Freezing,l,A,Nk,2,9,0,-1
3,197334,0,0,1,T,Y,Blue,Triangle,Hamster,China,...,1,Expert,Boiling Hot,h,Q,TZ,3,10,0,-1


Observe how the ids have changed in the first column in the dataframe (df) above.

In [6]:
df.target.value_counts(normalize = True)

0    0.69412
1    0.30588
Name: target, dtype: float64



### [Stratified Kfold](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html?msclkid=6b91a536bf2711ec8bd0311f820fd798#sklearn-model-selection-stratifiedkfold)

- Provides train/test indices to split data in train/test sets.
- This cross-validation object is a variation of KFold that returns stratified folds. The folds are made by preserving the percentage of samples for each class.

In [7]:
kf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

In [8]:
kf

StratifiedKFold(n_splits=5, random_state=1, shuffle=True)

In [9]:
for fold, (train_idx, val_idx) in enumerate(kf.split(X = df, y = df.target.values)):
    # kf.split(X = df, y = df.target.values) - returns 2 list of indices for train and validation sets
    print(len(train_idx), len(val_idx))
    print("fold: ", fold)
    df.loc[val_idx, 'kfold'] = fold

240000 60000
fold:  0
240000 60000
fold:  1
240000 60000
fold:  2
240000 60000
fold:  3
240000 60000
fold:  4


In [10]:
val_idx  # returns the list of indices corresponding to the training data in that row

array([     1,      6,      8, ..., 299985, 299987, 299996])

In [11]:
df.target[val_idx].value_counts(normalize = True)

0    0.694117
1    0.305883
Name: target, dtype: float64

Observe the stratified split has made sure that the original percentage of the 1 and 0 are maintained in the cross-validation dataset.

In [12]:
df.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target,kfold
0,78638,0,0,1,T,Y,Blue,Square,Dog,Russia,...,3,Contributor,Freezing,f,G,ac,1,4,0,1
1,101982,0,1,0,T,N,Blue,Trapezoid,Hamster,Russia,...,3,Grandmaster,Lava Hot,e,E,aM,2,3,1,4
2,58694,0,1,0,T,Y,Blue,Square,Cat,Canada,...,1,Grandmaster,Freezing,l,A,Nk,2,9,0,1
3,197334,0,0,1,T,Y,Blue,Triangle,Hamster,China,...,1,Expert,Boiling Hot,h,Q,TZ,3,10,0,0
4,23442,1,0,0,T,N,Green,Trapezoid,Snake,India,...,1,Novice,Lava Hot,l,I,gJ,3,2,0,0


In [13]:
df.shape

(300000, 26)

In [14]:
df.kfold.value_counts()

1    60000
4    60000
0    60000
3    60000
2    60000
Name: kfold, dtype: int64