# Split the Dataset Using Stratified K-Folds Cross-Validator

In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv("Data/Titanic.csv")

In [3]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df.shape

(891, 12)

In [6]:
df.Survived.value_counts()

0    549
1    342
Name: Survived, dtype: int64

In [7]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [8]:
features=[i for i in df.columns if i !="Survived"]
features

['PassengerId',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [9]:
X=np.array(df[features])
y=np.array(df["Survived"])

In [10]:
from sklearn.model_selection import StratifiedKFold 
size=3
skf = StratifiedKFold(n_splits=size)
skf.get_n_splits(X, y)

3

In [16]:
# Note that skf.split(X, y) is a generator
folds_indices=skf.split(X, y)
folds_indices

<generator object _BaseKFold.split at 0x1a1602e678>

In [17]:
# Generate the First Training Set and Test Set
train_index, test_index=next(folds_indices)
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]

In [None]:
## Generate all Training and Test Sets
# L_data=list()
# L_target=list()
# for i in range(0,size):
#     train_index, test_index=next(folds_indices)
#     L_data.append((X[train_index], X[test_index]))
#     L_target.append((y[train_index], y[test_index]))
## Note that it is not necessary to save the data within a list! Indeed, we can recall training and test 
## sets using the next command when we need them. 

## Count Unique Values Using Numpy

In [19]:
unique_1, counts_1 = np.unique(y_train, return_counts=True)
print(np.asarray((unique_1, counts_1)).T)

[[  0 366]
 [  1 228]]


In [20]:
unique_2, counts_2 = np.unique(y_test, return_counts=True)
print(np.asarray((unique_2, counts_2)).T)

[[  0 183]
 [  1 114]]


## Count Unique Values Using Pandas

In [22]:
df_train=pd.DataFrame(X_train, columns=features)
df_train.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,285,1,"Smith, Mr. Richard William",male,,0,0,113056,26.0,A19,S
1,286,3,"Stankovic, Mr. Ivan",male,33.0,0,0,349239,8.6625,,C
2,288,3,"Naidenoff, Mr. Penko",male,22.0,0,0,349206,7.8958,,S
3,293,2,"Levy, Mr. Rene Jacques",male,36.0,0,0,SC/Paris 2163,12.875,D,C
4,294,3,"Haas, Miss. Aloisia",female,24.0,0,0,349236,8.85,,S


In [23]:
df_train=pd.DataFrame(X_train, columns=features)
df_test=pd.DataFrame(X_test, columns=features)
df_target_train=pd.DataFrame(y_train, columns=["Survived"])
df_target_test=pd.DataFrame(y_test, columns=["Survived"])

In [25]:
df_target_train.Survived.value_counts()

0    366
1    228
Name: Survived, dtype: int64

In [26]:
df_target_test.Survived.value_counts()

0    183
1    114
Name: Survived, dtype: int64

## Generate the Second Training Set and Test Set

In [27]:
train_index, test_index=next(folds_indices)
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
df_train=pd.DataFrame(X_train, columns=features)
df_train.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S
