# Split the Dataset Using Stratified K-Folds Cross-Validator

In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv("Data/Titanic.csv")

In [3]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df.shape

(891, 12)

In [5]:
df.Sex.value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [6]:
features=[i for i in df.columns if i !="Sex"]
features

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [7]:
X=np.array(df[features])
y=np.array(df["Sex"])

In [8]:
from sklearn.model_selection import StratifiedKFold 
size=3
skf = StratifiedKFold(n_splits=size)
skf.get_n_splits(X, y)

3

In [9]:
# Note that skf.split(X, y) is a generator
folds_indices=skf.split(X, y)
folds_indices

<generator object _BaseKFold.split at 0x113f808e0>

In [10]:
# Generate just one fold containing Training and Test Sets
train_index, test_index=next(folds_indices)
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]

In [11]:
## Generate all Training and Test Sets
# L_data=list()
# L_target=list()
# for i in range(0,size):
#     train_index, test_index=next(folds_indices)
#     L_data.append((X[train_index], X[test_index]))
#     L_target.append((y[train_index], y[test_index]))
## Note that it is not necessary to save the data within a list! Indeed, we can recall training and test 
## sets using the next command when we need them. 

## Count Unique Values Using Numpy

In [12]:
unique_1, counts_1 = np.unique(y_train, return_counts=True)
print(np.asarray((unique_1, counts_1)).T)

[['female' 209]
 ['male' 384]]


In [13]:
unique_2, counts_2 = np.unique(y_test, return_counts=True)
print(np.asarray((unique_2, counts_2)).T)

[['female' 105]
 ['male' 193]]


## Count Unique Values Using Pandas

In [14]:
df_train=pd.DataFrame(X_train, columns=features)
df_test=pd.DataFrame(X_test, columns=features)
df_target_train=pd.DataFrame(y_train, columns=["Sex"])
df_target_test=pd.DataFrame(y_test, columns=["Sex"])

In [15]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,298,0,1,"Allison, Miss. Helen Loraine",2.0,1,2,113781,151.55,C22 C26,S
1,300,1,1,"Baxter, Mrs. James (Helene DeLaudeniere Chaput)",50.0,0,1,PC 17558,247.521,B58 B60,C
2,301,1,3,"Kelly, Miss. Anna Katherine ""Annie Kate""",,0,0,9234,7.75,,Q
3,302,1,3,"McCoy, Mr. Bernard",,2,0,367226,23.25,,Q
4,303,0,3,"Johnson, Mr. William Cahoone Jr",19.0,0,0,LINE,0.0,,S


In [16]:
df_target_train.Sex.value_counts()

male      384
female    209
Name: Sex, dtype: int64

In [17]:
df_target_test.Sex.value_counts()

male      193
female    105
Name: Sex, dtype: int64

## Generate the Second Training and Test Sets

In [18]:
train_index, test_index=next(folds_indices)
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
df_train=pd.DataFrame(X_train, columns=features)
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",22,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",26,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",35,0,0,373450,8.05,,S
