# Train Test Split

In [1]:
import pandas as pd
import numpy as np

In [5]:
from sklearn.datasets import load_iris

In [15]:
iris = load_iris()

In [8]:
iris.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [16]:
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df.head(3)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2


In [17]:
df["target"] = iris.target
df.head(2)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0


In [19]:
df["target_name"] = iris.target_names[df["target"]]
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,target_name
0,5.1,3.5,1.4,0.2,0,setosa
1,4.9,3.0,1.4,0.2,0,setosa
2,4.7,3.2,1.3,0.2,0,setosa
3,4.6,3.1,1.5,0.2,0,setosa
4,5.0,3.6,1.4,0.2,0,setosa


In [20]:
# Alternate approach
df["species"] = pd.Categorical.from_codes(iris.target, iris.target_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,target_name,species
0,5.1,3.5,1.4,0.2,0,setosa,setosa
1,4.9,3.0,1.4,0.2,0,setosa,setosa
2,4.7,3.2,1.3,0.2,0,setosa,setosa
3,4.6,3.1,1.5,0.2,0,setosa,setosa
4,5.0,3.6,1.4,0.2,0,setosa,setosa


## split without using `sklearn`'s `train_test_split()` API

In [24]:
val = np.random.uniform(0, 1, 5)
print(val, type(val))
val <= 0.75

[0.40814339 0.77201225 0.44070165 0.03206128 0.90288655] <class 'numpy.ndarray'>


array([ True, False,  True,  True, False])

#### create train and test data

In [29]:
df["is_train"] = np.random.uniform(0, 1, len(df)) <= 0.75
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,target_name,species,is_train
0,5.1,3.5,1.4,0.2,0,setosa,setosa,True
1,4.9,3.0,1.4,0.2,0,setosa,setosa,True
2,4.7,3.2,1.3,0.2,0,setosa,setosa,True
3,4.6,3.1,1.5,0.2,0,setosa,setosa,False
4,5.0,3.6,1.4,0.2,0,setosa,setosa,True


#### split data

In [30]:
train, test = df[df["is_train"] == True], df[df["is_train"] == False]

print(f"Length of the dataframe: {len(df)}")
print(f"Length of the training data: {len(train)}")
print(f"Length of the test data: {len(test)}")
print(f"% of train data = {round((len(train)/len(df)) * 100)}%")

Length of the dataframe: 150
Length of the training data: 116
Length of the test data: 34
% of train data = 77%


### observation

Above code will randomly split data into train and test set. However, as the threshold value is `0.75`, maximum data will be on the training set. But the size of training and testing data will vary on every run.

## Let's find another way, where we can actually control the size

In [32]:
# frac=1 as I want the whole dataset. use `random_state` for reproducibility
shuffle_df = df.sample(frac=1)
shuffle_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,target_name,species,is_train
59,5.2,2.7,3.9,1.4,1,versicolor,versicolor,True
53,5.5,2.3,4.0,1.3,1,versicolor,versicolor,True
40,5.0,3.5,1.3,0.3,0,setosa,setosa,True
32,5.2,4.1,1.5,0.1,0,setosa,setosa,True
7,5.0,3.4,1.5,0.2,0,setosa,setosa,True


In [33]:
# 70% training data
train_size = int(0.7 * len(df))

In [34]:
train_set = shuffle_df[:train_size]
test_set = shuffle_df[train_size:]

print(f"Length of the dataframe: {len(df)}")
print(f"Length of the training data: {len(train_set)}")
print(f"Length of the test data: {len(test_set)}")
print(f"% of train data = {round((len(train_set)/len(df)) * 100)}%")

Length of the dataframe: 150
Length of the training data: 105
Length of the test data: 45
% of train data = 70%
