## Import Libraries

In [1]:
import pandas as pd
import numpy as np

## Load the data

In [2]:
# load the dataset
iris = pd.read_csv('iris.csv')

In [3]:
# display the size
print('Size of Iris Dataframe', iris.shape)

Size of Iris Dataframe (150, 5)


In [4]:
# display the first few rows
print('First 5 rows of setosa species')
iris.head()

First 5 rows of setosa species


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [5]:
print('First 5 rows of versicolor species')
iris[50:55]

First 5 rows of versicolor species


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
50,7.0,3.2,4.7,1.4,Iris-versicolor
51,6.4,3.2,4.5,1.5,Iris-versicolor
52,6.9,3.1,4.9,1.5,Iris-versicolor
53,5.5,2.3,4.0,1.3,Iris-versicolor
54,6.5,2.8,4.6,1.5,Iris-versicolor


In [6]:
print('First 5 rows of virginica species')
iris[100:105]

First 5 rows of virginica species


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
100,6.3,3.3,6.0,2.5,Iris-virginica
101,5.8,2.7,5.1,1.9,Iris-virginica
102,7.1,3.0,5.9,2.1,Iris-virginica
103,6.3,2.9,5.6,1.8,Iris-virginica
104,6.5,3.0,5.8,2.2,Iris-virginica


In [7]:
iris["species"].value_counts()

Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: species, dtype: int64

## Perform Random Sampling

In [8]:
sample_random = iris.sample(n=30)
print(sample_random.shape)

(30, 5)


In [9]:
sample_random.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
19,5.1,3.8,1.5,0.3,Iris-setosa
126,6.2,2.8,4.8,1.8,Iris-virginica
90,5.5,2.6,4.4,1.2,Iris-versicolor
34,4.9,3.1,1.5,0.1,Iris-setosa
47,4.6,3.2,1.4,0.2,Iris-setosa


In [10]:
sample_random["species"].value_counts()

Iris-setosa        11
Iris-versicolor    10
Iris-virginica      9
Name: species, dtype: int64

In [11]:
iris["species"].value_counts()

Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: species, dtype: int64

## Perform Sampling with condition

In [12]:
condition = iris['sepal_width'] < 3

true_index = condition[condition == True].index


In [13]:
sample_random_condition = iris[condition].sample(n = 10)

In [14]:
sample_random_condition["species"].value_counts()

Iris-versicolor    7
Iris-virginica     3
Name: species, dtype: int64

## Systematic Sampling 

In [15]:
rate = 5 # every 5th example is sampled

sample_systematic = iris.loc[::rate] # slicing operation

In [16]:
sample_systematic["species"].value_counts()

Iris-setosa        10
Iris-versicolor    10
Iris-virginica     10
Name: species, dtype: int64

In [17]:
sample_systematic.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
5,5.4,3.9,1.7,0.4,Iris-setosa
10,5.4,3.7,1.5,0.2,Iris-setosa
15,5.7,4.4,1.5,0.4,Iris-setosa
20,5.4,3.4,1.7,0.2,Iris-setosa


### Statified Sampling

Separate the columns 0 to 3 as X.

Separate the last column as target or y.

Use the parameter *stratify* to sample based on y.

In [18]:
col_length = len(iris.columns)

# iloc means slicing column wise
# loc means slicing row wise

X = iris.iloc[:,0:col_length - 1]  #independent columns
y = iris.iloc[:,-1]    # target column

#Split the data into training and testing set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.2, stratify=y)

In [19]:
print(X_train.shape) # 80% data
print(X_test.shape)  # 20% data

# 150 train 150*0.8 and test 150*0.2)

(120, 4)
(30, 4)


In [20]:
y_test.value_counts()

Iris-setosa        10
Iris-virginica     10
Iris-versicolor    10
Name: species, dtype: int64