## SAMPLING METHODS

#### Data Generation

In [6]:
import numpy as np
import pandas as pd

s=100
data_employee={ 'employee_id':np.arange(1,101),
                'Age':np.random.randint(25,61,size=s),
                'Basic Pay':np.random.randint(15600,67100,size=s),
                'Years of Service':np.random.randint(0,41,size=s),
                'Performance Score':np.random.randint(0,2,size=s)
              }
#data_employee

In [7]:

df=pd.DataFrame(data_employee,columns=['employee_id','Age','Basic Pay','Years of Service','Performance Score'])            
df

Unnamed: 0,employee_id,Age,Basic Pay,Years of Service,Performance Score
0,1,51,28853,35,1
1,2,39,40170,5,0
2,3,41,63612,2,0
3,4,58,19911,25,0
4,5,25,45594,31,1
...,...,...,...,...,...
95,96,44,47126,10,0
96,97,54,17682,18,1
97,98,60,24339,18,0
98,99,34,52152,1,1


### Random Sampling

In [9]:

simple_random_sample = df.sample(n=10)

simple_random_sample


Unnamed: 0,employee_id,Age,Basic Pay,Years of Service,Performance Score
5,6,40,50013,32,0
31,32,45,57212,36,1
40,41,59,48482,21,1
62,63,38,34147,40,0
94,95,36,26073,13,1
2,3,41,63612,2,0
65,66,59,29450,20,0
77,78,49,25074,10,1
52,53,30,49729,27,1
68,69,39,15716,10,0


In [10]:

df.sample(frac=0.20, replace=True)


Unnamed: 0,employee_id,Age,Basic Pay,Years of Service,Performance Score
82,83,38,26619,29,1
6,7,54,58204,10,0
57,58,29,61528,19,1
63,64,47,51437,26,0
0,1,51,28853,35,1
46,47,45,51736,1,0
86,87,50,36230,4,0
63,64,47,51437,26,0
45,46,60,38536,1,1
11,12,38,49916,15,1


In [11]:

df.sample(frac=0.20, random_state=1)


Unnamed: 0,employee_id,Age,Basic Pay,Years of Service,Performance Score
80,81,26,40503,23,0
84,85,41,25445,27,0
33,34,27,50807,24,0
81,82,59,56060,36,1
93,94,58,50234,20,1
17,18,44,17738,22,1
36,37,51,23228,15,1
82,83,38,26619,29,1
69,70,30,50798,26,1
65,66,59,29450,20,0


### Categoricalo data

In [15]:
import pandas as pd
import random

zones = ['Airport', 'HiTech city', 'KPHB', 'SecBad']
time_slots = ['Morning', 'Afternoon', 'Evening', 'Night']
day_types = ['Weekday', 'Weekend']

data = []

for i in range(100):
    zone = random.choice(zones)
    time = random.choice(time_slots)
    day_type = random.choice(day_types)

    data.append([zone, time, day_type])

In [18]:
df1=pd.DataFrame(data,columns=['zones','time_slots','day_types'])            
df1

Unnamed: 0,zones,time_slots,day_types
0,Airport,Morning,Weekday
1,HiTech city,Morning,Weekday
2,SecBad,Evening,Weekend
3,SecBad,Morning,Weekday
4,SecBad,Morning,Weekday
...,...,...,...
95,Airport,Morning,Weekday
96,HiTech city,Night,Weekend
97,KPHB,Night,Weekday
98,SecBad,Evening,Weekend


In [19]:
len(df)

100

### Systematic Sampling

In [20]:
def systematic_sampling(df, step):
    indexes = np.arange(0,len(df),step=step)
    systematic_sample = df.iloc[indexes]
    return systematic_sample
    
systematic_sample = systematic_sampling(df, 4)

systematic_sample

Unnamed: 0,employee_id,Age,Basic Pay,Years of Service,Performance Score
0,1,51,28853,35,1
4,5,25,45594,31,1
8,9,54,29228,24,1
12,13,40,39540,37,0
16,17,60,40617,22,1
20,21,51,48877,11,0
24,25,42,46874,12,0
28,29,44,30929,34,1
32,33,52,25152,32,1
36,37,51,23228,15,1


### Clustered Sampling

In [21]:
# Create clusters

df['Cluster'] = (df['employee_id'] - 1) // 10 + 1

    employee_id - 1 makes IDs start from 0
    // 10 divides into groups of 10
    + 1 makes cluster numbers start from 1

In [22]:
df['Cluster']

0      1
1      1
2      1
3      1
4      1
      ..
95    10
96    10
97    10
98    10
99    10
Name: Cluster, Length: 100, dtype: int32

In [37]:
# Randomly select clusters

selected_clusters = np.random.choice(df['Cluster'].unique(), size=2, replace=False)

In [38]:
selected_clusters

array([9, 5])

In [39]:
# Take all employees from selected clusters

cluster_sample = df[df['Cluster'].isin(selected_clusters)]

In [40]:
cluster_sample

Unnamed: 0,employee_id,Age,Basic Pay,Years of Service,Performance Score,Cluster
40,41,59,48482,21,1,5
41,42,32,30550,32,0,5
42,43,50,29230,15,0,5
43,44,49,19206,2,0,5
44,45,38,61632,15,0,5
45,46,60,38536,1,1,5
46,47,45,51736,1,0,5
47,48,44,55811,29,0,5
48,49,37,51923,6,0,5
49,50,34,18686,10,1,5


### Stratified Sampling

In [None]:

stratified_sample = df.groupby('Performance Score', group_keys=False).apply(lambda g: g.sample(frac=0.2, random_state=42))

stratified_sample
