In [13]:
import numpy as np
import pandas as pd

# Random Sampling

Used usually when there is no additional information.

In [3]:
N = 10000
mu = 10
std = 2

population_df = np.random.normal(mu, std, N)
population_df

array([ 9.84452338, 10.07946478,  9.80359981, ...,  9.1563559 ,
        7.28179228,  9.18779374])

In [10]:
random_sample = np.random.choice(population_df, replace=False, size=1000)
random_sample

array([10.70474148,  7.49577188, 12.41617537, 11.20441524,  8.51939216,
        8.52086256, 10.71391461, 10.42224103,  8.95798139,  8.09151738,
       10.72325313, 10.89199926,  9.70283664,  8.69498958, 10.34229363,
       10.31145463,  6.5631553 ,  8.68857315,  9.65800751,  9.83832444,
       11.3135006 ,  8.23730776, 10.12740863,  7.50972246,  8.33034412,
       10.96824758, 12.61809169,  9.65633059, 10.02783215,  9.03398906,
       11.3533996 , 11.63327991, 15.22059193,  9.27619596, 10.97031855,
        9.13220553, 12.96255815,  9.4066958 , 12.16513788, 10.97512912,
        9.58640618,  5.26432105, 11.75206812, 10.74757652, 10.01916323,
       12.96516019, 11.94191988, 13.30358753,  7.23348929,  8.64425093,
       14.92253841, 11.11630208,  7.59950318, 15.16385975,  7.83830703,
       11.60257951,  9.53479976,  8.83886605, 11.82705803,  8.01945542,
        8.5802749 ,  9.94741653, 10.08125411,  9.53575005, 14.16081839,
       10.61227982, 11.27820356, 11.22476231, 10.10822179, 10.69

# Systematic sampling

Produces a random sample, but doesn't address the bias.

In [17]:
def systematic_sampling(df, step):
    id = pd.Series(np.arange(1, len(df), 1))
    df = pd.Series(df)
    df_pd = pd.concat([id, df], axis=1)
    df_pd.columns = ["id", "data"]

    selected_index = np.arange(1, len(df), step)

    return df_pd.iloc[selected_index]

sample_size = 10
step_size = N/sample_size
systematic_sample = systematic_sampling(population_df, step_size)
systematic_sample

Unnamed: 0,id,data
1,2.0,10.079465
1001,1002.0,9.757181
2001,2002.0,9.789676
3001,3002.0,8.475806
4001,4002.0,7.291114
5001,5002.0,11.90233
6001,6002.0,11.124445
7001,7002.0,9.244151
8001,8002.0,5.460468
9001,9002.0,10.819754


# Cluster Sampling

Produces a random sample but does not address the bias in the sample.

- Price: generated using Uniform distribution,
- Id
- event_type: which is a categorical variable with 3 possible values {type1, type2, type3}
- click: binary variable taking values {0: no click, 1: click}

In [18]:
price_vb = pd.Series(np.random.uniform(1, 4, size=N))
id = pd.Series(np.arange(0, len(price_vb), 1))
event_type = pd.Series(np.random.choice(["type1", "type2", "type3"], size=len(price_vb)))
click = pd.Series(np.random.choice([0, 1], size=len(price_vb)))

df = pd.concat([id, price_vb, event_type, click], axis=1)
df.columns = ["id", "price", "event_type", "click"]
df

Unnamed: 0,id,price,event_type,click
0,0,2.000831,type1,0
1,1,3.991851,type1,1
2,2,2.978841,type3,1
3,3,2.028109,type2,1
4,4,1.443728,type1,1
...,...,...,...,...
9995,9995,3.967940,type1,1
9996,9996,1.299092,type3,1
9997,9997,2.473600,type1,1
9998,9998,1.554249,type2,0


In [19]:
def clustered_sampling(df, n_per_cluster, num_select_clusters):
    N = len(df)
    K = int(N/n_per_cluster)
    data = None

    for k in range(K):
        sample_k = df.sample(n_per_cluster)
        sample_k["cluster"] = np.repeat(k, len(sample_k))
        df = df.drop(index= sample_k.index)
        data = pd.concat([data, sample_k], axis=0)

    random_chosen_clusters = np.random.randint(0, K, size=num_select_clusters)
    return data[data.cluster.isin(random_chosen_clusters)]

clustered_sample = clustered_sampling(df, 100, 2)
clustered_sample

Unnamed: 0,id,price,event_type,click,cluster
4018,4018,1.397140,type1,1,34
6568,6568,2.050874,type2,0,34
2768,2768,2.660684,type3,0,34
1980,1980,2.469532,type2,1,34
9893,9893,3.226760,type1,0,34
...,...,...,...,...,...
8586,8586,1.361382,type2,0,67
8991,8991,3.009675,type1,0,67
6053,6053,1.808599,type3,1,67
5061,5061,2.039120,type3,1,67


# Weighted Sampling

Produces a random and unbiased sample.

In [21]:
def weighted_sampling(df, n):
    def get_class_prob(x):
        weight_x = int(np.rint(n* len(x[x.click != 0])) / len(df[df.click != 0]))
        sampled_x = x.sample(weight_x).reset_index(drop=True)
        return sampled_x
    
    weighted_sample = df.groupby("event_type").apply(get_class_prob)
    print(weighted_sample["event_type"].value_counts())
    return weighted_sample

weighted_sample = weighted_sampling(df, 100)
weighted_sample

event_type
type1    34
type2    33
type3    32
Name: count, dtype: int64


  weighted_sample = df.groupby("event_type").apply(get_class_prob)


Unnamed: 0_level_0,Unnamed: 1_level_0,id,price,event_type,click
event_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
type1,0,718,1.649533,type1,0
type1,1,6544,3.421670,type1,0
type1,2,3547,2.819380,type1,0
type1,3,9041,2.719342,type1,1
type1,4,4858,1.126370,type1,1
...,...,...,...,...,...
type3,27,6728,2.655326,type3,0
type3,28,9416,1.049936,type3,1
type3,29,1264,3.809083,type3,1
type3,30,139,2.107555,type3,1


# Stratified Sampling

It's a way of combining clustered and weighted sampling.

In [23]:
def stratified_sampling(df, n, num_clusters_needed):
    N = len(df)
    num_obs_per_clusters = int(N/n)
    K = int(N/num_obs_per_clusters)
    
    stratas = None
    for k in range(K):
        weighted_sample_k = weighted_sampling(df, num_obs_per_clusters).reset_index(drop=True)
        weighted_sample_k["cluster"] = np.repeat(k, len(weighted_sample_k))
        stratas = pd.concat([stratas, weighted_sample_k], axis=0)
        df.drop(index=weighted_sample_k.index)

    selected_strata_clusters = np.random.randint(0, K, size=num_clusters_needed)
    return stratas[stratas.cluster.isin(selected_strata_clusters)]

stratified_sampling(df, 100, 2)

  weighted_sample = df.groupby("event_type").apply(get_class_prob)
  weighted_sample = df.groupby("event_type").apply(get_class_prob)
  weighted_sample = df.groupby("event_type").apply(get_class_prob)
  weighted_sample = df.groupby("event_type").apply(get_class_prob)
  weighted_sample = df.groupby("event_type").apply(get_class_prob)
  weighted_sample = df.groupby("event_type").apply(get_class_prob)
  weighted_sample = df.groupby("event_type").apply(get_class_prob)
  weighted_sample = df.groupby("event_type").apply(get_class_prob)
  weighted_sample = df.groupby("event_type").apply(get_class_prob)
  weighted_sample = df.groupby("event_type").apply(get_class_prob)
  weighted_sample = df.groupby("event_type").apply(get_class_prob)
  weighted_sample = df.groupby("event_type").apply(get_class_prob)
  weighted_sample = df.groupby("event_type").apply(get_class_prob)
  weighted_sample = df.groupby("event_type").apply(get_class_prob)
  weighted_sample = df.groupby("event_type").apply(get_class_p

event_type
type1    34
type2    33
type3    32
Name: count, dtype: int64
event_type
type1    34
type2    33
type3    32
Name: count, dtype: int64
event_type
type1    34
type2    33
type3    32
Name: count, dtype: int64
event_type
type1    34
type2    33
type3    32
Name: count, dtype: int64
event_type
type1    34
type2    33
type3    32
Name: count, dtype: int64
event_type
type1    34
type2    33
type3    32
Name: count, dtype: int64
event_type
type1    34
type2    33
type3    32
Name: count, dtype: int64
event_type
type1    34
type2    33
type3    32
Name: count, dtype: int64
event_type
type1    34
type2    33
type3    32
Name: count, dtype: int64
event_type
type1    34
type2    33
type3    32
Name: count, dtype: int64
event_type
type1    34
type2    33
type3    32
Name: count, dtype: int64
event_type
type1    34
type2    33
type3    32
Name: count, dtype: int64
event_type
type1    34
type2    33
type3    32
Name: count, dtype: int64
event_type
type1    34
type2    33
type3    32
Name

  weighted_sample = df.groupby("event_type").apply(get_class_prob)
  weighted_sample = df.groupby("event_type").apply(get_class_prob)
  weighted_sample = df.groupby("event_type").apply(get_class_prob)
  weighted_sample = df.groupby("event_type").apply(get_class_prob)
  weighted_sample = df.groupby("event_type").apply(get_class_prob)
  weighted_sample = df.groupby("event_type").apply(get_class_prob)
  weighted_sample = df.groupby("event_type").apply(get_class_prob)
  weighted_sample = df.groupby("event_type").apply(get_class_prob)
  weighted_sample = df.groupby("event_type").apply(get_class_prob)
  weighted_sample = df.groupby("event_type").apply(get_class_prob)
  weighted_sample = df.groupby("event_type").apply(get_class_prob)
  weighted_sample = df.groupby("event_type").apply(get_class_prob)
  weighted_sample = df.groupby("event_type").apply(get_class_prob)
  weighted_sample = df.groupby("event_type").apply(get_class_prob)
  weighted_sample = df.groupby("event_type").apply(get_class_p

event_type
type1    34
type2    33
type3    32
Name: count, dtype: int64
event_type
type1    34
type2    33
type3    32
Name: count, dtype: int64
event_type
type1    34
type2    33
type3    32
Name: count, dtype: int64
event_type
type1    34
type2    33
type3    32
Name: count, dtype: int64
event_type
type1    34
type2    33
type3    32
Name: count, dtype: int64
event_type
type1    34
type2    33
type3    32
Name: count, dtype: int64
event_type
type1    34
type2    33
type3    32
Name: count, dtype: int64
event_type
type1    34
type2    33
type3    32
Name: count, dtype: int64
event_type
type1    34
type2    33
type3    32
Name: count, dtype: int64
event_type
type1    34
type2    33
type3    32
Name: count, dtype: int64
event_type
type1    34
type2    33
type3    32
Name: count, dtype: int64
event_type
type1    34
type2    33
type3    32
Name: count, dtype: int64
event_type
type1    34
type2    33
type3    32
Name: count, dtype: int64
event_type
type1    34
type2    33
type3    32
Name

  weighted_sample = df.groupby("event_type").apply(get_class_prob)
  weighted_sample = df.groupby("event_type").apply(get_class_prob)
  weighted_sample = df.groupby("event_type").apply(get_class_prob)
  weighted_sample = df.groupby("event_type").apply(get_class_prob)
  weighted_sample = df.groupby("event_type").apply(get_class_prob)
  weighted_sample = df.groupby("event_type").apply(get_class_prob)
  weighted_sample = df.groupby("event_type").apply(get_class_prob)
  weighted_sample = df.groupby("event_type").apply(get_class_prob)
  weighted_sample = df.groupby("event_type").apply(get_class_prob)
  weighted_sample = df.groupby("event_type").apply(get_class_prob)
  weighted_sample = df.groupby("event_type").apply(get_class_prob)
  weighted_sample = df.groupby("event_type").apply(get_class_prob)
  weighted_sample = df.groupby("event_type").apply(get_class_prob)
  weighted_sample = df.groupby("event_type").apply(get_class_prob)
  weighted_sample = df.groupby("event_type").apply(get_class_p

Unnamed: 0,id,price,event_type,click,cluster
0,4771,2.264260,type1,1,19
1,8738,1.983886,type1,0,19
2,1635,3.071280,type1,0,19
3,683,1.673255,type1,0,19
4,176,3.792270,type1,1,19
...,...,...,...,...,...
94,7662,3.240353,type3,1,65
95,8096,1.239540,type3,1,65
96,9325,3.085328,type3,1,65
97,1444,3.063573,type3,0,65
