In [1]:
import numpy as np
import pandas as pd
from scipy import stats
import math
    
import sampling_abtest.model as sabmodel
import sampling_abtest.utils as sabutils


# 1. Apply Sample Size Estimation  
## 1.1. Population Size Unkown Case

In [2]:
df = pd.DataFrame(np.random.rand(3000, 4))
sabutils.sample_size_estimate(df, known=False)

1068

## 1.2. Population Size Known Case  
Population Size N is obtained by the number of rows from dataframe shape.

In [3]:
sabutils.sample_size_estimate(df, known=True)

788

# 2. SRS (Simple Random Sampling)

In [4]:
# SRS
sabmodel.srs(df, 10)

Unnamed: 0,0,1,2,3
719,0.738913,0.962417,0.577457,0.385853
2295,0.663474,0.981764,0.321076,0.222312
2659,0.361758,0.369122,0.494729,0.558011
1011,0.7944,0.600582,0.374692,0.412534
554,0.261777,0.531939,0.285283,0.644015
1072,0.285058,0.081164,0.132216,0.855991
2422,0.511362,0.698037,0.589931,0.973566
697,0.323118,0.138166,0.567151,0.185224
1523,0.599716,0.916155,0.413889,0.033811
1083,0.313257,0.949127,0.904564,0.732814


# 3. Systematic Sampling

In [5]:
# Systematic
sabmodel.systematic(df, 100, 5)

Unnamed: 0,0,1,2,3
0,0.267893,0.374767,0.385392,0.573698
5,0.507262,0.193649,0.63906,0.545709
10,0.867039,0.558433,0.292974,0.820928
15,0.412657,0.475854,0.03602,0.57798
20,0.301141,0.007483,0.108897,0.123916
25,0.039022,0.819849,0.447776,0.454576
30,0.574885,0.918353,0.544653,0.442673
35,0.309208,0.742118,0.58684,0.791361
40,0.877862,0.658576,0.173156,0.076092
45,0.641858,0.642615,0.532792,0.618495


# 4. Binary Sampling  
Binary sampling method returns a dictionary with the keys a, and b.

In [6]:
# Binary
sabmodel.binary(df, 20)

{'a':              0         1         2         3
 614   0.045833  0.633505  0.682157  0.506694
 2936  0.350513  0.055808  0.768107  0.084782
 1418  0.582727  0.845378  0.440248  0.531533
 198   0.141734  0.075582  0.054443  0.106938
 1970  0.625814  0.705397  0.463036  0.167727
 676   0.241814  0.116439  0.387899  0.859536
 902   0.439818  0.796614  0.056689  0.793942
 1416  0.523330  0.807814  0.676731  0.702628
 910   0.680964  0.781842  0.555959  0.446357
 18    0.053193  0.176508  0.975079  0.865801,
 'b':              0         1         2         3
 2779  0.032778  0.612909  0.149350  0.892527
 973   0.026653  0.814620  0.594868  0.111499
 2137  0.852159  0.127457  0.931777  0.209836
 2745  0.788827  0.060365  0.071948  0.803574
 1451  0.006148  0.471282  0.299660  0.226566
 2953  0.020930  0.909120  0.030569  0.774890
 647   0.540865  0.681246  0.121364  0.733491
 2575  0.350886  0.242974  0.269854  0.604488
 615   0.043066  0.698642  0.356887  0.281773
 2587  0.969487  0.8981

# 5. Stratified Sampling  
At first, wine data is utilized for hierarchical stratification.
The data is from https://www.kaggle.com/datasets/dev7halo/wine-information

In [7]:
import os
path = os.getcwd()
path = os.path.join(path, 'wine_info.csv')
wine = pd.read_csv(path)

## 5.1. One Stratum Example  
Sample by the ratio of stratum 'nation'

In [8]:
sabmodel.strat(wine, 1000, ['nation'],10)

Unnamed: 0,id,name,producer,nation,local1,local2,local3,local4,varieties1,varieties2,...,abv,degree,sweet,acidity,body,tannin,price,year,ml,new_group
14649,160718,Evangello,크티마 게로바실리우 Ktima Gerovassiliou,그리스 Greece,Petite Sirah,,,,Viognier,,...,14.5,18~20,SWEET1,ACIDITY4,BODY5,TANNIN4,349000.0,2010.0,750.0,그리스 Greece
14652,160721,Gerovassiliou White,크티마 게로바실리우 Ktima Gerovassiliou,그리스 Greece,,,,,Assyrtiko,Malagousia,...,13.5,8~10,SWEET1,ACIDITY4,BODY2,TANNIN1,85000.0,2014.0,750.0,그리스 Greece
14687,160762,Howas,끼르야니 Kir Yianni,그리스 Greece,마케도니아 Macedonia,나우싸 Naoussa,,,Xinomavro,Merlot,...,14~15,20~23,SWEET1,ACIDITY4,BODY5,TANNIN5,140000.0,2009.0,750.0,그리스 Greece
1641,139879,"Meerlust, Cabernet Sauvignon",밀루스트 에스테이트 Meerlust Estate,남아프리카 공화국 the Republic of South Africa,코스탈 리젼 Coastal Region,스텔렌보쉬 Stellenbosch,,,Cabernet Sauvignon,,...,14,16~18,SWEET1,ACIDITY4,BODY3,TANNIN4,,2014.0,750.0,남아프리카 공화국 the Republic of South Africa
7221,148860,Cape Mountain Natural Sweet Red,케이프 마운틴 Cape Mountain,남아프리카 공화국 the Republic of South Africa,,,,,Muscat,Cinsault,...,8,10~12,SWEET4,ACIDITY3,BODY3,TANNIN4,25000.0,,750.0,남아프리카 공화국 the Republic of South Africa
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11100,154140,"Handpicked, Margaret River Sauvignon Blanc Sem...",핸드픽트 Handpicked,호주 Australia,웨스턴 오스트레일리아 Western Australia,마가렛 리버 Margaret River,,,Sauvignon blanc,Semillon,...,,,SWEET1,ACIDITY4,BODY3,TANNIN1,,2012.0,750.0,호주 Australia
16543,162786,"Giant Steps, Applejack Vineyard Pinot Noir",자이언트 스텝스 Giant Steps,호주 Australia,빅토리아 Victoria,야라 밸리 Yarra Valley,,,Pinot Noir,,...,13.8,15~17,SWEET1,ACIDITY4,BODY4,TANNIN3,140000.0,2017.0,750.0,호주 Australia
10390,152915,Gatt Cabernet Sauvignon,가트 와인즈 Gatt Wines,호주 Australia,사우스 오스트레일리아 South Australia,바로사 밸리 Barossa Valley,,,Cabernet Sauvignon,,...,,,SWEET1,ACIDITY3,BODY5,TANNIN4,160000.0,2009.0,750.0,호주 Australia
4010,144894,"Rosemount, Diamond Cellar Gewurztraminer Riesling",로즈마운트 Rosemount,호주 Australia,사우스 오스트레일리아 South Australia,,,,Gewurztraminer,Riesling,...,,,SWEET2,ACIDITY2,BODY4,TANNIN1,,2012.0,750.0,호주 Australia


## 5.1.2. Two Strata Example  
Sample by the ratio of stratum 'new_group'  
'new_group' column is dervied from the intersection of 'nation' and 'local1'  
For example, '호주 Australia' has sub-regions such as '웨스턴 오스트레일리아 Western Australia' and '태즈매니아 Tasmania'. Set cate_list as \['nation', 'local1'\] and region and sub-regions are added as '호주 Australia_웨스턴 오스트레일리아 Western Australia'
'호주 Australia_태즈매니아 Tasmania' and assgined to new column called 'new_group'  

In [9]:
sabmodel.strat(wine, 1000, ['nation', 'local1'],10)

Unnamed: 0,id,name,producer,nation,local1,local2,local3,local4,varieties1,varieties2,...,abv,degree,sweet,acidity,body,tannin,price,year,ml,new_group
14693,160768,Akakies Rose,끼르야니 Kir Yianni,그리스 Greece,마케도니아 Macedonia,아민테오 Amynteo,,,Xinomavro,,...,12,8~10,SWEET1,ACIDITY4,BODY3,TANNIN2,33000.0,2015.0,750.0,그리스 Greece_마케도니아 Macedonia
14213,159925,Meander Pink Moscato,유니 와인즈 Uni Wines,남아프리카 공화국 the Republic of South Africa,브리드 리버 밸리 Breede River Valley,브리덱루프 Breedekloof,,,Moscato,,...,5~6,6~8,SWEET4,ACIDITY2,BODY3,TANNIN1,3900.0,2014.0,275.0,남아프리카 공화국 the Republic of South Africa_브리드 리버 ...
8559,150368,"Nederburg, Rose",디스텔 Distell,남아프리카 공화국 the Republic of South Africa,웨스턴 케이프 Western Cape,,,,Pinotage,,...,12~13,12~14,SWEET2,ACIDITY4,BODY3,TANNIN2,25000.0,2018.0,750.0,남아프리카 공화국 the Republic of South Africa_웨스턴 케이프...
4644,145768,"Two Oceans, Pinotage",투 오션 Two Oceans,남아프리카 공화국 the Republic of South Africa,웨스턴 케이프 Western Cape,,,,Pinotage,,...,13.6,16~18,SWEET1,ACIDITY3,BODY2,TANNIN3,,2007.0,750.0,남아프리카 공화국 the Republic of South Africa_웨스턴 케이프...
4647,145771,"Two Oceans, Chardonnay",투 오션 Two Oceans,남아프리카 공화국 the Republic of South Africa,웨스턴 케이프 Western Cape,,,,Chardonnay,,...,13.8,10~12,SWEET1,ACIDITY4,BODY3,TANNIN1,,2007.0,750.0,남아프리카 공화국 the Republic of South Africa_웨스턴 케이프...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8188,149943,"Evans & Tate, Margaret River Shiraz",에반스 앤 테이트 와이너리 Evans & Tate Winery,호주 Australia,웨스턴 오스트레일리아 Western Australia,마가렛 리버 Margaret River,,,Syrah & Shiraz,,...,14.5,16~18,SWEET1,ACIDITY3,BODY4,TANNIN4,70000.0,2007.0,750.0,호주 Australia_웨스턴 오스트레일리아 Western Australia
14603,160631,"Plantagenet, Chardonnay",플란타제넷 와인즈 Plantagenet Wines,호주 Australia,웨스턴 오스트레일리아 Western Australia,,,,Chardonnay,,...,13~14,8~10,SWEET1,ACIDITY3,BODY4,TANNIN1,60000.0,2014.0,750.0,호주 Australia_웨스턴 오스트레일리아 Western Australia
7977,149700,Peppermint Grove The Growers' Unwooded Chardonnay,더 그로워즈 The Growers,호주 Australia,웨스턴 오스트레일리아 Western Australia,,,,Chardonnay,,...,14.5,10~12,SWEET1,ACIDITY4,BODY2,TANNIN1,,2008.0,750.0,호주 Australia_웨스턴 오스트레일리아 Western Australia
10043,152398,"Plantagenet, OMRAH Pinot Noir",플란타제넷 와인즈 Plantagenet Wines,호주 Australia,웨스턴 오스트레일리아 Western Australia,,,,Pinot Noir,,...,14,16~18,SWEET1,ACIDITY4,BODY3,TANNIN3,,2012.0,750.0,호주 Australia_웨스턴 오스트레일리아 Western Australia


# 6. Cluster Sampling  
## 6.1. One-stage Cluster Sampling  
Returns all the elements in selected clusters as samples.

In [14]:
# # make new groups as cluster
data_new = sabmodel.make_groups(wine, ['nation', 'local1'])
locs = ['호주 Australia_웨스턴 오스트레일리아 Western Australia', 
        '그리스 Greece_마케도니아 Macedonia']

In [15]:
sabmodel.cluster(data_new, 100, 'new_group', locs, stage='one')

Unnamed: 0,id,name,producer,nation,local1,local2,local3,local4,varieties1,varieties2,...,abv,degree,sweet,acidity,body,tannin,price,year,ml,new_group
1342,139522,"Moss Wood, Cabernet Sauvignon",모스우드 Moss Wood,호주 Australia,웨스턴 오스트레일리아 Western Australia,마가렛 리버 Margaret River,,,Cabernet Sauvignon,Syrah & Shiraz,...,14,16~18,SWEET1,ACIDITY4,BODY4,TANNIN5,,2006.0,750.0,호주 Australia_웨스턴 오스트레일리아 Western Australia
1343,139524,"Moss Wood, Chardonnay",모스우드 Moss Wood,호주 Australia,웨스턴 오스트레일리아 Western Australia,마가렛 리버 Margaret River,,,Chardonnay,,...,14.5,10~12,SWEET1,ACIDITY4,BODY4,TANNIN1,,2016.0,750.0,호주 Australia_웨스턴 오스트레일리아 Western Australia
1983,140422,"Cullen, Chardonnay",컬런 와인스 Cullen Wines,호주 Australia,웨스턴 오스트레일리아 Western Australia,마가렛 리버 Margaret River,,,Chardonnay,,...,14,10~12,SWEET1,ACIDITY3,BODY4,TANNIN1,,2016.0,750.0,호주 Australia_웨스턴 오스트레일리아 Western Australia
1984,140423,"Cullen, Diana Madeline Cabernet Sauvignon Merlot",컬런 와인스 Cullen Wines,호주 Australia,웨스턴 오스트레일리아 Western Australia,마가렛 리버 Margaret River,,,Cabernet Sauvignon,Merlot,...,13.5,16~18,SWEET1,ACIDITY3,BODY4,TANNIN4,,2016.0,750.0,호주 Australia_웨스턴 오스트레일리아 Western Australia
1985,140424,"Cullen, Ellen Bussell Red",컬런 와인스 Cullen Wines,호주 Australia,웨스턴 오스트레일리아 Western Australia,마가렛 리버 Margaret River,,,Cabernet Sauvignon,Merlot,...,13.5,16~18,SWEET1,ACIDITY3,BODY4,TANNIN4,,2016.0,750.0,호주 Australia_웨스턴 오스트레일리아 Western Australia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20258,167235,Kormilitsa Gold,찬탈리 Tsantali,그리스 Greece,마케도니아 Macedonia,,,,Cabernet Sauvignon,Limnio,...,13~14,16~18,SWEET1,ACIDITY4,BODY5,TANNIN5,1000000.0,2004.0,750.0,그리스 Greece_마케도니아 Macedonia
20615,167603,"L.A.S Vino, CBDB(The Chenin Blanc Dynamic Blanc)",라스 비노 L.A.S Vino,호주 Australia,웨스턴 오스트레일리아 Western Australia,마가렛 리버 Margaret River,,,Chenin Blanc,,...,14~15,10~12,SWEET1,ACIDITY4,BODY3,TANNIN1,,2018.0,750.0,호주 Australia_웨스턴 오스트레일리아 Western Australia
20616,167604,"L.A.S Vino, Pirate Blend",라스 비노 L.A.S Vino,호주 Australia,웨스턴 오스트레일리아 Western Australia,마가렛 리버 Margaret River,,,Touriga Nacional,Tinta Cao,...,14~15,,SWEET1,ACIDITY3,BODY4,TANNIN4,,2018.0,750.0,호주 Australia_웨스턴 오스트레일리아 Western Australia
20617,167605,L.A.S Vino Cabernet Sauvignon,라스 비노 L.A.S Vino,호주 Australia,웨스턴 오스트레일리아 Western Australia,마가렛 리버 Margaret River,,,Cabernet Sauvignon,,...,14~15,16~18,SWEET1,ACIDITY3,BODY5,TANNIN5,,2018.0,750.0,호주 Australia_웨스턴 오스트레일리아 Western Australia


## 6.2. Two-stage Cluster Sampling  
Returns sampled the elements in selected clusters by ratio as samples.

In [13]:
sabmodel.cluster(data_new, 100, 'new_group', locs, stage='two')

Unnamed: 0,id,name,producer,nation,local1,local2,local3,local4,varieties1,varieties2,...,abv,degree,sweet,acidity,body,tannin,price,year,ml,new_group
14694,160769,Akakies Rose Sparkling,끼르야니 Kir Yianni,그리스 Greece,마케도니아 Macedonia,아민테오 Amynteo,,,Xinomavro,,...,11.5,6~8,SWEET1,ACIDITY4,BODY3,TANNIN3,55000.0,2018.0,750.0,그리스 Greece_마케도니아 Macedonia
14639,160708,Areti Red,도멘 비블리아 호라 Domaine Biblia Chora,그리스 Greece,마케도니아 Macedonia,,,,Agiorgitiko,,...,14,16~18,SWEET1,ACIDITY3,BODY4,TANNIN3,99000.0,2009.0,750.0,그리스 Greece_마케도니아 Macedonia
14641,160710,Biblia Chora White,도멘 비블리아 호라 Domaine Biblia Chora,그리스 Greece,마케도니아 Macedonia,,,,Assyrtiko,Sauvignon blanc,...,13,8~10,SWEET1,ACIDITY3,BODY2,TANNIN1,69000.0,2014.0,750.0,그리스 Greece_마케도니아 Macedonia
20258,167235,Kormilitsa Gold,찬탈리 Tsantali,그리스 Greece,마케도니아 Macedonia,,,,Cabernet Sauvignon,Limnio,...,13~14,16~18,SWEET1,ACIDITY4,BODY5,TANNIN5,1000000.0,2004.0,750.0,그리스 Greece_마케도니아 Macedonia
14628,160697,Alpha Utopia,알파 에스테이트 Alpha Estate,그리스 Greece,마케도니아 Macedonia,아민테오 Amynteo,,,Tannat,Xinomavro,...,14.5,16~18,SWEET1,ACIDITY3,BODY4,TANNIN3,109000.0,2010.0,750.0,그리스 Greece_마케도니아 Macedonia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10043,152398,"Plantagenet, OMRAH Pinot Noir",플란타제넷 와인즈 Plantagenet Wines,호주 Australia,웨스턴 오스트레일리아 Western Australia,,,,Pinot Noir,,...,14,16~18,SWEET1,ACIDITY4,BODY3,TANNIN3,,2012.0,750.0,호주 Australia_웨스턴 오스트레일리아 Western Australia
11922,155559,Finest Tingleup Riesling,테스코 파이니스트 Tesco Finest,호주 Australia,웨스턴 오스트레일리아 Western Australia,,,,Riesling,,...,,,SWEET3,ACIDITY3,BODY3,TANNIN1,43000.0,,750.0,호주 Australia_웨스턴 오스트레일리아 Western Australia
2500,141313,"Salitage, Treehouse Pinot Noir",살리타지 Salitage,호주 Australia,웨스턴 오스트레일리아 Western Australia,펨버튼 Pemberton,,,Pinot Noir,,...,,,SWEET1,ACIDITY3,BODY3,TANNIN2,,2008.0,750.0,호주 Australia_웨스턴 오스트레일리아 Western Australia
8187,149942,"Evans & Tate, Margaret River Cabernet Merlot",에반스 앤 테이트 와이너리 Evans & Tate Winery,호주 Australia,웨스턴 오스트레일리아 Western Australia,마가렛 리버 Margaret River,,,Cabernet Sauvignon,Merlot,...,14.3,16~18,SWEET1,ACIDITY3,BODY4,TANNIN4,70000.0,2007.0,750.0,호주 Australia_웨스턴 오스트레일리아 Western Australia
