# Sampling

## 0. 데이터 불러오기

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
os.chdir('/content/drive/MyDrive/기상청/데이터')

In [None]:
import numpy as np
import pandas as pd
import datetime
from datetime import timedelta
from tqdm.notebook import tqdm
tqdm.pandas()

import random

import matplotlib.pyplot as plt
from matplotlib import rc
import seaborn as sns
%matplotlib inline

rc('font', family='MalgunGothic')
plt.rcParams['axes.unicode_minus'] = False

  from pandas import Panel


In [None]:
data = pd.read_csv('data.csv', encoding='CP949')

In [None]:
data.head()

Unnamed: 0,date,sex,age,big_cat,sm_cat,qty,cnt
0,20180101,F,20,식품,가공란,37,0.480964
1,20180101,F,30,식품,가공란,16,0.480964
2,20180101,F,40,식품,가공란,9,0.480964
3,20180101,F,50,식품,가공란,3,0.480964
4,20180101,M,20,식품,가공란,13,0.480964


## 1. 데이터 전처리

### 1-1. 시간 변수 생성

In [None]:
data['date'] = pd.to_datetime(data['date'], format='%Y%m%d')
data['year'] = data['date'].dt.year
data['month'] = data['date'].dt.month
data['day'] = data['date'].dt.day
data['weekday'] = data['date'].dt.strftime("%A")

In [None]:
data.head(2)

Unnamed: 0,date,sex,age,big_cat,sm_cat,qty,cnt,year,month,day,weekday
0,2018-01-01,F,20,식품,가공란,37,0.480964,2018,1,1,Monday
1,2018-01-01,F,30,식품,가공란,16,0.480964,2018,1,1,Monday


In [None]:
data.tail(2)

Unnamed: 0,date,sex,age,big_cat,sm_cat,qty,cnt,year,month,day,weekday
2056897,2019-12-31,M,50,냉난방가전,히터,23,71.295163,2019,12,31,Tuesday
2056898,2019-12-31,M,60,냉난방가전,히터,10,71.295163,2019,12,31,Tuesday


### 1-2. groupby

In [None]:
df_groupby = data.groupby(['date','big_cat','sm_cat'])['qty'].mean().reset_index()

In [None]:
df_groupby.head(2)

Unnamed: 0,date,big_cat,sm_cat,qty
0,2018-01-01,냉난방가전,가열식 가습기,1.75
1,2018-01-01,냉난방가전,공기정화 용품,12.0


In [None]:
df_groupby.tail(2)

Unnamed: 0,date,big_cat,sm_cat,qty
267705,2019-12-31,식품,회,245.9
267706,2019-12-31,식품,흰우유,78.2


## 2. Sampling 함수

**Rule**
* test set
* 1주일에 하루 랜덤한 요일
* 클러스터 내에서는 동일

**Idea**
* test set을 위한 index
* 클러스터끼리 모델링하므로 각 클러스터마다 아래 함수 사용할 것
* 모든 소분류 같은 요일 샘플링

In [None]:
def sampling(data):
    # 랜덤으로 요일 선택
    sample_day = []; sample_idx = []
    count = 730//7 
    day_list = day_list = [1,2,3,4,5,6,7]
    sample_list = [random.choice(day_list) for i in range(count)]

    # 랜덤으로 선택된 요일의 날짜
    start = datetime.datetime(2018, 1, 1)
    for days in sample_list:
        sample_day.append(start + datetime.timedelta(days = days))
        start = start + datetime.timedelta(weeks = 1)

    # 샘플링
    for date in sample_day:
        sample_idx.extend(data[data['date']==date].index)
        
    all_idx = data.index.tolist()
    train_idx = list(set(all_idx) - set(sample_idx))
    return data.iloc[train_idx].reset_index(drop=True), data.iloc[sample_idx].reset_index(drop=True)

In [None]:
train, test = sampling(df_groupby)

In [None]:
print(train.shape)
print(test.shape)

(229589, 4)
(38118, 4)


In [None]:
train.head(2)

Unnamed: 0,date,big_cat,sm_cat,qty
0,2018-01-01,냉난방가전,가열식 가습기,1.75
1,2018-01-01,냉난방가전,공기정화 용품,12.0


In [None]:
test.head(2)

Unnamed: 0,date,big_cat,sm_cat,qty
0,2018-01-07,냉난방가전,가열식 가습기,2.833333
1,2018-01-07,냉난방가전,공기정화 용품,11.111111


In [None]:
train.tail(2)

Unnamed: 0,date,big_cat,sm_cat,qty
229587,2019-12-31,식품,회,245.9
229588,2019-12-31,식품,흰우유,78.2


In [None]:
test.tail(2)

Unnamed: 0,date,big_cat,sm_cat,qty
38116,2019-12-29,식품,회,157.6
38117,2019-12-29,식품,흰우유,83.3


In [None]:
# 매주 한 요일씩 추출된 것 확인
len(test['date'].unique())

104