# Load Data

In [1]:
import pandas as pd

df_total = pd.read_csv('df_total.csv')
df_test = pd.read_csv('df_test.csv', sep=',', index_col=0)


---

# Data Preprocessing

In [2]:
df_total[['year','month','day','weekday']]

Unnamed: 0,year,month,day,weekday
0,2017,1,1,6
1,2017,1,1,6
2,2017,1,1,6
3,2017,1,1,6
4,2017,1,1,6
...,...,...,...,...
43507,2021,12,31,4
43508,2021,12,31,4
43509,2021,12,31,4
43510,2021,12,31,4


In [3]:
df_test[['year','month','day','weekday']]

Unnamed: 0_level_0,year,month,day,weekday
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,2022,1,1,5
1,2022,1,1,5
2,2022,1,1,5
3,2022,1,1,5
4,2022,1,1,5
...,...,...,...,...
1411,2022,2,28,0
1412,2022,2,28,0
1413,2022,2,28,0
1414,2022,2,28,0


In [4]:
df_total.columns

Index(['index', '기준일ID', '시간대구분', '총생활인구수', '남자0세부터9세생활인구수', '남자10세부터14세생활인구수',
       '남자15세부터19세생활인구수', '남자20세부터24세생활인구수', '남자25세부터29세생활인구수',
       '남자30세부터34세생활인구수', '남자35세부터39세생활인구수', '남자40세부터44세생활인구수',
       '남자45세부터49세생활인구수', '남자50세부터54세생활인구수', '남자55세부터59세생활인구수',
       '남자60세부터64세생활인구수', '남자65세부터69세생활인구수', '남자70세이상생활인구수', '여자0세부터9세생활인구수',
       '여자10세부터14세생활인구수', '여자15세부터19세생활인구수', '여자20세부터24세생활인구수',
       '여자25세부터29세생활인구수', '여자30세부터34세생활인구수', '여자35세부터39세생활인구수',
       '여자40세부터44세생활인구수', '여자45세부터49세생활인구수', '여자50세부터54세생활인구수',
       '여자55세부터59세생활인구수', '여자60세부터64세생활인구수', '여자65세부터69세생활인구수', '여자70세이상생활인구수',
       'year', 'month', 'day', 'weekday'],
      dtype='object')

In [5]:
from copy import deepcopy

df_total2 = deepcopy(df_total[['시간대구분', '총생활인구수', 'weekday']])
df_test2 = deepcopy(df_test[['시간대구분', '총생활인구수', 'weekday']])

#### shift & rolling

In [6]:
df_total2['1d'] = df_total2['총생활인구수'].shift(24)
df_total2['7d'] = df_total2['총생활인구수'].shift(24*7)

df_test2['1d'] = df_test2['총생활인구수'].shift(24)
df_test2['7d'] = df_test2['총생활인구수'].shift(24*7)

In [7]:
df_total2['window_7'] = df_total2['총생활인구수'].rolling(window=7).mean() # gaussian
df_test2['window_7'] = df_test2['총생활인구수'].rolling(window=7).mean()

In [8]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5)

col_names = df_total2.columns
total_filled = imputer.fit_transform(df_total2)
test_filled = imputer.fit_transform(df_test2)

In [9]:
df_total_filled = pd.DataFrame(total_filled, columns=col_names)
df_total_filled.head()

Unnamed: 0,시간대구분,총생활인구수,weekday,1d,7d,window_7
0,0.0,31535.22,6.0,32230.80934,31491.41964,32136.009549
1,1.0,31188.9174,6.0,31814.45848,31442.63326,32071.135806
2,2.0,31240.4974,6.0,31952.55834,32341.1284,31910.591469
3,3.0,31442.4314,6.0,32088.35098,31868.8995,31693.226077
4,4.0,31922.7751,6.0,32438.8985,32101.69762,31545.317817


In [10]:
df_test_filled = pd.DataFrame(test_filled, columns=col_names)
df_test_filled.head()

Unnamed: 0,시간대구분,총생활인구수,weekday,1d,7d,window_7
0,0.0,30509.7386,5.0,29937.63806,32575.95054,31271.790894
1,1.0,30759.5067,5.0,31435.26408,32297.2095,31164.148286
2,2.0,31048.8787,5.0,32147.83548,31972.12288,32633.510571
3,3.0,31076.1092,5.0,32062.94274,31942.58856,32431.509497
4,4.0,31714.9309,5.0,32096.02284,31769.95306,33212.895963


In [11]:
train_x = df_total_filled.drop('총생활인구수', axis=1)
train_y = df_total_filled['총생활인구수']

test_x = df_test_filled.drop('총생활인구수', axis=1)
test_y = df_test_filled['총생활인구수']

In [12]:
# CSV로 저장
train_x.to_csv('train_x.csv', index = False)
train_y.to_csv('train_y.csv', index = False)
test_x.to_csv('test_x.csv', index = False)
test_y.to_csv('test_y.csv', index = False)