In [1]:
import numpy as np
import pandas as pd
# pd.set_option('display.max_rows', None)

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from torch.utils.data import Dataset, DataLoader
import torch

import sys
import os
sys.path.append(os.path.abspath(".."))
from utils import preprocess

In [2]:
transaction_df = pd.read_csv('../../데이터/Transaction/transaction_all.csv')
economy_df = pd.read_excel('../../데이터/Economy/economy_all.xlsx')

transaction_df, economy_df = preprocess(transaction_df, economy_df, window_size=5)

In [19]:
df = transaction_df
window_size = 5

### RNN_Transaction_Dataset

In [5]:
def price_interpolate(group, start, end, method='linear'):
    idx = pd.date_range(start, end, freq='MS')
    group = group.set_index('계약년월').reindex(idx)
    group['동'] = group['동'].fillna(method='ffill')
    group['단지'] = group['단지'].fillna(method='ffill')
    group['건축년도'] = group['건축년도'].fillna(method='ffill')
    group['제곱미터당 거래금액'] = group[['제곱미터당 거래금액']].interpolate()
    
    if method == 'linear':
        group['제곱미터당 거래금액'] = group[['제곱미터당 거래금액']].interpolate(method='linear')
    elif method == 'quadratic':
        group['제곱미터당 거래금액'] = group[['제곱미터당 거래금액']].interpolate(method='quadratic')
    elif method == 'nearest':
        group['제곱미터당 거래금액'] = group[['제곱미터당 거래금액']].interpolate(method='nearest')
    elif method == 'spline':
        group['제곱미터당 거래금액'] = group[['제곱미터당 거래금액']].interpolate(method='spline', order=2)
    elif method == 'polynomial':
        group['제곱미터당 거래금액'] = group[['제곱미터당 거래금액']].interpolate(method='polynomial', order=3)
    
    return group

In [13]:
class RNN_Transaction_Dataset(Dataset):
    def __init__(self, df, window_size=5):
        # 보간
        df['계약년월'] = pd.to_datetime(df['계약년월'].astype(str), format='%Y%m')
        interpolated_df = pd.DataFrame(df.groupby(['동', '단지']).apply(lambda group: price_interpolate(group,group['계약년월'].min(),group['계약년월'].max()))['제곱미터당 거래금액']).reset_index().rename(columns={'level_2':'계약년월'})

        dongs_x, dongs_y = [], []
        for dong in df['동'].unique():
            for apartment_complex in df[df['동'] == dong]['단지'].unique():
                filtered_df = df[(df['동'] == dong)*(df['단지'] == apartment_complex)]
                filtered_df_values = filtered_df['제곱미터당 거래금액'].values
                for idx in range(len(filtered_df_values)-window_size):
                    dongs_x.append(filtered_df_values[idx:idx+window_size])
                    dongs_y.append(filtered_df_values[idx+window_size:idx+window_size+1])
        self.dongs_x = dongs_x
        self.dongs_y = dongs_y
        self.len = len(dongs_x)

    # 부동산_x, 부동산_y
    def __getitem__(self, i):
        return torch.FloatTensor(self.dongs_x[i][0]), torch.FloatTensor(self.dongs_x[i][1]), torch.FloatTensor(self.dongs_y[i][0]), torch.FloatTensor(self.dongs_y[i][1])

    def __len__(self):
        return self.len

In [14]:
batch_size = 1
dataset = RNN_Transaction_Dataset(transaction_df)
dataloader = DataLoader(dataset, batch_size=batch_size)

for x,y,z,w in dataloader:
    print(x)
    print(y)
    print(z)
    print(w)
    break

tensor([[ 7.6849,  8.6174, 10.3881,  7.7170, 10.3881]])
tensor([[3., 4., 5., 6., 7.]])
tensor([[10.3881]])
tensor([[8.]])


In [None]:
df['계약년월'] = pd.to_datetime(df['계약년월'].astype(str), format='%Y%m')
interpolated_df = pd.DataFrame(df.groupby(['동', '단지']).apply(lambda group: price_interpolate(group,group['계약년월'].min(),group['계약년월'].max()))['제곱미터당 거래금액']).reset_index().rename(columns={'level_2':'계약년월'})

dongs_x, dongs_y = [], []
for dong in df['동'].unique():
    for apartment_complex in df[df['동'] == dong]['단지'].unique():
        filtered_df = df[(df['동'] == dong)*(df['단지'] == apartment_complex)]
        filtered_df_values = filtered_df['제곱미터당 거래금액'].values
        for idx in range(len(filtered_df_values)-window_size):
            dongs_x.append(filtered_df_values[idx:idx+window_size])
            dongs_y.append(filtered_df_values[idx+window_size:idx+window_size+1])

### LSTM_Transformer_Dataset

In [7]:
def price_fill_0(df):
    months = pd.to_datetime(pd.date_range(start="20060101", end="20221201", freq='MS'))
    complex_city_combinations = df[['단지', '동']].drop_duplicates()

    combinations = pd.DataFrame({
        '단지': np.tile(complex_city_combinations['단지'], len(months)),
        '동': np.tile(complex_city_combinations['동'], len(months)),
        '계약년월': np.repeat(months, len(complex_city_combinations))
    })
    
    df['계약년월'] = pd.to_datetime(df['계약년월'].astype(str), format='%Y%m')
    df = pd.merge(combinations, df, on=['단지', '계약년월', '동'], how='left')
    df['제곱미터당 거래금액(만원)'].fillna(0, inplace=True)

    return df

In [8]:
class LSTM_Transformer_Dataset(Dataset):
    def __init__(self, transaction_data, economy_data, window_size=5):
        all_dong_max_apartment_complex = 311 # transaction_data.drop_duplicates(subset=['동','단지']).groupby(['동'])['단지'].count().max()

        filled_data = price_fill_0(transaction_data)
        filled_data = filled_data[['동', '단지', '계약년월', '제곱미터당 거래금액']]        

        dongs_x, dongs_y = [], []
        for dong in filled_data['동'].unique():
            filtered_filled_data = filled_data[filled_data['동'] == dong]
            date_range = pd.date_range('20060101', '20221201', freq='MS')
            for idx in range(len(date_range)-window_size):
                current_range_x = date_range[idx:idx+window_size]
                current_range_y = date_range[idx+window_size:idx+window_size+1]
                current_range_filled_x = filtered_filled_data[filtered_filled_data['계약년월'].isin(current_range_x)]
                current_range_filled_y = filtered_filled_data[filtered_filled_data['계약년월'].isin(current_range_y)]
                grouped_current_range_filled_x = current_range_filled_x.groupby('단지').agg({'제곱미터당 거래금액': list}).reset_index()['제곱미터당 거래금액'].to_list()
                grouped_current_range_filled_y = current_range_filled_y.groupby('단지').agg({'제곱미터당 거래금액': list}).reset_index()['제곱미터당 거래금액'].to_list()
                if len(grouped_current_range_filled_x) < all_dong_max_apartment_complex:
                    for _ in range(all_dong_max_apartment_complex-len(grouped_current_range_filled_x)):
                        grouped_current_range_filled_x.append([0.0]*window_size)
                        grouped_current_range_filled_y.append([0.0])
                economy_x, economy_y = [], []
                economy_x.append(economy_data['통화량'][idx:idx+window_size].to_list())
                economy_y.append(economy_data['통화량'][idx+window_size:idx+window_size+1].to_list())
                grouped_current_range_filled_and_economy_x = []
                grouped_current_range_filled_and_economy_x.extend([grouped_current_range_filled_x, economy_x])
                grouped_current_range_filled_and_economy_y = []
                grouped_current_range_filled_and_economy_y.extend([grouped_current_range_filled_y, economy_y])
                dongs_x.append(grouped_current_range_filled_and_economy_x)
                dongs_y.append(grouped_current_range_filled_and_economy_y)

        self.dongs_x = dongs_x
        self.dongs_y = dongs_y
        self.len = len(dongs_x)

    # 부동산_x, 경제_x, 부동산_y, 경제_y 
    def __getitem__(self, i):
        return torch.FloatTensor(self.dongs_x[i][0]), torch.FloatTensor(self.dongs_x[i][1]), torch.FloatTensor(self.dongs_y[i][0]), torch.FloatTensor(self.dongs_y[i][1])

    def __len__(self):
        return self.len

In [9]:
batch_size = 1
train_dataset = LSTM_Transformer_Dataset(transaction_all, economy_all)
train_loader = DataLoader(train_dataset, batch_size=batch_size)

for x,y,z,w in train_loader:
    print(w.dtype)
    break

torch.float32
