In [1]:
import numpy as np
import pandas as pd

from torch.utils.data import Dataset, DataLoader
import torch

In [2]:
data = pd.read_csv('../../데이터/Transaction/transaction_final.csv',index_col=0)
economy = pd.read_excel('../../데이터/Economy/economy_all.xlsx')

In [3]:
def price_per_pyeong_interpolate(group):
    idx = pd.date_range(group['계약년월'].min(), group['계약년월'].max(), freq='MS')
    group = group.set_index('계약년월').reindex(idx)
    group['단지명'] = group['단지명'].fillna(method='ffill')
    group['시군구'] = group['시군구'].fillna(method='ffill')
    group['평단가'] = group['평단가'].interpolate()
    return group

In [22]:
class RNN_Transaction_Dataset(Dataset):
    def __init__(self, data, sequence_length=5):
        data['계약년월'] = pd.to_datetime(data['계약년월'])
        interpolated_data = pd.DataFrame(data.groupby(['시군구', '단지명']).apply(price_per_pyeong_interpolate)['평단가']).reset_index().rename(columns={'level_2': '계약년월'})

        dongs_x, dongs_y = [], []
        for dong in interpolated_data['시군구'].unique():
            dong_x, dong_y = [], []
            for apartment_complex in interpolated_data[interpolated_data['시군구'] == dong]['단지명'].unique():
                filtered_interpolated_data_values = interpolated_data[interpolated_data['단지명'] == apartment_complex]['평단가'].values
                for idx in range(len(filtered_interpolated_data_values)-sequence_length):
                    apartment_complex_x = filtered_interpolated_data_values[idx:idx+sequence_length]
                    apartment_complex_y = filtered_interpolated_data_values[idx+sequence_length:idx+sequence_length+1]
                    dong_x.append(apartment_complex_x)
                    dong_y.append(apartment_complex_y)   
            dongs_x.append(dong_x)
            dongs_y.append(dong_y)

        self.dongs_x = dongs_x
        self.dongs_y = dongs_y
        self.len = len(dongs_x)

    def __getitem__(self, i):
        return torch.FloatTensor(self.dongs_x[i]), torch.FloatTensor(self.dongs_y[i])

    def __len__(self):
        return self.len

batch_size = 1
train_dataset = RNN_Transaction_Dataset(data)
train_loader = DataLoader(train_dataset, batch_size=batch_size)

In [None]:
class ODE_Transaction_Dataset(Dataset):
    def __init__(self, data, sequence_length=5):
        data['계약년월'] = pd.to_datetime(data['계약년월']).dt.strftime('%Y%m%d')

        dongs_x, dongs_x_time, dongs_y, dongs_y_time = [], [], [], []
        for dong in data['시군구'].unique():
            dong_x, dong_x_time, dong_y, dong_y_time = [], [], [], []
            for apartment_complex in data[data['시군구'] == dong]['단지명'].unique():
                filtered_data_values = data[data['단지명'] == apartment_complex]['평단가'].values
                filtered_data_times = data[data['단지명'] == apartment_complex]['계약년월'].apply(lambda x: int(x)).values 
                for idx in range(len(filtered_data_values)-sequence_length):
                    apartment_complex_x = filtered_data_values[idx:idx+sequence_length]
                    apartment_complex_x_time = filtered_data_times[idx:idx+sequence_length]
                    apartment_complex_y = filtered_data_values[idx+sequence_length:idx+sequence_length+1]
                    apartment_complex_y_time = filtered_data_times[idx+sequence_length:idx+sequence_length+1]
                    dong_x.append(apartment_complex_x)
                    dong_x_time.append(apartment_complex_x_time)
                    dong_y.append(apartment_complex_y)
                    dong_y_time.append(apartment_complex_y_time)
            dongs_x.append(dong_x)
            dongs_x_time.append(dong_x_time)
            dongs_y.append(dong_y)
            dongs_y_time.append(dong_y_time)

        self.dongs_x = dongs_x
        self.dongs_x_time = dongs_x_time
        self.dongs_y = dongs_y
        self.dongs_y_time = dongs_y_time
        self.len = len(dongs_x)

    def __getitem__(self, i):
        return torch.FloatTensor(self.dongs_x[i]), torch.tensor(self.dongs_x_time[i], dtype=torch.int64), torch.FloatTensor(self.dongs_y[i]), torch.tensor(self.dongs_y_time[i], dtype=torch.int64)

    def __len__(self):
        return self.len

batch_size = 1
train_dataset = ODE_Transaction_Dataset(data)
train_loader = DataLoader(train_dataset, batch_size=batch_size)

In [26]:
def price_per_pyeong_fill_0(df):
    months = pd.to_datetime(pd.date_range(start="20060101", end="20221201", freq='MS'))
    complex_city_combinations = df[['단지명', '시군구']].drop_duplicates()

    combinations = pd.DataFrame({
        '단지명': np.tile(complex_city_combinations['단지명'], len(months)),
        '시군구': np.tile(complex_city_combinations['시군구'], len(months)),
        '계약년월': np.repeat(months, len(complex_city_combinations))
    })
    
    df['계약년월'] = pd.to_datetime(df['계약년월'])
    df = pd.merge(combinations, df, on=['단지명', '계약년월', '시군구'], how='left')
    df['평단가'].fillna(0, inplace=True)

    return df

In [95]:
class RNN_Transformer_Dataset(Dataset):
    def __init__(self, data, sequence_length=5):
        all_dong_max_apartment_complex = data.drop_duplicates(subset=['시군구','단지명']).groupby(['시군구'])['단지명'].count().max()

        filled_data = price_per_pyeong_fill_0(data)
        filled_data = filled_data[['시군구', '단지명', '계약년월', '평단가']]        
        
        dongs_x, dongs_y = [], []
        for dong in filled_data['시군구'].unique():
            dong_x, dong_y = [], []
            filtered_filled_data = filled_data[filled_data['시군구'] == dong]
            date_range = pd.date_range('20060101', '20221201', freq='MS')
            for idx in range(len(date_range)-sequence_length):
                current_range_x = date_range[idx:idx+sequence_length]
                current_range_y = date_range[idx+sequence_length:idx+sequence_length+1]
                current_range_filled_x = filtered_filled_data[filtered_filled_data['계약년월'].isin(current_range_x)]
                current_range_filled_y = filtered_filled_data[filtered_filled_data['계약년월'].isin(current_range_y)]
                grouped_current_range_filled_x = current_range_filled_x.groupby('단지명').agg({'평단가': list}).reset_index()['평단가'].to_list()
                grouped_current_range_filled_y = current_range_filled_y.groupby('단지명').agg({'평단가': list}).reset_index()['평단가'].to_list()
                if len(grouped_current_range_filled_x) < all_dong_max_apartment_complex:
                    for _ in range(all_dong_max_apartment_complex-len(grouped_current_range_filled_x)):
                        grouped_current_range_filled_x.append([0.0]*sequence_length)
                        grouped_current_range_filled_y.append([0.0])
                dong_x.append(grouped_current_range_filled_x)
                dong_y.append(grouped_current_range_filled_y)
            dongs_x.append(dong_x)
            dongs_y.append(dong_y)

        self.dongs_x = dongs_x
        self.dongs_y = dongs_y
        self.len = len(dongs_x)

    def __getitem__(self, i):
        return torch.FloatTensor(self.dongs_x[i]), torch.FloatTensor(self.dongs_y[i])

    def __len__(self):
        return self.len

batch_size = 1
train_dataset = RNN_Transformer_Dataset(data)
train_loader = DataLoader(train_dataset, batch_size=batch_size)

In [99]:
for x,y in train_loader:
    print(y)

tensor([[[[2608.6956],
          [   0.0000],
          [   0.0000],
          ...,
          [   0.0000],
          [   0.0000],
          [   0.0000]],

         [[   0.0000],
          [   0.0000],
          [   0.0000],
          ...,
          [   0.0000],
          [   0.0000],
          [   0.0000]],

         [[   0.0000],
          [   0.0000],
          [   0.0000],
          ...,
          [   0.0000],
          [   0.0000],
          [   0.0000]],

         ...,

         [[   0.0000],
          [   0.0000],
          [   0.0000],
          ...,
          [   0.0000],
          [   0.0000],
          [   0.0000]],

         [[   0.0000],
          [   0.0000],
          [9600.0000],
          ...,
          [   0.0000],
          [   0.0000],
          [   0.0000]],

         [[   0.0000],
          [   0.0000],
          [   0.0000],
          ...,
          [   0.0000],
          [   0.0000],
          [   0.0000]]]])
tensor([[[[2148.],
          [   0.],
          [   0.

In [None]:
sequence_length = 5

all_dong_max_apartment_complex = data.drop_duplicates(subset=['시군구','단지명']).groupby(['시군구'])['단지명'].count().max()

filled_data = price_per_pyeong_fill_0(data)
filled_data = filled_data[['시군구', '단지명', '계약년월', '평단가']]        

dongs_x, dongs_y = [], []
for dong in filled_data['시군구'].unique():
    dong_x, dong_y = [], []
    filtered_filled_data = filled_data[filled_data['시군구'] == dong]
    date_range = pd.date_range('20060101', '20221201', freq='MS')
    for idx in range(len(date_range)-sequence_length):
        
        current_range_x = date_range[idx:idx+sequence_length]
        current_range_y = date_range[idx+sequence_length:idx+sequence_length+1]
        current_range_filled_x = filtered_filled_data[filtered_filled_data['계약년월'].isin(current_range_x)]
        current_range_filled_y = filtered_filled_data[filtered_filled_data['계약년월'].isin(current_range_y)]
        grouped_current_range_filled_x = current_range_filled_x.groupby('단지명').agg({'평단가': list}).reset_index()['평단가'].to_list()
        grouped_current_range_filled_y = current_range_filled_y.groupby('단지명').agg({'평단가': list}).reset_index()['평단가'].to_list()
        if len(grouped_current_range_filled_x) < all_dong_max_apartment_complex:
            for _ in range(all_dong_max_apartment_complex-len(grouped_current_range_filled_x)):
                grouped_current_range_filled_x.append([0.0]*sequence_length)
                grouped_current_range_filled_y.append([0.0])
        dong_x.append(grouped_current_range_filled_x)
        dong_y.append(grouped_current_range_filled_y)
    dongs_x.append(dong_x)
    dongs_y.append(dong_y)

In [None]:
class RNN_Economy_Dataset(Dataset):
    def __init__(self, data, sequence_length=5):
        nation_economy_x, nation_economy_y, call_economy_x, call_economy_y = [], [], [], []
        for i in range(len(data)-sequence_length-1):
            nation_economy_x.append(data['국고채금리'][i:i+sequence_length].to_list())
            nation_economy_y.append(data['국고채금리'][i+sequence_length:i+sequence_length+1].to_list())
            call_economy_x.append(data['콜금리'][i:i+sequence_length].to_list())
            call_economy_y.append(data['콜금리'][i+sequence_length:i+sequence_length+1].to_list())

        self.nation_economy_x = torch.FloatTensor(nation_economy_x)
        self.nation_economy_y = torch.FloatTensor(nation_economy_y)
        self.call_economy_x = torch.FloatTensor(call_economy_x)
        self.call_economy_y = torch.FloatTensor(call_economy_y)
        self.len = len(nation_economy_x)

    def __getitem__(self, i):
        return self.nation_economy_x[i], self.nation_economy_y[i], self.call_economy_x[i], self.call_economy_y[i]

    def __len__(self):
        return self.len
    
batch_size = 1
train_dataset = RNN_Economy_Dataset(data)
train_loader = DataLoader(train_dataset, batch_size=batch_size)