In [1]:
import numpy as np
import pandas as pd

from torch.utils.data import Dataset, DataLoader
import torch

In [2]:
transaction_df = pd.read_excel('../../데이터/Transaction/transaction_final.xlsx',index_col=0)

In [3]:
economy_df = pd.read_excel('../../데이터/Economy/economy_all.xlsx')
economy_df = economy_df['국고채금리']

In [358]:
def price_per_pyeong_interpolate(group):
    idx = pd.date_range(group['계약년월'].min(), group['계약년월'].max(), freq='MS')
    group = group.set_index('계약년월').reindex(idx)
    group['단지명'] = group['단지명'].fillna(method='ffill')
    group['시군구'] = group['시군구'].fillna(method='ffill')
    group['평단가'] = group['평단가'].interpolate()
    return group

In [359]:
class RNN_Transaction_Dataset(Dataset):
    def __init__(self, data, sequence_length=5):
        data['계약년월'] = pd.to_datetime(data['계약년월'])
        interpolated_data = pd.DataFrame(data.groupby(['시군구', '단지명']).apply(price_per_pyeong_interpolate)['평단가']).reset_index().rename(columns={'level_2': '계약년월'})

        dongs_x, dongs_y = [], []
        for dong in interpolated_data['시군구'].unique():
            for apartment_complex in interpolated_data[interpolated_data['시군구'] == dong]['단지명'].unique():
                filtered_interpolated_data_values = interpolated_data[interpolated_data['단지명'] == apartment_complex]['평단가'].values
                for idx in range(len(filtered_interpolated_data_values)-sequence_length):
                    apartment_complex_x = filtered_interpolated_data_values[idx:idx+sequence_length]
                    apartment_complex_y = filtered_interpolated_data_values[idx+sequence_length:idx+sequence_length+1]
                    dongs_x.append(apartment_complex_x)
                    dongs_y.append(apartment_complex_y)   

        self.dongs_x = dongs_x
        self.dongs_y = dongs_y
        self.len = len(dongs_x)

    def __getitem__(self, i):
        return torch.FloatTensor(self.dongs_x[i]), torch.FloatTensor(self.dongs_y[i])

    def __len__(self):
        return self.len

batch_size = 2
train_dataset = RNN_Transaction_Dataset(transaction_df)
train_loader = DataLoader(train_dataset, batch_size=batch_size)

In [360]:
for x,y in train_loader:
    print(x)
    print('='*100)
    print(y)
    break

tensor([[2586.9565, 2913.0435, 2760.8696, 2608.6956, 2741.8479],
        [2913.0435, 2760.8696, 2608.6956, 2741.8479, 2875.0000]])
tensor([[2875.0000],
        [3008.1521]])


In [4]:
def price_per_pyeong_fill_0(df):
    months = pd.to_datetime(pd.date_range(start="20060101", end="20221201", freq='MS'))
    complex_city_combinations = df[['단지명', '시군구']].drop_duplicates()

    combinations = pd.DataFrame({
        '단지명': np.tile(complex_city_combinations['단지명'], len(months)),
        '시군구': np.tile(complex_city_combinations['시군구'], len(months)),
        '계약년월': np.repeat(months, len(complex_city_combinations))
    })
    
    df['계약년월'] = pd.to_datetime(df['계약년월'])
    df = pd.merge(combinations, df, on=['단지명', '계약년월', '시군구'], how='left')
    df['평단가'].fillna(0, inplace=True)

    return df

In [16]:
# pandas 이용
class RNN_Transformer_Dataset(Dataset):
    def __init__(self, transaction_data, economy_data, sequence_length=5):
        all_dong_max_apartment_complex = transaction_data.drop_duplicates(subset=['시군구','단지명']).groupby(['시군구'])['단지명'].count().max()

        filled_data = price_per_pyeong_fill_0(transaction_data)
        filled_data = filled_data[['시군구', '단지명', '계약년월', '평단가']]        

        dongs_x, dongs_y = [], []
        for dong in filled_data['시군구'].unique():
            filtered_filled_data = filled_data[filled_data['시군구'] == dong]
            date_range = pd.date_range('20060101', '20221201', freq='MS')
            for idx in range(len(date_range)-sequence_length):
                current_range_x = date_range[idx:idx+sequence_length]
                current_range_y = date_range[idx+sequence_length:idx+sequence_length+1]
                current_range_filled_x = filtered_filled_data[filtered_filled_data['계약년월'].isin(current_range_x)]
                current_range_filled_y = filtered_filled_data[filtered_filled_data['계약년월'].isin(current_range_y)]
                grouped_current_range_filled_x = current_range_filled_x.groupby('단지명').agg({'평단가': list}).reset_index()['평단가'].to_list()
                grouped_current_range_filled_y = current_range_filled_y.groupby('단지명').agg({'평단가': list}).reset_index()['평단가'].to_list()
                if len(grouped_current_range_filled_x) < all_dong_max_apartment_complex:
                    for _ in range(all_dong_max_apartment_complex-len(grouped_current_range_filled_x)):
                        grouped_current_range_filled_x.append([0.0]*sequence_length)
                        grouped_current_range_filled_y.append([0.0])
                economy_x, economy_y = [], []
                economy_x.append(economy_data[idx:idx+sequence_length].to_list())
                economy_y.append(economy_data[idx+sequence_length:idx+sequence_length+1].to_list())
                grouped_current_range_filled_and_economy_x = []
                grouped_current_range_filled_and_economy_x.extend([grouped_current_range_filled_x, economy_x])
                grouped_current_range_filled_and_economy_y = []
                grouped_current_range_filled_and_economy_y.extend([grouped_current_range_filled_y, economy_y])
                dongs_x.append(grouped_current_range_filled_and_economy_x)
                dongs_y.append(grouped_current_range_filled_and_economy_y)

        self.dongs_x = dongs_x
        self.dongs_y = dongs_y
        self.len = len(dongs_x)

    # 부동산_x, 경제_x, 부동산_y, 경제_y 
    def __getitem__(self, i):
        return torch.FloatTensor(self.dongs_x[i][0]), torch.FloatTensor(self.dongs_x[i][1]), torch.FloatTensor(self.dongs_y[i][0]), torch.FloatTensor(self.dongs_y[i][1])

    def __len__(self):
        return self.len

batch_size = 1
train_dataset_1 = RNN_Transformer_Dataset(transaction_df, economy_df)
train_loader_1 = DataLoader(train_dataset, batch_size=batch_size)

In [None]:
for x,y,z,w in train_loader_1:
    print(w)
    break

In [4]:
class NODE_Transaction_Dataset(Dataset):
    def __init__(self, data, sequence_length=5):
        dongs_x, dongs_y = [], []
        for dong in data['시군구'].unique():
            for apartment_complex in data[data['시군구'] == dong]['단지명'].unique():
                filtered_data_values = data[data['단지명'] == apartment_complex]['평단가'].values
                filtered_data_times = data[data['단지명'] == apartment_complex]['계약년월'].apply(lambda x: float((x.year-pd.Timestamp('2006-01').year)*12+(x.month-pd.Timestamp('2006-01').month)+1)).values
                for idx in range(len(filtered_data_values)-sequence_length):
                    dongs_x.append([filtered_data_values[idx:idx+sequence_length],filtered_data_times[idx:idx+sequence_length]])
                    dongs_y.append([filtered_data_values[idx+sequence_length:idx+sequence_length+1],filtered_data_times[idx+sequence_length:idx+sequence_length+1]])

        self.dongs_x = dongs_x
        self.dongs_y = dongs_y
        self.len = len(dongs_x)

    # 부동산_x, 부동산_시점_x, 부동산_y, 부동산_시점_y 
    def __getitem__(self, i):
        return torch.FloatTensor(self.dongs_x[i][0]), torch.tensor(self.dongs_x[i][1], dtype=torch.float32), torch.FloatTensor(self.dongs_y[i][0]), torch.tensor(self.dongs_y[i][1], dtype=torch.float32)

    def __len__(self):
        return self.len

batch_size = 1
train_dataset = NODE_Transaction_Dataset(transaction_df)
train_loader = DataLoader(train_dataset, batch_size=batch_size)

In [5]:
for x,y,z,w in train_loader:
    print(y)
    break

tensor([[ 2.,  3.,  5.,  9., 11.]])


In [43]:
class NODE_Transformer_Dataset(Dataset):
    def __init__(self, transaction_data, economy_data, sequence_length=5):
        all_dong_max_apartment_complex = 158 # transaction_data.drop_duplicates(subset=['시군구','단지명']).groupby(['시군구'])['단지명'].count().max()

        date_range = pd.date_range('20060101', '20221201', freq='MS')
        economy_data.index = date_range
        transaction_data.drop_duplicates(subset=['시군구', '단지명', '계약년월'], keep='first', inplace=True) # 왜 rnn_transformer는 가능했지?
        dongs_x, dongs_y = [], []

        # 동별로 탐색
        for dong in transaction_data['시군구'].unique():
            filtered_data = transaction_data[transaction_data['시군구'] == dong]

            # 200601부터 sequence_length(window_size)만큼 탐색
            for idx in range(len(date_range)-sequence_length+1): # len(date_range)-sequence_length+1
                # x,y 포함된 기간 가져오기
                current_range = date_range[:idx+sequence_length+1]
                
                # x 기간에 sequence_length만큼 거래가 존재하는 단지만 가져오기(단, current_range_filtered_data에는 모든 기간 포함)
                current_range_apartment_complex = filtered_data[filtered_data['계약년월'].isin(current_range[:-1])].groupby('단지명').filter(lambda x: len(x) >= sequence_length)['단지명'].unique()
                current_range_filtered_data = filtered_data[filtered_data['단지명'].isin(current_range_apartment_complex)]

                # x 기간의 단지별 평단가, 시간, 경제(x 기간 중 마지막 window_size 만큼)
                current_range_filtered_x = current_range_filtered_data[current_range_filtered_data['계약년월'].isin(current_range[:-1])].groupby('단지명').apply(lambda x: x.tail(sequence_length)).reset_index(drop=True)
                grouped_current_range_filtered_x = current_range_filtered_x.groupby('단지명').agg({'평단가': list}).reset_index()['평단가'].to_list()
                grouped_current_range_filtered_time_x = current_range_filtered_x.groupby('단지명').agg({'계약년월': list}).reset_index()['계약년월'].to_list()
                grouped_current_range_filtered_time_x = [[float((ts.year-pd.Timestamp('2006-01').year)*12+(ts.month-pd.Timestamp('2006-01').month)+1) for ts in sublist] for sublist in grouped_current_range_filtered_time_x]
                grouped_current_range_filtered_economy_x = [[economy_data[ts] for ts in current_range[-1-sequence_length:-1]]]

                # y 기간의 단지별 평단가, 시간, 경제
                grouped_current_range_filtered_y = []
                grouped_current_range_filtered_time_y = []
                grouped_current_range_filtered_economy_y = []
                for apartment_complex in current_range_apartment_complex:
                    if current_range_filtered_data[current_range_filtered_data['단지명']==apartment_complex]['계약년월'].isin([current_range[-1]]).any():                
                        grouped_current_range_filtered_y.append(current_range_filtered_data[(current_range_filtered_data['단지명']==apartment_complex) & (current_range_filtered_data['계약년월']==current_range[-1])]['평단가'].to_list())
                    else:
                        grouped_current_range_filtered_y.append([0.0])
                    grouped_current_range_filtered_time_y.append([current_range[-1]])
                grouped_current_range_filtered_time_y = [[float((ts.year-pd.Timestamp('2006-01').year)*12+(ts.month-pd.Timestamp('2006-01').month)+1) for ts in sublist] for sublist in grouped_current_range_filtered_time_y]
                grouped_current_range_filtered_economy_y.append([economy_data[current_range[-1]]])
                
                # 최대 단지 수만큼 단지별 평단가 채우기
                if len(grouped_current_range_filtered_x) < all_dong_max_apartment_complex:
                    for _ in range(all_dong_max_apartment_complex-len(grouped_current_range_filtered_x)):
                        grouped_current_range_filtered_x.append([0.0]*sequence_length)
                        grouped_current_range_filtered_time_x.append([0.0]*sequence_length)
                        grouped_current_range_filtered_y.append([0.0])
                        grouped_current_range_filtered_time_y.append([0.0])

                # x,y 단지별 평단가, 시간, 경제 모두 묶고 dongs에 하나씩 붙이기
                grouped_current_range_filtered_and_time_and_economy_x = []
                grouped_current_range_filtered_and_time_and_economy_x.extend([grouped_current_range_filtered_x, grouped_current_range_filtered_time_x, grouped_current_range_filtered_economy_x])
                grouped_current_range_filtered_and_time_and_economy_y = []
                grouped_current_range_filtered_and_time_and_economy_y.extend([grouped_current_range_filtered_y, grouped_current_range_filtered_time_y, grouped_current_range_filtered_economy_y])
                dongs_x.append(grouped_current_range_filtered_and_time_and_economy_x)
                dongs_y.append(grouped_current_range_filtered_and_time_and_economy_y)

        self.dongs_x = dongs_x
        self.dongs_y = dongs_y
        self.len = len(dongs_x)

    # 부동산_x, 시간_x, 경제_x, 부동산_y, 시간_y, 경제_y 
    def __getitem__(self, i):
        return torch.FloatTensor(self.dongs_x[i][0]), torch.FloatTensor(self.dongs_x[i][1]), torch.FloatTensor(self.dongs_x[i][2]), torch.FloatTensor(self.dongs_y[i][0]), torch.FloatTensor(self.dongs_y[i][1]), torch.FloatTensor(self.dongs_y [i][2])
 
    def __len__(self):
        return self.len

batch_size = 1
train_dataset = NODE_Transformer_Dataset(transaction_df, economy_df)
train_loader = DataLoader(train_dataset, batch_size=batch_size)

In [68]:
for x,y,z,w,p,q in train_loader:
    print(x)
    break

tensor([[[2725.0000, 2750.0000, 2833.3333, 3395.8333, 3666.6667],
         [3590.9092, 3863.6365, 3818.1819, 4386.3638, 4545.4546],
         [3409.0908, 3681.8181, 3850.0000, 4512.0000, 4272.7271],
         [3062.5000, 3020.8333, 3041.6667, 3404.1667, 3625.0000],
         [2720.0000, 2640.0000, 2900.0000, 3320.0000, 3541.6667],
         [   0.0000,    0.0000,    0.0000,    0.0000,    0.0000],
         [   0.0000,    0.0000,    0.0000,    0.0000,    0.0000],
         [   0.0000,    0.0000,    0.0000,    0.0000,    0.0000],
         [   0.0000,    0.0000,    0.0000,    0.0000,    0.0000],
         [   0.0000,    0.0000,    0.0000,    0.0000,    0.0000],
         [   0.0000,    0.0000,    0.0000,    0.0000,    0.0000],
         [   0.0000,    0.0000,    0.0000,    0.0000,    0.0000],
         [   0.0000,    0.0000,    0.0000,    0.0000,    0.0000],
         [   0.0000,    0.0000,    0.0000,    0.0000,    0.0000],
         [   0.0000,    0.0000,    0.0000,    0.0000,    0.0000],
         [