In [15]:
import numpy as np
import pandas as pd

from torch.utils.data import Dataset, DataLoader
import torch

In [16]:
transaction_df = pd.read_excel('../../데이터/Transaction/transaction_final.xlsx',index_col=0)
economy_df = pd.read_excel('../../데이터/Economy/economy_all.xlsx')

In [30]:
def price_per_pyeong_interpolate(group):
    idx = pd.date_range(group['계약년월'].min(), group['계약년월'].max(), freq='MS')
    group = group.set_index('계약년월').reindex(idx)
    group['단지명'] = group['단지명'].fillna(method='ffill')
    group['시군구'] = group['시군구'].fillna(method='ffill')
    group['평단가'] = group['평단가'].interpolate()
    return group

In [31]:
class RNN_Transaction_Dataset(Dataset):
    def __init__(self, data, sequence_length=5):
        data['계약년월'] = pd.to_datetime(data['계약년월'])
        interpolated_data = pd.DataFrame(data.groupby(['시군구', '단지명']).apply(price_per_pyeong_interpolate)['평단가']).reset_index().rename(columns={'level_2': '계약년월'})

        dongs_x, dongs_y = [], []
        for dong in interpolated_data['시군구'].unique():
            for apartment_complex in interpolated_data[interpolated_data['시군구'] == dong]['단지명'].unique():
                filtered_interpolated_data_values = interpolated_data[interpolated_data['단지명'] == apartment_complex]['평단가'].values
                for idx in range(len(filtered_interpolated_data_values)-sequence_length):
                    apartment_complex_x = filtered_interpolated_data_values[idx:idx+sequence_length]
                    apartment_complex_y = filtered_interpolated_data_values[idx+sequence_length:idx+sequence_length+1]
                    dongs_x.append(apartment_complex_x)
                    dongs_y.append(apartment_complex_y)   

        self.dongs_x = dongs_x
        self.dongs_y = dongs_y
        self.len = len(dongs_x)

    def __getitem__(self, i):
        return torch.FloatTensor(self.dongs_x[i]), torch.FloatTensor(self.dongs_y[i])

    def __len__(self):
        return self.len

batch_size = 2
train_dataset = RNN_Transaction_Dataset(transaction_df)
train_loader = DataLoader(train_dataset, batch_size=batch_size)

In [32]:
for x,y in train_loader:
    print(x)
    break

tensor([[2586.9565, 2913.0435, 2760.8696, 2608.6956, 2741.8479],
        [2913.0435, 2760.8696, 2608.6956, 2741.8479, 2875.0000]])


In [18]:
def price_per_pyeong_fill_0(df):
    months = pd.to_datetime(pd.date_range(start="20060101", end="20221201", freq='MS'))
    complex_city_combinations = df[['단지명', '시군구']].drop_duplicates()

    combinations = pd.DataFrame({
        '단지명': np.tile(complex_city_combinations['단지명'], len(months)),
        '시군구': np.tile(complex_city_combinations['시군구'], len(months)),
        '계약년월': np.repeat(months, len(complex_city_combinations))
    })
    
    df['계약년월'] = pd.to_datetime(df['계약년월'])
    df = pd.merge(combinations, df, on=['단지명', '계약년월', '시군구'], how='left')
    df['평단가'].fillna(0, inplace=True)

    return df

In [59]:
class RNN_Transformer_Dataset(Dataset):
    def __init__(self, transaction_data, economy_data, sequence_length=5):
        all_dong_max_apartment_complex = transaction_data.drop_duplicates(subset=['시군구','단지명']).groupby(['시군구'])['단지명'].count().max()

        filled_data = price_per_pyeong_fill_0(transaction_data)
        filled_data = filled_data[['시군구', '단지명', '계약년월', '평단가']]        

        dongs_x, dongs_y = [], []
        for dong in filled_data['시군구'].unique():
            filtered_filled_data = filled_data[filled_data['시군구'] == dong]
            date_range = pd.date_range('20060101', '20221201', freq='MS')
            for idx in range(len(date_range)-sequence_length):
                current_range_x = date_range[idx:idx+sequence_length]
                current_range_y = date_range[idx+sequence_length:idx+sequence_length+1]
                current_range_filled_x = filtered_filled_data[filtered_filled_data['계약년월'].isin(current_range_x)]
                current_range_filled_y = filtered_filled_data[filtered_filled_data['계약년월'].isin(current_range_y)]
                grouped_current_range_filled_x = current_range_filled_x.groupby('단지명').agg({'평단가': list}).reset_index()['평단가'].to_list()
                grouped_current_range_filled_y = current_range_filled_y.groupby('단지명').agg({'평단가': list}).reset_index()['평단가'].to_list()
                if len(grouped_current_range_filled_x) < all_dong_max_apartment_complex:
                    for _ in range(all_dong_max_apartment_complex-len(grouped_current_range_filled_x)):
                        grouped_current_range_filled_x.append([0.0]*sequence_length)
                        grouped_current_range_filled_y.append([0.0])
                economy_x, economy_y = [], []
                economy_x.extend([
                    economy_data['국고채금리'][idx:idx+sequence_length].to_list(),
                    economy_data['콜금리'][idx:idx+sequence_length].to_list()
                ])
                economy_y.extend([
                    economy_data['국고채금리'][idx+sequence_length:idx+sequence_length+1].to_list(),
                    economy_data['콜금리'][idx+sequence_length:idx+sequence_length+1].to_list()
                ])
                grouped_current_range_filled_and_economy_x = []
                grouped_current_range_filled_and_economy_x.extend([grouped_current_range_filled_x, economy_x])
                grouped_current_range_filled_and_economy_y = []
                grouped_current_range_filled_and_economy_y.extend([grouped_current_range_filled_y, economy_y])
                dongs_x.append(grouped_current_range_filled_and_economy_x)
                dongs_y.append(grouped_current_range_filled_and_economy_y)

        self.dongs_x = dongs_x
        self.dongs_y = dongs_y
        self.len = len(dongs_x)

    # 부동산_x, 경제_x, 부동산_y, 경제_y 
    def __getitem__(self, i):
        return torch.FloatTensor(self.dongs_x[i][0]), torch.FloatTensor(self.dongs_x[i][1]), torch.FloatTensor(self.dongs_y[i][0]), torch.FloatTensor(self.dongs_y[i][1])

    def __len__(self):
        return self.len

batch_size = 1
train_dataset = RNN_Transformer_Dataset(transaction_df, economy_df)
train_loader = DataLoader(train_dataset, batch_size=batch_size)

In [63]:
for x,y,z,w in train_loader:
    print(w)
    break

tensor([[[3.2380],
         [1.9710]]])


In [8]:
class NODE_Transaction_Dataset(Dataset):
    def __init__(self, data, sequence_length=5):
        data['계약년월'] = data['계약년월'].dt.strftime('%Y%m%d')

        dongs_x, dongs_y = [], []
        for dong in data['시군구'].unique():
            for apartment_complex in data[data['시군구'] == dong]['단지명'].unique():
                filtered_data_values = data[data['단지명'] == apartment_complex]['평단가'].values
                filtered_data_times = data[data['단지명'] == apartment_complex]['계약년월'].apply(lambda x: int(x)).values 
                for idx in range(len(filtered_data_values)-sequence_length):
                    apartment_complex_and_time_x = []
                    apartment_complex_and_time_x.extend([
                        filtered_data_values[idx:idx+sequence_length],
                        filtered_data_times[idx:idx+sequence_length]
                    ])
                    apartment_complex_and_time_y = []
                    apartment_complex_and_time_y.extend([
                        filtered_data_values[idx+sequence_length:idx+sequence_length+1],
                        filtered_data_times[idx+sequence_length:idx+sequence_length+1]
                    ])
                    dongs_x.append(apartment_complex_and_time_x)
                    dongs_y.append(apartment_complex_and_time_y)

        self.dongs_x = dongs_x
        self.dongs_y = dongs_y
        self.len = len(dongs_x)

    # 부동산_x, 부동산_시점_x, 부동산_y, 부동산_시점_y 
    def __getitem__(self, i):
        return torch.FloatTensor(self.dongs_x[i][0]), torch.tensor(self.dongs_x[i][1], dtype=torch.int64), torch.FloatTensor(self.dongs_y[i][0]), torch.tensor(self.dongs_y[i][1], dtype=torch.int64)

    def __len__(self):
        return self.len

batch_size = 1
train_dataset = NODE_Transaction_Dataset(transaction_df)
train_loader = DataLoader(train_dataset, batch_size=batch_size)

In [9]:
for x,y,z,w in train_loader:
    print(y)
    break

tensor([[20060301, 20060401, 20060601, 20061001, 20061201]])


In [None]:
class NODE_Transformer_Dataset(Dataset):
    def __init__(self, transaction_data, economy_data, sequence_length=5):
        all_dong_max_apartment_complex = transaction_data.drop_duplicates(subset=['시군구','단지명']).groupby(['시군구'])['단지명'].count().max()

        dongs_x, dongs_y = [], []
        for dong in transaction_data['시군구'].unique():
            filtered_filled_data = transaction_data[transaction_data['시군구'] == dong]
            date_range = pd.date_range('20060101', '20221201', freq='MS')
            for idx in range(len(date_range)-sequence_length):
                current_range_x = date_range[idx:idx+sequence_length]
                current_range_y = date_range[idx+sequence_length:idx+sequence_length+1]
                current_range_filled_x = filtered_filled_data[filtered_filled_data['계약년월'].isin(current_range_x)]
                current_range_filled_y = filtered_filled_data[filtered_filled_data['계약년월'].isin(current_range_y)]
                grouped_current_range_filled_x = current_range_filled_x.groupby('단지명').agg({'평단가': list}).reset_index()['평단가'].to_list()
                grouped_current_range_filled_y = current_range_filled_y.groupby('단지명').agg({'평단가': list}).reset_index()['평단가'].to_list()
                if len(grouped_current_range_filled_x) < all_dong_max_apartment_complex:
                    for _ in range(all_dong_max_apartment_complex-len(grouped_current_range_filled_x)):
                        grouped_current_range_filled_x.append([0.0]*sequence_length)
                        grouped_current_range_filled_y.append([0.0])
                economy_x, economy_y = [], []
                economy_x.extend([
                    economy_data['국고채금리'][idx:idx+sequence_length].to_list(),
                    economy_data['콜금리'][idx:idx+sequence_length].to_list()
                ])
                economy_y.extend([
                    economy_data['국고채금리'][idx+sequence_length:idx+sequence_length+1].to_list(),
                    economy_data['콜금리'][idx+sequence_length:idx+sequence_length+1].to_list()
                ])
                grouped_current_range_filled_and_economy_x = []
                grouped_current_range_filled_and_economy_x.extend([grouped_current_range_filled_x, economy_x])
                grouped_current_range_filled_and_economy_y = []
                grouped_current_range_filled_and_economy_y.extend([grouped_current_range_filled_y, economy_y])
                dongs_x.append(grouped_current_range_filled_and_economy_x)
                dongs_y.append(grouped_current_range_filled_and_economy_y)

        self.dongs_x = dongs_x
        self.dongs_y = dongs_y
        self.len = len(dongs_x)

    # 부동산_x, 경제_x, 부동산_y, 경제_y 
    def __getitem__(self, i):
        return torch.FloatTensor(self.dongs_x[i][0]), torch.FloatTensor(self.dongs_x[i][1]), torch.FloatTensor(self.dongs_y[i][0]), torch.FloatTensor(self.dongs_y[i][1])

    def __len__(self):
        return self.len

batch_size = 1
train_dataset = NODE_Transformer_Dataset(transaction_df, economy_df)
train_loader = DataLoader(train_dataset, batch_size=batch_size)

In [13]:
start_date = pd.Timestamp('20060101')
date_range = pd.date_range(start_date+pd.tseries.offsets.DateOffset(months=5-1), '20221201', freq='MS')
date_range

DatetimeIndex(['2006-05-01', '2006-06-01', '2006-07-01', '2006-08-01',
               '2006-09-01', '2006-10-01', '2006-11-01', '2006-12-01',
               '2007-01-01', '2007-02-01',
               ...
               '2022-03-01', '2022-04-01', '2022-05-01', '2022-06-01',
               '2022-07-01', '2022-08-01', '2022-09-01', '2022-10-01',
               '2022-11-01', '2022-12-01'],
              dtype='datetime64[ns]', length=200, freq='MS')

In [17]:
transaction_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 246685 entries, 0 to 246684
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype         
---  ------  --------------   -----         
 0   시군구     246685 non-null  object        
 1   단지명     246685 non-null  object        
 2   계약년월    246685 non-null  datetime64[ns]
 3   평단가     246685 non-null  float64       
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 9.4+ MB


In [26]:
pd.date_range(start_date, start_date+pd.tseries.offsets.DateOffset(months=1), freq='MS')

DatetimeIndex(['2006-01-01', '2006-02-01'], dtype='datetime64[ns]', freq='MS')

In [32]:
sequence_length = 5
transaction_data = transaction_df
economy_data = economy_df

all_dong_max_apartment_complex = transaction_data.drop_duplicates(subset=['시군구','단지명']).groupby(['시군구'])['단지명'].count().max()

transaction_data.drop_duplicates(subset=['시군구', '단지명', '계약년월'], keep='first', inplace=True) # 왜 rnn_transformer는 가능했지?
dongs_x, dongs_y = [], []
for dong in transaction_data['시군구'].unique():
    filtered_data = transaction_data[transaction_data['시군구'] == dong]
    start_date = pd.Timestamp('20060101')
    date_range = pd.date_range('20060101', '20221201', freq='MS')
    for idx in range(10,len(date_range)-sequence_length+1):
        current_range_x = pd.date_range(start_date, start_date+pd.tseries.offsets.DateOffset(months=sequence_length-1+idx), freq='MS')
        current_range_y = date_range[idx+sequence_length:idx+sequence_length+1]
        current_range_filtered_x = filtered_data[filtered_data['계약년월'].isin(current_range_x)].groupby('단지명').filter(lambda x: len(x) >= sequence_length)
        current_range_filtered_y = filtered_data[filtered_data['계약년월'].isin(current_range_y)].groupby('단지명').filter(lambda x: len(x) >= sequence_length)
        grouped_current_range_filtered_x = current_range_filtered_x.groupby('단지명').agg({'평단가': list}).reset_index()['평단가'].to_list()
        grouped_current_range_filtered_y = current_range_filtered_y.groupby('단지명').agg({'평단가': list}).reset_index()['평단가'].to_list()
        print(current_range_x)
        print(current_range_filtered_x)
        print(grouped_current_range_filtered_x)
        break
        # if len(grouped_current_range_filled_x) < all_dong_max_apartment_complex:
        #     for _ in range(all_dong_max_apartment_complex-len(grouped_current_range_filled_x)):
        #         grouped_current_range_filled_x.append([0.0]*sequence_length)
        #         grouped_current_range_filled_y.append([0.0])
        # economy_x, economy_y = [], []
        # economy_x.extend([
        #     economy_data['국고채금리'][idx:idx+sequence_length].to_list(),
        #     economy_data['콜금리'][idx:idx+sequence_length].to_list()
        # ])
        # economy_y.extend([
        #     economy_data['국고채금리'][idx+sequence_length:idx+sequence_length+1].to_list(),
        #     economy_data['콜금리'][idx+sequence_length:idx+sequence_length+1].to_list()
        # ])
        # grouped_current_range_filled_and_economy_x = []
        # grouped_current_range_filled_and_economy_x.extend([grouped_current_range_filled_x, economy_x])
        # grouped_current_range_filled_and_economy_y = []
        # grouped_current_range_filled_and_economy_y.extend([grouped_current_range_filled_y, economy_y])
        # dongs_x.append(grouped_current_range_filled_and_economy_x)
        # dongs_y.append(grouped_current_range_filled_and_economy_y)
    break

DatetimeIndex(['2006-01-01', '2006-02-01', '2006-03-01', '2006-04-01',
               '2006-05-01', '2006-06-01', '2006-07-01', '2006-08-01',
               '2006-09-01', '2006-10-01', '2006-11-01', '2006-12-01',
               '2007-01-01', '2007-02-01', '2007-03-01'],
              dtype='datetime64[ns]', freq='MS')
                 시군구             단지명       계약년월          평단가
0      서울특별시 강남구 개포동  개포2차현대아파트(220) 2006-03-01  2586.956522
1      서울특별시 강남구 개포동  개포2차현대아파트(220) 2006-04-01  2913.043478
2      서울특별시 강남구 개포동  개포2차현대아파트(220) 2006-06-01  2608.695652
3      서울특별시 강남구 개포동  개포2차현대아파트(220) 2006-10-01  3141.304348
4      서울특별시 강남구 개포동  개포2차현대아파트(220) 2006-12-01  3326.086957
...              ...             ...        ...          ...
86     서울특별시 강남구 개포동            포이벨리 2006-09-01  1180.000000
87     서울특별시 강남구 개포동            포이벨리 2006-10-01  1380.000000
17936  서울특별시 강남구 개포동  개포6차우성아파트1동~8동 2007-03-01  3875.000000
17939  서울특별시 강남구 개포동         개포주공6단지 2007-03-01  5000.000000
17952  서울

In [17]:
transaction_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 246685 entries, 0 to 246684
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype         
---  ------  --------------   -----         
 0   시군구     246685 non-null  object        
 1   단지명     246685 non-null  object        
 2   계약년월    246685 non-null  datetime64[ns]
 3   평단가     246685 non-null  float64       
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 9.4+ MB
