### Imputation with GAN

In [2]:
# Library
import os
import re
import copy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.impute import KNNImputer
import warnings
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, LSTM
from statsmodels.distributions.empirical_distribution import ECDF

총 9가지의 변수에 대하여 imputation을 진행해 보았다.

In [3]:
class GANimputer():
    """Data Transformer.
    data : imputation하고자 하는 raw data
    data_chr : 데이터 종류 (train / test)
    var : imputation하고자 하는 변수들의 리스트
    epoch : GAN fitting epoch
    seed : random seed
    """
    def __init__(self, data):
        self.data = data
    def initailize(self, data_chr):
        # 컬럼명 수정
        if data_chr == 'train':
            col = ['idx','area','year','mmddhh','temp','dp_temp',
              'humid','wind','rain','tf_rain','fog','land_temp','solar_amt','solar_time','snow']
            col2 = ['area', 'year', 'month', 'day', 'hour', 'temp', 'dp_temp', 'humid',
               'wind', 'rain', 'tf_rain', 'fog', 'land_temp', 'solar_amt',
               'solar_time', 'snow']
        else:
            col = ['idx','area','year','mmddhh','temp','dp_temp',
              'humid','wind','rain','tf_rain','fog','solar_amt','solar_time','snow']
            col2 = ['area', 'year', 'month', 'day', 'hour', 'temp', 'dp_temp', 'humid',
               'wind', 'rain', 'tf_rain', 'fog', 'solar_amt',
               'solar_time', 'snow']
        self.data.columns = col
        self.data['area'] = self.data['area'].replace({'a': 1, 'b': 2, 'c': 3})
        # 월/일/시간 데이터 분리
        pattern = r'(\d+)(\d{4})$'
        month = []
        for i in range(self.data.shape[0]):
            matches = re.match(pattern, str(self.data['mmddhh'][i]))
            result = matches.group(1)
            month.append(result)
        self.data['month'] = month
        day = []
        for i in range(self.data.shape[0]):
            result = str(self.data['mmddhh'][i])[-4:-2]
            day.append(result)
        self.data['day'] = day
        hour = []
        for i in range(self.data.shape[0]):
            result = str(self.data['mmddhh'][i])[-2:]
            hour.append(result)
        self.data['hour'] = hour
        # index, mmddhh 컬럼 제거
        self.data.drop(['idx', 'mmddhh'], axis = 1, inplace=True)
        self.data = self.data[col2]
        # NAN 처리
        self.data.replace(-99.9, np.nan, inplace=True)
        self.data.replace(-99, np.nan, inplace=True)
        # 타입 처리
        self.data[self.data.drop(['year', 'fog'], axis = 1).columns] = self.data.drop(['year', 'fog'], axis = 1).astype('float')
        
    def preprocessing(self):
        # solar_amt가 NAN인 값들 중에서 fog가 Clear가 아니거나 밤 시간대(20-6시)인 경우 >> 0
        self.data.loc[((self.data['fog'] != 'C') | (self.data['hour'] <= 6) | (self.data['hour'] >= 20)) & (self.data['solar_amt'].isnull()), 'solar_amt'] = 0
        # solar_time이 NAN인 값들 중에서 fog가 Clear가 아니거나 밤 시간대(20-6시)인 경우 >> 0
        self.data.loc[((self.data['fog'] != 'C') | (self.data['hour'] <= 6) | (self.data['hour'] >= 20)) & (self.data['solar_time'].isnull()), 'solar_time'] = 0
        # snow의 missing은 0으로 처리
        self.data.loc[self.data['snow'].isnull(), 'snow'] = 0
    
    def fit(self, data, var, epoch, seed):
    # imputation 대상인 변수만 사용
    new_df = data[var]
    dim = len(var)

    # Set seed
    np.random.seed(seed)

    # 결측값 위치 저장
    missing_mask = new_df.isnull()

    # 전처리로 간단한 mean imputation 적용
    imputed_data = new_df.copy()
    imputed_data = imputed_data.fillna(imputed_data.mean())

    # GAN 모델링이 가능하도록 format 맞춰주기
    data_array = imputed_data.values
    X = data_array[:-1]  # Input features (time steps)
    y = data_array[1:]  # Target features (next time step)

    # Reshape
    X = X.reshape(X.shape[0], X.shape[1], 1)
    y = y.reshape(y.shape[0], y.shape[1])

    # Define the GAN model
    generator = Sequential()
    generator.add(LSTM(32, input_shape=(dim, 1)))
    generator.add(Dense(dim, activation='linear'))

    discriminator = Sequential()
    discriminator.add(Conv1D(64, 3, strides=2, padding='same', input_shape=(dim, 1)))
    discriminator.add(LeakyReLU(alpha=0.2))
    discriminator.add(Conv1D(128, 3, strides=2, padding='same'))
    discriminator.add(LeakyReLU(alpha=0.2))
    discriminator.add(Flatten())
    discriminator.add(Dense(1, activation='sigmoid'))

    gan_input = Input(shape=(dim, 1))
    generated_data = generator(gan_input)
    gan_output = discriminator(generated_data)

    gan = Model(inputs=gan_input, outputs=gan_output)
    discriminator.compile(loss='binary_crossentropy', optimizer='adam')
    gan.compile(loss='binary_crossentropy', optimizer='adam')

        # GAN generator
        def generate_synthetic_values(context, generator):
            context = context.reshape(context.shape[0], context.shape[1], 1)
            synthetic_value = generator.predict(context)
            return synthetic_value.flatten()

        # GAN-based imputation
        def gan_imputation(data, missing_mask):
            imputed_array = data.copy().values

            for i in range(1, len(imputed_array)):
                for j in range(dim):
                    if missing_mask.iloc[i, j]:
                        context = imputed_array[i-1, :].reshape(1, dim) # shape 맞추기에 주의!!
                        imputed_value = generate_synthetic_values(context, generator)
                        imputed_array[i, j] = imputed_value[j]

            imputed_data = pd.DataFrame(imputed_array, columns=data.columns)
            return imputed_data

        # Train the GAN model
        batch_size = 32
        for epoch in range(epoch):
            # Train discriminator
            discriminator_loss = 0
            for _ in range(batch_size):
                idx = np.random.randint(0, X.shape[0], batch_size)
                real_samples = y[idx]
                noise = np.random.normal(0, 1, (batch_size, dim, 1))
                generated_samples = generator.predict(noise)

                labels = np.zeros((batch_size, 1))
                labels[:batch_size // 2] = 1

                discriminator_loss += discriminator.train_on_batch(real_samples, labels)
                discriminator_loss += discriminator.train_on_batch(generated_samples, 1 - labels)

            discriminator_loss /= batch_size

            # Train generator
            noise = np.random.normal(0, 1, (batch_size, dim, 1))
            generator_loss = gan.train_on_batch(noise, np.ones((batch_size, 1)))

            print(f"Epoch: {epoch+1}, Discriminator Loss: {discriminator_loss}, Generator Loss: {generator_loss}")

        # Perform GAN-based imputation
        imputed_data_gan = gan_imputation(imputed_data, missing_mask)

        # 결측값이 아닌 데이터가 바뀌지 않게 하는데에 주의하며 imputation data 정리
        imputed_data_final = new_df.copy()
        imputed_data_final[missing_mask] = imputed_data_gan[missing_mask]

        return imputed_data_final, imputed_data_gan
    
    # 최종 imputer
    def impute(self, var, epoch, seed):
        result = pd.DataFrame()
        result_gan = pd.DataFrame()
        for i in np.unique(self.data['area']):
            print('### area :', int(i))
            temp, gan_temp = self.fit(self.data[self.data['area'] == i].reset_index(drop =True), var, epoch, seed)
            result = pd.concat([result, temp], axis = 0)
            print(result.info())
        df_final = pd.concat([self.data[['area','year','month','day','hour']], result.reset_index(drop = True), self.data[['snow', 'fog']]], axis = 1)
        return df_final
    
    # 원하는 변수의 음수값을 0으로 처리 (강수량, 일조량, 시간 등과 같이 음수를 가질 수 없는 변수 처리)
    def replace_negative_values(self, df, col):
        df_copy = df.copy() 
        df_copy[col] = df_copy[col].clip(lower=0)     
        return df_copy

In [4]:
path = 'surface_tp_train.csv'
df = pd.read_csv(path)
df

Unnamed: 0.1,Unnamed: 0,surface_tp_train.stn,surface_tp_train.year,surface_tp_train.mmddhh,surface_tp_train.ta,surface_tp_train.td,surface_tp_train.hm,surface_tp_train.ws,surface_tp_train.rn,surface_tp_train.re,surface_tp_train.ww,surface_tp_train.ts,surface_tp_train.si,surface_tp_train.ss,surface_tp_train.sn
0,1,1,A,20100,-9.9,-10.7,93.9,0.6,0.0,0,H,-1.3,-99.9,-99.9,-99.9
1,2,1,A,20101,-10.8,-11.6,93.8,0.6,0.0,0,H,-1.5,-99.9,-99.9,-99.9
2,3,1,A,20102,-11.4,-12.1,94.6,0.7,0.0,0,H,-1.7,-99.9,-99.9,-99.9
3,4,1,A,20103,-11.6,-12.5,93.4,0.6,0.0,0,H,-1.8,-99.9,-99.9,-99.9
4,5,1,A,20104,-11.8,-12.7,93.0,0.6,0.0,0,H,-2.0,-99.9,-99.9,-99.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438235,438236,10,F,13119,5.7,-7.2,39.2,3.9,0.0,0,C,3.6,-99.9,-99.9,-99.9
438236,438237,10,F,13120,5.2,-8.0,38.1,4.9,0.0,0,C,2.7,-99.9,-99.9,-99.9
438237,438238,10,F,13121,4.6,-7.9,39.8,6.6,0.0,0,C,2.2,-99.9,-99.9,-99.9
438238,438239,10,F,13122,3.6,-6.6,47.4,7.1,0.0,0,C,1.6,-99.9,-99.9,-99.9


In [5]:
# 생활안전 과제인 지면온도 예측에 대한 RN(누적강수량)의 일부 값 변경사항 (6/19 공지)
df.loc[df['surface_tp_train.rn'].isin([1.8, 3.3, 5.3, 19.5, 30.3, 623.5]), 'surface_tp_train.rn'] = -99.9

In [7]:
imputer = GANimputer(df)
imputer.initailize('train')
imputer.preprocessing()

In [8]:
imputer.data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 438240 entries, 0 to 438239
Data columns (total 16 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   area        438240 non-null  float64
 1   year        438240 non-null  object 
 2   month       438240 non-null  float64
 3   day         438240 non-null  float64
 4   hour        438240 non-null  float64
 5   temp        437610 non-null  float64
 6   dp_temp     437594 non-null  float64
 7   humid       437662 non-null  float64
 8   wind        437233 non-null  float64
 9   rain        430245 non-null  float64
 10  tf_rain     436101 non-null  float64
 11  fog         438240 non-null  object 
 12  land_temp   437613 non-null  float64
 13  solar_amt   429221 non-null  float64
 14  solar_time  429865 non-null  float64
 15  snow        438240 non-null  float64
dtypes: float64(14), object(2)
memory usage: 53.5+ MB


In [9]:
imputer.data.describe()

Unnamed: 0,area,month,day,hour,temp,dp_temp,humid,wind,rain,tf_rain,land_temp,solar_amt,solar_time,snow
count,438240.0,438240.0,438240.0,438240.0,437610.0,437594.0,437662.0,437233.0,430245.0,436101.0,437613.0,429221.0,429865.0,438240.0
mean,5.5,6.523549,15.72782,11.5,13.812299,6.703887,65.647486,2.217351,0.148436,3.106656,15.433858,0.604583,0.289334,0.054271
std,2.872285,3.448537,8.799334,6.922194,10.174912,11.955404,21.625208,1.63826,1.225136,11.583692,12.640131,0.902354,0.42343,0.741158
min,1.0,1.0,1.0,0.0,-21.6,-41.5,3.0,0.0,0.0,0.0,-19.9,0.0,0.0,0.0
25%,3.0,4.0,8.0,5.75,5.8,-2.3,49.3,1.0,0.0,0.0,4.8,0.0,0.0,0.0
50%,5.5,7.0,16.0,11.5,14.7,7.3,66.6,1.8,0.0,0.0,15.6,0.02,0.0,0.0
75%,8.0,10.0,23.0,17.25,22.2,17.1,83.9,3.0,0.0,0.0,24.3,1.02,0.8,0.0
max,10.0,12.0,31.0,23.0,39.4,29.3,100.0,24.6,81.5,60.0,65.7,4.77,1.0,37.4


In [10]:
final = imputer.impute(var = ['temp', 'dp_temp', 'humid', 'wind', 'rain', 'tf_rain', 'land_temp', 'solar_amt',
               'solar_time'], epoch = 100, seed = 2023)

### area : 1
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
 209/1370 [===>..........................] - ETA: 2s - loss: 9.3686

KeyboardInterrupt: 

In [None]:
final = imputer.replace_negative_values(final, ['humid', 'wind', 'rain', 'tf_rain','solar_amt', 'solar_time'])

In [None]:
final.info()

In [None]:
# Saving imputed training dataset
final.to_csv('train0624.csv')

In [None]:
dat1 = pd.read_csv('train0624.csv')

In [None]:
dat1.info()

In [17]:
# test set imputation
path = 'surface_tp_test.csv'
df_test = pd.read_csv(path)
model_test = GANimputer(df_test)
model_test.initailize('test')
model_test.preprocessing()

In [18]:
model_test.data

Unnamed: 0,area,year,month,day,hour,temp,dp_temp,humid,wind,rain,tf_rain,fog,solar_amt,solar_time,snow
0,1.0,F,2.0,1.0,0.0,0.6,-2.0,82.5,2.7,0.0,0.0,G,0.0,0.0,3.1
1,1.0,F,2.0,1.0,1.0,0.0,-5.2,68.3,3.2,0.0,0.0,R,0.0,0.0,3.1
2,1.0,F,2.0,1.0,2.0,-0.3,-6.4,63.7,2.7,0.0,0.0,C,0.0,0.0,3.1
3,1.0,F,2.0,1.0,3.0,-1.0,-4.5,77.2,2.1,0.2,7.0,R,0.0,0.0,4.1
4,1.0,F,2.0,1.0,4.0,-1.4,-3.1,88.3,2.9,0.6,3.0,R,0.0,0.0,4.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26275,3.0,G,1.0,31.0,19.0,7.1,-1.9,53.0,5.7,0.0,0.0,C,,,0.0
26276,3.0,G,1.0,31.0,20.0,6.7,-0.5,60.1,4.7,0.0,0.0,C,0.0,0.0,0.0
26277,3.0,G,1.0,31.0,21.0,6.2,-0.1,63.9,3.7,0.0,0.0,C,0.0,0.0,0.0
26278,3.0,G,1.0,31.0,22.0,6.5,0.8,67.1,4.8,0.0,0.0,C,0.0,0.0,0.0


In [19]:
model_test.data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26280 entries, 0 to 26279
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   area        26280 non-null  float64
 1   year        26280 non-null  object 
 2   month       26280 non-null  float64
 3   day         26280 non-null  float64
 4   hour        26280 non-null  float64
 5   temp        26258 non-null  float64
 6   dp_temp     26245 non-null  float64
 7   humid       26248 non-null  float64
 8   wind        26240 non-null  float64
 9   rain        25275 non-null  float64
 10  tf_rain     26153 non-null  float64
 11  fog         26280 non-null  object 
 12  solar_amt   25709 non-null  float64
 13  solar_time  25711 non-null  float64
 14  snow        26280 non-null  float64
dtypes: float64(13), object(2)
memory usage: 3.0+ MB


In [29]:
df_test_gan = model_test.impute(var = ['temp', 'dp_temp', 'humid', 'wind', 'rain', 'tf_rain', 'solar_amt',
               'solar_time'], epoch = 100, seed = 2023)

In [30]:
df_test_final = df_test_gan

In [31]:
df_test_final.describe()

In [32]:
df_test_final.to_csv('imputed_test_data0624.csv', index=False)

In [21]:
dat2 = pd.read_csv('imputed_test_data0624.csv')

In [22]:
dat2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26280 entries, 0 to 26279
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   area        26280 non-null  float64
 1   year        26280 non-null  object 
 2   month       26280 non-null  float64
 3   day         26280 non-null  float64
 4   hour        26280 non-null  float64
 5   temp        26280 non-null  float64
 6   dp_temp     26280 non-null  float64
 7   humid       26280 non-null  float64
 8   wind        26280 non-null  float64
 9   rain        26280 non-null  float64
 10  tf_rain     26280 non-null  float64
 11  solar_amt   26280 non-null  float64
 12  solar_time  26280 non-null  float64
 13  snow        26280 non-null  float64
 14  fog         26280 non-null  object 
dtypes: float64(13), object(2)
memory usage: 3.0+ MB
