# nan 처리된 데이터 불러오기

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import torch

# 데이터 로드
train_X_data = pd.read_csv('./train_set/train_X_data_nan.csv', index_col='datetime', parse_dates=True)
test_X_data = pd.read_csv('./test_set/test_X_data_nan.csv', index_col='datetime', parse_dates=True)

train_X_data = train_X_data.apply(pd.to_numeric, errors='coerce') 
test_X_data = test_X_data.apply(pd.to_numeric, errors='coerce')

# 선형보간

In [6]:
# 선형 보간법을 사용하여 결측값 대체
train_X_data_impute = train_X_data.interpolate(method='linear')
test_X_data_impute = test_X_data.interpolate(method='linear')

# 데이터 확인
print("Train X data shape:", train_X_data_impute.shape)
print("Test X data shape:", test_X_data_impute.shape)

train_X_data_impute.to_csv('./train_set/train_X_data_linear.csv')
test_X_data_impute.to_csv('./test_set/test_X_data_linear.csv')

Train X data shape: (40911, 35)
Test X data shape: (2920, 35)


# Iterative

In [3]:
from sklearn.experimental import enable_iterative_imputer  # 실험적 기능 활성화
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer

In [14]:
# IterativeImputer를 사용하여 결측치 처리
imputerIterative = IterativeImputer(random_state=42)
X_train_imputed = imputerIterative.fit_transform(train_X_data)
X_test_imputed = imputerIterative.transform(test_X_data)



In [18]:
X_train_imputed_df = pd.DataFrame(X_train_imputed)
X_train_imputed_df.index = train_X_data.index
X_train_imputed_df.columns = train_X_data.columns
#print(X_train_imputed_df.shape)
#print(X_train_imputed_df)

X_test_imputed_df = pd.DataFrame(X_test_imputed)
X_test_imputed_df.index = test_X_data.index
X_test_imputed_df.columns = test_X_data.columns
#print(X_test_imputed_df.shape)
#print(X_test_imputed_df)

In [19]:
# 데이터 확인
print("Train X data shape:", X_train_imputed_df.shape)
print("Test X data shape:", X_test_imputed_df.shape)

X_train_imputed_df.to_csv('./train_set/train_X_data_iterative.csv')
X_test_imputed_df.to_csv('./test_set/test_X_data_iterative.csv')

Train X data shape: (40911, 35)
Test X data shape: (2920, 35)


# KNN

In [20]:
# KNNImputer를 사용하여 결측치 처리
imputerKNN = KNNImputer(n_neighbors=10)
X_train_imputed = imputerKNN.fit_transform(train_X_data)
X_test_imputed = imputerKNN.transform(test_X_data)

In [21]:
X_train_imputed_df = pd.DataFrame(X_train_imputed)
X_train_imputed_df.index = train_X_data.index
X_train_imputed_df.columns = train_X_data.columns
#print(X_train_imputed_df.shape)
#print(X_train_imputed_df)

X_test_imputed_df = pd.DataFrame(X_test_imputed)
X_test_imputed_df.index = test_X_data.index
X_test_imputed_df.columns = test_X_data.columns
#print(X_test_imputed_df.shape)
#print(X_test_imputed_df)

In [22]:
# 데이터 확인
print("Train X data shape:", X_train_imputed_df.shape)
print("Test X data shape:", X_test_imputed_df.shape)

X_train_imputed_df.to_csv('./train_set/train_X_data_KNN.csv')
X_test_imputed_df.to_csv('./test_set/test_X_data_KNN.csv')

Train X data shape: (40911, 35)
Test X data shape: (2920, 35)


# Mean

In [23]:
# SimpleImputer를 사용하여 결측치 처리
imputerSimple = SimpleImputer(strategy='mean')  # 평균으로 결측치 채우기
X_train_imputed = imputerSimple.fit_transform(train_X_data)
X_test_imputed = imputerSimple.transform(test_X_data)

In [24]:
X_train_imputed_df = pd.DataFrame(X_train_imputed)
X_train_imputed_df.index = train_X_data.index
X_train_imputed_df.columns = train_X_data.columns
#print(X_train_imputed_df.shape)
#print(X_train_imputed_df)

X_test_imputed_df = pd.DataFrame(X_test_imputed)
X_test_imputed_df.index = test_X_data.index
X_test_imputed_df.columns = test_X_data.columns
#print(X_test_imputed_df.shape)
#print(X_test_imputed_df)

In [25]:
# 데이터 확인
print("Train X data shape:", X_train_imputed_df.shape)
print("Test X data shape:", X_test_imputed_df.shape)

X_train_imputed_df.to_csv('./train_set/train_X_data_mean.csv')
X_test_imputed_df.to_csv('./test_set/test_X_data_mean.csv')

Train X data shape: (40911, 35)
Test X data shape: (2920, 35)


# Most Frequent

In [26]:
# SimpleImputer를 사용하여 결측치 처리
imputerSimple = SimpleImputer(strategy='most_frequent')  # 평균으로 결측치 채우기
X_train_imputed = imputerSimple.fit_transform(train_X_data)
X_test_imputed = imputerSimple.transform(test_X_data)

In [27]:
X_train_imputed_df = pd.DataFrame(X_train_imputed)
X_train_imputed_df.index = train_X_data.index
X_train_imputed_df.columns = train_X_data.columns
#print(X_train_imputed_df.shape)
#print(X_train_imputed_df)

X_test_imputed_df = pd.DataFrame(X_test_imputed)
X_test_imputed_df.index = test_X_data.index
X_test_imputed_df.columns = test_X_data.columns
#print(X_test_imputed_df.shape)
#print(X_test_imputed_df)

In [28]:
# 데이터 확인
print("Train X data shape:", X_train_imputed_df.shape)
print("Test X data shape:", X_test_imputed_df.shape)

X_train_imputed_df.to_csv('./train_set/train_X_data_MF.csv')
X_test_imputed_df.to_csv('./test_set/test_X_data_MF.csv')

Train X data shape: (40911, 35)
Test X data shape: (2920, 35)


# Random Sampling

In [1]:
from scipy.stats import norm

# 결측값이 아닌 데이터의 분포 추정
def estimate_distribution(column):
    non_na_data = column.dropna()
    mu, std = norm.fit(non_na_data)
    return mu, std

# 결측값을 추정된 분포에서 샘플링한 값으로 대체
def fill_missing_with_distribution(df):
    filled_df = df.copy() # 받은 데이터 복사본 만들기
    for column in df.columns: # column 하나씩 조회
        mu, std = estimate_distribution(df[column]) # 평균, 표준편차 구하기
        missing_mask = df[column].isna() # 결측치의 위치 표시
        filled_values = norm.rvs(loc=mu, scale=std, size=missing_mask.sum()) # 추정된 분포에서 샘플링
        filled_df.loc[missing_mask, column] = filled_values # 채워 넣기
    return filled_df

In [3]:
# 결측치 대체
train_X = fill_missing_with_distribution(train_X_data)
test_X = fill_missing_with_distribution(test_X_data)

In [6]:
# 데이터 확인
print("Train X data shape:", train_X.shape)
print("Test X data shape:", test_X.shape)
train_X.to_csv('./train_set/train_X_data_RS.csv')
test_X.to_csv('./test_set/test_X_data_RS.csv')

Train X data shape: (40911, 35)
Test X data shape: (2920, 35)


# Median


In [5]:
# SimpleImputer를 사용하여 결측치 처리
imputerSimple = SimpleImputer(strategy='median')  # 중위값으로 결측치 채우기
X_train_imputed = imputerSimple.fit_transform(train_X_data)
X_test_imputed = imputerSimple.transform(test_X_data)

In [6]:
X_train_imputed_df = pd.DataFrame(X_train_imputed)
X_train_imputed_df.index = train_X_data.index
X_train_imputed_df.columns = train_X_data.columns
#print(X_train_imputed_df.shape)
#print(X_train_imputed_df)

X_test_imputed_df = pd.DataFrame(X_test_imputed)
X_test_imputed_df.index = test_X_data.index
X_test_imputed_df.columns = test_X_data.columns
#print(X_test_imputed_df.shape)
#print(X_test_imputed_df)

In [7]:
# 데이터 확인
print("Train X data shape:", X_train_imputed_df.shape)
print("Test X data shape:", X_test_imputed_df.shape)

X_train_imputed_df.to_csv('./train_set/train_X_data_median.csv')
X_test_imputed_df.to_csv('./test_set/test_X_data_median.csv')

Train X data shape: (40911, 35)
Test X data shape: (2920, 35)
