In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
# CSV 파일 경로 설정
pollutants_path = '../../NewData/Weekly_Air_Pollutants.csv'
weather_path = '../data_preprocessing/normalized_analyze_abnormal.csv'

In [None]:
# Transformer 모델 정의
class WeatherTransformer(nn.Module):
    def __init__(self, input_dim, output_dim, d_model, nhead, num_layers, sequence_length, dropout=0.1):
        super(WeatherTransformer, self).__init__()
        self.input_embedding = nn.Linear(input_dim, d_model)
        self.positional_encoding = nn.Parameter(torch.zeros(1, sequence_length, d_model))
        self.transformer = nn.Transformer(
            d_model=d_model, nhead=nhead, num_encoder_layers=num_layers, dropout=dropout
        )
        self.fc = nn.Linear(d_model, output_dim)
    
    def forward(self, x):
        x = self.input_embedding(x) + self.positional_encoding
        x = x.permute(1, 0, 2)  # (batch, sequence, features) -> (sequence, batch, features)
        x = self.transformer(x, x)  # Self-attention
        x = x.permute(1, 0, 2)  # (sequence, batch, features) -> (batch, sequence, features)
        x = x[:, -1, :]  # 마지막 타임스텝의 결과 사용
        x = self.fc(x)
        return x

# 데이터 로드
pollutants_data = pd.read_csv(pollutants_path)
weather_data = pd.read_csv(weather_path)

# 데이터 병합
pollutants_data['datetime'] = pd.to_datetime(pollutants_data['datetime'])
weather_data['datetime'] = pd.to_datetime(weather_data['datetime'])
merged_data = pd.merge(weather_data, pollutants_data, on='datetime', how='inner')

# 입력(X)와 출력(Y) 설정
input_features = ['CO', 'Nox', 'Sox', 'TSP', 'PM-10', 'VOCs', 'NH3']
target_features = ['temp', 'humidity', 'precip', 'windspeed']

X = merged_data[input_features].values
Y = merged_data[target_features].values

# 슬라이딩 윈도우로 시계열 데이터 생성
sequence_length = 7
def create_sequences(data, target, sequence_length):
    X_seq, Y_seq = [], []
    for i in range(len(data) - sequence_length):
        X_seq.append(data[i:i+sequence_length])
        if target is not None:
            Y_seq.append(target[i+sequence_length])  # 다음 시간 스텝 예측
    if target is not None:
        return np.array(X_seq), np.array(Y_seq)
    return np.array(X_seq), None

X_seq, Y_seq = create_sequences(X, Y, sequence_length)

# 데이터 분할
X_train, X_test, Y_train, Y_test = train_test_split(X_seq, Y_seq, test_size=0.2, random_state=42)

# PyTorch 텐서 변환
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
Y_train_tensor = torch.tensor(Y_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
Y_test_tensor = torch.tensor(Y_test, dtype=torch.float32)

# DataLoader 생성
batch_size = 32
train_loader = DataLoader(TensorDataset(X_train_tensor, Y_train_tensor), batch_size=batch_size, shuffle=True)
test_loader = DataLoader(TensorDataset(X_test_tensor, Y_test_tensor), batch_size=batch_size, shuffle=False)

# 모델 하이퍼파라미터 설정
input_dim = len(input_features)
output_dim = len(target_features)
d_model = 64
nhead = 4
num_layers = 2
dropout = 0.1
learning_rate = 0.001
epochs = 20

# 모델 초기화
model = WeatherTransformer(input_dim, output_dim, d_model, nhead, num_layers, sequence_length, dropout)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# 모델 학습
for epoch in range(epochs):
    model.train()
    train_loss = 0.0
    for X_batch, Y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, Y_batch)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    
    # 검증
    model.eval()
    test_loss = 0.0
    with torch.no_grad():
        for X_batch, Y_batch in test_loader:
            outputs = model(X_batch)
            loss = criterion(outputs, Y_batch)
            test_loss += loss.item()
    
    print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss / len(train_loader):.4f}, Test Loss: {test_loss / len(test_loader):.4f}")

# 실제 값과 비교
actual_weather = merged_data[target_features].values[sequence_length:]
mse = mean_squared_error(actual_weather, predictions)
print(f"Mean Squared Error (MSE) between actual and predicted weather: {mse:.4f}")

# 비교 데이터프레임 생성
comparison_df = pd.DataFrame({
    'datetime': merged_data['datetime'].iloc[sequence_length:].reset_index(drop=True),
    'Actual Temp': actual_weather[:, 0],
    'Predicted Temp': predictions[:, 0],
    'Actual Humidity': actual_weather[:, 1],
    'Predicted Humidity': predictions[:, 1],
    'Actual Precip': actual_weather[:, 2],
    'Predicted Precip': predictions[:, 2],
    'Actual Windspeed': actual_weather[:, 3],
    'Predicted Windspeed': predictions[:, 3]
})

print(comparison_df.head())

TypeError: 'int' object is not callable