In [37]:
import pandas as pd
import numpy as np
import torch
from torch import nn, Tensor, optim, cuda
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import MinMaxScaler

In [38]:
train_set = pd.read_csv("./train_data.csv")
print(train_set.shape)

(39457, 294)


In [39]:
if torch.backends.mps.is_available():
    device = 'mps'
    mps_device = torch.device(device)
    torch.cuda.manual_seed_all(777)
    print(device)
else:
    print ("MPS device not found.")

torch.manual_seed(777)

mps


<torch._C.Generator at 0x14082db70>

In [40]:
# 불필요한 열 제거
train_set_cleaned = data = train_set.loc[:, train_set.nunique() != 1]

# 결과 확인
print(train_set_cleaned.shape)

(39457, 204)


In [41]:
# Function to fill missing values with the average of the row above and below
def fill_missing_values(data):
    # Get the list of numeric columns to process (excluding 'id' and 'UTC_Settlement_DateTime')
    numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
    columns_to_process = [col for col in numeric_columns if col not in ['id']]

    # Iterate over all rows
    for i in range(len(data)):
        if data['id'][i] % 2 == 0:  # Check if the id is even
            for col in columns_to_process:
                if pd.isna(data.at[i, col]):  # Check if the value is NaN
                    if i == 0:  # First row
                        data.at[i, col] = data.at[i + 1, col]
                    elif i == len(data) - 1:  # Last row
                        data.at[i, col] = data.at[i - 1, col]
                    else:  # All other rows
                        data.at[i, col] = (data.at[i - 1, col] + data.at[i + 1, col]) / 2
    return data

# Apply the function to fill missing values
filled_trained_set_subset = fill_missing_values(train_set_cleaned)

# Display the first few rows where the column 'cloudcoverBelfast_weather' has non-null values
cloudcover_belfast_rows = train_set_cleaned[train_set_cleaned['cloudcoverBelfast_weather'].notnull()]

# 해당 행들의 첫 몇 개 열을 확인
#print(cloudcover_belfast_rows[['id', 'cloudcoverBelfast_weather']].head())

In [42]:
print(cloudcover_belfast_rows[['id', 'cloudcoverBelfast_weather']].head())
print(filled_trained_set_subset.shape)
print(type(filled_trained_set_subset))

   id  cloudcoverBelfast_weather
0   0                       72.0
1   1                       72.0
2   2                       81.0
3   3                       90.0
4   4                       95.0
(39457, 204)
<class 'pandas.core.frame.DataFrame'>


In [43]:
# 데이터 셋 읽기
data = filled_trained_set_subset  # 이미 처리된 데이터셋을 사용하는 것으로 가정
x_data = data.iloc[:, 3:]
y_data = data.iloc[:, [2]]

# 숫자형 열만 선택
numeric_columns = x_data.select_dtypes(include=[np.number]).columns
scaler = MinMaxScaler()
x_data[numeric_columns] = scaler.fit_transform(x_data[numeric_columns])
x_data.fillna(0, inplace=True)
x_train = torch.Tensor(x_data[numeric_columns].values).to(device)
y_train = torch.Tensor(y_data.values).to(device)
print(x_data.shape)
print(y_data.shape)

(39457, 201)
(39457, 1)


In [44]:
class BatteryOutputPredictionModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear1 = nn.Linear(201, 150)
        self.batch_norm1 = nn.BatchNorm1d(150)
        self.linear2 = nn.Linear(150, 100)
        self.batch_norm2 = nn.BatchNorm1d(100)
        self.linear3 = nn.Linear(100, 50)
        self.batch_norm3 = nn.BatchNorm1d(50)
        self.linear4 = nn.Linear(50, 1)

        # Initialize weights using Xavier uniform initialization
        torch.nn.init.xavier_uniform_(self.linear1.weight)
        torch.nn.init.xavier_uniform_(self.linear2.weight)
        torch.nn.init.xavier_uniform_(self.linear3.weight)
        torch.nn.init.xavier_uniform_(self.linear4.weight)

        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.linear1(x)
        x = self.batch_norm1(x)
        x = self.relu(x)

        x = self.linear2(x)
        x = self.batch_norm2(x)
        x = self.relu(x)

        x = self.linear3(x)
        x = self.batch_norm3(x)
        x = self.relu(x)

        x = self.linear4(x)  # No batch norm after the last linear layer
        return x

In [45]:
model = BatteryOutputPredictionModel().to(device)
optimizer = optim.SGD(model.parameters(), lr=0.1)
criterion = nn.MSELoss()

In [46]:
# 배치 크기 정의
batch_size = 64

# TensorDataset과 DataLoader를 사용하여 배치 생성
dataset = TensorDataset(x_train, y_train)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [47]:
def train(dataloader,model,optimizer,criterion):

  total_batch = len(dataloader)
  model.train()

  for epoch in range(1000):
    avg_cost = 0

    for X, Y in dataloader:
      X = X.to(device)
      Y = Y.to(device)

      optimizer.zero_grad()

      prediction = model(X)

      loss = criterion(prediction,Y)

      loss.backward()
      optimizer.step()

      avg_cost += loss / total_batch

    print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.9f}'.format(avg_cost))

In [48]:
train(dataloader,model,optimizer,criterion)

Epoch: 0001 cost = 4.581576347
Epoch: 0002 cost = 4.478846550
Epoch: 0003 cost = 4.437811375
Epoch: 0004 cost = 4.418032646
Epoch: 0005 cost = 4.386753559
Epoch: 0006 cost = 4.370287895
Epoch: 0007 cost = 4.345763683
Epoch: 0008 cost = 4.310221672
Epoch: 0009 cost = 4.297514915
Epoch: 0010 cost = 4.248674393
Epoch: 0011 cost = 4.221480370
Epoch: 0012 cost = 4.209295750
Epoch: 0013 cost = 4.169816971
Epoch: 0014 cost = 4.146825314
Epoch: 0015 cost = 4.116108418
Epoch: 0016 cost = 4.063551426
Epoch: 0017 cost = 4.051523209
Epoch: 0018 cost = 4.022755623
Epoch: 0019 cost = 4.007181168
Epoch: 0020 cost = 3.991583347
Epoch: 0021 cost = 3.949930191
Epoch: 0022 cost = 3.927198172
Epoch: 0023 cost = 3.870541811
Epoch: 0024 cost = 3.863215685
Epoch: 0025 cost = 3.870408058
Epoch: 0026 cost = 3.847440481
Epoch: 0027 cost = 3.800487995
Epoch: 0028 cost = 3.817695856
Epoch: 0029 cost = 3.749864817
Epoch: 0030 cost = 3.779456615
Epoch: 0031 cost = 3.703594446
Epoch: 0032 cost = 3.679924726
Epoch: 0

In [49]:
test_set = pd.read_csv("./test_data.csv")
print(test_set.shape)

(9888, 293)


In [50]:
# 불필요한 열 제거
test_set_cleaned = data = test_set.loc[:, test_set.nunique() != 1]

# 결과 확인
print(test_set_cleaned.shape)

(9888, 203)


In [51]:
# Function to fill missing values with the average of the row above and below
def fill_missing_values(data):
    # Get the list of numeric columns to process (excluding 'id' and 'UTC_Settlement_DateTime')
    numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
    columns_to_process = [col for col in numeric_columns if col not in ['id']]

    # Iterate over all rows
    for i in range(len(data)):
        if data['id'][i] % 2 == 0:  # Check if the id is even
            for col in columns_to_process:
                if pd.isna(data.at[i, col]):  # Check if the value is NaN
                    if i == 0:  # First row
                        data.at[i, col] = data.at[i + 1, col]
                    elif i == len(data) - 1:  # Last row
                        data.at[i, col] = data.at[i - 1, col]
                    else:  # All other rows
                        data.at[i, col] = (data.at[i - 1, col] + data.at[i + 1, col]) / 2
    return data

# Apply the function to fill missing values
filled_test_set_subset = fill_missing_values(test_set_cleaned)

# Display the first few rows where the column 'cloudcoverBelfast_weather' has non-null values
cloudcover_belfast_rows = test_set_cleaned[test_set_cleaned['cloudcoverBelfast_weather'].notnull()]

# 결과
print(cloudcover_belfast_rows[['id', 'cloudcoverBelfast_weather']].head())
print(filled_test_set_subset.shape)
print(type(filled_test_set_subset))

      id  cloudcoverBelfast_weather
0  39457                      100.0
1  39458                      100.0
2  39459                      100.0
3  39460                      100.0
4  39461                      100.0
(9888, 203)
<class 'pandas.core.frame.DataFrame'>


In [52]:
# 데이터 셋 읽기
data = filled_test_set_subset  # 이미 처리된 데이터셋을 사용하는 것으로 가정
x_data = data.iloc[:, 2:]

# 숫자형 열만 선택
numeric_columns = x_data.select_dtypes(include=[np.number]).columns
scaler = MinMaxScaler()
x_data[numeric_columns] = scaler.fit_transform(x_data[numeric_columns])
x_data.fillna(0, inplace=True)
x_data = torch.Tensor(x_data[numeric_columns].values).to(device)
print(x_data.shape)

torch.Size([9888, 201])


In [53]:
# 배치 크기 정의
batch_size = 64

# TensorDataset과 DataLoader를 사용하여 배치 생성
dataset = TensorDataset(x_data)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, drop_last=False)

In [54]:
def evaluate_and_save(dataloader, model, file_path):
    model.eval()  # 모델을 평가 모드로 설정
    predictions = []

    with torch.no_grad():  # 기울기 계산 비활성화
        for X, in dataloader:  # 데이터 로더에서 데이터 배치를 반복
            X = X.to(device)  # 데이터를 적절한 디바이스로 이동 (CPU 또는 GPU)
            output = model(X)  # 모델로부터 예측 수행
            predictions.extend(output.cpu().numpy())  # 예측값을 리스트에 저장

    # 예측 결과를 DataFrame으로 변환
    df = pd.DataFrame(predictions, columns=['Prediction'])
    
    # 결과를 CSV 파일로 저장
    df.to_csv(file_path, index=False)


In [55]:
# 함수 호출
file_path = 'predictions.csv'
evaluate_and_save(dataloader, model, file_path)