In [1]:
!unzip '/content/drive/MyDrive/HD현대 AI Challenge/HD현대AI챌린지.zip'

Archive:  /content/drive/MyDrive/HD현대 AI Challenge/HD현대AI챌린지.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [4]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import lightgbm as lgb
import bisect
from tqdm import tqdm
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

train = pd.read_csv('train.csv').drop(columns=['SAMPLE_ID'])
test = pd.read_csv('test.csv').drop(columns=['SAMPLE_ID'])

# datetime 컬럼 처리
train['ATA'] = pd.to_datetime(train['ATA'])
test['ATA'] = pd.to_datetime(test['ATA'])

# datetime을 여러 파생 변수로 변환
for df in [train, test]:
    df['year'] = df['ATA'].dt.year
    df['month'] = df['ATA'].dt.month
    df['day'] = df['ATA'].dt.day
    df['hour'] = df['ATA'].dt.hour
    df['minute'] = df['ATA'].dt.minute
    df['weekday'] = df['ATA'].dt.weekday

# datetime 컬럼 제거
train.drop(columns='ATA', inplace=True)
test.drop(columns='ATA', inplace=True)

# Categorical 컬럼 인코딩
categorical_features = ['ARI_CO', 'ARI_PO', 'SHIP_TYPE_CATEGORY', 'ID', 'SHIPMANAGER', 'FLAG']
encoders = {}

for feature in tqdm(categorical_features, desc="Encoding features"):
    le = LabelEncoder()
    train[feature] = le.fit_transform(train[feature].astype(str))
    le_classes_set = set(le.classes_)
    test[feature] = test[feature].map(lambda s: '-1' if s not in le_classes_set else s)
    le_classes = le.classes_.tolist()
    bisect.insort_left(le_classes, '-1')
    le.classes_ = np.array(le_classes)
    test[feature] = le.transform(test[feature].astype(str))
    encoders[feature] = le

# 결측치 처리
train.fillna(train.mean(), inplace=True)
test.fillna(train.mean(), inplace=True)

Encoding features: 100%|██████████| 6/6 [00:01<00:00,  3.02it/s]


In [8]:
# 학습용 데이터 정답 분리
X = train.drop('CI_HOUR', axis=1)
y = train['CI_HOUR']

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [12]:
# dataset
# loader
# for each 학습
# 평가

In [14]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset,dataloader

In [15]:
# 토치텐서로 변환
x_train = torch.tensor(x_train.values,dtype=torch.float32)
y_train = torch.tensor(y_train.values,dtype=torch.float32)
x_test = torch.tensor(x_test.values,dtype=torch.float32)
y_test = torch.tensor(y_test.values,dtype=torch.float32)

In [17]:
# 다층 퍼셉트론
class MLP(nn.Module):
  def __init__(self):
    super(MLP,self).__init__()
    self.fc1 = nn.Linear(in_features=len(X.columns), out_features=64)
    self.fc2 = nn.Linear(in_features=64, out_features=32)
    self.fc3 = nn.Linear(in_features=32, out_features=1)
    self.relu = nn.ReLU()
  def forward(self,x):
    x = self.relu(self.fc1(x))
    x = self.relu(self.fc2(x))
    x = self.fc3(x)
    return x
model = MLP()

In [None]:
y_train.reshape(-1,1)


In [26]:
train_dataset = torch.utils.data.TensorDataset(x_train,y_train.reshape(-1,1))
train_loader = torch.utils.data.DataLoader(train_dataset,batch_size=64, shuffle=True)

opt = optim.Adam(model.parameters(), lr = 1e-3)

device = 'cuda' if torch.cuda.is_available() else 'cpu'

model.to(device)
for epoch in range(10):
  iter = tqdm(train_loader)
  for data, label in iter:
    opt.zero_grad()
    pred = model(data.to(device))
    loss = nn.MSELoss()(pred,label.to(device))
    loss.backward()
    opt.step()
    iter.set_description(f"epoch{epoch} loss:{loss.item()}")
torch.save(model.state_dict(), 'MLP.pth')

epoch0 loss:5299.3203125: 100%|██████████| 4593/4593 [00:28<00:00, 161.29it/s]
epoch1 loss:21702.97265625: 100%|██████████| 4593/4593 [00:27<00:00, 166.11it/s]
epoch2 loss:8080.6572265625: 100%|██████████| 4593/4593 [00:30<00:00, 151.86it/s]
epoch3 loss:48804.4140625: 100%|██████████| 4593/4593 [00:30<00:00, 152.20it/s]
epoch4 loss:44927.9921875: 100%|██████████| 4593/4593 [00:28<00:00, 163.49it/s]
epoch5 loss:8953.5537109375: 100%|██████████| 4593/4593 [00:26<00:00, 171.01it/s]
epoch6 loss:5133.26025390625: 100%|██████████| 4593/4593 [00:26<00:00, 173.16it/s]
epoch7 loss:32146.376953125: 100%|██████████| 4593/4593 [00:26<00:00, 172.17it/s]
epoch8 loss:7804.5595703125: 100%|██████████| 4593/4593 [00:31<00:00, 143.65it/s]
epoch9 loss:35622.2421875: 100%|██████████| 4593/4593 [00:28<00:00, 163.71it/s]


In [27]:
# 모델 불러오기
model.load_state_dict(torch.load('MLP.pth', map_location=device))

<All keys matched successfully>

In [29]:
test_data =  torch.tensor(test.values,dtype=torch.float32)

In [30]:
with torch.no_grad():
  pred = model(test_data)


In [33]:
pred.numpy().shape

(244989, 1)

In [34]:
submit = pd.read_csv('./sample_submission.csv')
submit['CI_HOUR'] = pred.numpy()
submit.to_csv('./baseline_submit.csv', index=False)

In [35]:
submit.head()

Unnamed: 0,SAMPLE_ID,CI_HOUR
0,TEST_000000,22.175913
1,TEST_000001,115.257484
2,TEST_000002,-3.475577
3,TEST_000003,7.330209
4,TEST_000004,72.969231
