## Import

In [None]:
import os, random
import numpy as np
import pandas as pd

from ase.io import read

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, Dataset, DataLoader

from tqdm.auto import tqdm

np.set_printoptions(threshold=np.inf)

In [2]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)

seed_everything(42) # Seed 고정

## Pre-Processing

In [3]:
train = read('./train.xyz', format='extxyz', index=':') # 전체 데이터 불러오기
test = read('./test.xyz', format='extxyz', index=':')
sample = pd.read_csv('./sample_submission.csv')

In [4]:
print(f"The number of data: {len(train)}")
train[0]

The number of data: 22510


Atoms(symbols='N24Si24', pbc=True, cell=[8.52238831, 8.52238831, 8.52238831], forces=..., calculator=SinglePointCalculator(...))

In [5]:
sequence_train, symbols, positions_x, positions_y, positions_z, forces, energies = [], [], [], [], [], [], []

for i in range(len(train)):
    mole = train[i] # 각 분자
    
    atoms = len(mole) # 원자 개수
    sequence_train.append(atoms)
    
    position = mole.get_positions() # 원자 위치 정보
    force = mole.get_forces() # label 1
    
    energy = mole.get_total_energy() # label 2
    energies.append(energy)
    
    for j in range(len(mole)): # 각 원자에 대해
        atom = mole[j]
        
        positions_x.append(position[j][0])
        positions_y.append(position[j][1])
        positions_z.append(position[j][2])
        forces.append(force[j])
        
train_df = pd.DataFrame({'position_x': positions_x, 'position_y':positions_y, 'position_z':positions_z, 'force':forces})        
train_df.head()

Unnamed: 0,position_x,position_y,position_z,force
0,1.591737,4.200483,7.832245,"[-1.9364797, -2.75540073, 0.90898967]"
1,5.640802,2.305094,4.606757,"[1.77046974, -0.17350153, -1.99398617]"
2,6.672786,8.483263,2.981881,"[-2.05488716, -0.29381591, -0.89173793]"
3,1.908548,0.147931,1.741693,"[-0.89207197, -0.8143158, -1.36426899]"
4,4.37565,6.837884,1.948188,"[-4.65938123, -0.77685475, -3.07403915]"


In [6]:
sequence_test, positions_x, positions_y, positions_z = [], [], [], []

for i in range(len(test)):
    mole = test[i] # 각 분자
    
    atoms = len(mole) # 원자 개수
    sequence_test.append(atoms)
    
    position = mole.get_positions() # 원자 위치 정보
    
    for j in range(len(mole)): # 각 원자에 대해
        atom = mole[j]
        
        positions_x.append(position[j][0])
        positions_y.append(position[j][1])
        positions_z.append(position[j][2])
        
test_df = pd.DataFrame({'position_x': positions_x, 'position_y':positions_y, 'position_z':positions_z, 'force':None})        
test_df.head()

Unnamed: 0,position_x,position_y,position_z,force
0,9.671275,8.734431,6.151755,
1,1.676806,2.238918,5.27045,
2,10.358608,4.824889,9.174357,
3,4.37062,5.391541,9.812298,
4,2.453404,10.449967,9.906622,


## [Force] Hyperparameter Setting

In [7]:
# 하이퍼파라미터
input_size = 3  # feature 개수
hidden_size = 256
output_size = 3 # target 개수
num_epochs = 3
batch_size = 256
learning_rate = 0.001

## [Force] Dataset

In [8]:
class ForceDataset(Dataset):
    def __init__(self, df, mode='test'):
        self.df = df
        self.mode = mode
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        
        pos_x = self.df.loc[idx, 'position_x']
        pos_y = self.df.loc[idx, 'position_y']
        pos_z = self.df.loc[idx, 'position_z']
        
        inputs = torch.tensor([pos_x, pos_y, pos_z], dtype=torch.float32)
        
        if not self.mode == 'test':
            label = torch.tensor(self.df.loc[idx, 'force'], dtype=torch.float32)
            return inputs, label
        else:
            return inputs

In [9]:
train_dataset = ForceDataset(train_df, 'train')
test_dataset = ForceDataset(test_df, 'test')

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

## [Force] Model

In [10]:
class ForceModel(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(ForceModel, self).__init__()
        
        self.layers = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(),
            nn.Dropout(0.5),
            
            nn.Linear(hidden_size, 128),
            nn.BatchNorm1d(128),
            nn.LeakyReLU(0.01),
            nn.Dropout(0.5),
            
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.5),
            
            nn.Linear(64, 3)
        )
    
    def forward(self, x):
        y = self.layers(x)
        
        return y

In [11]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"current device is {device}")

model = ForceModel(input_size, hidden_size).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

current device is cuda


## [Force] Train

In [12]:
print("Training Start!")

model.train()
for epoch in range(num_epochs):
    print(f"{epoch+1}/{num_epochs} epoch..")
    for inputs, labels in tqdm(train_loader):
        optimizer.zero_grad()
        
        inputs = inputs.to(device)
        labels = labels.to(device)
        
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        loss.backward()
        optimizer.step()
        
print("Training Complete!")        

Training Start!
1/3 epoch..


100%|██████████| 5020/5020 [01:18<00:00, 64.02it/s]


2/3 epoch..


100%|██████████| 5020/5020 [00:57<00:00, 86.83it/s]


3/3 epoch..


100%|██████████| 5020/5020 [00:57<00:00, 88.05it/s]

Training Complete!





## [Force] Inference

In [13]:
print("Inference Start!")

model.eval()

preds = []
with torch.no_grad():
    for inputs in tqdm(test_loader):
        inputs = inputs.to(device)
        outputs = model(inputs)
        
        pred = outputs.detach().cpu().numpy()
        preds.extend(pred)

print("Inference Complete!")
len(preds)

Inference Start!


100%|██████████| 1154/1154 [00:06<00:00, 190.91it/s]

Inference Complete!





295234

## [Force] Submission

In [14]:
test_df['force'] = preds # 예측 결과 저장

In [15]:
# 한 분자가 몇 개의 원자로 이루어져 있는지에 따라 범위를 생성
bundles_train, bundles_test = [], []

flag = 0
for size in sequence_train:
    bundles_train.append((flag, flag+size))
    flag += size

flag = 0
for size in sequence_test:
    bundles_test.append((flag, flag+size))
    flag += size    

In [16]:
preds_force = []

for start, end in bundles_test:
    preds_force.append(np.vstack(preds[start:end])) # 2차원 array로 저장

sample['force'] = preds_force
sample

Unnamed: 0,ID,energy,force
0,TEST_0000,0,"[[0.00852358, 0.004790641, 0.00026761368], [0...."
1,TEST_0001,0,"[[0.008528535, 0.003925374, 0.0015593885], [0...."
2,TEST_0002,0,"[[0.008528535, 0.003925374, 0.0015593885], [0...."
3,TEST_0003,0,"[[0.008525672, 0.004425193, 0.0008131983], [0...."
4,TEST_0004,0,"[[0.008522416, 0.0049937684, -3.5639503e-05], ..."
...,...,...,...
4096,TEST_4096,0,"[[0.008237457, 0.0040524383, 0.0022960037], [0..."
4097,TEST_4097,0,"[[0.008528535, 0.003925374, 0.0015593885], [0...."
4098,TEST_4098,0,"[[0.008528535, 0.003925374, 0.0015593885], [0...."
4099,TEST_4099,0,"[[0.008528535, 0.003925374, 0.0015593885], [0...."


## [Energy] Preprocessing

In [17]:
# 'force' 컬럼의 값을 분해하여 각각의 행으로 만듦
force_df = train_df['force'].apply(pd.Series)
force_df.columns = [f'force_{i}' for i in range(3)]

# 분해한 'force' 컬럼을 추가
train_df = train_df.drop('force', axis=1).join(force_df)

# 'force' 컬럼의 값을 분해하여 각각의 행으로 만듦
force_df = test_df['force'].apply(pd.Series)
force_df.columns = [f'force_{i}' for i in range(3)]

# 분해한 'force' 컬럼을 추가
test_df = test_df.drop('force', axis=1).join(force_df)
test_df.head()

Unnamed: 0,position_x,position_y,position_z,force_0,force_1,force_2
0,9.671275,8.734431,6.151755,0.008524,0.004791,0.000268
1,1.676806,2.238918,5.27045,0.009385,0.004395,-0.000745
2,10.358608,4.824889,9.174357,0.010001,0.003093,-0.001962
3,4.37062,5.391541,9.812298,0.010793,0.006565,-0.007119
4,2.453404,10.449967,9.906622,0.008954,0.019782,-0.02357


In [18]:
# 데이터프레임에서 값 추출
sequences_train = [train_df.iloc[start:end].values for start, end in bundles_train]
sequences_test = [test_df.iloc[start:end].values for start, end in bundles_test]

## [Energy] Hyperparameter Setting

In [19]:
input_size = 6  # feature 개수
hidden_size = 256
output_size = 1 # target 개수
num_epochs = 1
batch_size = 64
learning_rate = 0.001

## [Energy] Dataset

In [20]:
# 패딩을 사용하여 모든 시퀀스의 길이를 동일하게 만듦
max_len = max(seq.shape[0] for seq in sequences_train)
padded_sequences = [np.vstack([seq, np.zeros((max_len - seq.shape[0], 6))]) for seq in sequences_train]

# 패딩된 시퀀스를 2차원 배열로 변환
padded_array_train = np.stack(padded_sequences)
X_tensor_train = torch.tensor(padded_array_train, dtype=torch.float32)
y_tensor_train = torch.tensor(energies, dtype=torch.float32).view(-1, 1)
train_dataset = TensorDataset(X_tensor_train, y_tensor_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# 패딩을 사용하여 모든 시퀀스의 길이를 동일하게 만듦
max_len = max(seq.shape[0] for seq in sequences_test)
padded_sequences = [np.vstack([seq, np.zeros((max_len - seq.shape[0], 6))]) for seq in sequences_test]

# 패딩된 시퀀스를 2차원 배열로 변환
padded_array_test = np.stack(padded_sequences)
X_tensor_test = torch.tensor(padded_array_test, dtype=torch.float32)
test_dataset = TensorDataset(X_tensor_test)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

## [Energy] Model

In [21]:
# BiLSTM 모델 정의
class EnergyModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=1, dropout_rate=0.5):
        super(EnergyModel, self).__init__()
        
        # Bidirectional LSTM with Dropout
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, 
                            batch_first=True, 
                            dropout=dropout_rate,
                            bidirectional=True)
        
        # Bidirectional LSTM이므로 hidden_size 조정
        self.linear = nn.Sequential(
            nn.Linear(hidden_size * 2, hidden_size),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_size),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_size, 1)
        )
        
    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        energy = self.linear(lstm_out[:, -1, :])
        return energy

In [None]:
# 모델, 손실 함수, 옵티마이저 초기화
model = EnergyModel(input_size, hidden_size).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

## [Energy] Train

In [23]:
print("Training Start!!")

# 학습
model.train()
for epoch in range(num_epochs):
    print(f"{epoch+1}/{num_epochs} epoch..")    
    for inputs, labels in tqdm(train_loader):
        inputs = inputs.to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()
        
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        loss.backward()
        optimizer.step()
        
print("Training Complete!")        

Training Start!!
1/1 epoch..


100%|██████████| 352/352 [00:17<00:00, 20.69it/s]

Training Complete!





## [Energy] Inference

In [24]:
print("Inference Start!")

model.eval()

preds = []
with torch.no_grad():
    for inputs in tqdm(test_loader):
        inputs = inputs[0].to(device)

        outputs = model(inputs)
        pred = outputs.detach().cpu().numpy()

        preds.extend(pred)

print("Inference Complete!")        
len(preds)        

Inference Start!


100%|██████████| 65/65 [00:00<00:00, 107.76it/s]

Inference Complete!





4101

## [Energy] Submission

In [25]:
preds = [pred.item() for pred in preds]
sample['energy'] = preds
sample

Unnamed: 0,ID,energy,force
0,TEST_0000,-43.936798,"[[0.00852358, 0.004790641, 0.00026761368], [0...."
1,TEST_0001,-43.936798,"[[0.008528535, 0.003925374, 0.0015593885], [0...."
2,TEST_0002,-43.936798,"[[0.008528535, 0.003925374, 0.0015593885], [0...."
3,TEST_0003,-43.936810,"[[0.008525672, 0.004425193, 0.0008131983], [0...."
4,TEST_0004,-43.936821,"[[0.008522416, 0.0049937684, -3.5639503e-05], ..."
...,...,...,...
4096,TEST_4096,-43.936844,"[[0.008237457, 0.0040524383, 0.0022960037], [0..."
4097,TEST_4097,-43.936844,"[[0.008528535, 0.003925374, 0.0015593885], [0...."
4098,TEST_4098,-43.936844,"[[0.008528535, 0.003925374, 0.0015593885], [0...."
4099,TEST_4099,-43.936829,"[[0.008528535, 0.003925374, 0.0015593885], [0...."


In [26]:
sample.to_csv('baseline_submission.csv', index=False)