In [1]:
import os
import sys
from pathlib import Path
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
import torch

parent_dir = Path().resolve().parent
sys.path.append(str(parent_dir))
from src.data.loader import load_excel
from src.data.AbstractDataset import AbstractDataset
from src.encoders.bert import get_model, get_tokenizer
from src.regressors.TextRegressor import TextRegressor
from constants import DataSetConfig as DataSetConfig
from src.train.train import train

In [2]:

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
EPOCHS = 10
LR = 0.001
TEST_SIZE = 0.8
RANDOM_STATE = 42
BATCH_SIZE = 16

## Load and split

In [3]:
file_path = os.path.join(parent_dir, DataSetConfig.PATH)
df = load_excel(file_path, DataSetConfig.COLUMNS)
X = df.drop(columns=[DataSetConfig.FEATURES_TARGET]).fillna('')
y = df[DataSetConfig.FEATURES_TARGET]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)

## Train

In [4]:

tokenizer = get_tokenizer()
X_train_dataset = AbstractDataset(X_train[DataSetConfig.FEATURES_BODY].values, y_train, tokenizer)
train_loader = DataLoader(X_train_dataset, batch_size=BATCH_SIZE, shuffle=True)

bert_model = get_model(DEVICE)
model = TextRegressor(bert_model.config.hidden_size).to(DEVICE)

train(model, train_loader, bert_model, DEVICE, EPOCHS, LR)

Epoch 1/10:   0%|          | 0/48 [01:38<?, ?it/s, loss=0.8824] 

Epoch 1/10, Avg Loss: 2.8719



Epoch 1/10:   0%|          | 0/48 [01:38<?, ?it/s, loss=0.8824]

Epoch 2/10:   0%|          | 0/48 [00:02<?, ?it/s, loss=1.1678][A
Epoch 2/10:   0%|          | 0/48 [00:04<?, ?it/s, loss=0.9398][A
Epoch 2/10:   0%|          | 0/48 [00:06<?, ?it/s, loss=0.6960][A
Epoch 2/10:   0%|          | 0/48 [00:08<?, ?it/s, loss=1.6200][A
Epoch 2/10:   0%|          | 0/48 [00:10<?, ?it/s, loss=1.2896][A
Epoch 2/10:   0%|          | 0/48 [00:12<?, ?it/s, loss=2.4554][A
Epoch 2/10:   0%|          | 0/48 [00:14<?, ?it/s, loss=1.5521][A
Epoch 2/10:   0%|          | 0/48 [00:16<?, ?it/s, loss=1.1066][A
Epoch 2/10:   0%|          | 0/48 [00:18<?, ?it/s, loss=2.2068][A
Epoch 2/10:   0%|          | 0/48 [00:20<?, ?it/s, loss=2.4327][A
Epoch 2/10:   0%|          | 0/48 [00:22<?, ?it/s, loss=2.0155][A
Epoch 2/10:   0%|          | 0/48 [00:24<?, ?it/s, loss=1.4670][A
Epoch 2/10:   0%|          | 0/48 [00:27<?, ?it/s, loss=0.8518][A
Epoch 2/10:   0%|          | 0/48 [00:29<?, ?it/s, loss=1.2259]

Epoch 2/10, Avg Loss: 1.4509


Epoch 2/10:   0%|          | 0/48 [01:42<?, ?it/s, loss=0.6697]
Epoch 3/10:   0%|          | 0/48 [00:13<?, ?it/s, loss=1.1340]

KeyboardInterrupt: 

## Evaluate

In [None]:
test_dataset = AbstractDataset(X_test[DataSetConfig.FEATURES_BODY].values, y_test)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)