# Iteration 01

In [1]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
import torch

from data.loader import load_excel
from data.AbstractDataset import AbstractDataset
from encoders.bert import getModel, getTokenizer
from regressors.TextRegressor import TextRegressor
from constants import DataSetConfig as DataSetConfig
from train.train import train

In [2]:

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
EPOCHS = 10
LR = 0.001
TEST_SIZE = 0.2
RANDOM_STATE = 42
BATCH_SIZE = 16

## Load and split

In [3]:
df = load_excel(DataSetConfig.PATH, DataSetConfig.COLUMNS)
X = df.drop(columns=[DataSetConfig.FEATURES_TARGET]).fillna('')
y = df[DataSetConfig.FEATURES_TARGET]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)

## Train

In [None]:

tokenizer = getTokenizer()
X_train_dataset = AbstractDataset(X_train[DataSetConfig.FEATURES_BODY].values, y_train, tokenizer)
train_loader = DataLoader(X_train_dataset, batch_size=BATCH_SIZE, shuffle=True)

bert_model = getModel(DEVICE)
model = TextRegressor(bert_model.config.hidden_size).to(DEVICE)

train(model, train_loader, bert_model, DEVICE, EPOCHS, LR)

## Evaluate

In [None]:
test_dataset = AbstractDataset(X_test[DataSetConfig.FEATURES_BODY].values, y_test)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)