# Iteration 01

In [6]:
import os
import sys
from pathlib import Path
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
import torch

# Add the parent directory to sys.path
parent_dir = Path().resolve().parent  # one level up
sys.path.append(str(parent_dir))



from src.data.loader import load_excel
from src.data.AbstractDataset import AbstractDataset
from src.encoders.bert import get_model, get_tokenizer
from src.regressors.TextRegressor import TextRegressor
from constants import DataSetConfig as DataSetConfig
from src.train.train import train

In [7]:

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
EPOCHS = 10
LR = 0.001
TEST_SIZE = 0.2
RANDOM_STATE = 42
BATCH_SIZE = 16

## Load and split

In [8]:
file_path = os.path.join(parent_dir, DataSetConfig.PATH)
df = load_excel(file_path, DataSetConfig.COLUMNS)
X = df.drop(columns=[DataSetConfig.FEATURES_TARGET]).fillna('')
y = df[DataSetConfig.FEATURES_TARGET]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)

## Train

In [9]:

tokenizer = get_tokenizer()
X_train_dataset = AbstractDataset(X_train[DataSetConfig.FEATURES_BODY].values, y_train, tokenizer)
train_loader = DataLoader(X_train_dataset, batch_size=BATCH_SIZE, shuffle=True)

bert_model = get_model(DEVICE)
model = TextRegressor(bert_model.config.hidden_size).to(DEVICE)

train(model, train_loader, bert_model, DEVICE, EPOCHS, LR)

Epoch 1/10:   0%|          | 0/190 [01:10<?, ?it/s, loss=1.4233] 

KeyboardInterrupt: 

## Evaluate

In [None]:
test_dataset = AbstractDataset(X_test[DataSetConfig.FEATURES_BODY].values, y_test)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)