SyntaxError: invalid syntax (3037475004.py, line 1)

In [23]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import youtokentome as yttm
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

df = pd.read_csv("jobs_with_target.csv").dropna()
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["description"], df["salary_avg"], test_size=0.2, random_state=42
)

with open("train_texts.txt", "w", encoding="utf-8") as f:
    for text in train_texts:
        f.write(text + "\n")

BPE_MODEL = "bpe_model.model"
yttm.BPE.train(data="train_texts.txt", vocab_size=5000, model=BPE_MODEL)
bpe = yttm.BPE(model=BPE_MODEL)

def encode_texts(texts, max_len=100):
    return [bpe.encode(text, output_type=yttm.OutputType.ID, bos=True, eos=True)[:max_len] for text in texts]

train_tokens = encode_texts(train_texts)
test_tokens = encode_texts(test_texts)

def pad_sequences(sequences, max_len=100):
    return [seq + [0] * (max_len - len(seq)) for seq in sequences]

train_tokens = pad_sequences(train_tokens)
test_tokens = pad_sequences(test_tokens)

train_tokens = torch.tensor(train_tokens, dtype=torch.long)
test_tokens = torch.tensor(test_tokens, dtype=torch.long)
train_labels = torch.tensor(train_labels.values, dtype=torch.float32)
test_labels = torch.tensor(test_labels.values, dtype=torch.float32)

class JobDataset(Dataset):
    def __init__(self, tokens, labels):
        self.tokens = tokens
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.tokens[idx], self.labels[idx]

train_dataset = JobDataset(train_tokens, train_labels)
test_dataset = JobDataset(test_tokens, test_labels)

BATCH_SIZE = 32
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

#Embedding + LSTM + Linear
class SalaryPredictor(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, lstm_units=128, hidden_units=64):
        super(SalaryPredictor, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, lstm_units, batch_first=True)
        self.fc = nn.Sequential(
            nn.Linear(lstm_units, hidden_units),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_units, hidden_units // 2),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_units // 2, 1)
        )

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = x[:, -1, :] #последний LSTM-выход
        x = self.fc(x)
        return x.squeeze(1)

VOCAB_SIZE = 5000
model = SalaryPredictor(VOCAB_SIZE)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

#Тренировка модели
EPOCHS = 5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for tokens, labels in train_loader:
        tokens, labels = tokens.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(tokens)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")

def evaluate(model, dataloader):
    model.eval()
    predictions, targets = [], []
    with torch.no_grad():
        for tokens, labels in dataloader:
            tokens, labels = tokens.to(device), labels.to(device)
            outputs = model(tokens).cpu().numpy()
            predictions.extend(outputs)
            targets.extend(labels.cpu().numpy())

    mse = mean_squared_error(targets, predictions)
    mae = mean_absolute_error(targets, predictions)
    r2 = r2_score(targets, predictions)
    return mse, mae, r2

train_mse, train_mae, train_r2 = evaluate(model, train_loader)
test_mse, test_mae, test_r2 = evaluate(model, test_loader)

print(f"\nTrain Metrics:\n MSE: {train_mse:.2f}, MAE: {train_mae:.2f}, R2: {train_r2:.2f}")
print(f"\nTest Metrics:\n MSE: {test_mse:.2f}, MAE: {test_mae:.2f}, R2: {test_r2:.2f}")


Epoch 1, Loss: 9756496384.0
Epoch 2, Loss: 9756471808.0
Epoch 3, Loss: 9756446720.0
Epoch 4, Loss: 9756426240.0
Epoch 5, Loss: 9756390400.0

Train Metrics:
 MSE: 9756358866.94, MAE: 91277.71, R2: -5.85

Test Metrics:
 MSE: 11925065702.04, MAE: 99007.08, R2: -4.62
