In [2]:
ls

[0m[01;32mTest_AI_2022_en.pdf[0m*   test_preds_reg.csv  [01;32mtrain_set_data.csv[0m*
[01;34mtest_checkpoints_reg[0m/  [01;32mtest_set_data.csv[0m*


In [3]:
import torch
import torch.nn as nn
import pandas as pd
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split

2023-07-04 17:37:01.331834: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-07-04 17:37:01.334214: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-07-04 17:37:01.375285: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-07-04 17:37:01.376430: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
# Load the training and test data
train_data = pd.read_csv('train_set_data.csv')
test_data = pd.read_csv('test_set_data.csv')

# Split the training data into features (smiles) and target variable (activity)
train_features = train_data['SMILES'].values
train_targets = train_data['Activity'].values

# Split the training data into train and validation sets
train_features, val_features, train_targets, val_targets = train_test_split(train_features, train_targets, test_size=0.2, random_state=42)


In [5]:
# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the training features
train_encodings = tokenizer.batch_encode_plus(train_features.tolist(), padding=True, truncation=True, max_length=512, return_tensors='pt')

# Tokenize the validation features
val_encodings = tokenizer.batch_encode_plus(val_features.tolist(), padding=True, truncation=True, max_length=512, return_tensors='pt')

# Tokenize the test features
test_encodings = tokenizer.batch_encode_plus(test_data['SMILES'].tolist(), padding=True, truncation=True, max_length=512, return_tensors='pt')

# Create torch datasets
train_dataset = torch.utils.data.TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], torch.tensor(train_targets))
val_dataset = torch.utils.data.TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'], torch.tensor(val_targets))
test_dataset = torch.utils.data.TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'])

In [6]:
class RegressionModel(nn.Module):
    def __init__(self, num_labels):
        super(RegressionModel, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.linear(pooled_output)
        return logits

# Set the number of regression labels (1 in this case)
num_labels = 1

# Initialize the regression model
model = RegressionModel(num_labels)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
# Define the optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
loss_fn = nn.MSELoss()

# Set the device (CPU or GPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Define the data loaders
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=8, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=8, shuffle=False)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        input_ids, attention_mask, targets = batch
        input_ids, attention_mask, targets = input_ids.to(device), attention_mask.to(device), targets.to(device)

        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids, attention_mask)
        loss = loss_fn(outputs.squeeze(), targets.float())

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)

    # Evaluation on validation set
    model.eval()
    val_loss = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, targets = batch
            input_ids, attention_mask, targets = input_ids.to(device), attention_mask.to(device), targets.to(device)

            outputs = model(input_ids, attention_mask)
            loss = loss_fn(outputs.squeeze(), targets.float())
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_loader)

    print(f'Epoch {epoch+1}/{num_epochs}:')
    print(f'Training Loss: {avg_loss:.4f} | Validation Loss: {avg_val_loss:.4f}')

# Predict on test data
model.eval()
predictions = []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask = batch
        input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)

        outputs = model(input_ids, attention_mask)
        predictions.extend(outputs.squeeze().tolist())

# Convert predictions to pandas DataFrame
test_predictions = pd.DataFrame({'Predicted Activity': predictions})

# Save predictions to a CSV file
test_predictions.to_csv('predictions.csv', index=False)

Epoch 1/10:
Training Loss: 1.4131 | Validation Loss: 1.2507
Epoch 2/10:
Training Loss: 1.1079 | Validation Loss: 1.1447
Epoch 3/10:
Training Loss: 0.9920 | Validation Loss: 0.9793
Epoch 4/10:
Training Loss: 0.8743 | Validation Loss: 0.9146
Epoch 5/10:
Training Loss: 0.7558 | Validation Loss: 0.8552
Epoch 6/10:
Training Loss: 0.6389 | Validation Loss: 0.8790
Epoch 7/10:
Training Loss: 0.5640 | Validation Loss: 0.7800
Epoch 8/10:
Training Loss: 0.5247 | Validation Loss: 0.7906
Epoch 9/10:
Training Loss: 0.4425 | Validation Loss: 0.7601
Epoch 10/10:
Training Loss: 0.4157 | Validation Loss: 0.7848


In [9]:
ls

predictions.csv       [0m[01;34mtest_checkpoints_reg[0m/  [01;32mtest_set_data.csv[0m*
[01;32mTest_AI_2022_en.pdf[0m*  test_preds_reg.csv     [01;32mtrain_set_data.csv[0m*


In [10]:
test_predictions

Unnamed: 0,Predicted Activity
0,2.103446
1,-0.493939
2,3.518135
3,3.127757
4,2.582865
...,...
832,2.019315
833,1.929239
834,2.856727
835,2.834792
