In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split

In [None]:
# Define a function to preprocess data and tokenize text records
def preprocess_data(data):
    # Preprocess data (e.g., remove missing values)
    # ...

    # Tokenize text records using BERT tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    encoded_data = tokenizer(data['text_record'].tolist(), padding=True, truncation=True, max_length=512, return_tensors='pt')
    input_ids = encoded_data['input_ids']
    attention_mask = encoded_data['attention_mask']

    labels = data['ocms'].tolist()
    labels = [int(label) for label in labels]

    return input_ids, attention_mask, labels

In [None]:
# Define a function to train a BERT model on the data
def train_model(input_ids, attention_mask, labels):
    # Split data into training and validation sets
    train_input_ids, val_input_ids, train_attention_mask, val_attention_mask, train_labels, val_labels = train_test_split(input_ids, attention_mask, labels, test_size=0.2)

    # Load BERT model and set up optimizer and loss function
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
    optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
    loss_fn = torch.nn.CrossEntropyLoss()

    # Train BERT model
    for epoch in range(5):
        model.train()
        optimizer.zero_grad()

        outputs = model(train_input_ids, train_attention_mask, labels=train_labels)
        loss = loss_fn(outputs.logits, train_labels)
        loss.backward()
        optimizer.step()

        # Evaluate the model on the validation set
        model.eval()
        with torch.no_grad():
            outputs = model(val_input_ids, val_attention_mask)
            val_loss = loss_fn(outputs.logits, val_labels)
            val_acc = (outputs.logits.argmax(1) == val_labels).float().mean()

            print(f"Epoch {epoch+1}: validation loss = {val_loss:.4f}, validation accuracy = {val_acc:.4f}")

    return model

In [None]:
# Define a function to make predictions using the trained BERT model
def predict(model, text_records):
    # Tokenize new text records using BERT tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    encoded_data = tokenizer(text_records, padding=True, truncation=True, max_length=512, return_tensors='pt')
    input_ids = encoded_data['input_ids']
    attention_mask = encoded_data['attention_mask']

    # Make predictions using the trained BERT model
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        predictions = outputs.logits.argmax(1).tolist()

    return predictions

# Load data from CSV file
data = pd.read_csv("dataset.csv")

# Preprocess data and tokenize text records
input_ids, attention_mask, labels = preprocess_data(data)

# Train a BERT model on the data
model = train_model(input_ids, attention_mask, labels)

# Make predictions on new data
new_data = ['text record 1', 'text record 2', 'text record 3']
predictions = predict(model, new_data)
print(predictions)