## Training models to predict age

In [1]:
# Import libraries
from torch.utils.data import Dataset
from transformers import BertForSequenceClassification
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader, random_split
from torch.optim import AdamW
from tqdm.notebook import tqdm
import torch


# 1. Bert

### Define the class

In [17]:
trainFilename = "../all_posts_train.txt"
testFilename = "../all_posts_test.txt"

# Define a mapping for the age groups to integer labels
label_mapping = {10: 0, 20: 1, 30: 2, 40: 3}

# Define a custom dataset class for handling the age group data
class AgeGroupDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_length=128):
        self.tokenizer = tokenizer  # Tokenizer for encoding the text
        self.max_length = max_length  # Maximum sequence length
        self.texts = []  # List to store text samples
        self.labels = []  # List to store corresponding labels
        
        # Open the file and read line by line
        with open(file_path, 'r') as file:
            for line in file:
                words = line.rstrip().split()  # Split the line into words
                category = int(words[0])  # Convert the first word to integer as category
                words = words[1:]  # Rest of the words represent the text
                # Join the words into a text string, considering only words with 3 parts (using word.split("/"))
                text = " ".join([word.split("/")[0] for word in words if len(word.split("/")) == 3])
                self.texts.append(text)  # Append the text to texts list
                self.labels.append(label_mapping[category])  # Append the mapped label to labels list

    def __len__(self):
        return len(self.texts)  # Return the total number of samples

    def __getitem__(self, idx):
        text = self.texts[idx]  # Retrieve the text at index idx
        label = self.labels[idx]  # Retrieve the corresponding label
        # Encode the text using the tokenizer, truncating if necessary, and padding to max_length
        encoding = self.tokenizer.encode_plus(text, truncation=True, max_length=self.max_length, padding='max_length')
        # Return the encoding and label as a PyTorch tensor
        return {key: torch.tensor(val) for key, val in encoding.items()}, torch.tensor(label)


# Load the pre-trained BERT model for sequence classification.
def prepare_model(num_labels):
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)
    # The parameter "bert-base-uncased" specifies the pre-trained BERT model with uncased text processing.
    # The num_labels parameter is used to specify the number of output classes (labels) for the classification task.
    
    return model


from torch.optim import AdamW
from tqdm.notebook import tqdm

def train_model(model, train_loader, val_loader, epochs=3):
    optimizer = AdamW(model.parameters(), lr=1e-5)
    for epoch in range(epochs):
        model.train()
        for batch, (inputs, labels) in tqdm(enumerate(train_loader)):
            optimizer.zero_grad()
            outputs = model(**inputs, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
        
        # Validation step
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch, (inputs, labels) in tqdm(enumerate(val_loader)):
                outputs = model(**inputs, labels=labels)
                val_loss += outputs.loss.item()
        print(f"Validation Loss: {val_loss / len(val_loader)}")

def evaluate_model(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch, (inputs, labels) in tqdm(enumerate(test_loader)):
            outputs = model(**inputs)
            predictions = torch.argmax(outputs.logits, dim=1)
            correct += (predictions == labels).sum().item()
            total += labels.size(0)
    accuracy = correct / total
    print(f"Accuracy: {accuracy * 100}%")


### Train the model

In [21]:
from transformers import BertTokenizer
from torch.utils.data import DataLoader

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
train_dataset = AgeGroupDataset(trainFilename, tokenizer)
test_dataset = AgeGroupDataset(testFilename, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

num_labels = len(set(train_dataset.labels))  # Number of unique age group categories
model = prepare_model(num_labels)
train_model(model, train_loader, train_loader)

# Check accuracy
evaluate_model(model, test_loader)



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

0it [00:00, ?it/s]

0it [00:00, ?it/s]

Validation Loss: 1.087371581052182


0it [00:00, ?it/s]

0it [00:00, ?it/s]

Validation Loss: 0.9527522752270896


0it [00:00, ?it/s]

0it [00:00, ?it/s]

Validation Loss: 0.827935048990701


In [22]:
# Check accuracy
evaluate_model(model, test_loader)


0it [00:00, ?it/s]

Accuracy: 58.4%


## Modify Hyperparameters

#### - Increase Sequence Length <br>- Split Data into Training and Validation Sets <br>- Modify Hyperparameters <br>- Update the Training and Evaluation Calls

In [23]:
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader, random_split
from torch.optim import AdamW
from tqdm.notebook import tqdm
import torch

# Define file paths
trainFilename = "/Users/jocelyn/Desktop/ARP/all_posts_train.txt"
testFilename = "/Users/jocelyn/Desktop/ARP/all_posts_test.txt"

# Define a mapping for the age groups
label_mapping = {10: 0, 20: 1, 30: 2, 40: 3}

# Define max_length
max_length = 256

# Define learning rate
learning_rate = 2e-5

class AgeGroupDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_length=max_length):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.texts = []
        self.labels = []
        
        with open(file_path, 'r') as file:
            for line in file:
                words = line.rstrip().split()
                category = int(words[0])
                words = words[1:]
                text = " ".join([word.split("/")[0] for word in words if len(word.split("/")) == 3])
                self.texts.append(text)
                self.labels.append(label_mapping[category])

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(text, truncation=True, max_length=self.max_length, padding='max_length')
        return {key: torch.tensor(val) for key, val in encoding.items()}, torch.tensor(label)

def prepare_model(num_labels):
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)
    return model

def train_model(model, train_loader, val_loader, epochs=5): # Increased epochs
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    for epoch in range(epochs):
        model.train()
        for batch, (inputs, labels) in tqdm(enumerate(train_loader)):
            optimizer.zero_grad()
            outputs = model(**inputs, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

        # Validation step
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch, (inputs, labels) in tqdm(enumerate(val_loader)):
                outputs = model(**inputs, labels=labels)
                val_loss += outputs.loss.item()
        print(f"Validation Loss: {val_loss / len(val_loader)}")

def evaluate_model(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch, (inputs, labels) in tqdm(enumerate(test_loader)):
            outputs = model(**inputs)
            predictions = torch.argmax(outputs.logits, dim=1)
            correct += (predictions == labels).sum().item()
            total += labels.size(0)
    accuracy = correct / total
    print(f"Accuracy: {accuracy * 100}%")


# Load and tokenize data
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
train_dataset = AgeGroupDataset(trainFilename, tokenizer, max_length=max_length)
test_dataset = AgeGroupDataset(testFilename, tokenizer, max_length=max_length)

# Split training data into training and validation sets
train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_subset, val_subset = random_split(train_dataset, [train_size, val_size])

train_loader = DataLoader(train_subset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_subset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)

# Prepare and train the model
num_labels = len(label_mapping)
model = prepare_model(num_labels)
train_model(model, train_loader, val_loader)

# Evaluate the model
evaluate_model(model, test_loader)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

0it [00:00, ?it/s]

0it [00:00, ?it/s]

Validation Loss: 1.134809607968611


0it [00:00, ?it/s]

0it [00:00, ?it/s]

Validation Loss: 1.0929678275304682


0it [00:00, ?it/s]

0it [00:00, ?it/s]

Validation Loss: 1.0994654297828674


0it [00:00, ?it/s]

0it [00:00, ?it/s]

Validation Loss: 1.1388374777401196


0it [00:00, ?it/s]

0it [00:00, ?it/s]

Validation Loss: 1.1777905278346117


0it [00:00, ?it/s]

Accuracy: 59.4%


In [24]:
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader, random_split
from torch.optim import AdamW
from tqdm.notebook import tqdm
import torch
from transformers import BertConfig


# Define file paths
trainFilename = "/Users/jocelyn/Desktop/ARP/all_posts_train.txt"
testFilename = "/Users/jocelyn/Desktop/ARP/all_posts_test.txt"

# Define a mapping for the age groups
label_mapping = {10: 0, 20: 1, 30: 2, 40: 3}

# Define learning rate
learning_rate = 2e-5

class AgeGroupDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_length=128):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.texts = []
        self.labels = []
        
        with open(file_path, 'r') as file:
            for line in file:
                words = line.rstrip().split()
                category = int(words[0])
                words = words[1:]
                text = " ".join([word.split("/")[0] for word in words if len(word.split("/")) == 3])
                self.texts.append(text)
                self.labels.append(label_mapping[category])

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(text, truncation=True, max_length=self.max_length, padding='max_length')
        return {key: torch.tensor(val) for key, val in encoding.items()}, torch.tensor(label)

def prepare_model(num_labels, dropout_rate=0.1):
    # Define the configuration with the specified dropout rate
    config = BertConfig.from_pretrained("bert-base-uncased", num_labels=num_labels, hidden_dropout_prob=dropout_rate)
    # Load the model with the custom configuration
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", config=config)
    return model

def train_model(model, train_loader, val_loader, epochs=5): # Increased epochs
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    for epoch in range(epochs):
        model.train()
        for batch, (inputs, labels) in tqdm(enumerate(train_loader)):
            optimizer.zero_grad()
            outputs = model(**inputs, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

        # Validation step
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch, (inputs, labels) in tqdm(enumerate(val_loader)):
                outputs = model(**inputs, labels=labels)
                val_loss += outputs.loss.item()
        print(f"Validation Loss: {val_loss / len(val_loader)}")

def evaluate_model(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch, (inputs, labels) in tqdm(enumerate(test_loader)):
            outputs = model(**inputs)
            predictions = torch.argmax(outputs.logits, dim=1)
            correct += (predictions == labels).sum().item()
            total += labels.size(0)
    accuracy = correct / total
    print(f"Accuracy: {accuracy * 100}%")


# Load and tokenize data
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
train_dataset = AgeGroupDataset(trainFilename, tokenizer, max_length=max_length)
test_dataset = AgeGroupDataset(testFilename, tokenizer, max_length=max_length)

# Split training data into training and validation sets
train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_subset, val_subset = random_split(train_dataset, [train_size, val_size])

train_loader = DataLoader(train_subset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_subset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)

# Prepare and train the model
num_labels = len(label_mapping)
model = prepare_model(num_labels)
train_model(model, train_loader, val_loader)

# Evaluate the model
evaluate_model(model, test_loader)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

0it [00:00, ?it/s]

0it [00:00, ?it/s]

Validation Loss: 1.1234658255296595


0it [00:00, ?it/s]

0it [00:00, ?it/s]

Validation Loss: 1.0545869890381308


0it [00:00, ?it/s]

0it [00:00, ?it/s]

Validation Loss: 1.0554057833026438


0it [00:00, ?it/s]

0it [00:00, ?it/s]

Validation Loss: 1.0862387629116284


0it [00:00, ?it/s]

0it [00:00, ?it/s]

Validation Loss: 1.171978308874018


0it [00:00, ?it/s]

Accuracy: 56.8%


## 2. Roberta 

In [26]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import Dataset, DataLoader, random_split
from torch.optim import AdamW
from tqdm.notebook import tqdm
import torch
import torch.nn as nn
from transformers import BertConfig


# Define file paths
trainFilename = "/Users/jocelyn/Desktop/ARP/all_posts_train.txt"
testFilename = "/Users/jocelyn/Desktop/ARP/all_posts_test.txt"

# Define a mapping for the age groups
label_mapping = {10: 0, 20: 1, 30: 2, 40: 3}

# Define learning rate
learning_rate = 2e-5

class AgeGroupDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_length=128):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.texts = []
        self.labels = []
        
        with open(file_path, 'r') as file:
            for line in file:
                words = line.rstrip().split()
                category = int(words[0])
                words = words[1:]
                text = " ".join([word.split("/")[0] for word in words if len(word.split("/")) == 3])
                self.texts.append(text)
                self.labels.append(label_mapping[category])

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(text, truncation=True, max_length=self.max_length, padding='max_length')
        return {key: torch.tensor(val) for key, val in encoding.items()}, torch.tensor(label)

def prepare_model(num_labels, dropout_rate=0.1):
    model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=num_labels)
    return model

# Function to train the model
def train_model(model, train_loader, val_loader, epochs=5, class_weights=None):
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    loss_function = nn.CrossEntropyLoss(weight=class_weights)

    for epoch in range(epochs):
        model.train()
        for batch, (inputs, labels) in tqdm(enumerate(train_loader)):
            optimizer.zero_grad()
            outputs = model(**inputs)
            loss = loss_function(outputs.logits, labels)
            loss.backward()
            optimizer.step()

        # Validation step
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch, (inputs, labels) in tqdm(enumerate(val_loader)):
                outputs = model(**inputs, labels=labels)
                val_loss += outputs.loss.item()
        print(f"Validation Loss: {val_loss / len(val_loader)}")

def evaluate_model(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch, (inputs, labels) in tqdm(enumerate(test_loader)):
            outputs = model(**inputs)
            predictions = torch.argmax(outputs.logits, dim=1)
            correct += (predictions == labels).sum().item()
            total += labels.size(0)
    accuracy = correct / total
    print(f"Accuracy: {accuracy * 100}%")


# Load and tokenize data
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
train_dataset = AgeGroupDataset(trainFilename, tokenizer, max_length=max_length)
test_dataset = AgeGroupDataset(testFilename, tokenizer, max_length=max_length)

# Split the training data into training and validation subsets
train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_subset, val_subset = random_split(train_dataset, [train_size, val_size])

train_loader = DataLoader(train_subset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_subset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)

# Calculate class weights (you should implement this based on your data distribution)
class_weights = None  # Replace with the computed class weights

# Prepare and train the model
num_labels = len(label_mapping)
model = prepare_model(num_labels)
train_model(model, train_loader, val_loader, class_weights=class_weights)

# Evaluate the model on the test data
evaluate_model(model, test_loader)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

0it [00:00, ?it/s]

0it [00:00, ?it/s]

Validation Loss: 1.198749642161762


0it [00:00, ?it/s]

0it [00:00, ?it/s]

Validation Loss: 1.1328245506567114


0it [00:00, ?it/s]

0it [00:00, ?it/s]

Validation Loss: 1.147370994091034


0it [00:00, ?it/s]

0it [00:00, ?it/s]

Validation Loss: 1.2195333926116718


0it [00:00, ?it/s]

0it [00:00, ?it/s]

Validation Loss: 1.3643779018346


0it [00:00, ?it/s]

Accuracy: 60.6%


## Fine-tuning Roberta

Calculate Class Weights and use it in Loss Function

In [29]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import Dataset, DataLoader, random_split
from torch.optim import AdamW
from tqdm.notebook import tqdm
import torch
import torch.nn as nn
from collections import Counter
from transformers import BertConfig


# Define file paths
trainFilename = "/Users/jocelyn/Desktop/ARP/all_posts_train.txt"
testFilename = "/Users/jocelyn/Desktop/ARP/all_posts_test.txt"

# Define a mapping for the age groups
label_mapping = {10: 0, 20: 1, 30: 2, 40: 3}

max_length = 128 

# Define learning rate
learning_rate = 2e-5

# AgeGroupDataset class
class AgeGroupDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_length=128):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.texts = []
        self.labels = []
        
        with open(file_path, 'r') as file:
            for line in file:
                words = line.rstrip().split()
                category = int(words[0])
                words = words[1:]
                text = " ".join([word.split("/")[0] for word in words if len(word.split("/")) == 3])
                self.texts.append(text)
                self.labels.append(label_mapping[category])

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(text, truncation=True, max_length=self.max_length, padding='max_length')
        return {key: torch.tensor(val) for key, val in encoding.items()}, torch.tensor(label)


# Calculate class weights
label_counter = Counter(train_dataset.labels)
total_samples = len(train_dataset.labels)
class_weights = {k: total_samples / (len(label_mapping) * v) for k, v in label_counter.items()}
class_weights_tensor = torch.tensor([class_weights[i] for i in range(len(class_weights))], dtype=torch.float)

def prepare_model(num_labels, dropout_rate=0.1):
    model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=num_labels)
    return model


# Function to train the model
def train_model(model, train_loader, val_loader, epochs=5):
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    loss_function = nn.CrossEntropyLoss(weight=class_weights_tensor)  # Using class_weights_tensor directly

    for epoch in range(epochs):
        model.train()
        for batch, (inputs, labels) in tqdm(enumerate(train_loader)):
            optimizer.zero_grad()
            outputs = model(**inputs)
            loss = loss_function(outputs.logits, labels)
            loss.backward()
            optimizer.step()

        # Validation step
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch, (inputs, labels) in tqdm(enumerate(val_loader)):
                outputs = model(**inputs, labels=labels)
                val_loss += outputs.loss.item()
        print(f"Validation Loss: {val_loss / len(val_loader)}")


def evaluate_model(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch, (inputs, labels) in tqdm(enumerate(test_loader)):
            outputs = model(**inputs)
            predictions = torch.argmax(outputs.logits, dim=1)
            correct += (predictions == labels).sum().item()
            total += labels.size(0)
    accuracy = correct / total
    print(f"Accuracy: {accuracy * 100}%")


# Load and tokenize data
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
train_dataset = AgeGroupDataset(trainFilename, tokenizer, max_length=max_length)
test_dataset = AgeGroupDataset(testFilename, tokenizer, max_length=max_length)

# Split the training data into training and validation subsets
train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_subset, val_subset = random_split(train_dataset, [train_size, val_size])

train_loader = DataLoader(train_subset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_subset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)

# Prepare and train the model
num_labels = len(label_mapping)
model = prepare_model(num_labels)
train_model(model, train_loader, val_loader)

# Evaluate the model on the test data
evaluate_model(model, test_loader)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

0it [00:00, ?it/s]

0it [00:00, ?it/s]

Validation Loss: 1.1751519161112167


0it [00:00, ?it/s]

0it [00:00, ?it/s]

Validation Loss: 1.1696149864617515


0it [00:00, ?it/s]

0it [00:00, ?it/s]

Validation Loss: 1.1680370937375462


0it [00:00, ?it/s]

0it [00:00, ?it/s]

Validation Loss: 1.1454929989926956


0it [00:00, ?it/s]

0it [00:00, ?it/s]

Validation Loss: 1.2326692430412067


0it [00:00, ?it/s]

Accuracy: 57.199999999999996%
