In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from transformers import AutoModel, BertTokenizerFast, AdamW
from sklearn.utils.class_weight import compute_class_weight

In [2]:
# Load your dataset
df = pd.read_csv("../data/train/text_dataset_50.csv")
df = df.dropna(subset=['text'])
df.head()

Unnamed: 0,text,location,interest
0,attempt hari tu act like smart deep tech entre...,Unknown,Technology
1,mosti cradle ni clueless whats going tech tal...,Unknown,Technology
2,hightech export country 942b hong kong 431b g...,Unknown,Technology
3,tbh many tech talent dont actually need focus...,Unknown,Technology
4,make 180000 tech consultant london grew extrem...,Unknown,Technology


In [3]:
# Preprocess data and split into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['interest'])

In [4]:
# Load BERT model and tokenizer
bert = AutoModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# Tokenize and encode sequences in the training set
tokens_train = tokenizer.batch_encode_plus(
    train_df['text'].tolist(),
    max_length=25,
    pad_to_max_length=True,
    truncation=True
)

# Tokenize and encode sequences in the test set
tokens_test = tokenizer.batch_encode_plus(
    test_df['text'].tolist(),
    max_length=25,
    pad_to_max_length=True,
    truncation=True
)



In [5]:
# Initialize the label encoder
label_encoder = LabelEncoder()

# Fit and transform the labels for the training data
train_labels_encoded = label_encoder.fit_transform(train_df['interest'])

# Encode labels for the test data
test_labels_encoded = label_encoder.transform(test_df['interest'])

# Convert data to PyTorch tensors
train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_labels = torch.tensor(train_labels_encoded, dtype=torch.long)

test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_labels = torch.tensor(test_labels_encoded, dtype=torch.long)

# Create DataLoader for training set
batch_size = 32
train_data = TensorDataset(train_seq, train_mask, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

test_data = TensorDataset(test_seq, test_mask, test_labels)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=train_sampler, batch_size=batch_size)

In [6]:
# Define your neural network architecture
class BERT_Arch(nn.Module):
    def __init__(self, bert, num_classes):
        super(BERT_Arch, self).__init__()
        self.bert = bert
        self.dropout = nn.Dropout(0.1)
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(768, 512)
        self.fc2 = nn.Linear(512, num_classes)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, sent_id, mask):
        _, cls_hs = self.bert(sent_id, attention_mask=mask, return_dict=False)
        x = self.fc1(cls_hs)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x

In [7]:
# Create the model
num_classes = len(df['interest'].unique())
model = BERT_Arch(bert, num_classes)

# Set the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

# Calculate class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(train_labels_encoded), y=train_labels_encoded)
weights = torch.tensor(class_weights, dtype=torch.float32)

# Define the loss function
cross_entropy = nn.NLLLoss(weight=weights)

# Number of training epochs
epochs = 3



In [8]:
# Training loop
for epoch in range(epochs):
    model.train()
    for step, batch in enumerate(train_dataloader):
        batch = [t.to(device) for t in batch]
        sent_id, mask, labels = batch
        optimizer.zero_grad()
        preds = model(sent_id, mask)
        loss = cross_entropy(preds, labels)
        loss.backward()
        optimizer.step()

In [19]:
# Evaluation on the test set
model.eval()
true_labels = []
predicted_labels = []

with torch.no_grad():
    for step, batch in enumerate(test_dataloader):
        batch = [t.to(device) for t in batch]
        sent_id, mask, labels = batch

        # Print some debugging information
        print(f"Step: {step}, Batch Size: {len(labels)}, Index Range: {step * batch_size} - {(step + 1) * batch_size}")

        preds = model(sent_id, mask)
        preds_labels = torch.argmax(preds, axis=1)
        true_labels.extend(labels.cpu().numpy())
        predicted_labels.extend(preds_labels.cpu().numpy())

# Check if the length of true_labels and predicted_labels matches
if len(true_labels) != len(predicted_labels):
    print("Length mismatch between true_labels and predicted_labels.")
    print(f"Length of true_labels: {len(true_labels)}, Length of predicted_labels: {len(predicted_labels)}")
else:
    # Print the classification report
    print(classification_report(true_labels, predicted_labels))

IndexError: index 14177 is out of bounds for dimension 0 with size 5293

In [18]:
predicted_labels

[]