<a href="https://colab.research.google.com/github/hugotomita1201/yachay.ai_project/blob/main/bertweet_classification_grouped_df.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

# %%
import torch
import torch.nn as nn
from transformers import RobertaTokenizer, RobertaModel, RobertaConfig, AutoTokenizer
from sklearn.model_selection import train_test_split
from transformers import AutoModel, AutoTokenizer
import pandas as pd
import csv
import time
import pickle


#use gpu if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

df = pd.read_csv('grouped_data_langid.csv')

# Load pre-trained model tokenizer (vocabulary)
model_name = 'vinai/bertweet-large'
config = RobertaConfig.from_pretrained(model_name)
config.num_labels = 10  # number of regions for classification
bert_model = AutoModel.from_pretrained("vinai/bertweet-large").to(device)
tokenizer = AutoTokenizer.from_pretrained(
    "vinai/bertweet-large", normalize=True)

#bert_model = RobertaModel.from_pretrained(model_name, config=config)
#tokenizer = BertweetTokenizer.from_pretrained(model_name)

# preprocess data with function


def preprocess_data(csv_file, df):

    # Filter and sample data
    df = df[df['language'] == 0]

    # Split into train and test sets
    train_features, test_features, train_labels, test_labels = train_test_split(
        df['text'], df[['group']], test_size=0.2, random_state=42)

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        "vinai/bertweet-large", normalize=True)

    # Tokenize the training and test text features
    train_encodings = tokenizer(train_features.tolist(
    ), padding=True, truncation=True, return_tensors='pt', max_length=512)
    test_encodings = tokenizer(test_features.tolist(
    ), padding=True, truncation=True, return_tensors='pt', max_length=512)

    # Tokenize the training and test text features
    train_input_ids = train_encodings['input_ids'].to(device)
    test_input_ids = test_encodings['input_ids'].to(device)

    # make attention masks to let know which tokens are real and which are padding
    train_attention_mask = train_encodings['attention_mask'].to(device)
    test_attention_mask = test_encodings['attention_mask'].to(device)

    # Convert labels to PyTorch tensors with long datatype
    train_labels = torch.tensor(
        train_labels['group'].values, dtype=torch.long).to(device)
    test_labels = torch.tensor(
        test_labels['group'].values, dtype=torch.long).to(device)

    return train_encodings, test_encodings, train_input_ids, test_input_ids, train_attention_mask, test_attention_mask, train_labels, test_labels


start_time = time.time()
# call on function
train_encodings, test_encodings, train_input_ids, test_input_ids, train_attention_mask, test_attention_mask, train_labels, test_labels = preprocess_data(
    'grouped_dataset.csv', df)
end_time = time.time()
print(f"Preprocessing took {end_time - start_time} seconds")
# print out the shape of the data

print(train_input_ids.shape)
print(test_input_ids.shape)

# save encodings to pickle file with a function

'''
def save_encodings(train_encodings, test_encodings):
    with open('train_encodings_classification.pkl', 'wb') as f:
        pickle.dump(train_encodings, f)
    with open('test_encodings_classification.pkl', 'wb') as f:
        pickle.dump(test_encodings, f)
'''

'''
# call on function
save_encodings(train_encodings, test_encodings)
'''

'''
# load encodings from pickle file with a function
def load_encodings():
    with open('train_encodings_classification.pkl', 'rb') as f:
        train_encodings = pickle.load(f)
    with open('test_encodings_classification.pkl', 'rb') as f:
        test_encodings = pickle.load(f)

    train_input_ids = train_encodings['input_ids']
    test_input_ids = test_encodings['input_ids']

    # make attention masks to let know which tokens are real and which are padding
    train_attention_mask = train_encodings['attention_mask']
    test_attention_mask = test_encodings['attention_mask']

    train_labels = train_encodings['labels']
    test_labels = test_encodings['labels']

    train_labels = torch.tensor(
        train_labels.values, dtype=torch.long)
    test_labels = torch.tensor(
        test_labels.values, dtype=torch.long)

    return train_encodings, test_encodings, train_input_ids, test_input_ids, train_attention_mask, test_attention_mask, train_labels, test_labels


# call on function
train_encodings, test_encodings, train_input_ids, test_input_ids, train_attention_mask, test_attention_mask, train_labels, test_labels = load_encodings()

'''
# Define the BERT-based classification model


class BERTClassifier(nn.Module):
    def __init__(self, bert_model, num_regions):
        super(BERTClassifier, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(config.hidden_size, num_regions)

    def forward(self, input_ids, attention_mask):
        bert_outputs = self.bert(
            input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = bert_outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits


num_regions = 10

# Initialize the BERT-based classification model
bert_classifier = BERTClassifier(bert_model, num_regions).to(device)

# Define the optimizer and the loss function
optimizer = torch.optim.Adam(bert_classifier.parameters(), lr=1e-5)
loss_fn = nn.CrossEntropyLoss()

# %%
# Train the model
num_epochs = 10
batch_size = 16

# set a timer to see how long it takes to train the model

start_time = time.time()
# print starting time
print(f"Training started at {time.ctime(start_time)}")
for epoch in range(num_epochs):
    for i in range(0, len(train_input_ids), batch_size):
        input_ids_batch = train_input_ids[i:i+batch_size].to(device)
        attention_mask_batch = train_attention_mask[i:i+batch_size].to(device)
        region_ids_batch = train_labels[i:i+batch_size].to(device)

        bert_classifier.zero_grad()

        logits = bert_classifier(
            input_ids_batch, attention_mask_batch).to(device)
        loss = loss_fn(logits, region_ids_batch)

        loss.backward()
        optimizer.step()

    # Evaluate the model on the test set after each epoch
    test_batch_size = 8 

    # Evaluate the model on the test set after each epoch
    with torch.no_grad():
      test_losses = []
      test_accuracies = []
    
      for i in range(0, len(test_input_ids), test_batch_size):
        test_input_ids_batch = test_input_ids[i:i+test_batch_size].to(device)
        test_attention_mask_batch = test_attention_mask[i:i+test_batch_size].to(device)
        test_labels_batch = test_labels[i:i+test_batch_size].to(device)

        test_logits_batch = bert_classifier(test_input_ids_batch, test_attention_mask_batch)
        test_loss_batch = loss_fn(test_logits_batch, test_labels_batch)
        test_accuracy_batch = (test_logits_batch.argmax(dim=1) == test_labels_batch).float().mean()
        
        test_losses.append(test_loss_batch.item())
        test_accuracies.append(test_accuracy_batch.item())

    # Calculate the average test loss and accuracy for the entire test set
    avg_test_loss = sum(test_losses) / len(test_losses)
    avg_test_accuracy = sum(test_accuracies) / len(test_accuracies)

    print(
        f"Epoch {epoch+1}: Test loss={avg_test_loss:.4f}, Test accuracy={avg_test_accuracy:.4f}")


end_time = time.time()
print(f"Training took {end_time - start_time} seconds")

predicted_labels = torch.argmax(logits, dim=1).tolist()

# %%
torch.save(bert_classifier.state_dict(),
           'bert_classifier_for_grouped_data.pth')

# save to output file
output_file = 'predicted_labels_for_grouped_dataset.csv'

with open(output_file, 'w', newline='') as f:
    writer = csv.writer(f)
    for label in predicted_labels:
        writer.writerow([label])


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Some weights of the model checkpoint at vinai/bertweet-large were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Preprocessing took 13.408267498016357 seconds
torch.Size([33217, 512])
torch.Size([8305, 512])
Training started at Fri Apr 21 04:46:52 2023
Epoch 1: Test loss=1.4945, Test accuracy=0.4490
Epoch 2: Test loss=1.5525, Test accuracy=0.4471
Epoch 3: Test loss=1.7125, Test accuracy=0.4380
Epoch 4: Test loss=2.2142, Test accuracy=0.4153
Epoch 5: Test loss=2.6841, Test accuracy=0.4017
Epoch 6: Test loss=3.0244, Test accuracy=0.4283
Epoch 7: Test loss=3.3204, Test accuracy=0.4084
Epoch 8: Test loss=3.5851, Test accuracy=0.4081
Epoch 9: Test loss=3.4424, Test accuracy=0.4139
Epoch 10: Test loss=3.4303, Test accuracy=0.4169
Training took 24278.79972267151 seconds
