<a href="https://colab.research.google.com/github/hugotomita1201/yachay.ai_project/blob/main/distilbert_model_classification_ungrouped.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers

# %%
import torch
import torch.nn as nn
from transformers import BertConfig
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.model_selection import train_test_split
import pandas as pd
import csv
import time
import pickle
import os

torch.cuda.empty_cache()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

df = pd.read_csv('Clusters_Coordinates_Language_Labeled.csv')

# Load pre-trained model tokenizer (vocabulary)
model_name = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
config = BertConfig.from_pretrained(model_name)
config.num_labels = 10  # number of regions for classification
bert_model = DistilBertModel.from_pretrained(model_name, config=config).to(device)

# %%

def preprocess_data(df, tokenizer, language_id=0, test_size=0.2):
    #start time 
    start_time = time.time()
    # Filter the data based on language_id
    df = df[df['language'] == language_id]

    # Split the data into train and test
    train_features, test_features, train_labels, test_labels = train_test_split(
        df['text'], df['group'], test_size=test_size, random_state=42)

    # Tokenize the training and test text features
    train_encodings = tokenizer(train_features.tolist(),
                                 padding=True, truncation=True, return_tensors='pt')
    test_encodings = tokenizer(test_features.tolist(),
                                padding=True, truncation=True, return_tensors='pt')

    # Extract input_ids and attention masks from the encodings
    train_input_ids = train_encodings['input_ids'].to(device)
    test_input_ids = test_encodings['input_ids'].to(device)
    train_attention_mask = train_encodings['attention_mask'].to(device)
    test_attention_mask = test_encodings['attention_mask'].to(device)

    # Tokenize the training and test labels (targets)
    train_labels = torch.tensor(train_labels.values, dtype=torch.long).to(device)
    test_labels = torch.tensor(test_labels.values, dtype=torch.long).to(device)


    # Print the time taken to preprocess the data
    print("--- %s seconds ---" % (time.time() - start_time))

    # Save encodings using pickle
    with open('train_encodings_distilbert.pickle', 'wb') as handle:
        pickle.dump(train_encodings, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open('test_encodings_distilbert.pickle', 'wb') as handle:
        pickle.dump(test_encodings, handle, protocol=pickle.HIGHEST_PROTOCOL)

    return train_input_ids, train_attention_mask, train_labels, test_input_ids, test_attention_mask, test_labels


train_input_ids, train_attention_mask, train_labels, test_input_ids, test_attention_mask, test_labels = preprocess_data(
    df, tokenizer, language_id=0, test_size=0.2)


# Define the BERT-based classification model
class BERTClassifier(nn.Module):
    def __init__(self, bert_model, num_regions):
        super(BERTClassifier, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(config.hidden_size, num_regions)

    def forward(self, input_ids, attention_mask):
        bert_outputs = self.bert(
            input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = bert_outputs.last_hidden_state[:, 0, :]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits


num_regions = 10

# Initialize the BERT-based classification model
bert_classifier = BERTClassifier(bert_model, num_regions).to(device)

# Define the optimizer and the loss function
optimizer = torch.optim.Adam(bert_classifier.parameters(), lr=2e-4)
loss_fn = nn.CrossEntropyLoss()

# %%
# Train the model
num_epochs = 10
batch_size = 64
test_batch_size = 64

#start time 
start_time = time.time()
print("Start time: ", start_time)

for epoch in range(num_epochs):
    for i in range(0, len(train_input_ids), batch_size):
        input_ids_batch = train_input_ids[i:i+batch_size]
        attention_mask_batch = torch.ones_like(input_ids_batch)
        region_ids_batch = train_labels[i:i+batch_size]

        bert_classifier.zero_grad()

        logits = bert_classifier(
            input_ids_batch, attention_mask_batch)
        loss = loss_fn(logits, region_ids_batch)

        loss.backward()
        optimizer.step()

    test_batch_size = 32 

    # Evaluate the model on the test set after each epoch
    with torch.no_grad():
      test_losses = []
      test_accuracies = []
    
      for i in range(0, len(test_input_ids), test_batch_size):
          test_input_ids_batch = test_input_ids[i:i+test_batch_size]
          test_attention_mask_batch = test_attention_mask[i:i+test_batch_size]
          test_labels_batch = test_labels[i:i+test_batch_size]

          test_logits_batch = bert_classifier(test_input_ids_batch, test_attention_mask_batch)
          test_loss_batch = loss_fn(test_logits_batch, test_labels_batch)
          test_accuracy_batch = (test_logits_batch.argmax(dim=1) == test_labels_batch).float().mean()

          test_losses.append(test_loss_batch.item())
          test_accuracies.append(test_accuracy_batch.item())

    # Calculate the average test loss and accuracy for the entire test set
    avg_test_loss = sum(test_losses) / len(test_losses)
    avg_test_accuracy = sum(test_accuracies) / len(test_accuracies)

    print(
        f"Epoch {epoch+1}: Test loss={avg_test_loss:.4f}, Test accuracy={avg_test_accuracy:.4f}")


#end time
end_time = time.time()
print("Time to train model: ", end_time - start_time)

predicted_labels = torch.argmax(logits, dim=1).tolist()

# %%

torch.save(bert_classifier.state_dict(),
           'bc_distilbert.pth')

# save to output file
output_file = 'predicted_labels_distil_bert.csv'

with open(output_file, 'w', newline='') as f:
    writer = csv.writer(f)
    for label in predicted_labels:
        writer.writerow([label])

# %%h


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


--- 142.78659868240356 seconds ---
Start time:  1682162586.0701044
Epoch 1: Test loss=1.8001, Test accuracy=0.2593
Epoch 2: Test loss=1.7979, Test accuracy=0.2593
Epoch 3: Test loss=1.7966, Test accuracy=0.2593
Epoch 4: Test loss=1.7986, Test accuracy=0.2593
Epoch 5: Test loss=1.7979, Test accuracy=0.2593
Epoch 6: Test loss=1.7975, Test accuracy=0.2593
Epoch 7: Test loss=1.7973, Test accuracy=0.2593
Epoch 8: Test loss=1.7970, Test accuracy=0.2593
Epoch 9: Test loss=1.7971, Test accuracy=0.2593
Epoch 10: Test loss=1.7969, Test accuracy=0.2593
Time to train model:  7911.8390147686005
