<a href="https://colab.research.google.com/github/hugotomita1201/yachay.ai_project/blob/main/electra_regression_ungrouped_cumulative_count_embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers
# %%
import numpy as np
import pandas as pd
import csv
from sklearn.model_selection import train_test_split
import time
from transformers import AutoTokenizer
from transformers import AutoModel
import torch.nn as nn
import torch
import os

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def haversine_distance(lat1, lon1, lat2, lon2):
    earth_radius = 6371  # Earth radius in km
    pi = torch.tensor(3.141592653589793, dtype=torch.float,requires_grad=False)

    lat1, lon1, lat2, lon2 = [x * (pi / 180) for x in [lat1, lon1, lat2, lon2]]

    dlat = lat2 - lat1
    dlon = lon2 - lon1

    a = torch.sin(dlat / 2) ** 2 + torch.cos(lat1) * \
        torch.cos(lat2) * torch.sin(dlon / 2) ** 2
    c = 2 * torch.atan2(torch.sqrt(a), torch.sqrt(1 - a))

    distance = earth_radius * c
    return distance


class HaversineLoss(nn.Module):
    def __init__(self):
        super(HaversineLoss, self).__init__()

    def forward(self, y_pred, y_true):
        lat1, lon1 = y_pred[:, 0], y_pred[:, 1]
        lat2, lon2 = y_true[:, 0], y_true[:, 1]

        distance = haversine_distance(lat1, lon1, lat2, lon2)
        loss = distance.mean()
        return loss


haversine_loss = HaversineLoss()
# Replace the existing loss function in the training loop with haversine_loss
loss_fn = haversine_loss


# %%
# Train the model
num_epochs = 10
batch_size = 64

# %%
df = pd.read_csv('grouped_100_classification.csv')


# split the data into train and test
tokenizer = AutoTokenizer.from_pretrained(
    "google/electra-small-discriminator", do_lower_case=True)
model = AutoModel.from_pretrained(
    'google/electra-small-discriminator', num_labels=2).to(device)
config = model.config


# split data into train and test


def preprocess_data(df, tokenizer):

    # load start time
    start_time = time.time()
    # print that the data is being loaded
    print("Loading data...")

    # Filter and sample data
    df = df[df['language'] == 0]

    # Split into train and test sets
    train_features, test_features, train_labels, test_labels, train_cumulative_count, test_cumulative_count = train_test_split(
        df['text'], df[['lat', 'lng']], df['cumulative_count'], test_size=0.20, random_state=42)

  
    # Load tokenizer
    tokenizer = tokenizer

    # Tokenize the training and test text features
    train_encodings = tokenizer(train_features.tolist(
    ), padding=True, truncation=True, return_tensors='pt')
    test_encodings = tokenizer(test_features.tolist(
    ), padding=True, truncation=True, return_tensors='pt')

    # Tokenize the training and test text features
    train_input_ids = train_encodings['input_ids']
    test_input_ids = test_encodings['input_ids']

    # make attention masks to let know which tokens are real and which are padding
    train_attention_mask = train_encodings['attention_mask']
    test_attention_mask = test_encodings['attention_mask']

    # Convert labels to PyTorch tensors with float datatype
    train_labels = torch.tensor(
        train_labels[['lat', 'lng']].values, dtype=torch.float)
    test_labels = torch.tensor(
        test_labels[['lat', 'lng']].values, dtype=torch.float)
    
    #total cumulative count
    train_cumulative_count = torch.tensor(train_cumulative_count.values, dtype=torch.float).unsqueeze(1)
    test_cumulative_count = torch.tensor(test_cumulative_count.values, dtype=torch.float).unsqueeze(1) 

    # Print the time taken to load the data
    print("Time taken to load data: {} seconds".format(
        time.time() - start_time))

    return train_encodings, test_encodings, train_input_ids, test_input_ids, train_attention_mask, test_attention_mask, train_labels, test_labels, train_cumulative_count, test_cumulative_count


train_encodings, test_encodings, train_input_ids, test_input_ids, train_attention_mask, test_attention_mask, train_labels, test_labels, train_cumulative_count, test_cumulative_count = preprocess_data(
    df, tokenizer)


# Define the BERT-based reg model
class BERTRegressor(nn.Module):
    def __init__(self, model, num_inputs, num_outputs):
        super(BERTRegressor, self).__init__()
        self.bert = model
        self.dropout = nn.Dropout(0.15)
        self.regressor = nn.Linear(num_inputs * 2, num_outputs)

        #put in another layer for cumulative count
        self.cumulative_count_layer = nn.Linear(1,num_inputs)

    def forward(self, input_ids, attention_mask, cumulative_count):
        bert_outputs = self.bert(
            input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = bert_outputs.last_hidden_state
        cls_token = last_hidden_state[:, 0, :]

        #create a cumulative count layer
        cumulative_count_output = self.cumulative_count_layer(cumulative_count)

        #combine the layers with torch.cat
        combined_input = torch.cat((cls_token, cumulative_count_output), dim=1)
        combined_input = self.dropout(combined_input) #put dropout here

        logits = self.regressor(combined_input)
        return logits


num_outputs = 2
num_inputs = 256
cumulative_count = 1

# Initialize the BERT-based classification model
bert_regressor = BERTRegressor(model, num_inputs, num_outputs).to(device)

# Define the optimizer and the loss function
optimizer = torch.optim.AdamW(bert_regressor.parameters(), lr=2e-4, weight_decay=1e-9)

# %%
# Define model arguments
# Train the model
num_epochs = 10
batch_size = 64

torch.autograd.set_detect_anomaly(True)

# set a timer to see how long it takes to train the model
start_time = time.time()
for epoch in range(num_epochs):
    for i in range(0, len(train_input_ids), batch_size):
        input_ids_batch = train_input_ids[i:i+batch_size].to(device)
        attention_mask_batch = torch.ones_like(input_ids_batch).to(device)
        region_ids_batch = train_labels[i:i+batch_size].to(device)

        bert_regressor.zero_grad()

        logits = bert_regressor(
            input_ids_batch, attention_mask_batch, train_cumulative_count[i:i+batch_size].to(device))
        loss = loss_fn(logits, region_ids_batch)

        loss.backward()
        optimizer.step()

    # Evaluate the model on the test set after each epoch
    with torch.no_grad():
        test_distances = []
        for i in range(0, len(test_input_ids), batch_size):
            test_input_ids_batch = test_input_ids[i:i+batch_size].to(device)
            test_attention_mask_batch = test_attention_mask[i:i+batch_size].to(device)
            test_labels_batch = test_labels[i:i+batch_size].to(device)

            test_logits_batch = bert_regressor(
                test_input_ids_batch, test_attention_mask_batch, test_cumulative_count[i:i+batch_size].to(device))
           
            test_distance_batch = haversine_distance(
                test_logits_batch[:, 0], test_logits_batch[:, 1],
                test_labels_batch[:, 0], test_labels_batch[:, 1]
            ).mean()
            test_distances.append(test_distance_batch.item())

        test_distance = np.mean(test_distances)
        print(f"Epoch {epoch + 1}/{num_epochs}")
        print(f"Train loss: {loss}")
        print(f"Test distance: {test_distance}")
        print()


end_time = time.time()
print(f"Training time: {end_time - start_time}")


predicted_coords = logits.tolist()
# %%
torch.save(bert_regressor.state_dict(),
           'electra_regressor.pth')

# save to output file
output_file = 'predicted_labels.csv'

with open(output_file, 'w', newline='') as f:
    writer = csv.writer(f)
    for coord in predicted_coords:
        writer.writerow([coord])
# %%

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Some weights of the model checkpoint at google/electra-small-discriminator were not used when initializing ElectraModel: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Loading data...
Time taken to load data: 97.37684369087219 seconds
Epoch 1/10
Train loss: 1487.2919921875
Test distance: 1437.6474093764575

Epoch 2/10
Train loss: 1503.8245849609375
Test distance: 1414.4339980395873

Epoch 3/10
Train loss: 1478.60400390625
Test distance: 1398.4499300373134

Epoch 4/10
Train loss: 1523.2249755859375
Test distance: 1396.7306310852962

Epoch 5/10
Train loss: 1480.136474609375
Test distance: 1404.5912912852727



KeyboardInterrupt: ignored