<a href="https://colab.research.google.com/github/hugotomita1201/yachay.ai_project/blob/main/twhin_ungrouped_conc_text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sentencepiece
!pip install transformers


# %%
import torch
import torch.nn as nn
from transformers import RobertaTokenizer, RobertaModel, RobertaConfig, AutoTokenizer
from sklearn.model_selection import train_test_split
from transformers import AutoModel, AutoTokenizer
from transformers import BertConfig
import pandas as pd
import csv
import time
import pickle
from torch.optim import AdamW
from sklearn.metrics import accuracy_score
import numpy as np


import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

# use gpu if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

df = pd.read_csv('df_10_ungrouped_conc_text.csv')

# Load pre-trained model tokenizer (vocabulary)
model_name = 'Twitter/twhin-bert-base'

config = BertConfig.from_pretrained(model_name)
config.num_labels = 10  # number of regions for classification
bert_model = AutoModel.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(
    model_name)

#bert_model = RobertaModel.from_pretrained(model_name, config=config)
#tokenizer = BertweetTokenizer.from_pretrained(model_name)

# preprocess data with function


def haversine_distance(lat1, lon1, lat2, lon2):
    earth_radius = 6371  # Earth radius in km
    pi = torch.tensor(3.141592653589793, dtype=torch.float)

    lat1, lon1, lat2, lon2 = [torch.tensor(
        x.to_numpy(), dtype=torch.float) * (pi / 180) for x in [lat1, lon1, lat2, lon2]]

    dlat = lat2 - lat1
    dlon = lon2 - lon1

    a = torch.sin(dlat / 2) ** 2 + torch.cos(lat1) * \
        torch.cos(lat2) * torch.sin(dlon / 2) ** 2
    c = 2 * torch.atan2(torch.sqrt(a), torch.sqrt(1 - a))

    distance = earth_radius * c
    return distance


class HaversineLoss(nn.Module):
    def __init__(self):
        super(HaversineLoss, self).__init__()

    def forward(self, y_pred, y_true):
        lat1, lon1 = y_pred[:, 0], y_pred[:, 1]
        lat2, lon2 = y_true[:, 0], y_true[:, 1]

        distance = haversine_distance(lat1, lon1, lat2, lon2)
        loss = distance.mean()
        return loss


def preprocess_data(df, tokenizer):

    # Filter and sample data
    df = df
    # Split into train and test sets
    train_features, test_features, train_labels, test_labels = train_test_split(
        df['text_3'], df[['group']], test_size=0.2, random_state=12345)
    train_featuresx, test_featuresx, train_labelsx, test_labels_centroids = train_test_split(
        df['text'], df[['group_centroid_lat', 'group_centroid_lng']], test_size=0.2, random_state=12345)

    # Load tokenizer
    tokenizer = tokenizer

    # Tokenize the training and test text features
    train_encodings = tokenizer(train_features.values.tolist(
    ), padding=True, truncation=True, return_tensors='pt', max_length=512)
    test_encodings = tokenizer(test_features.values.tolist(
    ), padding=True, truncation=True, return_tensors='pt', max_length=512)

    # Tokenize the training and test text features
    train_input_ids = train_encodings['input_ids'].to(device)
    test_input_ids = test_encodings['input_ids'].to(device)

    # make attention masks to let know which tokens are real and which are padding
    train_attention_mask = train_encodings['attention_mask'].to(device)
    test_attention_mask = test_encodings['attention_mask'].to(device)

    # Convert labels to PyTorch tensors with long datatype
    train_labels = torch.tensor(
        train_labels['group'].values, dtype=torch.long).to(device)
    test_labels = torch.tensor(
        test_labels['group'].values, dtype=torch.long).to(device)

    return train_encodings, test_encodings, train_input_ids, test_input_ids, train_attention_mask, test_attention_mask, train_labels, test_labels, test_labels_centroids


start_time = time.time()
# call on function
train_encodings, test_encodings, train_input_ids, test_input_ids, train_attention_mask, test_attention_mask, train_labels, test_labels, test_labels_centroids = preprocess_data(
    df, tokenizer)
end_time = time.time()
print(f"Preprocessing took {end_time - start_time} seconds")
# print out the shape of the data

print(train_input_ids.shape)
print(test_input_ids.shape)

# Define the BERT-based classification model


class BERTClassifier(nn.Module):
    def __init__(self, bert_model, num_regions):
        super(BERTClassifier, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(0.10)
        self.classifier = nn.Linear(config.hidden_size, num_regions)

    def forward(self, input_ids, attention_mask):
        bert_outputs = self.bert(
            input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = bert_outputs[0][:, 0, :]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits


num_regions = 10

# Initialize the BERT-based classification model
bert_classifier = BERTClassifier(bert_model, num_regions).to(device)

# Define the optimizer and the loss function
optimizer = AdamW(bert_classifier.parameters(), lr=5e-5, weight_decay=1e-9)
loss_fn = nn.CrossEntropyLoss()

# %%
# Train the model
num_epochs = 5
batch_size = 8

# set a timer to see how long it takes to train the model

start_time = time.time()
# print starting time
print(f"Training started at {time.ctime(start_time)}")
for epoch in range(num_epochs):
    for i in range(0, len(train_input_ids), batch_size):
        input_ids_batch = train_input_ids[i:i+batch_size].to(device)
        attention_mask_batch = train_attention_mask[i:i+batch_size].to(device)
        region_ids_batch = train_labels[i:i+batch_size].to(device)

        bert_classifier.zero_grad()

        logits = bert_classifier(
            input_ids_batch, attention_mask_batch).to(device)
        loss = loss_fn(logits, region_ids_batch)

        loss.backward()
        optimizer.step()

    # Evaluate the model on the test set after each epoch
    test_batch_size = 4

    # Evaluate the model on the test set after each epoch
    with torch.no_grad():
        test_losses = []
        test_accuracies = []
        test_distances = []
        test_predictions = []
        test_actual_labels = []

        for i in range(0, len(test_input_ids), test_batch_size):
            test_input_ids_batch = test_input_ids[i:i +
                                                  test_batch_size].to(device)
            test_attention_mask_batch = test_attention_mask[i:i+test_batch_size].to(
                device)
            test_labels_batch = test_labels[i:i+test_batch_size].to(device)

            test_logits_batch = bert_classifier(
                test_input_ids_batch, test_attention_mask_batch)
            test_loss_batch = loss_fn(test_logits_batch, test_labels_batch)

            test_losses.append(test_loss_batch.item())

            # append test predictions and labels to list for each epoch
            test_predictions.append(torch.argmax(
                test_logits_batch, dim=1).cpu().numpy())
            test_actual_labels.append(test_labels_batch.cpu().numpy())

        # make arrays into dataframes to merge with df
        test_predictions_df = pd.DataFrame(np.concatenate(
            test_predictions, axis=0).reshape(-1, 1), columns=['group'])
        test_actual_labels_df = pd.DataFrame(np.concatenate(
            test_actual_labels, axis=0).reshape(-1, 1), columns=['group'])

        # calculate centroids for predicted and labels groups
        predicted_centroids = test_predictions_df.merge(df[['group', 'group_centroid_lat', 'group_centroid_lng']].drop_duplicates(
            subset=['group']), on='group', how='left')[['group_centroid_lat', 'group_centroid_lng']]
        labels_centroids = test_actual_labels_df.merge(df[['group', 'group_centroid_lat', 'group_centroid_lng']].drop_duplicates(
            subset=['group']), on='group', how='left')[['group_centroid_lat', 'group_centroid_lng']]

        # calculate haversine distance based on predicted and labels centroids
        haversine_distances = haversine_distance(predicted_centroids['group_centroid_lat'], predicted_centroids[
                                                 'group_centroid_lng'], labels_centroids['group_centroid_lat'], labels_centroids['group_centroid_lng'])

        avg_distance = sum(haversine_distances) / len(haversine_distances)

        avg_test_loss = sum(test_losses) / len(test_losses)
        test_accuracy = accuracy_score(
            test_actual_labels_df, test_predictions_df)
        print(
            f"Epoch {epoch+1}: Test loss={avg_test_loss:.4f}, Test accuracy={test_accuracy:.4f}, Avg distance={avg_distance:.4f} km")


end_time = time.time()
print(f"Training took {end_time - start_time} seconds")


# %%
torch.save(bert_classifier.state_dict(),
           'bert_classifier_for_grouped_data.pth')

'''
@misc{he2021debertav3,
      title={DeBERTaV3: Improving DeBERTa using ELECTRA-Style Pre-Training with Gradient-Disentangled Embedding Sharing}, 
      author={Pengcheng He and Jianfeng Gao and Weizhu Chen},
      year={2021},
      eprint={2111.09543},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}
'''
# %%


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m62.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.1-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m108.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
Collecting t

Downloading (…)lve/main/config.json:   0%|          | 0.00/632 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of the model checkpoint at Twitter/twhin-bert-base were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at Twitter/twhin-bert-base and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bi

Downloading (…)okenizer_config.json:   0%|          | 0.00/372 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Preprocessing took 104.29016637802124 seconds
torch.Size([483364, 292])
torch.Size([120842, 177])
Training started at Sat May 13 08:14:46 2023
Epoch 1: Test loss=1.9927, Test accuracy=0.2288, Avg distance=1834.0698 km


KeyboardInterrupt: ignored