In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # Order GPUs by PCI bus number
os.environ["CUDA_VISIBLE_DEVICES"] = "0"         # Select GPU with PCI bus number 0

In [2]:
!nvidia-smi

Tue Mar  7 15:22:33 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03    Driver Version: 510.47.03    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  On   | 00000000:01:00.0 Off |                    0 |
| N/A   29C    P0    61W / 400W |   3872MiB / 40960MiB |      4%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA A100-SXM...  On   | 00000000:41:00.0 Off |                    0 |
| N/A   58C    P0   327W / 400W |  22942MiB / 40960MiB |     97%      Default |
|       

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

import torch.nn as nn
import json
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModel, AutoConfig, AutoTokenizer, AutoModelForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import wandb
wandb.init(project='misc', entity='haonanl5')

[34m[1mwandb[0m: Currently logged in as: [33mhaonanl5[0m. Use [1m`wandb login --relogin`[0m to force relogin


### Data preparation

In [5]:
with open('/l/users/haonan.li/LAMB/data/TourQue_Knowledge_Sel.json') as f:
    data = pd.read_json(f, orient='index')
data = data.dropna() # remove the lines without latlong    

# Create a PyTorch dataset
class PlacesDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, latlongs):
        self.encodings = encodings
        self.latlong = latlongs

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['latlong'] = torch.tensor(self.latlong[idx])
        return item

    def __len__(self):
        return len(self.latlong)

### Model preparation

In [6]:
n_layers = 2
model = AutoModel.from_pretrained('distilbert-base-uncased')
model.transformer.layer = model.transformer.layer[:n_layers] # keep only n_layers 
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
train_data, test_data = train_test_split(data, test_size=0.1, random_state=42)

train_encodings = tokenizer(train_data['name'].tolist(), padding=True, truncation=True, max_length=64)
test_encodings = tokenizer(test_data['name'].tolist(), padding=True, truncation=True, max_length=64)

# Create the PyTorch datasets and data loaders
train_dataset = PlacesDataset(train_encodings, train_data['lat_long'].tolist())
test_dataset = PlacesDataset(test_encodings, test_data['lat_long'].tolist())

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=False)


### Training preparation

In [8]:
# Define hyperparameters
batch_size = 8
learning_rate = 2e-5
num_epochs = 3
max_seq_len = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define the optimizer and the learning rate scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.99)
criterion = torch.nn.MSELoss()


In [9]:
import math
import torch
import torch.nn as nn
import numpy as np

class PlaceDistanceLoss(nn.Module):
    def __init__(self, margin):
        super(PlaceDistanceLoss, self).__init__()
        self.margin = margin
    
    def forward(self, embeddings, coords, device):
        # embeddings is the tensor of fixed-length representations of the place names
        # coords is the tensor of geocoordinates
        
        # Compute pairwise distances between all pairs of places
        # Convert latitude and longitude to radians
        coordinates_rad = torch.deg2rad(coords)

        # Compute pairwise differences in latitude and longitude
        dlat = coordinates_rad[:, None, 0] - coordinates_rad[None, :, 0]
        dlon = coordinates_rad[:, None, 1] - coordinates_rad[None, :, 1]

        # Compute great-circle distance using Haversine formula
        a = torch.sin(dlat / 2) ** 2 + torch.cos(coordinates_rad[:, 0])[:, None] * torch.cos(coordinates_rad[None, :, 0]) * torch.sin(dlon / 2) ** 2
        c = 2 * torch.atan2(torch.sqrt(a), torch.sqrt(1 - a))
        distances = c / math.pi  # Earth radius = 6371 km

        # distance is a tensor of shape (num_points, num_points) containing the true pairwise distances
        
        # Compute pairwise cosine similarities between all pairs of fixed-length representations
        similarities = torch.nn.functional.cosine_similarity(embeddings.unsqueeze(1), embeddings.unsqueeze(0), dim=-1)
        
        # Convert distances to similarities using a Gaussian kernel
        sigma = distances.std()
        similarities_gt = torch.exp(-distances ** 2 / (2 * sigma ** 2))
        
        # Compute the triplet loss
        margin = self.margin
        N = embeddings.size(0)
        loss = torch.tensor(0.0).to(device)
        
        for i in range(N):
            for j in range(N):
                if i == j:
                    continue
                
                for k in range(j):
                    if k == i or k == j:
                        continue
                    
                    a_sim = similarities[i, j]
                    b_sim = similarities[i, k]
                    a_sim_gt = similarities_gt[i, j]
                    b_sim_gt = similarities_gt[i, k]
                    
                    flag = a_sim_gt - b_sim_gt
                    # print(pos_sim, neg_sim, pos_sim_gt, neg_sim_gt)
                    # Compute the triplet loss
                    if flag < 0:
                        triplet_loss = torch.max(a_sim - b_sim - flag, torch.tensor(0.0).to(device))
                    else:
                        triplet_loss = torch.max(b_sim - a_sim + flag, torch.tensor(0.0).to(device))
                    # Add the triplet loss to the total loss
                    loss += triplet_loss # + triplet_loss_gt
        
        # Normalize the loss by the number of triplets
        num_triplets = N * (N - 1) * (N - 2) / 2
        loss /= num_triplets
        
        return loss
criterion = PlaceDistanceLoss(margin=2)

In [10]:
# Define the training loop
def train(model, train_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        latlong = batch['latlong'].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask)
        rep = outputs.last_hidden_state[:,0,:]
        loss = criterion(rep, latlong.float(), device)
        loss.backward()
        optimizer.step()
        wandb.log({"Train Loss":loss.item()})
        total_loss += loss.item()
    return total_loss / len(train_loader)


# Define the evaluation loop
def evaluate(model, test_loader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            latlong = batch['latlong'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            rep = outputs.last_hidden_state[:,0,:]
            loss = criterion(rep, latlong.float(), device)
            total_loss += loss.item()
    return total_loss / len(test_loader)

In [11]:
# Train the model
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
epochs = 1
for epoch in range(epochs):
    train_loss = train(model, train_loader, optimizer, criterion, device)
    test_loss = evaluate(model, test_loader, criterion, device)
    wandb.log({"Avg Train Loss":train_loss, "Test Loss": test_loss})
    model.save_pretrained(f'/l/users/haonan.li/LAMB/data/tmp/loc_{n_layers}layer.pth')

Exception in thread SystemMonitor:
Traceback (most recent call last):
  File "/home/haonan.li/.conda/envs/torch13cu117/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "/home/haonan.li/.conda/envs/torch13cu117/lib/python3.10/threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "/home/haonan.li/.conda/envs/torch13cu117/lib/python3.10/site-packages/wandb/sdk/internal/system/system_monitor.py", line 118, in _start
    asset.start()
  File "/home/haonan.li/.conda/envs/torch13cu117/lib/python3.10/site-packages/wandb/sdk/internal/system/assets/cpu.py", line 166, in start
    self.metrics_monitor.start()
  File "/home/haonan.li/.conda/envs/torch13cu117/lib/python3.10/site-packages/wandb/sdk/internal/system/assets/interfaces.py", line 168, in start
    logger.info(f"Started {self._process.name}")
AttributeError: 'NoneType' object has no attribute 'name'


In [12]:
model

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0): TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): Linear(i