In [4]:
import json
import pandas as pd

# LOCATION OF THE OSM DATA FOR FINE-TUNING

data_ny = 'sample_datasets/osm_ny.csv'

data_mn = 'sample_datasets/osm_mn.csv'

data = data_ny

In [5]:
# DEFAULT iS NEW YORK DATASET RUN THIS CELL TO USE THE MINNESOTA DATASET
data = data_mn

In [6]:
## CONSTRUCT DATASET FOR FINE TUNING ##

# Read data from .csv data file

state_frame = pd.read_csv(data)


# construct list of names and coordinates from data
name_list = []
coordinate_list = []
for i, item in state_frame.iterrows():
    name = item[1]
    lat = item[2]
    lng =item[3]
    name_list.append(name)
    coordinate_list.append([lng,lat])


# construct KDTree out of coordinates list for when we make the neighbor lists
import scipy.spatial as scp

ordered_neighbor_coordinate_list = scp.KDTree(coordinate_list)

In [7]:
state_frame

Unnamed: 0.1,Unnamed: 0,name,latitude,longitude
0,0,Duluth,-92.125122,46.772932
1,1,Walker,-94.585026,47.101709
2,2,Discount Liquor,-94.886617,47.490564
3,3,Laporte,-94.753356,47.214110
4,4,Avoca,-95.643418,43.949721
...,...,...,...,...
42974,42974,Glencoe Veterans Memorial,-94.132288,44.768892
42975,42975,Elephant Lake Lookout Tower,-92.750538,48.179819
42976,42976,Charles Cabinet Co,-93.187145,45.034487
42977,42977,SAC Towing & Recovery,-92.903192,44.490295


In [9]:

# Get top 20 nearest neighbors for each entity in dataset
with open('sample_datasets/SPABERT_finetuning_data.json', 'w') as out_f:
    for i, item in state_frame.iterrows():
        name = item[1]
        lat = item[2]
        lng = item[3]
        coordinates = [lng,lat]

        _, nearest_neighbors_idx = ordered_neighbor_coordinate_list.query([coordinates], k=21)

        # we want to store their names and coordinates

        nearest_neighbors_name = []
        nearest_neighbors_coords = []
        
        # iterate over nearest neighbors list
        for idx in nearest_neighbors_idx[0]:
            # get name and coordinate of neighbor
            neighbor_name = name_list[idx]
            neighbor_coords = coordinate_list[idx]
            nearest_neighbors_name.append(neighbor_name)
            nearest_neighbors_coords.append({"coordinates": neighbor_coords})
        
        # construct neighbor info dictionary object for SpaBERT embedding construction
        neighbor_info = {"name_list":nearest_neighbors_name, "geometry_list":nearest_neighbors_coords}


        # construct full dictionary object for SpaBERT embedding construction
        place = {"info":{"name":name, "geometry":{"coordinates": coordinates}}, "neighbor_info":neighbor_info}

        out_f.write(json.dumps(place))
        out_f.write('\n')

In [11]:
### FINE-TUNE SPABERT
import sys
from transformers.models.bert.modeling_bert import BertForMaskedLM
from transformers import BertTokenizer
sys.path.append("../")
from models.spatial_bert_model import SpatialBertConfig
from utils.common_utils import load_spatial_bert_pretrained_weights
from models.spatial_bert_model import  SpatialBertForMaskedLM

# load dataset we just created

dataset = 'sample_datasets/SPABERT_finetuning_data.json'

# load pre-trained spabert model

pretrained_model = 'sample_datasets/mlm_mem_keeppos_ep9_iter02799_0.2372.pth'


# load bert model and tokenizer as well as the SpaBERT config
bert_model = BertForMaskedLM.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
config = SpatialBertConfig()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [18]:
# load pre-trained spabert model
import torch
model = SpatialBertForMaskedLM(config)

model.load_state_dict(bert_model.state_dict() , strict = False) 

#model = load_spatial_bert_pretrained_weights(model, pretrained_model)

pre_trained_model = torch.load(pretrained_model)

model_keys = model.state_dict()
cnt_layers = 0
for key in model_keys:
    if key in pre_trained_model:
        model_keys[key] = pre_trained_model[key]
        cnt_layers += 1
    else:
        print("No weight for", key)
print(cnt_layers, 'layers loaded')

model.load_state_dict(model_keys)

205 layers loaded


<All keys matched successfully>

In [19]:
from datasets.osm_sample_loader import PbfMapDataset
from torch.utils.data import DataLoader
# load fine-tning dataset with data loader

fine_tune_dataset = PbfMapDataset(data_file_path = dataset, 
                                        tokenizer = tokenizer, 
                                        max_token_len = 300, 
                                        distance_norm_factor = 0.0001, 
                                        spatial_dist_fill = 20, 
                                        with_type = False,
                                        sep_between_neighbors = False, 
                                        label_encoder = None,
                                        mode = None)
#initialize data loader
train_loader = DataLoader(fine_tune_dataset, batch_size=12, num_workers=5, shuffle=False, pin_memory=True, drop_last=True)



In [20]:
import torch
# cast our loaded model to a gpu if one is available, otherwise use the cpu
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# set model to training mode
model.train()

SpatialBertForMaskedLM(
  (bert): SpatialBertModel(
    (embeddings): SpatialEmbedding(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (sent_position_embedding): Embedding(512, 768)
      (spatial_position_embedding): ContinuousSpatialPositionalEmbedding()
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out

In [21]:
### FINE TUNING PROCEDURE ###
from tqdm import tqdm 
from transformers import AdamW
# initialize optimizer
optim = AdamW(model.parameters(), lr = 5e-5)

# setup loop with TQDM and dataloader
epoch = tqdm(train_loader, leave=True)
iter = 0
for batch in epoch:
    # initialize calculated gradients from previous step
    optim.zero_grad()

    # pull all tensor batches required for training
    input_ids = batch['masked_input'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    position_list_x = batch['norm_lng_list'].to(device)
    position_list_y = batch['norm_lat_list'].to(device)
    sent_position_ids = batch['sent_position_ids'].to(device)

    labels = batch['pseudo_sentence'].to(device)

    # get outputs of model
    outputs = model(input_ids, attention_mask = attention_mask, sent_position_ids = sent_position_ids,
                position_list_x = position_list_x, position_list_y = position_list_y, labels = labels)
    

    # calculate loss
    loss = outputs.loss

    # perform backpropigation
    loss.backward()

    optim.step()
    epoch.set_postfix({'loss':loss.item()})


    iter += 1
torch.save(model.state_dict(), "sample_datasets/fine-tuned_model.pth")


  2%|▏         | 54/3581 [00:15<17:22,  3.38it/s, loss=0.402]


KeyboardInterrupt: 

In [5]:
# perform entity linking
from transformers import BertTokenizer
import torch
from whg_dataset_loader import WhgDataset
from spabert.datasets.usgs_os_sample_loader import USGS_MapDataset
from spabert.datasets.wikidata_sample_loader import Wikidata_Geocoord_Dataset, Wikidata_Random_Dataset
from spabert.models.spatial_bert_model import SpatialBertModel
from spabert.models.spatial_bert_model import SpatialBertConfig

from spabert.utils.find_closest import find_ref_closest_match, sort_ref_closest_match
from spabert.utils.common_utils import load_spatial_bert_pretrained_weights, get_spatialbert_embedding, get_bert_embedding, write_to_csv
from spabert.utils.baseline_utils import get_baseline_model



# entity linking function

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        
config = SpatialBertConfig()
model = SpatialBertModel(config)

model.to(device)
model.eval()

#model = load_spatial_bert_pretrained_weights(model, 'data/fine-tuned_model.pth')

sep_between_neighbors = False
wikidata_dict_per_map = {}
wikidata_dict_per_map['wikidata_emb_list'] = []
wikidata_dict_per_map['wikidata_qid_list'] = []
wikidata_dict_per_map['names'] = []


whg_dataset = WhgDataset(
    data_file_path = 'data/whg.json',
    tokenizer = tokenizer,
    max_token_len = 512, 
    distance_norm_factor = 25, 
    spatial_dist_fill=100,
    sep_between_neighbors = sep_between_neighbors)

wikidata_dataset = WhgDataset(
    data_file_path='data/wikidata.json',
    tokenizer=tokenizer,
    max_token_len=512,
    distance_norm_factor=50000,
    spatial_dist_fill=20,
    sep_between_neighbors=sep_between_neighbors)


matched_wikid_dataset = []
for i in range(len(wikidata_dataset)):
    emb = wikidata_dataset[i]
    matched_wikid_dataset.append(emb)
    max_dist_lng = max(emb['norm_lng_list'])
    max_dist_lat = max(emb['norm_lat_list'])


In [10]:
from spabert.experiments.entity_matching.data_processing import request_wrapper
import scipy.spatial as sp
import numpy as np

# disambigufy
def disambiguify(model, model_name, usgs_dataset, wikidata_dict_list, candset_mode = 'all_map', if_use_distance = True, select_indices = None): 

    if select_indices is None: 
        select_indices = range(0, len(wikidata_dict_list))


    assert(candset_mode in ['all_map','per_map'])
    wikidata_emb_list = wikidata_dict_list['wikidata_emb_list']#[wikidata_dict['wikidata_emb_list'] for wikidata_dict in wikidata_dict_list]
    wikidata_qid_list = wikidata_dict_list['wikidata_qid_list'] #for wikidata_dict in wikidata_dict_list]
    wikidata_name_list = wikidata_dict_list['names']
    wrapper = request_wrapper.RequestWrapper()
    ret_list = []
    for i in range(len(usgs_dataset)):
        if (i % 1000) == 0:
            print("disambigufy at " + str((i/len(usgs_dataset))*100)+"%")
            #wikidata_des_list = wikidata_des_list[i]
        if model_name == 'spatial_bert-base' or model_name == 'spatial_bert-large':
            usgs_emb = get_spatialbert_embedding(usgs_dataset[i], model, use_distance = if_use_distance)
        else:
            usgs_emb = get_bert_embedding(usgs_dataset[i], model)
        sim_matrix = 1 - sp.distance.cdist(np.array(wikidata_emb_list), np.array([usgs_emb]), 'cosine')
        closest_match_qid = sort_ref_closest_match(sim_matrix, wikidata_qid_list)
        #closest_match_des = sort_ref_closest_match(sim_matrix, wikidata_des_list)

            
        sorted_sim_matrix = np.sort(sim_matrix, axis = 0)[::-1] # descending order

        ret_dict = dict()
        ret_dict['pivot_name'] = usgs_dataset[i]['pivot_name']
        ret_dict['sorted_match_qid'] = [a[0] for a in closest_match_qid]
        #ret_dict['sorted_match_des'] = [a[0] for a in closest_match_des]
        ret_dict['sorted_sim_matrix'] = [a[0] for a in sorted_sim_matrix]

        ret_list.append(ret_dict)

    return ret_list 


candset_mode = 'all_map'
for i in range(0, len(matched_wikid_dataset)):
    if (i % 1000) == 0:
        print("processing at: "+ str(i/len(matched_wikid_dataset)*100) + "%")
        wikidata_emb = get_spatialbert_embedding(matched_wikid_dataset[i], model)

        wikidata_dict_per_map['wikidata_emb_list'].append(wikidata_emb)
        wikidata_dict_per_map['wikidata_qid_list'].append(matched_wikid_dataset[i]['qid'])
        wikidata_dict_per_map['names'].append(wikidata_dataset[i]['pivot_name'])

    ret_list = disambiguify(model, 'spatial_bert-base', whg_dataset, wikidata_dict_per_map, candset_mode= candset_mode, if_use_distance = not False, select_indices = None)
    write_to_csv('data/', "output.csv", ret_list)


processing at: 0.0%
disambigufy at 0.0%




disambigufy at 21.62629757785467%
disambigufy at 43.25259515570934%
disambigufy at 64.87889273356402%
disambigufy at 86.50519031141869%
disambigufy at 0.0%
disambigufy at 21.62629757785467%
disambigufy at 43.25259515570934%
disambigufy at 64.87889273356402%
disambigufy at 86.50519031141869%
disambigufy at 0.0%
disambigufy at 21.62629757785467%
disambigufy at 43.25259515570934%
disambigufy at 64.87889273356402%
disambigufy at 86.50519031141869%
disambigufy at 0.0%
disambigufy at 21.62629757785467%
disambigufy at 43.25259515570934%
disambigufy at 64.87889273356402%
disambigufy at 86.50519031141869%


KeyboardInterrupt: 