<a href="https://colab.research.google.com/github/izabellakacprzak/twitter-location-ner/blob/master/DataGeneration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#######################################
#              CONSTANTS              #
#######################################

PREDICTIONS_LIMIT = 10
SEARCH_LIMIT = 3
NOMINATIM_API_URL = "https://nominatim.openstreetmap.org"
NOMINATIM_SEARCH_ENDPOINT = f"{NOMINATIM_API_URL}/search"

In [None]:
#######################################
#               IMPORTS               #
#######################################

!pip install transformers
!pip install pytorch-pretrained-bert
from transformers import BertTokenizer
from pytorch_pretrained_bert import BertForMaskedLM
import torch
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()

import requests
from typing import Dict
import csv

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
#######################################
#   LOCATIONS GENERATION USING BERT   #
#######################################

# get PREDICTIONS_LIMIT number of predictions of masked words
def get_predictions(original_sentence, masked_sentence):
  text = '[MASK]'
  tokenized_text = tokenizer.tokenize(text)
  mask_token = tokenizer.convert_tokens_to_ids(tokenized_text)[0]

  # tokenize the text
  tokenized_text = tokenizer.tokenize(masked_sentence)
  indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

  # Create the segments tensors.
  segments_ids = [0] * len(tokenized_text)

  # Convert inputs to PyTorch tensors
  tokens_tensor = torch.tensor([indexed_tokens])
  segments_tensors = torch.tensor([segments_ids])

  # Predict all tokens
  with torch.no_grad():
    predictions = model(tokens_tensor, segments_tensors)

  # index of the masked token
  mask_index = (tokens_tensor == mask_token).nonzero()[0][1].item()
  # predicted token
  predicted_index = torch.argmax(predictions[0, mask_index]).item()
  max_indeces = []
  ts = torch.argsort(predictions[0, mask_index], descending=True)[:PREDICTIONS_LIMIT]
  for t in ts:
    max_indeces.append(t.item())
  predicted_tokens = tokenizer.convert_ids_to_tokens(max_indeces)
    
  return predicted_tokens

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = BertForMaskedLM.from_pretrained('bert-large-cased')
model.eval()

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

100%|██████████| 1242874899/1242874899 [01:39<00:00, 12437909.32B/s]


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
      

In [None]:
#######################################
#   SENTENCE GENERATION USING BERT    #
#######################################

# Generate new sentences based on input text
# substituting existing locations with new
# synthetically generated locations, generated using Bert.
def generate_new_sentences(text, tags):
  tokens = text.split()
  labels = tags.split()
  new_sentences = []
  new_labels = []
  for idx, token in enumerate(tokens):
    # Only substitute existing locations
    if labels[idx] == "I-LOC":
      masked = tokens[:]
      masked[idx] = "[MASK]"
      masked = "[CLS] " + " ".join(masked) + " . [SEP]"
      locations = get_predictions(text, masked)
      for location in locations:
        doc = nlp(location)
        if len(doc.ents) > 0 and doc.ents[0].label_ == "GPE":
          new_sentence = tokens[:]
          new_sentence[idx] = location
          new_sentences.append(" ".join(new_sentence))
          new_labels.append(tags)
  return new_sentences, new_labels

In [None]:
#######################################
# SENTENCE GENERATION USING NOMINATIM #
#######################################

# Search Nominatim endpoint for query location
def search_query(query: str, params: Dict[str, int]) -> dict:
  if not(True in [char.isdigit() for char in query]):
      params_query = "&".join(f"{param_name}={param_value}" for param_name, param_value in params.items())
      request_url = f"{NOMINATIM_SEARCH_ENDPOINT}?q={query}&{params_query}&format=json"
      print(request_url)

      try:
        response = requests.get(request_url)
        response.raise_for_status()
      except requests.exceptions.HTTPError as err:
        return {}
        
      return response.json()
    
  return {}

# Get new similar locations to input location
def get_location_from_nominatim(location):
  result = search_query(query=location.replace(" ", "+").replace("#", ""), params={})
  return result

# Returns a list of tokens generated from the input sentence
# (a token is a non-location word or a location string)
# and a list of ids of tokens which are locations
def cluster_words_in_tokens(sent, labels):
    location_idxs = []
    tokens = []
    words = sent.split()
    labels = labels.split()
    idx = 0
    while idx < len(words):
        token = []
        if labels[idx] == "I-LOC":
          token = []
          while idx < len(words) and labels[idx] == "I-LOC":
            location_idxs.append(idx)
            token.append(words[idx])
            idx += 1
          tokens.append(" ".join(token))
        else:
            tokens.append(words[idx])
            idx += 1
            
    return tokens, location_idxs

# Generate new sentences based on input text
# substituting existing locations with new
# synthetically generated locations, pulled from the Nominatim API.
def generate_new_sentences_nominatim(text, tags):
  if len(text.split()) != len(tags.split()):
    print("bad stuff")
    return [], []
  tokens, location_ids = cluster_words_in_tokens(text, tags)
  tags = tags.split()
  new_sentences = []
  new_labels = []
  for idx, token in enumerate(tokens):
      if idx in location_ids:
        locations = get_location_from_nominatim(token)
        if locations != {}:
          for location in locations[:SEARCH_LIMIT]:
            new_sentence = tokens.copy()
            new_sentence[idx] = location["display_name"]
            new_sentences.append(" ".join(new_sentence))
            new_tags = tags[:idx] + (["I-LOC"] * len(new_sentence[idx].split())) + tags[idx+1:]
            new_labels.append(" ".join(new_tags))
  return new_sentences, new_labels

In [None]:
#######################################
#         SENTENCE GENERATION         #
#######################################

# For each sentence in csvs/existing_sentences.csv generate new sentences
# with artificially computed new locations.
# Generated sentences are saved in csvs/generated.csv
# To switch between using Bert and the Nominatim API to generate
# sentences set the "source" variable to "BERT" or "NOMINATIM" accordingly
    
source = "NOMINATIM"
generated = open("csvs/generated.csv", "x")
generated_texts = []
i = 0
with open('csvs/existing_sentences.csv', 'r') as file:
  reader = csv.reader(file)
  for row in reader:
      if i % 100 == 0:
        print(i)
      text = row[0]
      labels = row[1]
      if source == "NOMINATIM":
        new_sentences, new_labels = generate_new_sentences_nominatim(text, labels)
      else:
        new_sentences, new_labels = generate_new_sentences(text, labels)
      new_sentences.append(text)
      new_labels.append(labels)
            
      for idx, sent in enumerate(new_sentences):
          generated.write(sent)
          generated.write(",")
          generated.write(new_labels[idx])
          generated.write("\n")
      i += 1

0
https://nominatim.openstreetmap.org/search?q=Ireland&&format=json
https://nominatim.openstreetmap.org/search?q=Minneapolis&&format=json
https://nominatim.openstreetmap.org/search?q=page&&format=json
https://nominatim.openstreetmap.org/search?q=Miami&&format=json
https://nominatim.openstreetmap.org/search?q=Paris+Austria+germany+Australia&&format=json
https://nominatim.openstreetmap.org/search?q=Iran_Maryam&&format=json
https://nominatim.openstreetmap.org/search?q=DC&&format=json
https://nominatim.openstreetmap.org/search?q=Seattle&&format=json
https://nominatim.openstreetmap.org/search?q=Russia&&format=json
https://nominatim.openstreetmap.org/search?q=Chattanooga&&format=json
https://nominatim.openstreetmap.org/search?q=USA&&format=json
https://nominatim.openstreetmap.org/search?q=Atlanta&&format=json
https://nominatim.openstreetmap.org/search?q=Sacramento&&format=json
https://nominatim.openstreetmap.org/search?q=San+Bernardino&&format=json
https://nominatim.openstreetmap.org/search?