# Address2Building: Deep Learning-Based Building Name Extraction

### Other names:

1. **BERT-Based Building Name Extractor (BBNE)**
2. **Address2Building: Deep Learning-Based Building Name Extraction**
3. **BERTex: BERT Enhanced Text Extractor for Building Names**
4. **BuildBERT: Address Parsing and Building Name Recognition**
5. **AddressNet: Building Name Extraction Using BERT**


# Model: BERT

## Step 1: Imports

In [None]:
# imports 
import data_preprocessor
import string

abbreviations = data_preprocessor.load_json("abbreviations.json")
fname = 'Data/buildings/Buildings_Dataset.csv'

## Step 2:  Data Preparation

In [None]:
df = data_preprocessor.load_corpus(fname, pandas = True, header = True)
# df = df.drop(columns=['Title', 'Created', 'Close Time', 'Queue'], axis=1) 

addresses = df['Address'].tolist()
building_names = df['Building Name'].tolist()

print(len(df))

df.head()

3550


Unnamed: 0,Address,Building Name
0,"House # B6, Block-B Floor Aftab Sultan residen...",Aftab Sultan Complex
1,"Apartment/Suite# B-3 , Building Block B, Aftab...",Aftab Sultan Complex
2,"Apartment/Suite# B-1 1st Floor, Building Block...",Aftab Sultan Complex
3,House # Aftab Sultan Resedention complex Appt ...,Aftab Sultan Complex
4,House # St 20 fL B2 Aftab sultan near postoffi...,Aftab Sultan Complex


In [None]:
# building_names = data_preprocessor.load_corpus('karachi_buildings.txt')

In [None]:
# # Sample data without commas; replace with your actual dataset
# addresses = [
#     "123 Elm St Windsor Building Apt 5A",
#     "456 Oak Rd Maple Complex Level 2",
#     "789 Pine Ave Cedar Towers Block B"
# ]

# building_names = [
#     "Windsor Building",
#     "Maple Complex",
#     "Cedar Towers"
# ]

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into 80% training and 20% testing
addresses_train, addresses_test, building_names_train, building_names_test = train_test_split(addresses, building_names, test_size=0.2, random_state=42)

# Further split the training data into 80% training and 20% validation
addresses_train, addresses_val, building_names_train, building_names_val = train_test_split(addresses_train, building_names_train, test_size=0.2, random_state=42)


## Step 3: Model Initialization

In [None]:
# Initializing BERT Model

from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


from transformers import BertForQuestionAnswering

model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')

  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Step 4: Data Preprocessing

In [None]:
def clean_address(address):
    address = address.lower()  # Convert to lowercase
    address = address.strip()  # Remove leading and trailing whitespaces
    address = ' '.join(address.split())  # Replace multiple spaces with a single space
    # address.translate(str.maketrans('', '', string.punctuation)) # Removing punctuation
    # address = data_preprocessor.standard_abbreviations_fix(address, abbreviations) # Standardizing Abbreviations
    # You can add more cleaning steps if necessary
    return address


def tokenize_for_bert(address, max_length=512):
    return tokenizer.encode_plus(address, 
                                 add_special_tokens=True,
                                 max_length=max_length,
                                 pad_to_max_length=True,
                                 return_attention_mask=True,
                                 truncation=True)



def convert_labels_to_spans(address, building_name, tokenizer, max_length=512):
    # Tokenize the address and the building name
    address_tokens = tokenizer.tokenize(address)
    building_name_tokens = tokenizer.tokenize(building_name)
    
    # Find the start and end token positions of the building name in the address tokens
    try:
        start_idx = address_tokens.index(building_name_tokens[0])
        end_idx = start_idx + len(building_name_tokens) - 1
    except ValueError:
        start_idx = 0
        end_idx = 0

    return start_idx, end_idx

In [None]:
def preprocess_data(addresses, building_names):
    cleaned_addresses = [clean_address(a) for a in addresses]
    tokenized_data = [tokenize_for_bert(a) for a in cleaned_addresses]
    input_ids = [item['input_ids'] for item in tokenized_data]
    attention_masks = [item['attention_mask'] for item in tokenized_data]
    
    spans = [convert_labels_to_spans(a, b, tokenizer) for a, b in zip(cleaned_addresses, building_names)]
    start_positions = [span[0] for span in spans]
    end_positions = [span[1] for span in spans]
    
    return input_ids, attention_masks, start_positions, end_positions


input_ids_train, attention_masks_train, start_positions_train, end_positions_train = preprocess_data(addresses_train, building_names_train)
input_ids_val, attention_masks_val, start_positions_val, end_positions_val = preprocess_data(addresses_val, building_names_val)




## Step 5: Model Training

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

# Convert data into torch tensors
input_ids_train = torch.tensor(input_ids_train)
attention_masks_train = torch.tensor(attention_masks_train)
start_positions_train = torch.tensor(start_positions_train)
end_positions_train = torch.tensor(end_positions_train)

# Create a DataLoader
train_data = TensorDataset(input_ids_train, attention_masks_train, start_positions_train, end_positions_train)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=16)  # You can adjust batch size as needed

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(), lr=3e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * 3)  # Assuming 3 epochs



In [None]:
# Move the model to the GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Training loop
for epoch in range(3):  # you can adjust the number of epochs
    model.train()
    total_train_loss = 0

    for step, batch in enumerate(train_dataloader):
        # Load batch data to GPU
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_attention_mask, b_start_positions, b_end_positions = batch

        # Clear any previously calculated gradients
        model.zero_grad()

        # Forward pass
        outputs = model(b_input_ids, attention_mask=b_attention_mask, start_positions=b_start_positions, end_positions=b_end_positions)
        loss = outputs[0]

        # Backward pass
        loss.backward()

        # Gradient clipping (optional, can help prevent exploding gradients)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters
        optimizer.step()
        scheduler.step()

        # Accumulate loss
        total_train_loss += loss.item()

    # Calculate the average loss over the training data
    avg_train_loss = total_train_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}, Training loss: {avg_train_loss:.2f}")


""" Do note: Training a BERT model can be resource-intensive. Ideally, this should be run on a machine with a good GPU. Adjust 
the batch size and learning rate according to the resources available and monitor for any potential issues during training."""

## Step 6: Model Evaluation

In [None]:
# Convert validation data into torch tensors
input_ids_val = torch.tensor(input_ids_val)
attention_masks_val = torch.tensor(attention_masks_val)
start_positions_val = torch.tensor(start_positions_val)
end_positions_val = torch.tensor(end_positions_val)

# Create a DataLoader for validation data
val_data = TensorDataset(input_ids_val, attention_masks_val, start_positions_val, end_positions_val)
val_sampler = RandomSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=16)  # Adjust batch size as needed


  input_ids_val = torch.tensor(input_ids_val)
  attention_masks_val = torch.tensor(attention_masks_val)
  start_positions_val = torch.tensor(start_positions_val)
  end_positions_val = torch.tensor(end_positions_val)


In [None]:
model.eval()

total_eval_loss = 0

all_start_positions = []
all_end_positions = []
all_pred_start_positions = []
all_pred_end_positions = []

for batch in val_dataloader:
    # Load batch data to GPU
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_attention_mask, b_start_positions, b_end_positions = batch

    # Tell the model not to compute gradients
    with torch.no_grad():
        # Forward pass
        outputs = model(b_input_ids, attention_mask=b_attention_mask, start_positions=b_start_positions, end_positions=b_end_positions)
        
    # Get the predicted start and end token positions
    start_logits = outputs.start_logits
    end_logits = outputs.end_logits

    pred_start_positions = torch.argmax(start_logits, dim=1)
    pred_end_positions = torch.argmax(end_logits, dim=1)

    loss = outputs[0]
    total_eval_loss += loss.item()

    all_start_positions.extend(b_start_positions.tolist())
    all_end_positions.extend(b_end_positions.tolist())
    all_pred_start_positions.extend(pred_start_positions.tolist())
    all_pred_end_positions.extend(pred_end_positions.tolist())

avg_eval_loss = total_eval_loss / len(val_dataloader)
print(f"Validation Loss: {avg_eval_loss:.2f}")


Validation Loss: 5.86


In [None]:
def compute_exact_match(true_starts, true_ends, pred_starts, pred_ends):
    return sum([(ts == ps) and (te == pe) for ts, te, ps, pe in zip(true_starts, true_ends, pred_starts, pred_ends)])

EM_score = compute_exact_match(all_start_positions, all_end_positions, all_pred_start_positions, all_pred_end_positions)
print(f"Exact Match (EM) Score on Validation Set: {EM_score / len(all_start_positions):.2%}")


Exact Match (EM) Score on Validation Set: 0.00%


## Step 7: Deployment & Usage (Simplified for Direct Extraction)

In [None]:
# Paths where the model and tokenizer were saved
model_save_path = './model_save/'
tokenizer_save_path = './tokenizer_save/'

In [None]:
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(tokenizer_save_path)

('./tokenizer_save/tokenizer_config.json',
 './tokenizer_save/special_tokens_map.json',
 './tokenizer_save/vocab.txt',
 './tokenizer_save/added_tokens.json')

In [None]:
model = BertForQuestionAnswering.from_pretrained(model_save_path)
tokenizer = BertTokenizer.from_pretrained(tokenizer_save_path)

# If you have a GPU, let's put the model there for faster computation
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elem

In [None]:
# def extract_building_name_from_span(address, start_idx, end_idx, tokenizer):
#     tokens = tokenizer.tokenize(address)
#     building_name_tokens = tokens[start_idx: end_idx+1]
#     building_name = tokenizer.convert_tokens_to_string(building_name_tokens)
#     return building_name


In [None]:
def extract_building_names(addresses, model, tokenizer):
    # Preprocess the input addresses
    input_ids = [tokenizer.encode(a, add_special_tokens=True, max_length=512, pad_to_max_length=True) for a in addresses]
    attention_masks = [[1 if token_id > 0 else 0 for token_id in address] for address in input_ids]
    
    input_ids = torch.tensor(input_ids).to(device)
    attention_masks = torch.tensor(attention_masks).to(device)

    # Make predictions
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_masks)

    start_logits = outputs.start_logits.cpu().numpy()
    end_logits = outputs.end_logits.cpu().numpy()

    predicted_start = start_logits.argmax(axis=1)
    predicted_end = end_logits.argmax(axis=1)

    # Post-process to extract building names
    building_names = []
    for i, address in enumerate(addresses):
        tokens = tokenizer.tokenize(address)
        building_name_tokens = tokens[predicted_start[i]:predicted_end[i]+1]
        building_name = tokenizer.convert_tokens_to_string(building_name_tokens)
        building_names.append(building_name)

    return building_names


In [None]:
# Sample usage:
addresses_list = [
    "123 Elm St Windsor Building Apt 5A",
    "456 Oak Rd Maple Complex Level 2",
    "789 Pine Ave Cedar Towers Block B"
]

predicted_building_names = extract_building_names(addresses_list, model, tokenizer)
print(predicted_building_names)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


['windsor building apt 5a', 'maple complex', 'cedar towers block b']
