# Predict

In [1]:
def find_b_loc_i_loc_sequences(lst):  
    sequences = []  
    current_sequence = None  
    for i, tag in enumerate(lst):  
        if tag == 'B-LOC' and current_sequence is None:  
            current_sequence = [i]  # Start a new sequence  
        elif tag == 'I-LOC' and current_sequence is not None:  
            current_sequence.append(i)  # Append to the current sequence  
        elif tag != 'I-LOC' and current_sequence is not None:  
            sequences.append((current_sequence[0], current_sequence[-1]))  # Save the sequence  
            current_sequence = None  # Reset the current sequence  
    # Check if there is a final sequence to addif current  
    if current_sequence is not None:  
        sequences.append((current_sequence[0], current_sequence[-1]))  
    return sequences  

In [2]:
import torch  
from transformers import BertTokenizer  
from torch.nn import functional as F  
from transformers import BertTokenizer, BertForTokenClassification

# Load the saved model  
tokenizer = BertTokenizer.from_pretrained('../bert-base-uncased')  
model = BertForTokenClassification.from_pretrained('../bert-base-uncased', num_labels=3)  
max_seq_length = 100

# Load the pretrained model parameters  
model.load_state_dict(torch.load('bert_ner_model.pt'))  

# Set the model to evaluation mode  
model.eval()

  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
Some weights of BertForTokenClassification were not initialized from the model checkpoint at ../bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

In [3]:
# New text for prediction
test_texts = [
    'Finsbury Park is a friendly melting pot community composed of Turkish, French, Spanish, Middle Eastern, Irish and English families. <br />We have a wonderful variety of international restaurants directly under us on Stroud Green Road. And there are many shops and large Tescos supermarket right next door. <br /><br />But you can also venture up to Crouch End and along Greens Lanes where there will endless choice of Turkish and Middle Eastern cuisines.'
]

# Convert text to token IDs
input_ids = []
attention_masks = []

for text in test_texts:
    encoded_text = tokenizer(text, return_tensors='pt', padding='max_length', truncation=True, max_length=max_seq_length)
    input_ids.append(encoded_text['input_ids'].squeeze(0))
    attention_masks.append(encoded_text['attention_mask'].squeeze(0))

input_ids = torch.stack(input_ids, dim=0)
attention_masks = torch.stack(attention_masks, dim=0)

# If GPU is available, move the model and data to GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
input_ids = input_ids.to(device)
attention_masks = attention_masks.to(device)

# Make predictions
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_masks)
    predictions = outputs.logits

# Get predicted label IDs
predicted_ids = torch.argmax(F.log_softmax(predictions, dim=2), dim=2)

# Decode predicted label IDs to original labels
dic = {"O": 0, "B-LOC": 1, "I-LOC": 2}
inverse_dic = {v: k for k, v in dic.items()}

decoded_labels = []
for predicted_id in predicted_ids.cpu().numpy():
    decoded_label = [inverse_dic[id] for id in predicted_id]
    decoded_labels.append(decoded_label)


In [4]:
# Align labels with the original text
decoded_texts = []
for text, decoded_label in zip(test_texts, decoded_labels):
    word_tokens = tokenizer.tokenize(text)
    sequences = find_b_loc_i_loc_sequences(decoded_label[1:])
    end = []
    for j in sequences:
        if j[1] <= len(word_tokens):
            s = ''
            for idx in range(j[0], j[1] + 1):
                if '##' in word_tokens[idx]:
                    s += word_tokens[idx].replace('##', '')
                else:
                    s += ' ' + word_tokens[idx]
            end.append(s.strip())
    decoded_texts.append(end)
decoded_texts

[['finsbury park', 'stroud green road', 'crouch end', 'greens lanes']]

# Apply to dataset

In [5]:
from tqdm import tqdm
import re

def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove special characters except for alphanumeric, spaces, periods, and commas. Note that we included \. and , in the exclusion set, so they will not be removed
    text = re.sub(r'[^\w\s.,]', '', text)
    # Remove numbers (if needed)
    # text = re.sub(r'\d+', '', text)
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    # Remove leading and trailing spaces
    text = text.strip()
    return text

def get_token(test_texts):
    # Convert text to token IDs
    input_ids = []
    attention_masks = []

    print('tokenizer')
    ok_test_texts = []
    for text in tqdm(test_texts):
        try:
            text = clean_text(text)
            encoded_text = tokenizer(text, return_tensors='pt', padding='max_length', truncation=True, max_length=max_seq_length)
            input_ids.append(encoded_text['input_ids'].squeeze(0))
            attention_masks.append(encoded_text['attention_mask'].squeeze(0))
            ok_test_texts.append(text)
        except:
            encoded_text = tokenizer('', return_tensors='pt', padding='max_length', truncation=True, max_length=max_seq_length)
            input_ids.append(encoded_text['input_ids'].squeeze(0))
            attention_masks.append(encoded_text['attention_mask'].squeeze(0))
            ok_test_texts.append('')
    
    input_ids = torch.stack(input_ids, dim=0)
    attention_masks = torch.stack(attention_masks, dim=0)
    return ok_test_texts, input_ids, attention_masks

def pre(ok_test_texts, input_ids, attention_masks, model):
    # If GPU is available, move the model and data to GPU
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    input_ids = input_ids.to(device)
    attention_masks = attention_masks.to(device)

    # input_ids and attention_masks are your input data lists, each element is a batch of inputs
    batch_size = 32  # Define the batch size
    num_batches = len(input_ids) // batch_size  # Calculate the total number of complete batches

    # If the length of input data is not divisible by the batch size, consider the last batch
    if len(input_ids) % batch_size != 0:
        num_batches += 1

    # Perform batch predictions
    print('Performing predictions')
    all_decoded_texts = []
    with torch.no_grad():
        for i in tqdm(range(num_batches)):
            start_idx = i * batch_size
            end_idx = min((i + 1) * batch_size, len(input_ids))  # Ensure not to exceed the data range

            # Get the current batch of input data
            batch_test_texts = ok_test_texts[start_idx:end_idx]
            batch_input_ids = input_ids[start_idx:end_idx]
            batch_attention_masks = attention_masks[start_idx:end_idx]

            # Perform predictions
            outputs = model(batch_input_ids, attention_mask=batch_attention_masks)
            batch_predictions = outputs.logits

            # Get predicted label IDs
            predicted_ids = torch.argmax(F.log_softmax(batch_predictions, dim=2), dim=2)
            
            # Decode predicted label IDs to original labels
            dic = {"O": 0, "B-LOC": 1, "I-LOC": 2}
            inverse_dic = {v: k for k, v in dic.items()}
            
            decoded_labels = []
            for predicted_id in predicted_ids.cpu().numpy():
                decoded_label = [inverse_dic[id] for id in predicted_id]
                decoded_labels.append(decoded_label)
            
            # Align labels with the original text
            decoded_texts = []
            for text, decoded_label in zip(batch_test_texts, decoded_labels):
                word_tokens = tokenizer.tokenize(text)
                sequences = find_b_loc_i_loc_sequences(decoded_label[1:])
                end = []
                for j in sequences:
                    if j[1] < len(word_tokens):
                        s = ''
                        for idx in range(j[0], j[1] + 1):
                            if '##' in word_tokens[idx]:
                                s += word_tokens[idx].replace('##', '')
                            else:
                                s += ' ' + word_tokens[idx]
                        end.append(s.strip())
                decoded_texts.append(end)
            all_decoded_texts += decoded_texts
    return all_decoded_texts


In [6]:
test_texts = [
    'Finsbury Park is a friendly melting pot community composed of Turkish, French, Spanish, Middle Eastern, Irish and English families. <br />We have a wonderful variety of international restaurants directly under us on Stroud Green Road. And there are many shops and large Tescos supermarket right next door. <br /><br />But you can also venture up to Crouch End and along Greens Lanes where there will endless choice of Turkish and Middle Eastern cuisines.'
             ] 
ok_test_texts, input_ids, attention_masks = get_token(test_texts)
decoded_texts = pre(ok_test_texts, input_ids, attention_masks, model)
decoded_texts

tokenizer


100%|██████████| 1/1 [00:00<00:00, 720.42it/s]


进行预测


100%|██████████| 1/1 [00:00<00:00, 17.48it/s]


[['finsbury park', 'stroud green road', 'crouch end', 'greens lanes']]

In [7]:
import pandas as pd
csv_data = pd.read_csv('listings_d.csv')
text_Data = csv_data['neighborhood_overview'].tolist()

In [8]:
ok_test_texts, input_ids, attention_masks = get_token(text_Data)

tokenizer


100%|██████████| 91778/91778 [00:27<00:00, 3329.30it/s]


In [9]:
decoded_texts = pre(ok_test_texts, input_ids, attention_masks, model)

进行预测


100%|██████████| 2869/2869 [27:32<00:00,  1.74it/s]


In [10]:
csv_data['output'] = decoded_texts
csv_data.to_csv('listings_d_out.csv')

In [20]:
pd.set_option('display.max_colwidth', 3000) 

In [21]:
see_data = pd.read_csv('listings_d_out.csv')
see_data.dropna(subset=['neighborhood_overview', 'output'], inplace=True)
see_data[['neighborhood_overview','output']].head(30)

  see_data = pd.read_csv('listings_d_out.csv')


Unnamed: 0,neighborhood_overview,output
0,"I live in Barking town centre, at one time the capital of England (!) and where Captain Cook got married! It also had the largest fishing fleet in England and one of the largest monasteries until King Henry burned it down in the 1600s (the church still remains and can be seen from your window). It's currently undergoing a lot of redevelopment with plans for a nearby film studio, brand new shopping centre and a new station. There are two gyms within walking distance, a multi screen cinema, three supermarkets and local shops.<br /><br />Historial Barking Abbey.<br />Barking Park<br />Queen Elizabeth Olympic Park<br />Westfield shopping centre<br />the 02<br />Emirates Air Line<br />Local gay scene in nearby Limehouse features a gay club, pub sauna!",['barking']
1,Peaceful and friendly.,['friendly']
2,"We have a unique cinema called the Phoenix which is supposed to be the oldest continually running cinema in the country. There are nearby parks including Kenwood where part of the movie "" Nottinghill"" was filmed with Julia Roberts and Hugh Grant starred.",['julia']
4,"Finsbury Park is a friendly melting pot community composed of Turkish, French, Spanish, Middle Eastern, Irish and English families. <br />We have a wonderful variety of international restaurants directly under us on Stroud Green Road. And there are many shops and large Tescos supermarket right next door. <br /><br />But you can also venture up to Crouch End and along Greens Lanes where there will endless choice of Turkish and Middle Eastern cuisines.","['finsbury park', 'stroud green road', 'crouch end', 'greens lanes']"
6,"Residential, quiet and green neighbourhood with 10 min walk to local restaurants, shops and amenities. You will love the view of the horizon from the flat, 5 min walk to Gladstone Park when on a good, no clouds day you will see a Wembley Stadium, then 15 min walk you will get to the nature reserve- my personal favourite, hidden spot of London!<br /><br />High Street is full of ethnical food- favouirte is authentic Sri Lankan: Lihiniya and Persian Zeyneb.","['gladstone park', 'wembley stadium']"
7,Peaceful and quite with beautiful tree lined street also near Earls Court tube station that is in zone 1 about a 3 minute walk away.,[]
8,The neighbourhood is safe and many new cafes and bars are appearing all the time. I am also very close to the famous Brixton Village and Crystal Palace,"['brixton village', 'crystal palace']"
10,"We're very close to the Emirates stadium, so if you're an Arsenal fan it's a great location.","['emirates', 'arsenal']"
11,"Our neighbourhood is fun and lively. Camden Lock is ten minutes away. There are lots of shops, bars, restaurants, theatres and cafes. Also parks and a zoo.<br />Camden is famous for its music venues too.",[]
12,"For the lovers of long walks or bike rides - 15 min walk (5 min ride) away is Hammersmith bridge and gorgeous greenery the Thames path, which will take you to Kew Gardens and Richmond along the river. <br /><br />For shorter outdoor activity- You can easily walk to Holland Park (just over the roundabout) and enjoy the tranquil green space or go for a run, go have coffee or lunch around Holland Park area (15-20 min walk max) or even get to Portobello on foot. If you chose to stay very locally - crossing the road to Hammersmith Grove will get you to some nice cafes (in about 5 minutes), like Raoul's deli and cafe, for example, the Grove restaurant and many more. There is a yoga studio as well. Should you wish to shop till you drop - within 10 minutes walk is a Westfield shopping centre, with a mind-blowing array of cafes, bars and restaurants, plus a great number of shops, a cinema and a great deal more. Right next to us is a Shepherds Bush market, an open air affair for fresh veggies a","['hammersmith bridge', 'thames', 'kew gardens', 'richmond', 'holland park']"
