In [1]:
# standard library imports
import sys
sys.path.append('../BERT_geoparser/')
# third party imports
import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim
from geopy import distance
from shapely.geometry import box, Point
from tqdm import tqdm
# local imports
from data import Data, Phrase
from tokenizer import Tokenizer
from model import BertModel
from analysis import Results
from retagger import Retagger


# Notebook 2: building a target location identification model
This notebook goes through the steps neccessary for using a trained NER-tagging model (as built in notebook 1) to build a target/incidental location BERT model. This will involve 3 steps:

1. Open the yelp review dataset and link each review to the geographic location of the business.
2. Use a trained BERT model to NER tag the text in each review.
3. Retag the data by assessing the proximity of any tokens (or strings of tokens) tagged as locations to the true location identified in step 1.
4. Use this re-tagged data to retrain the BERT-model on the task of identifying target and incidental locations mentioned in text.

By doing this, the goal is to produce a model which can parse locations in text in a sensible way - identifying locations which the text is directly referencing ('target' locations) and other locations which are not related to the business being reviewed ('incidental' locations). For example,  the sentence:

 <p style="text-align: center;"> "Donnies Pizza Heaven in <span style="color:green">Inidanapolis</span> do the best deep pan pizza outside of <span style="color:red">Chicago</span>." </p>

 would receive the tags:

  <p style="text-align: center;"> "[O] [O] [O] [O] <span style="color:green">[B-tar]</span> [O] [O] ... <span style="color:red">[B-inc]</span>." </p>

  ## Step 1: adding locations to review data
  The first step is to add the locations to the yelp review data. We can do this by linking the `yelp_academic_dataset_review.json` data with the `yelp_academic_dataset_business.json` data using the `business_id` column.

In [91]:
# Get the review dataset and the accompanying business info dataset
review_df = pd.read_json('../data/yelp_academic_dataset_review.json', lines=True, nrows=20000)#[10000:]
business_df = pd.read_json('../data/yelp_academic_dataset_business.json', lines=True)

In [92]:
# add locations to reviews by linking to business
def get_coords(business_ids):
    coords = []
    for id in tqdm(business_ids):
        business = business_df[business_df.business_id==id]
        lat = business.latitude.iloc[0]
        lon = business.longitude.iloc[0]
        coords.append((lat, lon))
    return coords

review_df['coordinates'] = get_coords(review_df.business_id.values)

  0%|          | 0/20000 [00:00<?, ?it/s]

100%|██████████| 20000/20000 [07:56<00:00, 42.01it/s]


In [93]:
review_df.to_csv('data/step_2/yelp_reviews_with_location.csv')

## Step 2: parsing the reviews using the BERT model
We will load a pre-trained BERT model using the `Data`, `Tokenizer` and `BERTModel` classes and use this to parse the text in the review dataset. This will output a dataframe set up in a way that makes it easy to use to re-train a new model on in the future. 

In [6]:
# Load the dataset using the BERT_geoparser Data.py module
data_csv = r'../data/ner_dataset.csv'
tokenizer = Tokenizer(size='base', cased=False)
data = Data(data_path=data_csv, 
            tokenizer=tokenizer,
            max_len=125)

model = BertModel(saved_model='20230808_bert_model_large.hdf5', data=data)
model.model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 125)]        0           []                               
                                                                                                  
 input_3 (InputLayer)           [(None, 125)]        0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 125)]        0           []                               
                                                                                                  
 tf_bert_model_1 (TFBertModel)  TFBaseModelOutputWi  109482240   ['input_1[0][0]',                
                                thPoolingAndCrossAt               'input_3[0][0]',            

In [5]:
results = model.results_dataframe(texts=review_df.text.values, include_best=True)

100%|██████████| 20000/20000 [29:25<00:00, 11.33it/s]


## Step 3: re-tagging the reviews with target/incidental locations
We can now use the `Retagger` class to tag every location identified in step 2 with a new 'target' or 'incidental' tag. We will consider locations as being anything with a tag containing either `geo`, `org` or `gpe`. Everything elase will be given an `O` tag. 

We will complete proximity checks using a bounding box around the matched locations. This is acheived by setting `threshold='bbox'`. This will mean that any tokens or phrase for which the true location is within the bounding box of any locations matched to the word or phrase will be tagged with `tar` and other will be tagged `inc`. Alternatively, we could set a minimum distance (in KM) for these proximity checks by setting `threshold=<float>`.

In [7]:
retagger = Retagger(results)
retagger.retag(['geo', 'gpe', 'org'], threshold='bbox', review_df=review_df)
retagged_data = retagger.df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df.loc[:, 'sequential_group'] = groups
100%|██████████| 23365/23365 [3:18:28<00:00,  1.96it/s]  


In [8]:
retagged_data.to_csv('data/step_4/test_yelp_tar_inc_tagged_bbx_org_geo_gpe.csv', index=False)

In [13]:
# get some info about distribution of targets
O = retagged_data[retagged_data.Tag == 'O']
B_inc = retagged_data[retagged_data.Tag == 'B-inc']
B_tar = retagged_data[retagged_data.Tag == 'B-tar']
I_inc = retagged_data[retagged_data.Tag == 'I-inc']
I_tar = retagged_data[retagged_data.Tag == 'I-tar']

print(f'O tags : {len(O)}')
print(f'B-tar tags : {len(B_tar)}')
print(f'I-tar tags : {len(I_tar)}')
print(f'B-inc tags : {len(B_inc)}')
print(f'I-inc tags : {len(I_inc)}')

O tags : 1695422
B-tar tags : 4866
I-tar tags : 1460
B-inc tags : 26075
I-inc tags : 14763


## Step 4: Retrain a new BERT-model on the new tags


In [20]:
data_path = r'data/step_4/test_yelp_tar_inc_tagged_bbx_org_geo_gpe.csv'
tokenizer = Tokenizer(size='base', cased=False)
data = Data(data_path=data_path, tokenizer=tokenizer, max_len=125)
tar_model = BertModel(saved_model=False, data=data)
tar_model.model.summary()

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_4 (InputLayer)           [(None, 125)]        0           []                               
                                                                                                  
 input_6 (InputLayer)           [(None, 125)]        0           []                               
                                                                                                  
 input_5 (InputLayer)           [(None, 125)]        0           []                               
                                                                                                  
 tf_bert_model_2 (TFBertModel)  TFBaseModelOutputWi  109482240   ['input_4[0][0]',                
                                thPoolingAndCrossAt               'input_6[0][0]',          

  super().__init__(name, **kwargs)


In [48]:
tar_model.train(save_as='20230926_tar_tagged_bert_model_large.hdf5', n_epochs=4, batch_size=16, validation_split=0.1)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [49]:
# Get the review dataset and the accompanying business info dataset
review_df = pd.read_json('../data/yelp_academic_dataset_review.json', lines=True, nrows=25000)[20000:]
business_df = pd.read_json('../data/yelp_academic_dataset_business.json', lines=True)

In [50]:
# add locations to reviews by linking to business
def get_coords(business_ids):
    coords = []
    for id in tqdm(business_ids):
        business = business_df[business_df.business_id==id]
        lat = business.latitude.iloc[0]
        lon = business.longitude.iloc[0]
        coords.append((lat, lon))
    return coords

review_df['coordinates'] = get_coords(review_df.business_id.values)

100%|██████████| 5000/5000 [01:53<00:00, 43.97it/s]


In [51]:
# Load the dataset using the BERT_geoparser Data.py module
data_csv = r'../data/ner_dataset.csv'
tokenizer = Tokenizer(size='base', cased=False)
data = Data(data_path=data_csv, 
            tokenizer=tokenizer,
            max_len=125)

model = BertModel(saved_model='20230808_bert_model_large.hdf5', data=data)
results = model.results_dataframe(texts=review_df.text.values, include_best=True)

100%|██████████| 5000/5000 [07:45<00:00, 10.75it/s]


In [52]:
results = results.reset_index()

In [53]:
retagger = Retagger(results)
review_df = review_df.reset_index()
retagger.retag(['geo', 'gpe', 'org'], threshold='bbox', review_df=review_df)
test_data = retagger.df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  groups.append(current_group)
100%|██████████| 5800/5800 [48:59<00:00,  1.97it/s]  


In [54]:
test_data.to_csv('data/step_4/test_yelp_dataset.csv', index=False)

In [4]:
data_csv = 'data/step_4/test_yelp_dataset.csv'
tokenizer = Tokenizer(size='base', cased=False)
data = Data(data_path=data_csv, 
            tokenizer=tokenizer,
            max_len=125)
tar_model = BertModel(saved_model='20230926_tar_tagged_bert_model_large.hdf5', data=data)
X_tokens, y_pred, y_true = tar_model.test('data/step_4/test_yelp_dataset.csv', return_tokens=True)



In [5]:
res = Results(y_true, y_pred)
for cat in ['O', 'tar', 'inc']:
    print(f'"{cat}" accuracy : {np.round(res.categorical_accuracy(cat),3)}')
    print(f'"{cat}" precision : {np.round(res.categorical_precision(cat),3)}')
    print(f'"{cat}" recall : {np.round(res.categorical_recall(cat),3)}')
    print('=======================')
print(f'macro average recall : {np.round(res.macro_average_recall(), 3)}')
print(f'macro average precision : {np.round(res.macro_average_precision(),3)}')
print(f'micro average recall : {np.round(res.micro_average_recall(),3)}')
print(f'micro average precision : {np.round(res.micro_average_precision(),3)}')

"O" accuracy : 0.994
"O" precision : 0.991
"O" recall : 0.994
"tar" accuracy : 0.799
"tar" precision : 0.822
"tar" recall : 0.802
"inc" accuracy : 0.679
"inc" precision : 0.773
"inc" recall : 0.691
macro average recall : 0.733
macro average precision : 0.826
micro average recall : 0.983
micro average precision : 0.983


In [9]:
rows = {'Sentence #': [], 'Word':[], 'Tag':[], 'Predicted Tag':[]}
for i, (X, y, y_hat) in enumerate(zip(X_tokens, y_true, y_pred)):
    rows['Sentence #'].extend([i]*len(X))
    rows['Word'].extend(X)
    rows['Tag'].extend(y)
    rows['Predicted Tag'].extend(y_hat)
results_df = pd.DataFrame(rows)

In [12]:
results_df.to_csv('data/predictions/20230927_tar_inc_predictions.csv', index=False)

## Model improvement
While these results are promising, there is a serious flaw in the datset the model has been trained on. Specifically, the reviews are from businesses from a small number of locations. As such, there is a danger that the model is simply consistently assigning the `tar` tag to tokens matching those places. To investigate we will build a new dataset with all the tokens tagged as `tar` replaced with random draws from a list of place names. 

In [31]:
# open the test data
test_data = pd.read_csv('data/step_4/test_yelp_dataset.csv')
# extract only lines tagged as target
target_only_data = test_data[test_data.Tag.str.contains('tar')]
# use the Retagger class to add a 'sequential group' column to this data.
retagger = Retagger(target_only_data)
retagger.add_sequential_groups()
retagger.df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df.loc[:, 'sequential_group'] = groups


Unnamed: 0,index,Sentence #,Word,B-art,B-eve,B-geo,B-gpe,B-nat,B-org,B-per,...,I-geo,I-gpe,I-nat,I-org,I-per,I-tim,O,old_tag,Tag,sequential_group
66,66,0,phil,0.0,0.0,0.981,0.001,0.0,0.007,0.0,...,0.001,0.0,0.0,0.0,0.0,0.0,0.005,B-geo,B-tar,0
67,67,0,##ly,0.0,0.0,0.987,0.001,0.0,0.006,0.0,...,0.002,0.0,0.0,0.0,0.0,0.0,0.002,B-geo,B-tar,0
897,897,9,shell,0.037,0.003,0.469,0.002,0.001,0.358,0.047,...,0.011,0.0,0.001,0.015,0.002,0.002,0.041,B-geo,B-tar,1
898,898,9,key,0.004,0.002,0.061,0.001,0.001,0.004,0.009,...,0.583,0.003,0.001,0.212,0.048,0.004,0.027,I-geo,I-tar,1
899,899,9,island,0.002,0.0,0.011,0.001,0.0,0.002,0.002,...,0.619,0.002,0.001,0.218,0.023,0.003,0.084,I-geo,I-tar,1


In [53]:
import random
from BERT_geoparser.utils import flatten

In [66]:
new_ids = np.asarray(flatten(data.build_input_from_text('Penrith')[0]))
new_ids = new_ids[~np.isin(new_ids, (101,102,0))]

In [63]:
flatten(data.build_input_from_text('Penrith')[0])[~np.isin(101,102,0)]

  flatten(data.build_input_from_text('Penrith')[0])[~np.isin(101,102,0)]


7279

In [70]:
[data.tokenizer.id_to_token(i) for i in new_ids]

['pen', '##rith']

In [83]:
world_cities_df = pd.read_csv('data/model_improvement/worldcities.csv')
us_cities_df = world_cities_df[world_cities_df.iso3=='USA']
us_cities = []
for i, city in us_cities_df.iterrows():
    name = city.city_ascii
    lat = city.lat
    lng = city.lng
    us_cities.append((name, (lng,lat)))

In [84]:
replacement_dict = {}

for i, group in retagger.df.groupby('sequential_group'):
    phrase = Phrase('', tag=None)
    for token, tag in zip(group['Word'].values, group['Tag'].values):
        phrase.add_token(token=token, tag=tag)
    if phrase.text not in replacement_dict.keys():
        new_cities = random.choices(us_cities, k=2)
        # make sure we're not replacing with the same city
        new_city = new_cities[0]
        if new_city == phrase.text:
            new_city == new_cities[1]
        replacement_dict.update({phrase.text:new_city})  

In [90]:
# Get the review dataset and the accompanying business info dataset
review_df = pd.read_json('../data/yelp_academic_dataset_review.json', lines=True, nrows=25000)[20000:]
business_df = pd.read_json('../data/yelp_academic_dataset_business.json', lines=True)

Unnamed: 0,Sentence #,Word,B-art,B-eve,B-geo,B-gpe,B-nat,B-org,B-per,B-tim,I-art,I-eve,I-geo,I-gpe,I-nat,I-org,I-per,I-tim,O
0,0,[CLS],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0,if,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0,you,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0,decide,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0,to,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
855018,19996,out,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
855019,19996,things,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
855020,19996,on,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
855021,19996,your,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
