In [1]:
# Standard library imports
import random
# third party imports
import pandas as pd
import numpy as np
from tqdm import tqdm
# local imports
from BERT_geoparser.data import Data, Phrase
from BERT_geoparser.tokenizer import Tokenizer
from BERT_geoparser.model import BertModel
from BERT_geoparser.analysis import Results
from BERT_geoparser.retagger import Retagger
from BERT_geoparser.utils import flatten

## Test the model on randomized locations
While these results are promising, there is a serious flaw in the datset the model has been trained on. Specifically, the reviews are from businesses from a small number of locations. As such, there is a danger that the model is simply consistently assigning the `tar` tag to tokens matching those places. To investigate we will build a new dataset with all the tokens tagged as `tar` replaced with random draws from a list of place names. 

One approach to doing this would be to go through the test data and replace anything tagged as a target location with a new location. However, this will cause problems with indexing if the token representation of the new location is longer or shorter than the previous location (e.g. ['New', 'York'] -> ['LA']). 

To get around any problems with indexing we willgenerate a new reviews dataset, with the text and coordinates altered when a place tagged as `tar` is mentioned. We will then go through the process of NER tagging and `tar`/`inc` tagging again to produce a new dataset.

In [2]:
# open the test data
test_data = pd.read_csv('data/step_4/test_yelp_dataset.csv')
# extract only lines tagged as target
target_only_data = test_data[test_data.Tag.str.contains('tar')]
# use the Retagger class to add a 'sequential group' column to this data.
retagger = Retagger(target_only_data)
retagger.add_sequential_groups()
retagger.df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df.loc[:, 'sequential_group'] = groups


Unnamed: 0,index,Sentence #,Word,B-art,B-eve,B-geo,B-gpe,B-nat,B-org,B-per,...,I-geo,I-gpe,I-nat,I-org,I-per,I-tim,O,old_tag,Tag,sequential_group
66,66,0,phil,0.0,0.0,0.981,0.001,0.0,0.007,0.0,...,0.001,0.0,0.0,0.0,0.0,0.0,0.005,B-geo,B-tar,0
67,67,0,##ly,0.0,0.0,0.987,0.001,0.0,0.006,0.0,...,0.002,0.0,0.0,0.0,0.0,0.0,0.002,B-geo,B-tar,0
897,897,9,shell,0.037,0.003,0.469,0.002,0.001,0.358,0.047,...,0.011,0.0,0.001,0.015,0.002,0.002,0.041,B-geo,B-tar,1
898,898,9,key,0.004,0.002,0.061,0.001,0.001,0.004,0.009,...,0.583,0.003,0.001,0.212,0.048,0.004,0.027,I-geo,I-tar,1
899,899,9,island,0.002,0.0,0.011,0.001,0.0,0.002,0.002,...,0.619,0.002,0.001,0.218,0.023,0.003,0.084,I-geo,I-tar,1


Load a dataset of world cities, and produce a list of US cities and a (lat,long) coordinate for that city. This will be important for re-tagging the dataset as `tar` and `inc`.

In [2]:
world_cities_df = pd.read_csv('data/model_improvement/worldcities.csv')
us_cities_df = world_cities_df[world_cities_df.iso3=='USA']
us_cities = []
for i, city in us_cities_df.iterrows():
    name = city.city_ascii
    lat = city.lat
    lng = city.lng
    us_cities.append({'name':name, 'coords':str((lat,lng))})

We now want to loop over the test locations in the test dataset (i.e. groups of sequentially indexed tokens tagged as `tar`) and create a dictionary which maps the review number (`Sentence #`) to a copy of the review text with the location replaced with a random draw from the cities dataset and the set of coordinates related to the new location. 

In [4]:
review_df = pd.read_csv('data/step_2/25k_yelp_reviews_with_location.csv', nrows=25000)[20000:]
review_df = review_df.reset_index()

In [5]:
review_df.loc[0].text

"Blue Claws can be pricey but phenomenal. Stick with the larges i/o jumbo, sizes very close and save some $. Great with fries and beer. I've never had anything but those 3 things so couldn't tell you much else but worth the visit if you're looking for crabs in Philly."

So we want the word 'Philly' to be replaced with another random city. 

In [6]:
# Create a dictionary for replacements, mapping a review index to a new  
replacement_dict = {}

for i, group in retagger.df.groupby('sequential_group'):
    # build the phrase for this group of words
    old_name = Phrase('', tag=None)
    for token, tag in zip(group['Word'].values, group['Tag'].values):
        old_name.add_token(token=token, tag=tag)
    # get a new city and coordinate pair
    new_city = random.choice(us_cities)
    # get the old text and replace the city name
    review_num = group['Sentence #'].iloc[0]
    old_text = review_df.loc[review_num].text.lower()
    new_text = old_text.replace(old_name.text, new_city['name'])
    # update the review dataframe
    review_df.loc[review_num, 'text'] = new_text
    review_df.loc[review_num, 'coordinates'] = new_city['coords']

In [7]:
review_df.loc[0].text

"blue claws can be pricey but phenomenal. stick with the larges i/o jumbo, sizes very close and save some $. great with fries and beer. i've never had anything but those 3 things so couldn't tell you much else but worth the visit if you're looking for crabs in Yelm."

That seems to have worked! We can now build a new test datset out of this randomized data and see if the model is able to acheive an acceptable degree of accuracy.

In [8]:
#### NER Tagging ###
data_csv = r'../data/ner_dataset.csv'
tokenizer = Tokenizer(size='base', cased=False)
data = Data(data_path=data_csv, 
            tokenizer=tokenizer,
            max_len=125)

model = BertModel(saved_model='20230808_bert_model_large.hdf5', data=data)
model.model.summary()

results = model.results_dataframe(texts=review_df.text.values, include_best=True)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 125)]        0           []                               
                                                                                                  
 input_3 (InputLayer)           [(None, 125)]        0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 125)]        0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  109482240   ['input_1[0][0]',                
                                thPoolingAndCrossAt               'input_3[0][0]',            

100%|██████████| 5000/5000 [15:12<00:00,  5.48it/s]


In [9]:
### tar/inc tagging
retagger = Retagger(results)
retagger.retag(['geo', 'gpe', 'org'], threshold='bbox', review_df=review_df)
retagged_data = retagger.df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df.loc[:, 'sequential_group'] = groups
100%|██████████| 5744/5744 [48:44<00:00,  1.96it/s]  


In [16]:
retagged_data[retagged_data['Sentence #']==0]

Unnamed: 0,Sentence #,Word,B-art,B-eve,B-geo,B-gpe,B-nat,B-org,B-per,B-tim,...,I-eve,I-geo,I-gpe,I-nat,I-org,I-per,I-tim,O,old_tag,Tag
0,0,[CLS],0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,1.000,O,O
1,0,blue,0.220,0.008,0.057,0.004,0.004,0.288,0.043,0.005,...,0.001,0.003,0.001,0.001,0.004,0.002,0.001,0.357,O,O
2,0,claws,0.010,0.001,0.007,0.001,0.001,0.018,0.022,0.001,...,0.006,0.046,0.001,0.003,0.216,0.024,0.001,0.597,O,O
3,0,can,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,1.000,O,O
4,0,be,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,1.000,O,O
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65,0,in,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,1.000,O,O
66,0,ye,0.006,0.000,0.908,0.002,0.000,0.045,0.005,0.010,...,0.000,0.007,0.000,0.000,0.001,0.000,0.001,0.014,B-geo,B-tar
67,0,##lm,0.012,0.001,0.803,0.002,0.001,0.084,0.010,0.023,...,0.001,0.006,0.000,0.000,0.001,0.001,0.002,0.053,B-geo,B-tar
68,0,.,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,1.000,O,O


In [12]:
retagged_data.to_csv('data/step_4/test_yelp_dataset_randomized_locations.csv', index=False)

In [17]:
data_csv = 'data/step_4/test_yelp_dataset_randomized_locations.csv'
tokenizer = Tokenizer(size='base', cased=False)
data = Data(data_path=data_csv, 
            tokenizer=tokenizer,
            max_len=125)
tar_model = BertModel(saved_model='20230926_tar_tagged_bert_model_large.hdf5', data=data)
X_tokens, y_pred, y_true = tar_model.test('data/step_4/test_yelp_dataset_randomized_locations.csv', return_tokens=True)



In [18]:
res = Results(y_true, y_pred)
for cat in ['O', 'tar', 'inc']:
    print(f'"{cat}" accuracy : {np.round(res.categorical_accuracy(cat),3)}')
    print(f'"{cat}" precision : {np.round(res.categorical_precision(cat),3)}')
    print(f'"{cat}" recall : {np.round(res.categorical_recall(cat),3)}')
    print('=======================')
print(f'macro average recall : {np.round(res.macro_average_recall(), 3)}')
print(f'macro average precision : {np.round(res.macro_average_precision(),3)}')
print(f'micro average recall : {np.round(res.micro_average_recall(),3)}')
print(f'micro average precision : {np.round(res.micro_average_precision(),3)}')

"O" accuracy : 0.994
"O" precision : 0.991
"O" recall : 0.994
"tar" accuracy : 0.354
"tar" precision : 0.528
"tar" recall : 0.354
"inc" accuracy : 0.68
"inc" precision : 0.734
"inc" recall : 0.69
macro average recall : 0.575
macro average precision : 0.698
micro average recall : 0.981
micro average precision : 0.981


This is not good. The model clearly struggles with the randomized location data, indicating that the high accuracy acheived on the previous dataset was likely a product of the limited location, rather than any understanding of sentence structure. 

## Re-train the model on randomized location data
The first approach to this should be to retrain the model on the new data. This will help the model generalise a little better about locations, but may result in some of the finer nuance in the data being lost. Contextual information, such as foods, activities or sights specific to a particular location, will be lost. This highlights one of the key limitations of the yelp data. With data which is already more generalised this contextualisation might be possible. 


In [3]:
# build a new training set
# open the test data
train_data = pd.read_csv('data/step_4/test_yelp_dataset.csv')
# extract only lines tagged as target
target_only_data = train_data[train_data.Tag.str.contains('tar')]
# use the Retagger class to add a 'sequential group' column to this data.
retagger = Retagger(target_only_data)
retagger.add_sequential_groups()
retagger.df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df.loc[:, 'sequential_group'] = groups


Unnamed: 0,index,Sentence #,Word,B-art,B-eve,B-geo,B-gpe,B-nat,B-org,B-per,...,I-geo,I-gpe,I-nat,I-org,I-per,I-tim,O,old_tag,Tag,sequential_group
66,66,0,phil,0.0,0.0,0.981,0.001,0.0,0.007,0.0,...,0.001,0.0,0.0,0.0,0.0,0.0,0.005,B-geo,B-tar,0
67,67,0,##ly,0.0,0.0,0.987,0.001,0.0,0.006,0.0,...,0.002,0.0,0.0,0.0,0.0,0.0,0.002,B-geo,B-tar,0
897,897,9,shell,0.037,0.003,0.469,0.002,0.001,0.358,0.047,...,0.011,0.0,0.001,0.015,0.002,0.002,0.041,B-geo,B-tar,1
898,898,9,key,0.004,0.002,0.061,0.001,0.001,0.004,0.009,...,0.583,0.003,0.001,0.212,0.048,0.004,0.027,I-geo,I-tar,1
899,899,9,island,0.002,0.0,0.011,0.001,0.0,0.002,0.002,...,0.619,0.002,0.001,0.218,0.023,0.003,0.084,I-geo,I-tar,1


In [4]:
review_df = pd.read_csv('data/step_2/25k_yelp_reviews_with_location.csv', nrows=25000)
review_df = review_df.reset_index()

In [5]:
# Create a dictionary for replacements, mapping a review index to a new  
replacement_dict = {}

for i, group in retagger.df.groupby('sequential_group'):
    # build the phrase for this group of words
    old_name = Phrase('', tag=None)
    for token, tag in zip(group['Word'].values, group['Tag'].values):
        old_name.add_token(token=token, tag=tag)
    # get a new city and coordinate pair
    new_city = random.choice(us_cities)
    # get the old text and replace the city name
    review_num = group['Sentence #'].iloc[0]
    old_text = review_df.loc[review_num].text.lower()
    new_text = old_text.replace(old_name.text, new_city['name'])
    # update the review dataframe
    review_df.loc[review_num, 'text'] = new_text
    review_df.loc[review_num, 'coordinates'] = new_city['coords']

In [7]:
#### NER Tagging ###
data_csv = r'../data/ner_dataset.csv'
tokenizer = Tokenizer(size='base', cased=False)
data = Data(data_path=data_csv, 
            tokenizer=tokenizer,
            max_len=125)

model = BertModel(saved_model='20230808_bert_model_large.hdf5', data=data)
model.model.summary()

results = model.results_dataframe(texts=review_df.text.values, include_best=True)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 125)]        0           []                               
                                                                                                  
 input_3 (InputLayer)           [(None, 125)]        0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 125)]        0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  109482240   ['input_1[0][0]',                
                                thPoolingAndCrossAt               'input_3[0][0]',            

 71%|███████   | 17666/25000 [25:22<10:31, 11.62it/s]