In [None]:
# Standard library imports
import random
# third party imports
import pandas as pd
import numpy as np
from tqdm import tqdm
# local imports
from BERT_geoparser.data import Data, Phrase
from BERT_geoparser.tokenizer import Tokenizer
from BERT_geoparser.model import BertModel
from BERT_geoparser.analysis import Results
from BERT_geoparser.retagger import Retagger
from BERT_geoparser.utils import flatten

## Test the model on randomized locations
While these results are promising, there is a serious flaw in the datset the model has been trained on. Specifically, the reviews are from businesses from a small number of locations. As such, there is a danger that the model is simply consistently assigning the `tar` tag to tokens matching those places. To investigate we will build a new dataset with all the tokens tagged as `tar` replaced with random draws from a list of place names. 

One approach to doing this would be to go through the test data and replace anything tagged as a target location with a new location. However, this will cause problems with indexing if the token representation of the new location is longer or shorter than the previous location (e.g. ['New', 'York'] -> ['LA']). 

To get around any problems with indexing we willgenerate a new reviews dataset, with the text and coordinates altered when a place tagged as `tar` is mentioned. We will then go through the process of NER tagging and `tar`/`inc` tagging again to produce a new dataset.

In [None]:
# open the test data
test_data = pd.read_csv('data/step_4/test_yelp_dataset.csv')
# extract only lines tagged as target
target_only_data = test_data[test_data.Tag.str.contains('tar')]
# use the Retagger class to add a 'sequential group' column to this data.
retagger = Retagger(target_only_data)
retagger.add_sequential_groups()
retagger.df.head()

Load a dataset of world cities, and produce a list of US cities and a (lat,long) coordinate for that city. This will be important for re-tagging the dataset as `tar` and `inc`.

In [None]:
world_cities_df = pd.read_csv('data/model_improvement/worldcities.csv')
us_cities_df = world_cities_df[world_cities_df.iso3=='USA']
us_cities = []
for i, city in us_cities_df.iterrows():
    name = city.city_ascii
    lat = city.lat
    lng = city.lng
    us_cities.append({'name':name, 'coords':str((lat,lng))})

We now want to loop over the test locations in the test dataset (i.e. groups of sequentially indexed tokens tagged as `tar`) and create a dictionary which maps the review number (`Sentence #`) to a copy of the review text with the location replaced with a random draw from the cities dataset and the set of coordinates related to the new location. 

In [None]:
review_df = pd.read_csv('data/step_2/25k_yelp_reviews_with_location.csv', nrows=25000)[20000:]
review_df = review_df.reset_index()

In [None]:
review_df.loc[0].text

So we want the word 'Philly' to be replaced with another random city. 

In [None]:
# Create a dictionary for replacements, mapping a review index to a new  
replacement_dict = {}

for i, group in retagger.df.groupby('sequential_group'):
    # build the phrase for this group of words
    old_name = Phrase('', tag=None)
    for token, tag in zip(group['Word'].values, group['Tag'].values):
        old_name.add_token(token=token, tag=tag)
    # get a new city and coordinate pair
    new_city = random.choice(us_cities)
    # get the old text and replace the city name
    review_num = group['Sentence #'].iloc[0]
    old_text = review_df.loc[review_num].text.lower()
    new_text = old_text.replace(old_name.text, new_city['name'])
    # update the review dataframe
    review_df.loc[review_num, 'text'] = new_text
    review_df.loc[review_num, 'coordinates'] = new_city['coords']

In [None]:
review_df.loc[0].text

That seems to have worked! We can now build a new test datset out of this randomized data and see if the model is able to acheive an acceptable degree of accuracy.

In [None]:
#### NER Tagging ###
data_csv = r'../data/ner_dataset.csv'
tokenizer = Tokenizer(size='base', cased=False)
data = Data(data_path=data_csv, 
            tokenizer=tokenizer,
            max_len=125)

model = BertModel(saved_model='20230808_bert_model_large.hdf5', data=data)
model.model.summary()

results = model.results_dataframe(texts=review_df.text.values, include_best=True)

In [None]:
### tar/inc tagging
retagger = Retagger(results)
retagger.retag(['geo', 'gpe', 'org'], threshold='bbox', review_df=review_df)
retagged_data = retagger.df

In [None]:
retagged_data[retagged_data['Sentence #']==0]

In [None]:
retagged_data.to_csv('data/step_4/test_yelp_dataset_randomized_locations.csv', index=False)

In [None]:
data_csv = 'data/step_4/test_yelp_dataset_randomized_locations.csv'
tokenizer = Tokenizer(size='base', cased=False)
data = Data(data_path=data_csv, 
            tokenizer=tokenizer,
            max_len=125)
tar_model = BertModel(saved_model='20230926_tar_tagged_bert_model_large.hdf5', data=data)
X_tokens, y_pred, y_true = tar_model.test('data/step_4/test_yelp_dataset_randomized_locations.csv', return_tokens=True)

In [None]:
res = Results(y_true, y_pred)
for cat in ['O', 'tar', 'inc']:
    print(f'"{cat}" accuracy : {np.round(res.categorical_accuracy(cat),3)}')
    print(f'"{cat}" precision : {np.round(res.categorical_precision(cat),3)}')
    print(f'"{cat}" recall : {np.round(res.categorical_recall(cat),3)}')
    print('=======================')
print(f'macro average recall : {np.round(res.macro_average_recall(), 3)}')
print(f'macro average precision : {np.round(res.macro_average_precision(),3)}')
print(f'micro average recall : {np.round(res.micro_average_recall(),3)}')
print(f'micro average precision : {np.round(res.micro_average_precision(),3)}')

This is not good. The model clearly struggles with the randomized location data, indicating that the high accuracy acheived on the previous dataset was likely a product of the limited location, rather than any understanding of sentence structure. 

## Re-train the model on randomized location data
The first approach to this should be to retrain the model on the new data. This will help the model generalise a little better about locations, but may result in some of the finer nuance in the data being lost. Contextual information, such as foods, activities or sights specific to a particular location, will be lost. This highlights one of the key limitations of the yelp data. With data which is already more generalised this contextualisation might be possible. 


In [None]:
# build a new training set
# open the test data
train_data = pd.read_csv('data/step_4/test_yelp_dataset.csv')
# extract only lines tagged as target
target_only_data = train_data[train_data.Tag.str.contains('tar')]
# use the Retagger class to add a 'sequential group' column to this data.
retagger = Retagger(target_only_data)
retagger.add_sequential_groups()
retagger.df.head()

In [None]:
review_df = pd.read_csv('data/step_2/25k_yelp_reviews_with_location.csv', nrows=25000)
review_df = review_df.reset_index()

In [None]:
# Create a dictionary for replacements, mapping a review index to a new  
replacement_dict = {}

for i, group in retagger.df.groupby('sequential_group'):
    # build the phrase for this group of words
    old_name = Phrase('', tag=None)
    for token, tag in zip(group['Word'].values, group['Tag'].values):
        old_name.add_token(token=token, tag=tag)
    # get a new city and coordinate pair
    new_city = random.choice(us_cities)
    # get the old text and replace the city name
    review_num = group['Sentence #'].iloc[0]
    old_text = review_df.loc[review_num].text.lower()
    new_text = old_text.replace(old_name.text, new_city['name'])
    # update the review dataframe
    review_df.loc[review_num, 'text'] = new_text
    review_df.loc[review_num, 'coordinates'] = new_city['coords']

In [None]:
#### NER Tagging ###
data_csv = r'../data/ner_dataset.csv'
tokenizer = Tokenizer(size='base', cased=False)
data = Data(data_path=data_csv, 
            tokenizer=tokenizer,
            max_len=125)

model = BertModel(saved_model='20230808_bert_model_large.hdf5', data=data)
model.model.summary()

results = model.results_dataframe(texts=review_df.text.values, include_best=True)

In [None]:
### tar/inc tagging
retagger = Retagger(results)
retagger.retag(['geo', 'gpe', 'org'], threshold='bbox', review_df=review_df)
retagged_data = retagger.df

In [None]:
retagged_data.to_csv('data/model_improvement/randomised_location_tarinc_train.csv', index=False)

In [None]:
#### NER Tagging ###
data_csv = r'data/model_improvement/randomised_location_tarinc_train.csv'
tokenizer = Tokenizer(size='base', cased=False)
data = Data(data_path=data_csv, 
            tokenizer=tokenizer,
            max_len=125)

model = BertModel(saved_model=None, data=data)
model.model.summary()

In [None]:
from sklearn.utils import class_weight
data = pd.read_csv(data_csv)
class_weights_list = class_weight.compute_class_weight('balanced',
                                                 classes=['B-inc', 'B-tar', 'I-inc', 'I-tar', 'O'],
                                                 y=data.Tag.values)

class_weights = {i:w for i,w in enumerate(class_weights_list)}
class_weights.update({5:0.01})

In [None]:
model.train(save_as='20230929_tar_model_randomised_locations.hdf5', 
            n_epochs=2,
            batch_size=16, 
            validation_split=0.1, 
            class_weights=class_weights)

In [None]:
data_csv = 'data/step_4/test_yelp_dataset_randomized_locations.csv'
tokenizer = Tokenizer(size='base', cased=False)
data = Data(data_path=data_csv, 
            tokenizer=tokenizer,
            max_len=125)
tar_model = BertModel(saved_model='20230929_tar_model_randomised_locations.hdf5', data=data)
X_tokens, y_pred, y_true = tar_model.test('data/step_4/test_yelp_dataset_randomized_locations.csv', return_tokens=True)

In [None]:
res = Results(y_true, y_pred)
for cat in ['O', 'tar', 'inc']:
    print(f'"{cat}" accuracy : {np.round(res.categorical_accuracy(cat),3)}')
    print(f'"{cat}" precision : {np.round(res.categorical_precision(cat),3)}')
    print(f'"{cat}" recall : {np.round(res.categorical_recall(cat),3)}')
    print('=======================')
print(f'macro average recall : {np.round(res.macro_average_recall(), 3)}')
print(f'macro average precision : {np.round(res.macro_average_precision(),3)}')
print(f'micro average recall : {np.round(res.micro_average_recall(),3)}')
print(f'micro average precision : {np.round(res.micro_average_precision(),3)}')

In [None]:
from sklearn.utils import class_weight
class_weights_list = class_weight.compute_class_weight('balanced',
                                                 classes=np.unique(retagged_data.Tag),
                                                 y=retagged_data.Tag.values)

class_weights = {i:w for i,w in enumerate(class_weights)}

In [None]:
model.train(save_as='20230929_tar_model_randomised_locations.hdf5', n_epochs=1, batch_size=16, validation_split=0.1) 