In [9]:
# standard library imports
import json
import sys
# third party imports
from geopy.distance import distance
import numpy as np
from tqdm import tqdm
# local imports
sys.path.append('..')
from geo_llama.main import GeoLlama
from geo_llama.model import TopoModel, RAGModel

# Testing the GeoLlama model on the News2024 dataset
this notbook goes through th eprocess of testing the GeoLlama model on the News2024 dataset. The model is very computational intensive, even with Unsloth and quantization. The model uses roughly 12Gb of GPU RAM. For efficiency, it is recommended to be run on the L4 GPU available on colab pro. 
## 1. Loading the dataset
We'll load the dataset and remove the articles which have no toponyms. This same process has been done with other LLMs tested.

In [13]:
with open('../data/test_data/News2024.json', 'r') as f:
    true_data = json.load(f)
    
# remove any articles without topnoyms
true_data = [d for d in true_data if len(d['toponyms'])>0]   
print(f'Total number of articles : {len(true_data)}')

Total number of articles : 50


## 2. Geoparse using GeoLlama
We'll now use the `geoparse` method in GeoLlama to parse each text. Note that we are not using the translation module in this instance as all texts are in english.

In [None]:
# set up the model
topo_model = TopoModel(model_name='JoeShingleton/GeoLlama_7b_toponym', 
                       prompt_path='data/prompt_templates/prompt_template.txt',
                       instruct_path='data/prompt_templates/topo_instruction.txt',
                       input_path=None,
                       config_path='data/config_files/model_config.json')

rag_model = RAGModel(model_name='JoeShingleton/GeoLlama_7b_RAG',
                       prompt_path='data/prompt_templates/prompt_template.txt',
                       instruct_path='data/prompt_templates/rag_instruction.txt',
                       input_path='data/prompt_templates/rag_input.txt',
                       config_path='data/config_files/model_config.json')

geo_llama = GeoLlama(topo_model, rag_model)


results = []
for d in tqdm(true_data):
  results.append(geo_llama.geoparse(d['text']))
# save the results
with open('geollama_news2024_results.json', 'w') as f:
  json.dump(results, f)


## 3. Analyse the results
We will consider the toponym extraction accuracy and the toponym resolution accuracy. For toponym extraction we are interested in the proportion of toponyms in the text the model identifies. Note that we are expecting the model to tell us where in the text the toponym occurs, so this may be a different metric to other reported metrics. For toponym resolution we are interested in the distance betweeen true-positive toponyms resolved by the model and the assigned location in the dataset. We use Geopy's standard geodesic distance method for this. 

In [10]:
with open(r'../data/results/geollama_news2024_results.json', 'r') as f:
    pred_data = json.load(f)

In [34]:

topo_precision = []
topo_recall = []
topo_f1 = []
topo_distance = []

def get_toponym_metrics(true_toponyms:list, pred_toponyms:list)->dict[str,int]:
    true_positives = len([t for t in pred_toponyms if t in true_toponyms])
    false_positives = len([t for t in pred_toponyms if t not in true_toponyms])
    false_negatives = len([t for t in true_toponyms if t not in pred_toponyms])
    return {'TP':true_positives, 'FP':false_positives, 'FN':false_negatives}

def get_accuracy_metrics(topo_metrics: dict[str, int]) -> dict[str, float]:
    tp = topo_metrics['TP']
    fp = topo_metrics['FP']
    fn = topo_metrics['FN']

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0

    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

    return {'precision': precision, 'recall': recall, 'F1': f1}

for true, pred in zip(true_data, pred_data):
    # skip if no toponyms in text
    if len(true['toponyms'])==0:
        continue
    # get toponyms
    true_toponyms = [t['word'] for t in true['toponyms']]
    pred_toponyms = [t['name'] for t in pred]
    # get true/false positives/negatives
    toponym_metrics = get_toponym_metrics(true_toponyms, pred_toponyms)
    # get prec, rec, f1
    accuracy_metrics = get_accuracy_metrics(toponym_metrics)
    topo_precision.append(accuracy_metrics['precision'])
    topo_recall.append(accuracy_metrics['recall'])
    topo_f1.append(accuracy_metrics['F1'])
    ### distances
    for pred_topo in pred:
        if pred_topo['name'] not in true_toponyms:
            continue
        pred_coords = (float(pred_topo['latitude']), float(pred_topo['longitude']))
        true_location = [t for t in true['toponyms'] if t['word']==pred_topo['name']][0]
        true_coords = (float(true_location['lat']), float(true_location['lon']))
        topo_distance.append(distance(true_coords, pred_coords))
    

In [35]:
# print the results
macro_precision = np.mean(topo_precision)
macro_recall = np.mean(topo_recall)
macro_f1 = np.mean(topo_f1)

mean_d = np.mean([d.km for d in topo_distance])
median_d = np.median([d.km for d in topo_distance])
acc_1km = len([d for d in topo_distance if d.km<=1])/len(topo_distance)
acc_10km = len([d for d in topo_distance if d.km<=10])/len(topo_distance)
acc_50km = len([d for d in topo_distance if d.km<=50])/len(topo_distance)
acc_80km = len([d for d in topo_distance if d.km<=80])/len(topo_distance)
acc_161km = len([d for d in topo_distance if d.km<=161])/len(topo_distance)

print(f'Macro precision: {macro_precision:.3f}')
print(f'Macro recall: {macro_recall:.3f}')
print(f'Macro F1: {macro_f1:.3f}')
print('#####################')
print(f'Mean distance : {mean_d:.1f}')
print(f'Median distance : {median_d:.3f}')
print(f'Acc@1km : {acc_1km:.3f}')
print(f'Acc@10km : {acc_10km:.3f}')
print(f'Acc@50km : {acc_50km:.3f}')
print(f'Acc@80km : {acc_80km:.3f}')
print(f'Acc@161km : {acc_161km:.3f}')

Macro precision: 0.761
Macro recall: 0.716
Macro F1: 0.719
#####################
Mean distance : 255.5
Median distance : 0.011
Acc@1km : 0.685
Acc@10km : 0.802
Acc@50km : 0.858
Acc@80km : 0.873
Acc@161km : 0.898


In [36]:
from geo_llama.gazetteer import Gazetteer

In [39]:
nominatim = Gazetteer(polygon=False)

nominatim.query('Zion National Park', user_agent='UA_test')

[{'place_id': 278439064,
  'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. http://osm.org/copyright',
  'osm_type': 'relation',
  'osm_id': 5758583,
  'lat': '37.3247408',
  'lon': '-113.0048035752838',
  'class': 'boundary',
  'type': 'national_park',
  'place_rank': 25,
  'importance': 0.4973522186722954,
  'addresstype': 'national_park',
  'name': 'Zion National Park',
  'display_name': 'Zion National Park, Washington County, Utah, United States',
  'boundingbox': ['37.1413497', '37.5042917', '-113.2282863', '-112.8631483']}]