In [None]:
from lxml import objectify
from xml.etree import ElementTree as ET
import numpy as np
import random
import json
import pandas as pd
import sys
sys.path.append('..')
from geo_llama.data import GeoVirusArticle, LGLArticle, WikTorArticle
from tqdm import tqdm
import matplotlib.pyplot as plt

# Building a fine-tuning dataset for Llama-3 Geoparser fine tuning
We will construct a dataset using the LgL and GeoVirus datasets with which we will fine-tune a custom Llama-3 model. The model will be tested on the News2024 dataset to assess the accuracy.

In [None]:
geollama_prompt ="""Below is an instruction that describes a task, paired with an input that provides a specfic example which the task should be applied to. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}
"""

geoparse_instruction = """Extract all toponyms from the provided text and estimate their geolocations. Include the name of every toponym in the text and its decimal latitude and longitude coordinates. Do not consider ajdectives (e.g. 'English', 'Iranian') as toponyms. Format the output in JSON, strictly adhering to the specified template. Be very concise and output only the JSON data inside a code block. Do not provide any explanation or reasoning.

JSON Template for output:

{"toponyms": [
        {
          "name": "<string : toponym name exactly as it appears in the text>",
          "latitude": <float : latitude in decimal degrees>,
          "longitude": <float : longitude in decimal degrees>
        },
        // More toponyms from the text can follow
      ]
}
"""

In [None]:
### LGL data
# open the lgl dataset using xml
dataset = 'lgl'

def get_data(dataset):
    xml = ET.parse(f"../data/fine_tuning_data/{dataset}.xml")
    xml_root = xml.getroot()

    xml_str = ET.tostring(xml_root,method='xml').decode()
    xml_obj = objectify.fromstring(xml_str)
    return xml_obj

def build_ft_data(xml_obj, dataset):
    
    ft_data = []
    if dataset in ['lgl', 'GeoVirus']:
        articles = xml_obj.article
    elif dataset in ['WikToR']:
        articles = xml_obj.page
    for article_xml in articles:
        if dataset=='lgl':
            article = LGLArticle(article_xml)
        elif dataset=='GeoVirus':
            article = GeoVirusArticle(article_xml)
        elif dataset=='WikToR':
            article = WikTorArticle(article_xml)
        text = article.text
        response = {"toponyms":[]}
        for toponym in article.toponyms:
            try:
                response['toponyms'].append({"name":str(toponym.phrase),
                                            "latitude":float(toponym.latitude),
                                            "longitude":float(toponym.longitude)})
            except:
                response['toponyms'].append({"name":str(toponym.phrase),
                                             "latitude":None,
                                             "longitude":None})
        ft_data.append({"instruction":geoparse_instruction,
                        "input":str(text),
                        "response":response})
    
    return ft_data
        
    

In [None]:
lgl_xml = get_data('lgl')
geovirus_xml = get_data('GeoVirus')

lgl_ft_data = build_ft_data(lgl_xml, 'lgl')
geovirus_ft_data = build_ft_data(geovirus_xml, 'GeoVirus')

ft_data = lgl_ft_data + geovirus_ft_data

with open('../data/fine_tuning_data/llama3_ft_data.json', 'w') as f:
    json.dump(ft_data, f)

In [None]:
d = get_data('TR-News')

# Building a fine-tuning dataset for RAG based Llama-3 Geoparser
We will construct a dataset using the LgL and GeoVirus datasets with which we will fine-tune a custom Llama-3 model. The model will be tested on the News2024 dataset to assess the accuracy.

In [8]:
lgl_xml = get_data('lgl')
geovirus_xml = get_data('GeoVirus')
trnews_xml = get_data('TR-News')

ft_articles = []
for article_xml in lgl_xml.article:
    ft_articles.append(LGLArticle(article_xml))
for article_xml in geovirus_xml.article:
    ft_articles.append(GeoVirusArticle(article_xml))
for article_xml in trnews_xml.article:
    ft_articles.append(LGLArticle(article_xml))

In [9]:
RAG_prompt = """Below is an instruction that describes a task, paired with an input that provides a specfic example which the task should be applied to. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}
"""


In [10]:
geoparse_instruction = """You will be given a piece of text, a toponym found within that text, and a JSON detailing the matched locations when that toponym is searched on OpenStreetMaps. 

Your task is to identify the matched location which is most likely to be the true location of the toponym, given the context of the text.

If the list of matches is empty, or you do not think any match accurately represents the toponym, you are permitted to assign your best estimate for a latitude and longitude. This should be highlighted in your response by setting {"RAG":false}.

Your output should strictly conform to the following tmeplate:

{"name" : <(str) name of toponym as it appears in the text>,
 "latitude": <(float) latitude as it appears in the matched locations>,
 "longitude": <(float) longitude as it appears in the matched locations>,
 "RAG_estimated": <(bool) true if a matched location was used>
}
"""

input_prompt = r"""<text> {} <\text>

<toponym> {} <\toponym>

<matches> {} <\matches>
"""

In [11]:
with open('../data/fine_tuning_data/nominatim_cache.json', 'r') as f:
    cache = json.load(f)

In [12]:
from geopy import distance
from shapely.geometry import Point

In [17]:
import sys
sys.path.append('.')
from geo_llama.gazetteer import Gazetteer

In [21]:
nominatim = Gazetteer(gazetteer_source='nominatim', polygon=False)

In [29]:
def get_matches(toponym, cache):
    try:
        return cache[toponym], cache
    except KeyError:
        user_agent = f'GeoLlama_{random.uniform(1000,10000)}'
        matches = nominatim.query(toponym, user_agent)
        cache[toponym] = matches
        return matches, cache 

In [31]:
ft_data = []

for article in tqdm(ft_articles):
    
    for toponym in article.toponyms:
        if not toponym.latitude:
            continue
        true_point = (float(toponym.latitude), float(toponym.longitude))
        matches, cache = get_matches(str(toponym.phrase), cache)
        best_match = None
        best_d = np.inf
        for match in matches:
            match_point = (float(match['lat']), float(match['lon']))
            d = distance.distance(match_point, true_point)
            if d < best_d:
                best_match = match
                best_d = d
        # check if any match was very good:
        if len(matches)==0:
            response = {'name':toponym.phrase,
                        'latitude':toponym.latitude,
                        'longitude':toponym.longitude,
                        'RAG_estimated':False}
            
        elif (best_d.km > 20) and (best_match['addresstype'] not in ['country', 'state', 'county', 'region']):
            response = {'name':toponym.phrase,
                        'latitude':toponym.latitude,
                        'longitude':toponym.longitude,
                        'RAG_estimated':False}
        else:
            response = {'name':toponym.phrase,
                        'latitude':best_match['lat'],
                        'longitude':best_match['lon'],
                        'RAG_estimated':True}
    
        match_info = [{'name':m['name'], 'lat':m['lat'], 'lon':m['lon'], 'address':m['display_name']} for m in matches]
        input = input_prompt.format(article.text, toponym.phrase, match_info)
        
        ft_prompt = RAG_prompt.format(geoparse_instruction, input, response)
        ft_data.append({"instruction":geoparse_instruction,
                        "input":input,
                        "response":str(response)})
               
        
        

100%|██████████| 935/935 [01:51<00:00,  8.36it/s] 


In [34]:
# deduplicate
ft_data = [dict(t) for t in {tuple(d.items()) for d in ft_data}]

len(ft_data)

4682

In [35]:
with open('../data/fine_tuning_data/llama3_RAG_geoparsing_ft_new.json', 'w') as f:
    json.dump(ft_data, f)

# Building a fine-tuning dataset for Llama-3 toponym extraction
We will construct a dataset using the LgL and GeoVirus datasets with which we will fine-tune a custom Llama-3 model. The model will be tested on the News2024 dataset to assess the accuracy.

In [None]:
geoparse_instruction = """You will be given a piece of text which contains some place names (toponyms). Please extract each toponyhm from the text and place it in a python list.

Each toponym should only appear once in the list, even if they occur multiple times in the text. If multiple spellings of the same toponym appear in the text each spelling should be represented in the list.

You should not consider adjectives (e.g. "English", "Iranian") as toponyms. Some toponyms may span multiple words.

Please use the following template to structure your response:

{"toponyms":["toponym_1", "toponym_2", "toponym_3",...]}
"""

In [None]:
ft_data = []

for article in tqdm(ft_articles):
    
    toponyms = [str(t.phrase) for t in article.toponyms]
    response = {"toponyms":list(set(toponyms))}
    input = article.text
    ft_data.append({"instruction":geoparse_instruction,
                    "input":input,
                    "response":str(response)})

In [None]:
with open('../data/fine_tuning_data/llama3_toponym_extraction_ft.json', 'w') as f:
    json.dump(ft_data, f)

In [None]:
ft_data[0]

## Adding the CoNLL dataset

In [None]:
import json

def parse_conll2003_file(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    return lines

def process_conll2003_data(lines):
    dataset = []
    article_text = []
    toponyms = []
    current_location = []
    text_id = -1
    
    for line in lines:
        line = line.strip()
        
        if line == "-DOCSTART- -X- -X- O":
            # Save the previous article if it exists
            if article_text:
                if current_location:
                    toponyms.append(" ".join(current_location))
                dataset.append({
                    'text_id': text_id,
                    'text': "".join(article_text),
                    'toponyms': toponyms
                })
                # Reset for the new article
                article_text = []
                toponyms = []
                current_location = []
            text_id += 1
            continue
        
        if line == "":
            # Add a blank line to the article text to maintain formatting
            article_text.append("\n")
            continue
        
        token, pos, chunk, ner = line.split()

        # Handle apostrophes and split words
        if len(article_text) > 0 and (token.startswith("'") or token.startswith("-")):
            article_text[-1] += token
        else:
            # Handle spacing for punctuation
            if token in ['.', ',', '!', '?', ';', ':']:
                article_text[-1] += token
            elif token in ['-', '/']:
                article_text[-1] += token
            else:
                # Add a space before the token if it's not the start of the article or after a newline
                if len(article_text) > 0 and article_text[-1] != "\n":
                    article_text.append(" ")
                article_text.append(token)
        
        # Process location entities
        if ner == "B-LOC":
            if current_location:
                toponyms.append(" ".join(current_location))
            current_location = [token]
        elif ner == "I-LOC" and current_location:
            current_location.append(token)
        else:
            if current_location:
                toponyms.append(" ".join(current_location))
                current_location = []

    # Add the last article if it exists
    if article_text:
        if current_location:
            toponyms.append(" ".join(current_location))
        dataset.append({
            'text_id': text_id,
            'text': "".join(article_text),
            'toponyms': toponyms
        })
    
    return dataset

def save_as_json(data, output_file):
    with open(output_file, 'w') as f:
        json.dump(data, f, indent=4)

# Path to the CoNLL-2003 English dataset
conll_file_path = '../data/fine_tuning_data/CoNLL_train.txt'

# Parsing the dataset
conll_lines = parse_conll2003_file(conll_file_path)

# Processing the data to extract text and location entities
processed_data = process_conll2003_data(conll_lines)

# Saving the output as a JSON file
output_file_path = 'conll2003_location_entities.json'
save_as_json(processed_data, output_file_path)

print(f"Processed data saved to {output_file_path}")

In [None]:
for article in tqdm(processed_data):
    
    toponyms = article['toponyms']
    response = {"toponyms":list(set(toponyms))}
    input = article['text']
    ft_data.append({"instruction":geoparse_instruction,
                    "input":input,
                    "response":str(response)})

In [None]:
len(ft_data)

In [None]:
with open('../data/fine_tuning_data/llama3_toponym_extraction_ft.json', 'w') as f:
    json.dump(ft_data, f)

In [None]:
print(ft_data[-24]['input'])