In [1]:
! pip install -U googlemaps

Collecting googlemaps
  Downloading googlemaps-4.10.0.tar.gz (33 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Installing collected packages: googlemaps
  Running setup.py install for googlemaps: started
  Running setup.py install for googlemaps: finished with status 'done'
Successfully installed googlemaps-4.10.0


  DEPRECATION: googlemaps is being installed using the legacy 'setup.py install' method, because it does not have a 'pyproject.toml' and the 'wheel' package is not installed. pip 23.1 will enforce this behaviour change. A possible replacement is to enable the '--use-pep517' option. Discussion can be found at https://github.com/pypa/pip/issues/8559

[notice] A new release of pip available: 22.3.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
import torch
from transformers import DistilBertTokenizer, DistilBertForTokenClassification
from transformers import pipeline
import csv
from fuzzywuzzy import fuzz

model_dir = 'DistilBert_conll03'
tokenizer = DistilBertTokenizer.from_pretrained(model_dir)
model = DistilBertForTokenClassification.from_pretrained(model_dir)
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)

text = "Jordan is fram from north-korea"

ner_results = ner_pipeline(text)

def aggregate_subwords(current_entity, final_entities):
    if current_entity:
        entity_word = ''.join([word.replace('##', '') for word in current_entity["words"]])
        final_entities.append(entity_word)

current_entity = {}
final_entities = []
for entity in ner_results:
    if entity['entity'] in ['B-LOC', 'I-LOC']:
        word = entity['word'].replace('##', '')
        if entity['word'].startswith('##') or entity['word'].endswith('-') or (current_entity.get("words") and current_entity["words"][-1].endswith('-')):
            current_entity["words"].append(word)
        else:
            aggregate_subwords(current_entity, final_entities)
            current_entity = {"words": [word]}

aggregate_subwords(current_entity, final_entities)

similarity_threshold = 80

def load_dataset(file_path):
    dataset = []
    with open(file_path, 'r', newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            row['other-names'] = [name.strip() for name in row['other-names'].split(',')]
            dataset.append(row)
    return dataset

def perform_fuzzy_matching(extracted_name, dataset):
    best_match = None
    highest_similarity = 0
    for row in dataset:

        similarity = fuzz.ratio(extracted_name.lower(), row["canonical name"].lower())
        if similarity > similarity_threshold and similarity > highest_similarity:
            best_match = row
            highest_similarity = similarity
            
        for other_name in row['other-names']:
            if other_name:
                similarity = fuzz.ratio(extracted_name.lower(), other_name.lower())
                if similarity > similarity_threshold and similarity > highest_similarity:
                    best_match = row
                    highest_similarity = similarity
    return best_match

dataset = load_dataset('Datasets/place_name.csv')

for extracted_name in final_entities:
    best_match = perform_fuzzy_matching(extracted_name, dataset)
    if best_match:
        canonical_name = best_match["canonical name"]
        place_type = best_match["place-type"]
        other_names = ', '.join(best_match["other-names"])
        print(f"Token: {extracted_name}, Canonical name: {canonical_name}, Place Type: {place_type}")
    else:
        print(f"Token: {extracted_name}, No matching canonical name found")


Token: north, No matching canonical name found
Token: korea, No matching canonical name found


In [2]:
import googlemaps
from datetime import datetime

# Your Google Maps API key
google_maps_api_key = "YOUR_API_KEY_HERE"

# Initialize the Google Maps client
gmaps = googlemaps.Client(key=google_maps_api_key)

def get_geolocation(place_name):
    try:
        # Use the Geocoding API to get the geographic coordinates
        geocode_result = gmaps.geocode(place_name)

        if geocode_result:
            location = geocode_result[0]["geometry"]["location"]
            return location["lat"], location["lng"]
        else:
            return None
    except Exception as e:
        print("Error while fetching geolocation:", e)
        return None

for extracted_name in final_entities:
    best_match = perform_fuzzy_matching(extracted_name, dataset)
    if best_match:
        canonical_name = best_match["canonical name"]
        place_type = best_match["place-type"]
        other_names = ', '.join(best_match["other-names"])
        print(f"Token: {extracted_name}, Canonical name: {canonical_name}, Place Type: {place_type}")

        # Get geolocation for the canonical name
        coordinates = get_geolocation(canonical_name)

        if coordinates:
            latitude, longitude = coordinates
            print(f"Coordinates (Latitude, Longitude): {latitude}, {longitude}")

            # Now, you can use the latitude and longitude to display the location on Google Maps
            # You can open a web browser or use a web-based mapping library (e.g., Folium) to display the location.
            # Here's an example using the webbrowser library:

            import webbrowser

            map_url = f"https://www.google.com/maps/place/{latitude},{longitude}"
            webbrowser.open(map_url)

    else:
        print(f"Token: {extracted_name}, No matching canonical name found")

ValueError: Invalid API key provided.

In [10]:
import torch
from transformers import DistilBertTokenizer, DistilBertForTokenClassification
from transformers import pipeline
import csv
from fuzzywuzzy import fuzz

model_dir = 'DistilBert_conll03'
tokenizer = DistilBertTokenizer.from_pretrained(model_dir)
model = DistilBertForTokenClassification.from_pretrained(model_dir)
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)

text = "Sydney is going to tamil nadu"

ner_results = ner_pipeline(text)

def aggregate_subwords(current_entity, final_entities):
    if current_entity:
        entity_word = ''.join([word.replace('##', '') for word in current_entity["words"]])
        if not final_entities or (final_entities and final_entities[-1] != entity_word):
            final_entities.append(entity_word)

current_entity = {}
final_entities = []
previous_entity = None

for entity in ner_results:
    if entity['entity'] in ['B-LOC', 'I-LOC']:
        word = entity['word'].replace('##', '')
        if entity['word'].startswith('##') or entity['word'].endswith('-') or (current_entity.get("words") and current_entity["words"][-1].endswith('-')):
            current_entity["words"].append(word)
        elif previous_entity == 'B-LOC' and entity['entity'] == 'I-LOC':
            current_entity["words"].append(word)
        else:
            aggregate_subwords(current_entity, final_entities)
            current_entity = {"words": [word]}
        previous_entity = entity['entity']
    else:
        previous_entity = None

aggregate_subwords(current_entity, final_entities)

similarity_threshold = 80

def load_dataset(file_path):
    dataset = []
    with open(file_path, 'r', newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            row['other-names'] = [name.strip() for name in row['other-names'].split(',')]
            dataset.append(row)
    return dataset

def perform_fuzzy_matching(extracted_name, dataset):
    best_match = None
    highest_similarity = 0
    for row in dataset:

        similarity = fuzz.ratio(extracted_name.lower(), row["canonical name"].lower())
        if similarity > similarity_threshold and similarity > highest_similarity:
            best_match = row
            highest_similarity = similarity
            
        for other_name in row['other-names']:
            if other_name:
                similarity = fuzz.ratio(extracted_name.lower(), other_name.lower())
                if similarity > similarity_threshold and similarity > highest_similarity:
                    best_match = row
                    highest_similarity = similarity
    return best_match

dataset = load_dataset('Datasets/place_name.csv')

for extracted_name in final_entities:
    best_match = perform_fuzzy_matching(extracted_name, dataset)
    if best_match:
        canonical_name = best_match["canonical name"]
        place_type = best_match["place-type"]
        other_names = ', '.join(best_match["other-names"])
        print(f"Token: {extracted_name}, Canonical name: {canonical_name}, Place Type: {place_type}")
    else:
        print(f"Token: {extracted_name}, No matching canonical name found")

Token: sydney, No matching canonical name found
Token: tamilnadu, Canonical name: Tamil Nadu, Place Type: State


Token: maharashtra, Canonical name: Maharashtra, Place Type: state
Token: ahmedabad, Canonical name: Ahmedabad, Place Type: city
Token: new, No matching canonical name found
Token: -, No matching canonical name found
Token: zealand, Canonical name: newzealand, Place Type: country


In [7]:
# import torch
# from transformers import DistilBertTokenizer, DistilBertForTokenClassification
# from transformers import pipeline
# import csv
# from fuzzywuzzy import fuzz

# model_dir = 'DistilBert_conll03'
# tokenizer = DistilBertTokenizer.from_pretrained(model_dir)
# model = DistilBertForTokenClassification.from_pretrained(model_dir)
# ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)

# text = "Jordan is going to Jordan, his native is tamil nadu"

# ner_results = ner_pipeline(text)

# def aggregate_subwords(current_entity, final_entities):
#     if current_entity:
#         entity_word = ''.join([word.replace('##', '') for word in current_entity["words"]])
#         if not final_entities or (final_entities and final_entities[-1] != entity_word):
#             final_entities.append(entity_word)

# current_entity = {}
# final_entities = []
# previous_entity = None

# for entity in ner_results:
#     if entity['entity'] in ['B-LOC', 'I-LOC']:
#         word = entity['word'].replace('##', '')
#         if entity['word'].startswith('##') or entity['word'].endswith('-') or (current_entity.get("words") and current_entity["words"][-1].endswith('-')):
#             current_entity["words"].append(word)
#         elif previous_entity == 'B-LOC' and entity['entity'] == 'I-LOC':
#             current_entity["words"].append(word)
#         else:
#             aggregate_subwords(current_entity, final_entities)
#             current_entity = {"words": [word]}
#         previous_entity = entity['entity']
#     else:
#         previous_entity = None

# aggregate_subwords(current_entity, final_entities)

# similarity_threshold = 80

# def load_dataset(file_path):
#     dataset = []
#     with open(file_path, 'r', newline='', encoding='utf-8') as csvfile:
#         reader = csv.DictReader(csvfile)
#         for row in reader:
#             row['other-names'] = [name.strip() for name in row['other-names'].split(',')]
#             dataset.append(row)
#     return dataset

# def perform_fuzzy_matching(extracted_name, dataset):
#     best_match = None
#     highest_similarity = 0
#     for row in dataset:

#         similarity = fuzz.ratio(extracted_name.lower(), row["canonical name"].lower())
#         if similarity > similarity_threshold and similarity > highest_similarity:
#             best_match = row
#             highest_similarity = similarity
            
#         for other_name in row['other-names']:
#             if other_name:
#                 similarity = fuzz.ratio(extracted_name.lower(), other_name.lower())
#                 if similarity > similarity_threshold and similarity > highest_similarity:
#                     best_match = row
#                     highest_similarity = similarity
#     return best_match

# dataset = load_dataset('Datasets/place_name.csv')

# for extracted_name in final_entities:
#     best_match = perform_fuzzy_matching(extracted_name, dataset)
#     if best_match:
#         canonical_name = best_match["canonical name"]
#         place_type = best_match["place-type"]
#         other_names = ', '.join(best_match["other-names"])
#         print(f"Token: {extracted_name}, Canonical name: {canonical_name}, Place Type: {place_type}")
#     else:
#         print(f"Token: {extracted_name}, No matching canonical name found")

Token: jordan, No matching canonical name found
Token: tamilnadu, Canonical name: Tamil Nadu, Place Type: State
