## Named Entity Recognition

In [2]:
import pandas as pd
import requests
import pickle
import time
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en.stop_words import STOP_WORDS

In [4]:
# Apply the NER model to our dataset (list of abstracts)
# Load data
data = pd.read_csv(r'..\get data\alldata.csv')
abstracts = data["Abstract"]

# Define NLP model
nlp = spacy.load('en_core_web_trf', disable=["tagger", "parser", "attribute_ruler", "lemmatizer"])

# List to store processed Doc objects
processed_abstracts = []

for abstract in abstracts:
    doc = nlp(abstract)  # Process the abstract
    processed_abstracts.append(doc)  # Save the processed doc

In [None]:
# Get all locations (GPE and LOC) from the processed documents 

# Dictionary to store location entities for each abstract
locations = {}  

# Extract location entities from saved docs
for idx, doc in enumerate(processed_abstracts):
    location_set = set()
    for ent in doc.ents:
        if ent.label_ == 'GPE' or ent.label_ == 'LOC':
            words = ent.text.split()
            filtered_location = " ".join(word for word in words if word.lower() not in STOP_WORDS)
            if filtered_location:
                location_set.add(filtered_location)

    locations[idx] = list(location_set)

# Combine unique locations from all abstracts into a single set
all_locations = set()
for loc_list in locations.values():
    all_locations.update(loc_list)

# Print all unique locations
print("Unique Locations:")
for location in all_locations:
    print(location)

print("Number of unique locations:", len(all_locations))

# Save the locations for each document
data_loc = pd.DataFrame.from_dict(locations, orient='index')
data_loc.index.name = 'Index'
data_loc.to_csv(r'..\NER\data_loc.csv')

# Save all_locations as a CSV file using pandas
all_locations_df = pd.DataFrame({'Location': list(all_locations)})
all_locations_df.to_csv(r'..\NER\all_locations.csv', index=False)

## Get coordinates for each location using the GeoNames API

In [None]:
# Load all locations.csv and get a list from it
all_locations_df = pd.read_csv(r'..\NER\all_locations.csv')
all_loc = all_locations_df['Location']

# Username obtained from GEONAMES
import helper
config = helper.read_config()

username = config['GEONAMESSettings']['username']

# Dictionary to store the location information
location_data = {}

# List to store error locations
error_locations = []

# Set delay to conform with API limitations
delay = 5

# Iterate over the location names
for location_name in all_loc:
    try:
        # Construct the search API URL
        url = f"http://api.geonames.org/searchJSON?name={location_name}&fuzzy=0.8&maxRows=50&username={username}"

        # Make the search request
        response = requests.get(url)

        # Parse the JSON response
        data = response.json()

        # Extract the relevant information for the location
        if "geonames" in data and len(data["geonames"]) > 0:
            location = data["geonames"][0]
            feature_class = location.get("fcl", "Unknown")
            latitude = location.get("lat", "Unknown")
            longitude = location.get("lng", "Unknown")

            # Store the information in the location_data dictionary
            location_data[location_name] = {
                "location": "exact",
                "feature_class": feature_class,
                "latitude": latitude,
                "longitude": longitude,
            }

        else:
            # Extract individual words from the location
            words = location_name.split()

            for word in words:
                # Construct the search API URL for individual words
                url = f"http://api.geonames.org/searchJSON?name={word}&fuzzy=0.8&maxRows=50&username={username}"

                # Make the search request
                response = requests.get(url)

                # Parse the JSON response
                data = response.json()

                # Check if a match is found
                if "geonames" in data and len(data["geonames"]) > 0:
                    matched_location = data["geonames"][0]
                    matched_feature_class = matched_location.get("fcl", "Unknown")
                    matched_latitude = matched_location.get("lat", "Unknown")
                    matched_longitude = matched_location.get("lng", "Unknown")

                    # Store the information in the location_data dictionary
                    location_data[location_name] = {
                        "location":matched_location.get("name", "Unknown"),
                        "feature_class": matched_feature_class,
                        "latitude": matched_latitude,
                        "longitude": matched_longitude,
                    }
                    break  # Stop searching for individual words if a match is found

            if location_name not in location_data:
                # Location not found
                error_locations.append(location_name)
                print(f"Location '{location_name}' not found in GeoNames database")

    except Exception as e:
        error_locations.append(location_name)  # Append location name to error list
        print(f"An error occurred for location '{location_name}': {str(e)}")
        continue  # Continue to the next location
    
    # Delay between consecutive requests
    time.sleep(delay)

# Save location data (feature, lat, lon) for each unique location
location_df = pd.DataFrame.from_dict(location_data, orient='index')
location_df.index.name = 'Location'
location_df.to_csv(r'..\NER\fuzzy_location_data.csv')

df_error = pd.DataFrame({'Error Locations': error_locations})
df_error.to_csv(r'..\NER\fuzzy_error_locations.csv', index=False)

## Attribute each location to an ocean basin

In [None]:
# Calculate the distance to the nearest ocean/sea

# Import math funtions
from math import radians, sin, cos, sqrt, atan2

# Load the data
location_df = pd.read_csv(r'..\NER\fuzzy_location_data.csv')
range_area = pd.read_csv(r'..\NER\range_area.csv')

# Filter instances with latitude outside the valid range
invalid_latitudes = range_area[(range_area['Latitude'] < -90) | (range_area['Latitude'] > 90)]

# Filter instances with longitude outside the valid range
invalid_longitudes = range_area[(range_area['Longitude'] < -180) | (range_area['Longitude'] > 180)]

# Get the indices of instances with invalid latitude or longitude
invalid_indices = invalid_latitudes.index.union(invalid_longitudes.index)

# Remove instances with invalid latitude or longitude from range_area
range_area_cleaned = range_area.drop(invalid_indices)

# Print the cleaned range_area dataframe
print(range_area_cleaned)

# The haversine formula calculates the distance between two point in a sphere
"""As the Earth is nearly spherical, the haversine formula provides a good
    approximation of the distance between two points of the Earth surface, with
    a less than 1% error on average."""


def haversine(lat1, lon1, lat2, lon2):
    # Convert coordinates from degrees to radians
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])

    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * atan2(sqrt(a), sqrt(1-a))
    distance = 6371 * c  # Radius of the Earth in kilometers
    return distance

def find_closest_ocean_basin(df_locations, df_ocean_basins):
    closest_ocean_basins = []

    for _, loc_row in df_locations.iterrows():
        min_distance = float('inf')
        closest_ocean_basin = None

        for _, ocean_row in df_ocean_basins.iterrows():
            try:
                distance = haversine(loc_row['latitude'], loc_row['longitude'], ocean_row['Latitude'], ocean_row['Longitude'])

                if distance < min_distance:
                    min_distance = distance
                    closest_ocean_basin = ocean_row['range_area']
            except Exception as e:
                print(f"Error occurred: {e}")

        closest_ocean_basins.append((closest_ocean_basin, min_distance))

    return closest_ocean_basins


# Example usage
# Assuming you have two dataframes: location_df and ocean_basins

# location_df contains location names, latitude, and longitude
# location_df = pd.DataFrame({
#     'Location': ['Location A', 'Location B', 'Location C'],
#     'latitude': [40.7128, 34.0522, 51.5074],
#     'longitude': [-74.0060, -118.2437, -0.1278]
# })

# ocean_basins contains ocean basins or seas with their labels, latitude, and longitude
# ocean_basins = pd.DataFrame({
#     'range_area': ['Atlantic Ocean', 'Pacific Ocean', 'Mediterranean Sea'],
#     'Latitude': [37.0902, -8.7832, 35.8984],
#     'Longitude': [-95.7129, -124.0198, 14.5120]
# })

# Call the function to find the closest ocean basin or sea for each location
closest_ocean_basins = find_closest_ocean_basin(location_df, range_area)

# Unpack the tuples into separate lists for ocean basin and distance
closest_ocean_labels, distances = zip(*closest_ocean_basins)

# Add the closest ocean basin and distance information to the df_locations dataframe
location_df['Closest Ocean Basin'] = closest_ocean_labels
location_df['Distance to Nearest Ocean'] = distances
location_df.to_csv(r'..\NER\fuzzy_location_data.csv')

# Print the updated dataframe
print(location_df)


In [None]:

# Check how many locations are less than 50, 100, 500, 1000 km from the coast

# Initialize counters
count_less_than_50 = 0
count_50_to_100 = 0
count_100_to_500 = 0
count_500_to_1000 = 0

# Iterate over the distances list
for distance in location_df['Distance to Nearest Ocean']:
    if distance < 50:
        count_less_than_50 += 1
    if 50 < distance < 100:
        count_50_to_100 += 1
    if 100 < distance < 500:
        count_100_to_500 += 1
    if 500 < distance < 1000:
        count_500_to_1000 += 1
# Print the counts
print("Number of instances less than 50 km: ", count_less_than_50)
print("Number of instances less than 100 km: ", count_50_to_100)
print("Number of instances less than 500 km: ", count_100_to_500)
print("Number of instances less than 1000 km: ", count_500_to_1000)

## Classify each paper by ocean basin

In [None]:
data_loc = pd.read_csv(r'..\NER\data_loc.csv')
location_df = pd.read_csv(r'..\NER\fuzzy_location_data.csv')
print(data_loc.head())
print(location_df.head())

In [None]:
# Create a dictionary mapping locations to labels
location_dict = location_df.set_index('Location')['Closest Ocean Basin'].to_dict()

# Create a new dataframe with only labels for each location in data_loc
labels_df = data_loc.copy()  # Make a copy of data_loc

# Iterate over each column in labels_df
for col in labels_df.columns[1:]:  # Exclude the first column (Index)
    labels_df[col] = labels_df[col].map(location_dict)

# Print the new dataframe
print(labels_df)


In [None]:
# Get the most recurrent value per row in labels_df, excluding the index
classes = []

for _, row in labels_df.iterrows():
    try:
        classes.append(row.drop(labels='Index').value_counts().idxmax())
    except ValueError:
        classes.append(None)

# Create a new dataframe with only the most recurrent value per row
classes = pd.DataFrame(classes, columns=['Most Recurrent Value'])

# Print the new dataframe
print(classes)


In [9]:
import pandas as pd

## Fix some issues with the basins dataset
range_area = pd.read_csv(r'..\NER\range_area.csv')

# Convert the 'range_area' column to float
range_area['range_area'] = range_area['range_area'].astype(float)

# Filter instances with latitude outside the valid range
invalid_latitudes = range_area[(range_area['Latitude'] < -90) | (range_area['Latitude'] > 90)]

# Filter instances with longitude outside the valid range
invalid_longitudes = range_area[(range_area['Longitude'] < -180) | (range_area['Longitude'] > 180)]

# Get the indices of instances with invalid latitude or longitude
invalid_indices = invalid_latitudes.index.union(invalid_longitudes.index)

# Remove instances with invalid latitude or longitude from range_area
range_area = range_area.drop(invalid_indices)

## Remove differentiation between coastal and oceanic basins (i.e. coastal north atlantic and north atlantic)
merge_classes = [[2,3,1], [4,5], [6,7],[8,9], [10,11], [12,13], [14,15], [16,17], [18,19]]

# Create the mapping dictionary for merging
merge_mapping = {}
for merged_class in merge_classes:
    merged_label = merged_class[0]
    for class_label in merged_class:
        merge_mapping[class_label] = merged_label

# Merge the classes based on the mapping
classes['Merged Class'] = classes['Most Recurrent Value'].replace(merge_mapping)

# Calculate the basin counts
basin_counts = classes['Merged Class'].value_counts()

# Merge the classes based on the mapping in range_area DataFrame
range_area['Merged Class'] = range_area['range_area'].replace(merge_mapping)

# Map the basin counts to the 'range_area' column
range_area['basin_count'] = range_area['Merged Class'].map(basin_counts)

# Include ocean label besides code
range_basin_list = pd.read_csv(r'..\NER\range_basin_list.csv', names=['range_area', 'basin'])
range_basin_list['range_area'] = range_basin_list['range_area'].astype(float).round(1)
range_area = range_area.merge(range_basin_list, on='range_area', how='left')
range_area.drop(['range_area','Merged Class'], axis=1, inplace=True)

# Create a dictionary to map range_area to basin
label_mapping = dict(zip(range_basin_list['range_area'], range_basin_list['basin']))

# Map the values in 'merged_classes' to labels
classes['Basin'] = classes['Merged Class'].map(label_mapping)

range_area.to_csv(r'..\NER\maps\range_area_counts.csv', index=False)
classes.to_csv(r'..\NER\merged_fuzzy_classes_labels.csv', index=False)

In [None]:
print(classes['Basin'].unique())

In [None]:
# Check some info on location matches
sum_all = 0

for location in location_df['location']:
    if location == 'exact':
        sum_all += 1
print('The number of exact location matches is:' + str(sum_all))

sum_all = 0

for location in location_df['location']:
    if location != 'exact':
        sum_all += 1
print('The number of separated words location matches is:' + str(sum_all))

sum_all = 0
classes.fillna("NaN",inplace=True)
for location in classes['Most Recurrent Value']:
    if location != 'NaN':
        sum_all += 1
print('The number of papers with locations is:' + str(sum_all))