## Named Entity Recognition

In [1]:
import pandas as pd
import requests
import pickle
import time
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en.stop_words import STOP_WORDS

In [2]:
# Apply the NER model to our dataset (list of abstracts)
# Load data
data = pd.read_csv(r'..\get_data\alldata.csv')
abstracts = data["Abstract"]

# Define NLP model
nlp = spacy.load('en_core_web_trf', disable=["tagger", "parser", "attribute_ruler", "lemmatizer"])

# List to store processed Doc objects
processed_abstracts = []

for abstract in abstracts:
    doc = nlp(abstract)  # Process the abstract
    processed_abstracts.append(doc)  # Save the processed doc


# Save the processed docs using pickle
processed_data_path = r'..\NER\alldata_processed.pkl'
with open(processed_data_path, 'wb') as file:
    pickle.dump(processed_abstracts, file)


MemoryError: 

In [None]:
processed_data_path = r'..\NER\alldata_processed.pkl'

chunk_size = 1000  # Adjust based on your memory constraints
chunks = [processed_abstracts[i:i + chunk_size] for i in range(0, len(processed_abstracts), chunk_size)]

with open(processed_data_path, 'wb') as file:
    for chunk in chunks:
        pickle.dump(chunk, file)

In [3]:
processed_abstracts

[The COVID-19 pandemic was caused by the SARS-CoV-2 virus, marking one of the most catastrophic global health crises of the 21st century. Throughout this period, widespread use and improper disposal of personal protective equipment (PPE) emerged as a pressing environmental issue, significantly impacting various life forms. During the COVID-19 pandemic, there was a high rate of PEP disposal. An alarming 1.6 × 106 tons of plastic waste each day has been generated since the onset of the outbreak, predominantly from the inadequate disposal of PPE. The mismanagement and subsequent degradation of discarded PPE significantly contribute to increased non-biodegradable micro(nano)plastic (MNP) waste. This pollution has had profound adverse effects on terrestrial, marine, and aquatic ecosystems, which have been extensively of concern recently. Accumulated MNPs within aquatic organisms could serve as a potential route for human exposure when consuming seafood. This review presents a novel aspect c

In [4]:
# Get all locations (GPE and LOC) from the processed documents 

# Dictionary to store location entities for each abstract
locations = {}  

# Extract location entities from saved docs
for idx, doc in enumerate(processed_abstracts):
    location_set = set()
    for ent in doc.ents:
        if ent.label_ == 'GPE' or ent.label_ == 'LOC':
            words = ent.text.split()
            filtered_location = " ".join(word for word in words if word.lower() not in STOP_WORDS)
            if filtered_location:
                location_set.add(filtered_location)

    locations[idx] = list(location_set)

# Combine unique locations from all abstracts into a single set
all_locations = set()
for loc_list in locations.values():
    all_locations.update(loc_list)

# Print all unique locations
print("Unique Locations:")
for location in all_locations:
    print(location)

print("Number of unique locations:", len(all_locations))

# Save the locations for each document
data_loc = pd.DataFrame.from_dict(locations, orient='index')
data_loc.index.name = 'Index'
data_loc.to_csv(r'..\NER\data_loc.csv')

# Save all_locations as a CSV file using pandas
all_locations_df = pd.DataFrame({'Location': list(all_locations)})
all_locations_df.to_csv(r'..\NER\all_locations.csv', index=False)

Unique Locations:
Liao River
Corsica
Mazandaran
Tambakrejo
Samut Prakan Province
Cilento
Costa del Azahar
Lagos States
GOMMBR
Meishe River
São João da Madeira
Nuup Kangerlua
Palmyra Atoll
Morača
Sengarang Besar
Western Cape
Auroville
Mississippi coast
Beijiang River
Oceania
Alue Naga
Pulau Pangkor
Basins
Lake Kazichene
Fog Bay
Punta Roca
Western Sorkhrood
Al Qurum's
Bryde
Venice Lagoon
Tibet
Kara
North-Central Adriatic Sea
Galerazamba
Karwar Coast
Central Java
Mazara del Vallo
Beaufort Sea
Tha Pra Chan
Negros Oriental
South Java Sea
Massachusetts
marina
El Socorro
Recife
Sanya City, Hainan Province
Southeastern Arabian Sea
Cilincing
Azores Current
Nazaré Canyon
Shuangtaizi River
Australasia
Saronicos Gulf
Urabá Gulf
Yancheng
Songshan Lake
Johor Strait
Palar River
Latium
Pacific Islands
Malacca Strait
Riau
east coast
Georgia Strait
NW Africa
Scilly Island
Kuala Selangor
River Tame
Dessau
Pakistan
Sicilian
North Artlantic
Great Pacific Garbage Patch
Jambi City
Paraíba River
Waitemata Har

## Get coordinates for each location using the GeoNames API

In [4]:
import pandas as pd
import requests
import time

# Load all locations.csv and get a list from it
all_locations_df = pd.read_csv(r'..\NER\all_locations.csv')
all_loc = all_locations_df['Location']

username = "henrique.mers"

# Dictionary to store the location information
location_data = {}

# List to store error locations
error_locations = []

# Set delay to conform with API limitations
delay = 2

def search_location(location_name, fuzzy=False):
    # Construct the search API URL
    url = f"http://api.geonames.org/searchJSON?name={location_name}&maxRows=50&username={username}"
    if fuzzy:
        url += "&fuzzy=0.8"
    # Make the search request
    response = requests.get(url)
    return response.json()

# Iterate over the location names
for location_name in all_loc:
    try:
        # Attempt exact match first
        data = search_location(location_name)

        # Extract the relevant information for the location
        if "geonames" in data and len(data["geonames"]) > 0:
            location = data["geonames"][0]
            feature_class = location.get("fcl", "Unknown")
            latitude = location.get("lat", "Unknown")
            longitude = location.get("lng", "Unknown")

            # Store the information in the location_data dictionary
            location_data[location_name] = {
                "location": "exact",
                "feature_class": feature_class,
                "latitude": latitude,
                "longitude": longitude,
            }
        else:
            # Attempt fuzzy match if exact match not found
            data = search_location(location_name, fuzzy=True)

            if "geonames" in data and len(data["geonames"]) > 0:
                location = data["geonames"][0]
                feature_class = location.get("fcl", "Unknown")
                latitude = location.get("lat", "Unknown")
                longitude = location.get("lng", "Unknown")

                # Store the information in the location_data dictionary
                location_data[location_name] = {
                    "location": "fuzzy",
                    "feature_class": feature_class,
                    "latitude": latitude,
                    "longitude": longitude,
                }
            else:
                # Extract individual words from the location
                words = location_name.split()
                matched = False

                for word in words:
                    data = search_location(word, fuzzy=True)

                    if "geonames" in data and len(data["geonames"]) > 0:
                        matched_location = data["geonames"][0]
                        matched_feature_class = matched_location.get("fcl", "Unknown")
                        matched_latitude = matched_location.get("lat", "Unknown")
                        matched_longitude = matched_location.get("lng", "Unknown")

                        # Store the information in the location_data dictionary
                        location_data[location_name] = {
                            "location": matched_location.get("name", "Unknown"),
                            "feature_class": matched_feature_class,
                            "latitude": matched_latitude,
                            "longitude": matched_longitude,
                        }
                        matched = True
                        break  # Stop searching for individual words if a match is found

                if not matched:
                    # Location not found
                    error_locations.append(location_name)
                    print(f"Location '{location_name}' not found in GeoNames database")

    except Exception as e:
        error_locations.append(location_name)  # Append location name to error list
        print(f"An error occurred for location '{location_name}': {str(e)}")
        continue  # Continue to the next location
    
    # Delay between consecutive requests
    time.sleep(delay)

# Save location data (feature, lat, lon) for each unique location
location_df = pd.DataFrame.from_dict(location_data, orient='index')
location_df.index.name = 'Location'
location_df.to_csv(r'..\NER\fuzzy_location_data.csv')

df_error = pd.DataFrame({'Error Locations': error_locations})
df_error.to_csv(r'..\NER\fuzzy_error_locations.csv', index=False)


Location 'GofM.' not found in GeoNames database
Location 'WCI' not found in GeoNames database
Location 'Ctalamochita' not found in GeoNames database
Location 'Northeast Florida' not found in GeoNames database
Location 'Conceição' not found in GeoNames database
Location 'Samos Island' not found in GeoNames database
Location 'Southwest coast' not found in GeoNames database
Location 'Swat River' not found in GeoNames database
Location 'Tianjin' not found in GeoNames database
Location 'Bali Province' not found in GeoNames database
Location 'Mediterranean sea' not found in GeoNames database
Location 'Gulf Naples' not found in GeoNames database
Location 'Lingao' not found in GeoNames database
Location 'Xincun Bay' not found in GeoNames database
Location 'Sandspit' not found in GeoNames database
Location 'Liaodong Bay' not found in GeoNames database
Location 'Vieste' not found in GeoNames database
Location 'Kaikoura' not found in GeoNames database
Location 'Northern Chile' not found in GeoNam

In [9]:
import pandas as pd
import requests
import time

# Load error locations
error_locations_df = pd.read_csv(r'..\NER\fuzzy_error_locations.csv')
error_locations = error_locations_df['Error Locations'].tolist()

# Load existing location data
existing_location_data_df = pd.read_csv(r'..\NER\fuzzy_location_data.csv')
location_data = existing_location_data_df.set_index('Location').to_dict(orient='index')

# Username for GeoNames API
username = "henrique.mers"

# Delay between requests to conform with API limitations
delay = 5

# Iterate over the location names
for location_name in error_locations:
    try:
        # Attempt exact match first
        data = search_location(location_name)

        # Extract the relevant information for the location
        if "geonames" in data and len(data["geonames"]) > 0:
            location = data["geonames"][0]
            feature_class = location.get("fcl", "Unknown")
            latitude = location.get("lat", "Unknown")
            longitude = location.get("lng", "Unknown")

            # Store the information in the location_data dictionary
            location_data[location_name] = {
                "location": "exact",
                "feature_class": feature_class,
                "latitude": latitude,
                "longitude": longitude,
            }
        else:
            # Attempt fuzzy match if exact match not found
            data = search_location(location_name, fuzzy=True)

            if "geonames" in data and len(data["geonames"]) > 0:
                location = data["geonames"][0]
                feature_class = location.get("fcl", "Unknown")
                latitude = location.get("lat", "Unknown")
                longitude = location.get("lng", "Unknown")

                # Store the information in the location_data dictionary
                location_data[location_name] = {
                    "location": "fuzzy",
                    "feature_class": feature_class,
                    "latitude": latitude,
                    "longitude": longitude,
                }
            else:
                # Extract individual words from the location
                words = location_name.split()
                matched = False

                for word in words:
                    data = search_location(word, fuzzy=True)

                    if "geonames" in data and len(data["geonames"]) > 0:
                        matched_location = data["geonames"][0]
                        matched_feature_class = matched_location.get("fcl", "Unknown")
                        matched_latitude = matched_location.get("lat", "Unknown")
                        matched_longitude = matched_location.get("lng", "Unknown")

                        # Store the information in the location_data dictionary
                        location_data[location_name] = {
                            "location": matched_location.get("name", "Unknown"),
                            "feature_class": matched_feature_class,
                            "latitude": matched_latitude,
                            "longitude": matched_longitude,
                        }
                        matched = True
                        break  # Stop searching for individual words if a match is found

                if not matched:
                    # Location not found
                    error_locations.append(location_name)
                    print(f"Location '{location_name}' not found in GeoNames database")

    except Exception as e:
        error_locations.append(location_name)  # Append location name to error list
        print(f"An error occurred for location '{location_name}': {str(e)}")
        continue  # Continue to the next location
    
    # Delay between consecutive requests
    time.sleep(delay)

# Save updated location data
location_df = pd.DataFrame.from_dict(location_data, orient='index')
location_df.index.name = 'Location'
location_df.to_csv(r'..\NER\fuzzy_location_data.csv')

# Check if there are still any error locations
remaining_error_locations = [loc for loc in error_locations if loc not in location_data]
if remaining_error_locations:
    df_remaining_error = pd.DataFrame({'Error Locations': remaining_error_locations})
    df_remaining_error.to_csv(r'..\NER\fuzzy_error_locations.csv', index=False)
else:
    print("All locations processed successfully.")


Location 'GofM.' not found in GeoNames database
Location 'WCI' not found in GeoNames database
Location 'Baltoniodus variabilis Subzone' not found in GeoNames database
Location 'Arafura-Timor Seas' not found in GeoNames database
Location 'SWNS' not found in GeoNames database
Location 'Punnakayal' not found in GeoNames database
Location 'CGSM' not found in GeoNames database
Location 'Gorsozan' not found in GeoNames database
Location 'LZP' not found in GeoNames database
Location 'AEEZ' not found in GeoNames database
Location 'Aguincheira' not found in GeoNames database
Location 'Rhine-Ruhr' not found in GeoNames database
Location 'Kizhskie' not found in GeoNames database
Location 'Qinghai-Tibet Plateau，the' not found in GeoNames database
Location 'Fukutoku-Okanoba' not found in GeoNames database
Location 'Odaw' not found in GeoNames database
Location 'Chellanam' not found in GeoNames database
Location 'HRZs' not found in GeoNames database
Location 'NWHI' not found in GeoNames database
Loc

KeyboardInterrupt: 

In [11]:
# Save updated location data
location_df = pd.DataFrame.from_dict(location_data, orient='index')
location_df.index.name = 'Location'
location_df.to_csv(r'..\NER\fuzzy_location_data.csv')

# Check if there are still any error locations
remaining_error_locations = [loc for loc in error_locations if loc not in location_data]
if remaining_error_locations:
    df_remaining_error = pd.DataFrame({'Error Locations': remaining_error_locations})
    df_remaining_error.to_csv(r'..\NER\fuzzy_error_locations.csv', index=False)
else:
    print("All locations processed successfully.")


In [12]:
df_remaining_error

Unnamed: 0,Error Locations
0,GofM.
1,WCI
2,Baltoniodus variabilis Subzone
3,Arafura-Timor Seas
4,SWNS
...,...
426,YDXE
427,NSCS
428,NASG
429,GofM.


In [8]:
error_dict = {"Arafura_Timor Sea":[-9.5, 134.5], "Punnakayal":[8.632222, 78.111950],
              "Aguincheira":[40.835621, -8.380748], "Rhine-Ruhr":[51.451770, 7.018720],
              "Qinghai-Tibet Plateau":[35.751230, 94.988449], "Fukutoku-Okanoba": [34.004161, 134.563027],
              "Odaw":[5.548512, -0.224942], "Chellanam":[9.807211, 76.277420], "South Malang":[-7.979470, 112.634222],
              "Cabo-Delgado":[-12.750000, 39.500000], "Rhine-Meuse":[51.730431, 4.715880],
              "Ho-Chi-Minh city": [10.668271, 106.482262], "Solimões":[-2.601662, -55.143238],
              "Santos Bay":[-23.959779, -46.344984], "Perú":[-11.185900, -76.841576], "North Maluku Province":[1.348434, 127.668530],
              "Patok-Fushëkuqe-Ishëm":[41.628820, 19.619820], "High-Arctic":[79.138640, -3.238970], "Sanbanze":[35.672459, 139.967468],
              "Apulo-Lucanian":[40.371390, 16.811220], "Lohabarrack":[11.603486, 92.609825], "Alto-Tiete":[-23.060740, -47.723861],
              "Kinmen-Xiamen":[24.479720, 118.081900], "Kodiyaghat":[-2.780561, -78.854599]}

## Attribute each location to an ocean basin

In [13]:
from math import radians
import pandas as pd
from sklearn.neighbors import BallTree
import numpy as np

# Load the data
location_df = pd.read_csv(r'..\NER\fuzzy_location_data.csv')
range_area = pd.read_csv(r'..\NER\range_area.csv')

# Filter instances with latitude outside the valid range
range_area = range_area[(range_area['Latitude'] >= -90) & (range_area['Latitude'] <= 90)]
range_area = range_area[(range_area['Longitude'] >= -180) & (range_area['Longitude'] <= 180)]

# Convert coordinates to radians
location_coords = np.radians(location_df[['latitude', 'longitude']].values)
ocean_coords = np.radians(range_area[['Latitude', 'Longitude']].values)

# Build the BallTree for fast nearest neighbor search
tree = BallTree(ocean_coords, metric='haversine')

# Query the tree for the closest ocean basin for each location
distances, indices = tree.query(location_coords, k=500)

# Convert distances from radians to kilometers (Earth radius = 6371 km)
distances = distances[:, 0] * 6371

# Get the closest ocean basin names
closest_ocean_labels = range_area.iloc[indices[:, 0]]['range_area'].values

# Add the closest ocean basin and distance information to the df_locations dataframe
location_df['Closest Ocean Basin'] = closest_ocean_labels
location_df['Distance to Nearest Ocean'] = distances

# Save the updated dataframe
location_df.to_csv(r'..\NER\fuzzy_location_data.csv', index=False)

# Print the updated dataframe
print(location_df)


                   Location      location feature_class  latitude  longitude  \
0                Liao River         exact             H  40.68407  122.14896   
1                   Corsica         exact             P  43.42527  -98.40730   
2                Mazandaran         exact             L  36.00000   52.00000   
3                Tambakrejo         exact             P  -7.27060  111.62160   
4     Samut Prakan Province         exact             A  13.60442  100.70531   
...                     ...           ...           ...       ...        ...   
6468    Strait Juan de Fuca         exact             H  48.29674 -124.01225   
6469           Ouvéa Island         exact             T -20.62250  166.56166   
6470                  Lumut         exact             P   4.23230  100.62980   
6471   Buenaventura estuary  Buenaventura             P   3.88010  -77.03116   
6472                Buffalo         exact             L  43.03787  -78.77538   

      Closest Ocean Basin  Distance to 

In [14]:

# Check how many locations are less than 50, 100, 500, 1000 km from the coast

# Initialize counters
count_less_than_50 = 0
count_50_to_100 = 0
count_100_to_500 = 0
count_500_to_1000 = 0

# Iterate over the distances list
for distance in location_df['Distance to Nearest Ocean']:
    if distance < 50:
        count_less_than_50 += 1
    if 50 < distance < 100:
        count_50_to_100 += 1
    if 100 < distance < 500:
        count_100_to_500 += 1
    if 500 < distance < 1000:
        count_500_to_1000 += 1
# Print the counts
print("Number of instances less than 50 km: ", count_less_than_50)
print("Number of instances less than 100 km: ", count_50_to_100)
print("Number of instances less than 500 km: ", count_100_to_500)
print("Number of instances less than 1000 km: ", count_500_to_1000)

Number of instances less than 50 km:  4246
Number of instances less than 100 km:  2151
Number of instances less than 500 km:  76
Number of instances less than 1000 km:  0


## Classify each paper by ocean basin

In [4]:
data_loc = pd.read_csv(r'..\NER\data_loc.csv')
location_df = pd.read_csv(r'..\NER\fuzzy_location_data.csv')
print(data_loc.head())
print(location_df.head())

   Index        0    1    2    3    4    5    6    7    8  ...   12   13   14  \
0      0      NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN  NaN   
1      1      NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN  NaN   
2      2      NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN  NaN   
3      3      NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN  NaN   
4      4  Pacific  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN  NaN   

    15   16   17   18   19   20   21  
0  NaN  NaN  NaN  NaN  NaN  NaN  NaN  
1  NaN  NaN  NaN  NaN  NaN  NaN  NaN  
2  NaN  NaN  NaN  NaN  NaN  NaN  NaN  
3  NaN  NaN  NaN  NaN  NaN  NaN  NaN  
4  NaN  NaN  NaN  NaN  NaN  NaN  NaN  

[5 rows x 23 columns]
                Location location feature_class  latitude  longitude  \
0             Liao River    exact             H  40.68407  122.14896   
1                Corsica    exact             P  43.42527  -98.40730   
2             Mazandaran    exact       

In [25]:
import pandas as pd

# Create a dictionary mapping locations to labels
location_dict = location_df.set_index('Location')['Closest Ocean Basin'].to_dict()

# Create a new dataframe with only labels for each location in data_loc
labels_df = data_loc.copy()  # Make a copy of data_loc

# Function to map values
def map_location(val):
    if pd.isna(val):
        return val
    else:
        return location_dict.get(val, val)

# Apply the function to each element in the DataFrame
labels_df = labels_df.applymap(map_location)

# Print the new dataframe
print(labels_df)


  labels_df = labels_df.applymap(map_location)


         0    1    2    3    4    5    6    7    8    9  ...  12   13   14  \
Index                                                    ...                 
0      NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ... NaN  NaN  NaN   
1      NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ... NaN  NaN  NaN   
2      NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ... NaN  NaN  NaN   
3      NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ... NaN  NaN  NaN   
4       10  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ... NaN  NaN  NaN   
...    ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ..  ...  ...   
12603  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ... NaN  NaN  NaN   
12604    7   19    7  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ... NaN  NaN  NaN   
12607    8    8   10    9  NaN  NaN  NaN  NaN  NaN  NaN  ... NaN  NaN  NaN   
12608    8    3    8  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ... NaN  NaN  NaN   
12609  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ... NaN

In [27]:
import pandas as pd

# Get the most recurrent value per row in labels_df, excluding the index
classes = []

for _, row in labels_df.iterrows():
    try:
        # Drop 'Index' and NaN values before calculating value counts
        most_recurrent_value = row.dropna().value_counts().idxmax()
        classes.append(most_recurrent_value)
    except ValueError:
        classes.append(None)

# Create a new dataframe with only the most recurrent value per row
classes_df = pd.DataFrame(classes, columns=['Most Recurrent Value'])

# Print the new dataframe
print(classes_df)


      Most Recurrent Value
0                     None
1                     None
2                     None
3                     None
4                       10
...                    ...
12393                 None
12394                    7
12395                    8
12396                    8
12397                 None

[12398 rows x 1 columns]


In [28]:
import pandas as pd

## Fix some issues with the basins dataset
range_area = pd.read_csv(r'..\NER\range_area.csv')

# Convert the 'range_area' column to float
range_area['range_area'] = range_area['range_area'].astype(float)

# Filter instances with latitude outside the valid range
invalid_latitudes = range_area[(range_area['Latitude'] < -90) | (range_area['Latitude'] > 90)]

# Filter instances with longitude outside the valid range
invalid_longitudes = range_area[(range_area['Longitude'] < -180) | (range_area['Longitude'] > 180)]

# Get the indices of instances with invalid latitude or longitude
invalid_indices = invalid_latitudes.index.union(invalid_longitudes.index)

# Remove instances with invalid latitude or longitude from range_area
range_area = range_area.drop(invalid_indices)

## Remove differentiation between coastal and oceanic basins (i.e. coastal north atlantic and north atlantic)
merge_classes = [[2,3,1], [4,5], [6,7],[8,9], [10,11], [12,13], [14,15], [16,17], [18,19]]

# Create the mapping dictionary for merging
merge_mapping = {}
for merged_class in merge_classes:
    merged_label = merged_class[0]
    for class_label in merged_class:
        merge_mapping[class_label] = merged_label

# Merge the classes based on the mapping
classes_df = pd.DataFrame(classes_df, columns=['Most Recurrent Value'])
classes_df['Merged Class'] = classes_df['Most Recurrent Value'].replace(merge_mapping)

# Calculate the basin counts
basin_counts = classes_df['Merged Class'].value_counts()

# Merge the classes based on the mapping in range_area DataFrame
range_area['Merged Class'] = range_area['range_area'].replace(merge_mapping)

# Map the basin counts to the 'range_area' column
range_area['basin_count'] = range_area['Merged Class'].map(basin_counts)

# Include ocean label besides code
range_basin_list = pd.read_csv(r'..\NER\range_basin_list.csv', names=['range_area', 'basin'])
range_basin_list['range_area'] = range_basin_list['range_area'].astype(float).round(1)
range_area = range_area.merge(range_basin_list, on='range_area', how='left')
range_area.drop(['range_area','Merged Class'], axis=1, inplace=True)

# Create a dictionary to map range_area to basin
label_mapping = dict(zip(range_basin_list['range_area'], range_basin_list['basin']))

# Map the values in 'merged_classes' to labels
classes_df['Basin'] = classes_df['Merged Class'].map(label_mapping)

range_area.to_csv(r'..\NER\maps\range_area_counts.csv', index=False)
classes_df.to_csv(r'..\NER\merged_fuzzy_classes_labels.csv', index=False)

In [36]:
classes_df['Basin'].dropna().unique()

array(['Equatorial Pac', 'North Atlantic', 'North Pacific', 'Arctic',
       'Antarctic', 'Mediteranean', 'North Indian', 'Equatorial Indian',
       'Baltic Sea', 'South Pacific', 'Equatorial Atlant',
       'South Atlantic', 'South Indian', 'Persian Gulf', 'Red Sea',
       'Black Sea', 'Sulu Sea'], dtype=object)