In [None]:
import json
import pandas as pd
from datetime import datetime
from geopy.geocoders import Nominatim
import time
import country_converter as coco

In [None]:
with open("input.json","r") as f:
    inputs = json.load(f)

In [None]:
def filter_uppercase_words(text):
    words = text.split()
    filtered_words = [word for word in words if word[0].isupper()]
    return ' '.join(filtered_words)

CURRENT_YEAR = datetime.now().year
def is_valid_year(value):
    try:
        year = int(value)
        return 1000 <= year <= CURRENT_YEAR
    except:
        return False

def process_year_column(df, col_name):
    def process_year(x):
        if is_valid_year(x):
            return int(x)
        try:
            int(x)
            return 0
        except (ValueError, TypeError):
            return "Not a year"
    
    df[col_name] = df[col_name].apply(process_year)
    return df

def is_valid_country(name):
    if not isinstance(name, str) or not name.strip():
        return False
    
    result = coco.convert(names=name, to='name_official', not_found=None)
    return result is not None

def get_coordinates(df):
    loc = Nominatim(user_agent="geoapi")

    location_coords = {}

    print("=== STARTING GEOCODING WITH NOMINATIM ===")
    print(f"Processing {len(df['location'].unique())} unique locations...\n")

    for i, location in enumerate(df["location"].unique(), 1):
        if pd.isna(location):
            print(f"[{i}/{len(df['location'].unique())}] Skipped: Empty location")
            location_coords[location] = {'address': None, 'latitude': None, 'longitude': None}
            continue
            
        print(f"[{i}/{len(df['location'].unique())}] Processing: '{location}'")
        
        try:
            getLoc = loc.geocode(location)
            time.sleep(1)  # Required delay
            
            if getLoc:
                location_coords[location] = {
                    'address': getLoc.address,
                    'latitude': getLoc.latitude,
                    'longitude': getLoc.longitude
                }

            else:
                location_coords[location] = {'address': None, 'latitude': None, 'longitude': None}
                print("No results found\n")
                
        except Exception as e:
            location_coords[location] = {'address': None, 'latitude': None, 'longitude': None}
            print(f"! Error: {str(e)}")
            time.sleep(5)  # Longer delay if error
            print("  Waiting 5 seconds before continuing...\n")

    return location_coords

def validate_aegypti(row, sources=inputs):
    source_type = str(row["source_type"])
    year = str(row["year"])
    country = str(row["country"])
    texts = [item["text"] for item in sources if item["source_type"]==source_type]
    return any((country in text) and (year in text) and ('aegypti' in text) for text in texts)


def validate_albopictus(row, sources=inputs):
    source_type = str(row["source_type"])
    year = str(row["year"])
    country = str(row["country"])
    texts = [item["text"] for item in sources if item["source_type"]==source_type]
    return any((country in text) and (year in text) and ('albopictus' in text) for text in texts)

In [None]:
df_output = pd.read_csv("output_dataset.csv")
df_output = df_output[df_output["vector"].isin(["Aedes aegypti","Aedes albopictus"])]

In [None]:
df_output['location'] = df_output['location'].apply(filter_uppercase_words)
df_output = df_output[df_output['location'].str.replace(',', '').str.split().str.len() <= 9]

df_output = process_year_column(df_output, "year")
df_output = df_output[df_output["year"]!="Not a year"]

df_output['is_valid_country'] = df_output['country'].apply(is_valid_country)
df_output = df_output[df_output["is_valid_country"]!=True]

location_coords = get_coordinates(df_output)
df_output['y'] = df_output['location'].map(lambda x: location_coords.get(x, {}).get('latitude'))
df_output['x'] = df_output['location'].map(lambda x: location_coords.get(x, {}).get('longitude'))

In [None]:
df_output.drop_duplicates(inplace=True)
df_output.dropna(inplace=True)

In [None]:
output_aegypti = df_output[df_output["vector"]=="Aedes aegypti"]
output_albopictus = df_output[df_output["vector"]=="Aedes albopictus"]

In [None]:
output_aegypti["valid"] = output_aegypti.apply(validate_aegypti,axis=1)
output_albopictus["valid"] = output_albopictus.apply(validate_albopictus,axis=1)

output_aegypti = output_aegypti[output_aegypti["valid"]==True]
output_albopictus = output_albopictus[output_albopictus["valid"]==True]

In [None]:
output_aegypti[["vector","source_type","country","year","y","x"]].to_csv("updated_aegypti.csv",index=False)
output_albopictus[["vector","source_type","country","year","y","x"]].to_csv("updated_albopictus.csv",index=False)