In [1]:
import sys
import os

# Add the src directory to the Python path
#sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../src")))
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../src")))

from utils.paths import get_path  # Import the get_path function from paths.py
from utils.functions import a
from utils.functions import print_smthg

import utils.constants


import pandas as pd
import geopandas as gpd

def main():

    print(a)
    print_smthg()

    # Retrieve path to EM-DAT
    emdat_data_path = get_path("emdat_path")
    print(f"EM-DAT Data Path: {emdat_data_path}")

    #Load EM-DAT using pandas
    if os.path.exists(emdat_data_path):
        df = pd.read_excel(emdat_data_path)
        print(df.head())
    else:
        print(f"File not found at {emdat_data_path}")
    
    # Retrieve path to Gaul1
    gaul1_data_path = get_path("gaul1_path")
    print(f"GAUL 1 Data Path: {gaul1_data_path}")
    
    #Load GAUl1 using pandas
    if os.path.exists(gaul1_data_path):
        df = gpd.read_file(gaul1_data_path)
        print(df.head())
    else:
        print(f"File not found at {gaul1_data_path}")
    
if __name__ == "__main__":
    main()



import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In the next release, GeoPandas will switch to using Shapely by default, even if PyGEOS is installed. If you only have PyGEOS installed to get speed-ups, this switch should be smooth. However, if you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


1
Something
EM-DAT Data Path: /net/projects/xaida/raw_data/emdat_data/public_emdat_1990_2023.xlsx
          DisNo. Historic Classification Key Disaster Group Disaster Subgroup  \
0  1990-0001-LKA      Yes    nat-hyd-flo-riv        Natural      Hydrological   
1  1990-0002-TUN      Yes    nat-hyd-flo-riv        Natural      Hydrological   
2  1990-0003-WSM      Yes    nat-met-sto-tro        Natural    Meteorological   
3  1990-0004-FRA      Yes    nat-hyd-mmw-ava        Natural      Hydrological   
4  1990-0005-IDN      Yes    nat-hyd-flo-riv        Natural      Hydrological   

         Disaster Type  Disaster Subtype External IDs Event Name  ISO  ...  \
0                Flood    Riverine flood          NaN        NaN  LKA  ...   
1                Flood    Riverine flood          NaN        NaN  TUN  ...   
2                Storm  Tropical cyclone          NaN        Ofa  WSM  ...   
3  Mass movement (wet)   Avalanche (wet)          NaN        NaN  FRA  ...   
4                Flood   

In [2]:
import sys
import os

# Add the src directory to the Python path
#sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../src")))
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../src")))

from utils.paths import get_path  # Import the get_path function from paths.py
import utils.constants as constants
import utils.functions as functions

import pandas as pd
import geopandas as gpd
import numpy as np
import re

def main():

    # 1 Read and Apply Location Corrections
    ######################################
    # Read data and apply corrections
    
    emdat_data_path = get_path("emdat_path")
    print(f"EM-DAT Data Path: {emdat_data_path}")

    #Load EM-DAT using pandas
    if os.path.exists(emdat_data_path):
        emdat = pd.read_excel(emdat_data_path)
        print(emdat.head())
    else:
        print(f"File not found at {emdat_data_path}")
    
    emdat["Location"] = emdat["Location"].str.lower()

    emdat.loc[emdat.ISO == "PHL","Location"] = emdat.loc[emdat.ISO == "PHL","Location"].replace(constants.rep_philippines, regex=True)
    emdat.loc[emdat.ISO == "BFA","Location"] = emdat.loc[emdat.ISO == "BFA","Location"].replace(constants.rep_burkina, regex=True)
    emdat.loc[emdat.ISO == "HTI","Location"] = emdat.loc[emdat.ISO == "HTI","Location"].replace(constants.rep_haiti, regex=True)
    emdat.loc[emdat.ISO == "TCD","Location"] = emdat.loc[emdat.ISO == "TCD","Location"].replace(constants.rep_chad, regex=True)

    Emdata = emdat.copy()
    print("EMDATA print line")
    print(Emdata.head())
    
    # 2 Standardize and Clean Location Data
    ######################################
    #select locations that do not have admin units, but only admin names, and only for natural events
    
    Emdata_loconly = Emdata[(Emdata['Admin Units'].isna()) & (Emdata['Location'].notna()) & (Emdata["Disaster Group"] == 'Natural')]
    
    #This script processes and cleans EM-DAT locations. The goal is to standardize and separate the location information.
    #The script performs the following key tasks:

    #String Replacement: Replaces words like 'and', 'Between', '&', etc., with commas to standardize the separation between multiple locations.
    #Location Splitting: Uses a function `split_and_clean_locations` to split location strings based on commas, semicolons, and parentheses, 
                         #while handling special cases such as entries containing "Level 1".
    #Location Name Cleanup: removes certain common terms related to geographic divisions (like 'Province', 'District', 'Region', etc.).
    #Further Location Splitting: Uses a function `split_text` to break down entries that contain multiple parenthetical groups or locations
                        #into individual components for easier parsing.
    #Location Extraction: The `extract_locations` function is used to parse locations that contain brackets or geographic references.
    #New Data Creation:A new DataFrame is created with four main columns: 'DisNo.' (event ID), 'Individual_Location', 'Location_Before' 
                        #(location before parentheses), and 'Bracketed' (location inside parentheses). It combines the cleaned-up 
                        #'Location_Before' and 'Bracketed' columns to create a final 'Appended' location field, which merges both parts where applicable.

    #The final DataFrame presents a standardized and cleaned version of the original location data, making it suitable for further analysis or geocoding operations.

    Emdata_loconly.loc[:, 'Location'] = Emdata_loconly['Location'].str.replace(r'\) and\b', '),', regex=True)

    Emdata_loconly.loc[:,'Location'] = Emdata_loconly['Location'].str.replace(r'\b(and|Between|&|\+)\b', ',', regex=True)
    
    # 3 Split and Parse Location Strings
    ######################################
    
    expanded_rows = [
    [row['DisNo.'], row['Location'], loc]
    for _, row in Emdata_loconly.iterrows()
    for loc in functions.split_and_clean_locations(row['Location'])
    ]
    
    expanded_df = pd.DataFrame(expanded_rows, columns=['DisNo.', 'Full_Location_List', 'Individual_Location'])
    
    for term in constants.replace_terms:
        expanded_df['Individual_Location'] = expanded_df['Individual_Location'].str.replace(term, " ", case=False,regex=False)
    
    expanded_df['Individual_Location'] = expanded_df['Individual_Location'].apply(functions.split_text)
    expanded_df = expanded_df.explode('Individual_Location', ignore_index=True)
    expanded_df['Individual_Location'] = expanded_df['Individual_Location'].str.replace(r'\b(and| & |Between| \+ | \) and)\b', ',', regex=True)
    
    new_data = [
    [row['DisNo.'], row['Individual_Location'], loc[0], loc[1]]
    for _, row in expanded_df.iterrows()
    for loc in functions.extract_locations(row['Individual_Location'])
    ]

    new_df = pd.DataFrame(new_data, columns=['DisNo.', 'Individual_Location', 'Location_Before', 'Bracketed'])
    new_df['Appended'] = new_df['Location_Before'] + ','+ new_df['Bracketed'].apply(lambda x: f" {x}" if x else "")

    print(new_df)
    
    new_df['Appended'] = new_df['Appended'].apply(functions.remove_str_if_last)
    
    # remove locations that are only numbers or digits

    numeric_rows = new_df[["Location_Before"]].applymap(lambda x: isinstance(x, str) and x.isdigit()).any(axis=1)
    new_df = new_df[~numeric_rows]
    
    # Find locations where the number of characters is 2
    one_char_rows = new_df[["Individual_Location"]].applymap(lambda x: isinstance(x, str) and len(x) == 2)
    usa_rep_rows = new_df[one_char_rows.Individual_Location][(new_df[one_char_rows.Individual_Location]["DisNo."].str.contains("USA"))]

    #apply the correction to the locations in the USA
    new_df.loc[usa_rep_rows.index] = new_df.loc[usa_rep_rows.index].replace(constants.rep_us_states, regex=True)
    
    #extract the ISO column from the disaster number
    new_df['ISO'] = new_df['DisNo.'].str[-3:]

    #search for and remove uncorrect ISO codes.
    #For some events, older ISO codes or iso codes other than iso 3 are used by EM-DAT
    #ISO to correct: original iso and replacement
    #AZO by PRT
    #DFR by DEU
    #1346 by MNE
    #SCG by SRB
    new_df['ISO'].replace('AZO', 'PRT', inplace=True)
    new_df['ISO'].replace('DFR', 'DEU', inplace=True)
    new_df['ISO'].replace('SCG', 'SRB', inplace=True)
    
    #event in montenegro that belonged to serbia in the past
    new_df.loc[(new_df.ISO == "SRB") & (new_df.Bracketed == "montenegro"),"ISO"] = "MNE"
    
    print(new_df.loc[1346, "ISO"])
    
    #events to delete, as they happened in countries that no longer exist (e.g. Youguslavia), at the scale of entire countries today (e.g. Slovenia)
    #'ANT'
    #'SUN'
    #'YUG'
    new_df = new_df[~(new_df.ISO == "YUG")]
    new_df = new_df[~(new_df.ISO == "SUN")]
    new_df = new_df[~(new_df.ISO == "ANT")]
    
    # 4 Apply Final Corrections and Export
    ######################################
    #select USA events
    new_df_usa = new_df[new_df.ISO == "USA"]
    new_df_usa = new_df_usa[["DisNo.","ISO","Appended"]].rename(columns={"Appended":"Location"})
    
    #select rest of the world events
    new_df_restwolrd = new_df[~(new_df.ISO == "USA")]
    new_df_restwolrd = new_df_restwolrd[["DisNo.","ISO","Appended"]].rename(columns={"Appended":"Location"})

    #concat togethr
    df_locations = pd.concat([new_df_usa, new_df_restwolrd])

    df_locations = df_locations[~(df_locations["Location"] == 'nan')]

    df_locations = df_locations.reset_index().drop(columns='index')
    
    print(df_locations)
    
    intermediate_data_path = get_path("intermediate_data_path")
    print(intermediate_data_path)
    
    df_locations.to_csv(intermediate_data_path+'event_locations_to_geolocate.csv')
    
if __name__ == "__main__":
    main()


EM-DAT Data Path: /net/projects/xaida/raw_data/emdat_data/public_emdat_1990_2023.xlsx
          DisNo. Historic Classification Key Disaster Group Disaster Subgroup  \
0  1990-0001-LKA      Yes    nat-hyd-flo-riv        Natural      Hydrological   
1  1990-0002-TUN      Yes    nat-hyd-flo-riv        Natural      Hydrological   
2  1990-0003-WSM      Yes    nat-met-sto-tro        Natural    Meteorological   
3  1990-0004-FRA      Yes    nat-hyd-mmw-ava        Natural      Hydrological   
4  1990-0005-IDN      Yes    nat-hyd-flo-riv        Natural      Hydrological   

         Disaster Type  Disaster Subtype External IDs Event Name  ISO  ...  \
0                Flood    Riverine flood          NaN        NaN  LKA  ...   
1                Flood    Riverine flood          NaN        NaN  TUN  ...   
2                Storm  Tropical cyclone          NaN        Ofa  WSM  ...   
3  Mass movement (wet)   Avalanche (wet)          NaN        NaN  FRA  ...   
4                Flood    Riverine fl