In [12]:
import pandas as pd
import re


In [37]:
# Step 1: Load the text file
with open("drowned_all.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

In [38]:
# Split entries (each starts with a number and a dot)
entries = re.split(r'\n?\d{1,3}\.\s+', raw_text)
entries = [e.strip() for e in entries if e.strip()]


# Step 3: Define a function to extract all fields
def extract_fields(entry):
    date = re.search(r'(\d{1,2}/\d{2}/\d{2,4})', entry)
    platja = re.search(r'(platja|cala|caleta|zona)[^.,\n:)]*', entry, re.IGNORECASE)
    municipi = re.search(r'\(([^)]+)\)', entry)
    sexe = re.search(r'\b(home|dona|nen|nena|noi|noia)\b', entry, re.IGNORECASE)
    age = re.search(r'(\d{2,3})\s+anys', entry)
    nationality = re.search(r'nacionalitat\s+([a-zà-ÿA-ZÀ-ß]+)|indocumentad[oa]', entry, re.IGNORECASE)

    if 'no vigilad' in entry.lower() or 'servei de vigilància no havia començat' in entry.lower() or 'sense vigilància' in entry.lower():
        surveillance = 'No'
    elif 'vigilad' in entry.lower():
        surveillance = 'Sí'
    else:
        surveillance = 'Desconegut'

    if 'bandera verda' in entry.lower():
        flag = 'Verda'
    elif 'bandera groga' in entry.lower():
        flag = 'Groga'
    elif 'bandera vermella' in entry.lower():
        flag = 'Vermella'
    else:
        flag = 'Desconeguda'

    return {
        "Date": date.group(1) if date else None,
        "Platja": platja.group(0).strip() if platja else None,
        "Municipi o comarca": municipi.group(1) if municipi else None,
        "Sexe": sexe.group(1).lower() if sexe else None,
        "Age": age.group(1) if age else None,
        "Nationality or undocumented": nationality.group(1) if nationality and nationality.group(1) else "Indocumentat/da" if 'indocumentad' in entry else None,
        "Baywatch (Surveillance)": surveillance,
        "Flag": flag
    }




def extract_platja(entry):
    # Match "platja", "cala", or "caleta" followed by the name,
    # stopping before words like "vigilada", "bandera", or sentence end
    match = re.search(r'\b(platja|cala|caleta)\s+([^.,:\n()]*?)(?=\s+(vigilad|no\s*vigilad|bandera|servei|amb\s+bandera|i\s+bandera)|[.,:\n)])',
                      entry, re.IGNORECASE)
    if match:
        prefix = match.group(1).capitalize()
        name = match.group(2).strip()
        return f"{prefix} {name}"
    return None


In [44]:
def extract_platja(entry):
    # Match beach name directly after "platja", "cala", or "caleta"
    matches = re.findall(r'\b(platja|cala|caleta)\s+([^.,\n:;)]{3,50})', entry, re.IGNORECASE)
    for prefix, name in matches:
        full_name = f"{prefix.capitalize()} {name.strip()}"
        if not re.search(r'vigilad|servei de vigilància|no vigilad|sense vigilància', full_name, re.IGNORECASE):
            return full_name
    return None


In [45]:
def extract_fields(entry):
    date = re.search(r'(\d{1,2}/\d{2}/\d{2,4})', entry)
    platja = extract_platja(entry)
    municipi = re.search(r'\(([^)]+)\)', entry)
    sexe = re.search(r'\b(home|dona)\b', entry, re.IGNORECASE)
    age = re.search(r'(\d{2,3})\s+anys', entry)
    nationality = re.search(r'nacionalitat\s+([a-zà-ÿA-ZÀ-ÿ]+)|indocumentad[oa]', entry, re.IGNORECASE)

    if 'no vigilad' in entry.lower() or 'servei de vigilància no havia començat' in entry.lower() or 'sense vigilància' in entry.lower():
        surveillance = 'No'
    elif 'vigilad' in entry.lower():
        surveillance = 'Sí'
    else:
        surveillance = 'Desconegut'

    if 'bandera verda' in entry.lower():
        flag = 'Verda'
    elif 'bandera groga' in entry.lower():
        flag = 'Groga'
    elif 'bandera vermella' in entry.lower():
        flag = 'Vermella'
    else:
        flag = 'Desconeguda'

    return {
        "Date": date.group(1) if date else None,
        "Platja": platja,
        "Municipi o comarca": municipi.group(1) if municipi else None,
        "Sexe": sexe.group(1).capitalize() if sexe else None,
        "Age": age.group(1) if age else None,
        "Nationality or undocumented": nationality.group(1) if nationality and nationality.group(1) else ("Indocumentat/da" if 'indocumentad' in entry.lower() else None),
        "Baywatch (Surveillance)": surveillance,
        "Flag": flag,
        "Extra info": entry  # include original text for review
    }

In [46]:
# --- Apply to all entries ---
data = [extract_fields(entry) for entry in entries]


In [50]:
# --- Create DataFrame ---
df = pd.DataFrame(data)
df

Unnamed: 0,Date,Platja,Municipi o comarca,Sexe,Age,Nationality or undocumented,Baywatch (Surveillance),Flag,Extra info
0,16/06/2024,Platja Santa Margarida de Roses,,Home,65,espanyola,Desconegut,Groga,16/06/2024: platja Santa Margarida de Roses. H...
1,21/06/2024,Platja de Creixell (Tarragonès,Tarragonès,Home,77,espanyola,Desconegut,Desconeguda,21/06/2024: Platja de Creixell (Tarragonès). H...
2,24/06/2024,Platja de la Ribera de Sitges (Garraf,Garraf,Home,55,,Desconegut,Desconeguda,24/06/2024: platja de la Ribera de Sitges (Gar...
3,25/06/2024,Platja de la Gola del Ter,Baix Empordà,Home,74,espanyola,Sí,Verda,"25/06/2024: Platja de la Gola del Ter, a Torro..."
4,27/06/0202,Platja de Torre Valentina a Calonge i Sant Ant...,Baix Empordà,Home,54,russa,Desconegut,Desconeguda,27/06/02024: platja de Torre Valentina a Calon...
...,...,...,...,...,...,...,...,...,...
229,01/09/2015,,Roses,Home,74,alemanya,No,Desconeguda,01/09/2015: Santa Margarida (Roses) (Alt Empor...
230,01/09/2015,,Gavà,Home,68,espanyola,Sí,Verda,01/09/2015: Gavà (Gavà) (Baix Llobregat) Home ...
231,01/09/2015,,Tarragona,Home,54,espanyola,Sí,Desconeguda,01/09/2015: Arrabassada (Tarragona) (Tarragonè...
232,02/09/2015,Cala Roca Plana (Tarragona,Tarragona,Dona,48,espanyola,No,Desconeguda,02/09/2015: Cala Roca Plana (Tarragona) (Tarra...


In [48]:
df['Platja']

0                        Platja Santa Margarida de Roses
1                         Platja de Creixell (Tarragonès
2                  Platja de la Ribera de Sitges (Garraf
3                              Platja de la Gola del Ter
4      Platja de Torre Valentina a Calonge i Sant Ant...
                             ...                        
229                                                 None
230                                                 None
231                                                 None
232                           Cala Roca Plana (Tarragona
233                                                 None
Name: Platja, Length: 234, dtype: object

In [49]:
# Optional: export or explore
# df.to_csv("drowned_cases_cleaned.csv", index=False)
# df.head(10)

In [51]:
# Save to CSV
df.to_csv("drowned_cases_cleaned.csv", index=False)

In [11]:
from geopy.geocoders import Nominatim
import pandas as pd
# Sample data
df = pd.read_csv('drowned_cases_cleaned.csv')
geolocator = Nominatim(user_agent="ananya-geopy-lookup")
def geocode_address(address):
    try:
        location = geolocator.geocode(address)
        return location
    except:
        return None
# Geocode each address
df['location'] = df['Platja'].apply(geocode_address)
df['latitude'] = df['location'].apply(lambda loc: loc.latitude if loc else None)
df['longitude'] = df['location'].apply(lambda loc: loc.longitude if loc else None)
df



Unnamed: 0,Date,Platja,Municipi o comarca,Sexe,Age,Nationality or undocumented,Baywatch (Surveillance),Flag,location,latitude,longitude
0,16/06/2024,platja Santa Margarida de Roses,,home,65.0,espanyola,Desconegut,Groga,"(Platja de Sant Margarida, Santa Margarida, Ro...",42.261098,3.152984
1,21/06/2024,Platja de Creixell (Tarragonès,Tarragonès,home,77.0,espanyola,Desconegut,Desconeguda,"(Platja de Creixell, el Racó del Cèsar, Creixe...",41.160646,1.454203
2,24/06/2024,platja de la Ribera de Sitges (Garraf,Garraf,home,55.0,,Desconegut,Desconeguda,"(Platja de la Ribera, Sitges, Garraf, Barcelon...",41.234154,1.806889
3,25/06/2024,Platja de la Gola del Ter,Baix Empordà,home,74.0,espanyola,Sí,Verda,,,
4,27/06/0202,platja de Torre Valentina a Calonge i Sant Ant...,Baix Empordà,home,54.0,russa,Desconegut,Desconeguda,"(Via verda Platja d'Aro - Sant Antoni, Forn Ro...",41.833046,3.085585
...,...,...,...,...,...,...,...,...,...,...,...
229,01/09/2015,Platja no vigilada,Roses,home,74.0,alemanya,No,Desconeguda,,,
230,01/09/2015,Platja vigilada,Gavà,home,68.0,espanyola,Sí,Verda,,,
231,01/09/2015,Platja vigilada,Tarragona,home,54.0,espanyola,Sí,Desconeguda,,,
232,02/09/2015,Cala Roca Plana (Tarragona,Tarragona,dona,48.0,espanyola,No,Desconeguda,,,


In [None]:

# Save to new CSV
# df.to_csv("drowned_cases_geolocated.csv", index=False)