# En este notebook reproducimos el dataframe limpio para hacer hipótesis

In [None]:
# Este es todo el código testeado en `notebook_limpio.ipynb`

import pandas as pd

pd.set_option('display.max_columns', 500)

url = "https://www.sharkattackfile.net/spreadsheets/GSAF5.xls"

df_shark_attacks = pd.read_excel(url)

df_shark_attacks.columns = [x .lower().strip() for x in df_shark_attacks.columns]

columnas_a_borrar = [
    'unnamed: 21', 
    'pdf', 
    'href',
    'href formula',
    'case number',
    'case number.1',
    'original order', 
    'unnamed: 21', 
    'unnamed: 22', 
    'time', 
    'source', 
    'year'
    ]
df_shark_attacks.drop(columns=[col for col in columnas_a_borrar], inplace=True)

df_shark_attacks.rename(columns={'fatal y/n':'fatal'}, inplace=True)

df_shark_attacks.drop_duplicates(inplace=True)

df_shark_attacks.reset_index(drop=True,inplace=True)

def clean_fatal(valor):
    if valor == "Y":
        return "Y"
    elif valor == "N":
        return "N"
    else:
        return "UNKNOWN"

df_shark_attacks['fatal'] = df_shark_attacks['fatal'].apply(clean_fatal)

df_shark_attacks['species'] = df_shark_attacks['species'].fillna('Unknown').astype(str)

def clean_species(valor):
    valor = valor.strip().lower() # Eliminamos espacios y convertimos a minúsculas

    if "white" in valor:
        return "White Shark"
    elif "tiger" in valor:
        return "Tiger Shark"
    elif "bull" in valor:
        return "Bull Shark"
    elif "hammer" in valor:
        return "Hammerhead Shark"
    elif "shark" in valor:
        return "Other Shark"
    else:
        return "Unknown"

df_shark_attacks["species"] = df_shark_attacks["species"].apply(clean_species)

type_mapping = {
    "Unprovoked": "Unprovoked",
    "Provoked": "Provoked",
    "Invalid": "Invalid",
    "Watercraft": "Watercraft",
    "Sea Disaster": "Sea Disaster",
    "Questionable": "Questionable",
    "Boat": "Watercraft",
    " Provoked": "Provoked",
    "unprovoked": "Unprovoked",
    "?": "Questionable",
    "Unconfirmed": "Questionable",
    "Unverified": "Questionable",
    "Under investigation": "Questionable"
}

df_shark_attacks['type'] = df_shark_attacks['type'].map(type_mapping)

df_shark_attacks.fillna("Questionable", inplace=True)

df_shark_attacks["sex"] = df_shark_attacks["sex"].apply(lambda sex: sex.strip())

sex_mapping = {
    "M": "M",
    "F": "F",
    "Questionable": "Unknown",
    "N": "Unknown",
    "m": "M",
    "lli": "Unknown",
    "M x 2": "Unknown",
    ".": "Unknown"
}

df_shark_attacks["sex"] = df_shark_attacks["sex"].map(sex_mapping)

df_shark_attacks['age_clean'] = pd.to_numeric(df_shark_attacks['age'], errors='coerce')

age_mediana = df_shark_attacks['age_clean'].median()

df_shark_attacks['age_clean'] = df_shark_attacks['age_clean'].fillna(age_mediana)

df_shark_attacks['age_clean'] = df_shark_attacks['age_clean'].astype(int)

df_backup_age = df_shark_attacks.copy()

df_shark_attacks.drop(columns=['age'], inplace=True)

df_shark_attacks['name'] = df_shark_attacks['name'].astype(str).str.strip()

df_shark_attacks.loc[df_shark_attacks['name'].str.lower().isin(['male', 'female', 'questionable', 'boy', '2 males', 'a sailor', 'males', 'boat', 'child', 'unknown', 'girl']), 'name'] = 'Anonymous'

def mes_map(date):
    meses = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun','Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    date = str(date).title()
    for mes in meses:
        if mes in date:
            return mes
    return None

df_shark_attacks["mes"] = df_shark_attacks["date"].apply(mes_map)

df_backup_mes = df_shark_attacks.copy()

df_shark_attacks.drop(columns=['date'], inplace=True)

df_shark_attacks = df_shark_attacks.loc[df_shark_attacks["country"] == "USA"].copy()

def activity_maping(activity):
    activity = activity.strip().lower()
    if "surf" in activity:
        return "surfing"
    if "swim" in activity:
        return "swimming"
    if "fishing" in activity:
        return "fishing"
    if "diving" in activity:
        return "diving"
    if "snorkel" in activity:
        return "snorkel"
    if "hunt" in activity:
        return "hunting"
    if "question" in activity:
        return "questionable"
    else:
        return activity.strip().lower()
    
df_shark_attacks["activity"] = df_shark_attacks["activity"].apply(activity_maping)

def state_format(state):
    state = state.strip().lower()
    if "flor" in state:
        return "Florida"
    if "hawai" in state:
        return "Hawaii"
    if "cali" in state:
        return "California"
    if "baha" in state:
        return "Bahamas"
    else:
        return state.strip().title()
    
df_shark_attacks["state"] = df_shark_attacks["state"].apply(state_format)

df_shark_attacks


Unnamed: 0,type,country,state,location,activity,name,sex,injury,fatal,species,age_clean,mes
3,Unprovoked,USA,California,Lovers Point Pacific Grove,swimming,Erica Fox,F,Taken by shark body recovered with multiple in...,Y,White Shark,55,Dec
4,Unprovoked,USA,California,Salmon Creek,surfing,Anonymous,M,Hand Injury,N,White Shark,24,Dec
5,Provoked,USA,Hawaii,"Ka'alu""alu Beach",freeing trapped shark,Josiah Kaimani Ventura,M,Bite wounds to thigh,N,Other Shark,24,Dec
10,Unprovoked,USA,Hawaii,Pine Trees Hanalei Bay Kaui,swimming,Chance Swanson,M,Injuries to legs,N,Unknown,24,Nov
11,Unprovoked,USA,Texas,Matagorda Beach Matagorda,fishing,Chuck Bledsoe,M,Laceration on top and undermeath right foot,N,Unknown,24,Nov
...,...,...,...,...,...,...,...,...,...,...,...,...
7009,Unprovoked,USA,Florida,"Palm Beach, Palm Beach County",standing,Horton Chase,M,Abrasions & bruises hip to ankle,N,Unknown,24,
7043,Unprovoked,USA,Florida,"Gadsden Point, Tampa Bay",fishing,James Kelley,M,2-inch lacerations,N,Unknown,24,
7048,Unprovoked,USA,North Carolina,Somewhere between Hatteras and Beaufort,swimming,"""youthful male""",M,"""Lost leg""",N,Unknown,24,Jul
7052,Unprovoked,USA,Hawaii,Puna,questionable,"A ""chiefess""",F,Ankle bitten,N,Unknown,24,
