# En este notebook reproducimos el dataframe limpio para hacer hipótesis

In [None]:
# Este es todo el código testeado en `notebook_limpio.ipynb`

import pandas as pd

pd.set_option('display.max_columns', 500)

url = "https://www.sharkattackfile.net/spreadsheets/GSAF5.xls"

df_shark_attacks = pd.read_excel(url)

df_shark_attacks.columns = [x .lower().strip() for x in df_shark_attacks.columns]

columnas_a_borrar = [
    'unnamed: 21', 
    'pdf', 
    'href',
    'href formula',
    'case number',
    'case number.1',
    'original order', 
    'unnamed: 21', 
    'unnamed: 22', 
    'time', 
    'source', 
    'year'
    ]
df_shark_attacks.drop(columns=[col for col in columnas_a_borrar], inplace=True)

df_shark_attacks.rename(columns={'fatal y/n':'fatal'}, inplace=True)

df_shark_attacks.drop_duplicates(inplace=True)

df_shark_attacks.reset_index(drop=True,inplace=True)

def clean_fatal(valor):
    if valor == "Y":
        return "Y"
    elif valor == "N":
        return "N"
    else:
        return "UNKNOWN"

df_shark_attacks['fatal'] = df_shark_attacks['fatal'].apply(clean_fatal)

df_shark_attacks['species'] = df_shark_attacks['species'].fillna('Unknown').astype(str)

def clean_species(valor):
    valor = valor.strip().lower() # Eliminamos espacios y convertimos a minúsculas

    if "white" in valor:
        return "White Shark"
    elif "tiger" in valor:
        return "Tiger Shark"
    elif "bull" in valor:
        return "Bull Shark"
    elif "hammer" in valor:
        return "Hammerhead Shark"
    elif "shark" in valor:
        return "Other Shark"
    else:
        return "Unknown"

df_shark_attacks["species"] = df_shark_attacks["species"].apply(clean_species)

type_mapping = {
    "Unprovoked": "Unprovoked",
    "Provoked": "Provoked",
    "Invalid": "Invalid",
    "Watercraft": "Watercraft",
    "Sea Disaster": "Sea Disaster",
    "Questionable": "Questionable",
    "Boat": "Watercraft",
    " Provoked": "Provoked",
    "unprovoked": "Unprovoked",
    "?": "Questionable",
    "Unconfirmed": "Questionable",
    "Unverified": "Questionable",
    "Under investigation": "Questionable"
}

df_shark_attacks['type'] = df_shark_attacks['type'].map(type_mapping)

df_shark_attacks.fillna("Questionable", inplace=True)

df_shark_attacks["sex"] = df_shark_attacks["sex"].apply(lambda sex: sex.strip())

sex_mapping = {
    "M": "M",
    "F": "F",
    "Questionable": "Unknown",
    "N": "Unknown",
    "m": "M",
    "lli": "Unknown",
    "M x 2": "Unknown",
    ".": "Unknown"
}

df_shark_attacks["sex"] = df_shark_attacks["sex"].map(sex_mapping)

df_shark_attacks['age_clean'] = pd.to_numeric(df_shark_attacks['age'], errors='coerce')

age_mediana = df_shark_attacks['age_clean'].median()

df_shark_attacks['age_clean'] = df_shark_attacks['age_clean'].fillna(age_mediana)

df_shark_attacks['age_clean'] = df_shark_attacks['age_clean'].astype(int)

df_backup_age = df_shark_attacks.copy()

df_shark_attacks.drop(columns=['age'], inplace=True)

df_shark_attacks['name'] = df_shark_attacks['name'].astype(str).str.strip()

df_shark_attacks.loc[df_shark_attacks['name'].str.lower().isin(['male', 'female', 'questionable', 'boy', '2 males', 'a sailor', 'males', 'boat', 'child', 'unknown', 'girl']), 'name'] = 'Anonymous'

def mes_map(date):
    meses = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun','Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    date = str(date).title()
    for mes in meses:
        if mes in date:
            return mes
    return None

df_shark_attacks["mes"] = df_shark_attacks["date"].apply(mes_map)

df_backup_mes = df_shark_attacks.copy()

df_shark_attacks.drop(columns=['date'], inplace=True)

df_shark_attacks = df_shark_attacks.loc[df_shark_attacks["country"] == "USA"].copy()

def activity_maping(activity):
    activity = activity.strip().lower()
    if "surf" in activity:
        return "surfing"
    if "swim" in activity:
        return "swimming"
    if "fishing" in activity:
        return "fishing"
    if "diving" in activity:
        return "diving"
    if "snorkel" in activity:
        return "snorkel"
    if "hunt" in activity:
        return "hunting"
    if "question" in activity:
        return "questionable"
    else:
        return activity.strip().lower()
    
df_shark_attacks["activity"] = df_shark_attacks["activity"].apply(activity_maping)

def state_format(state):
    state = state.strip().lower()
    if "flor" in state:
        return "Florida"
    if "hawai" in state:
        return "Hawaii"
    if "cali" in state:
        return "California"
    if "baha" in state:
        return "Bahamas"
    else:
        return state.strip().title()
    
df_shark_attacks["state"] = df_shark_attacks["state"].apply(state_format)

df_shark_attacks


Unnamed: 0,type,country,state,location,activity,name,sex,injury,fatal,species,age_clean,mes
3,Unprovoked,USA,California,Lovers Point Pacific Grove,swimming,Erica Fox,F,Taken by shark body recovered with multiple in...,Y,White Shark,55,Dec
4,Unprovoked,USA,California,Salmon Creek,surfing,Anonymous,M,Hand Injury,N,White Shark,24,Dec
5,Provoked,USA,Hawaii,"Ka'alu""alu Beach",freeing trapped shark,Josiah Kaimani Ventura,M,Bite wounds to thigh,N,Other Shark,24,Dec
10,Unprovoked,USA,Hawaii,Pine Trees Hanalei Bay Kaui,swimming,Chance Swanson,M,Injuries to legs,N,Unknown,24,Nov
11,Unprovoked,USA,Texas,Matagorda Beach Matagorda,fishing,Chuck Bledsoe,M,Laceration on top and undermeath right foot,N,Unknown,24,Nov
...,...,...,...,...,...,...,...,...,...,...,...,...
7009,Unprovoked,USA,Florida,"Palm Beach, Palm Beach County",standing,Horton Chase,M,Abrasions & bruises hip to ankle,N,Unknown,24,
7043,Unprovoked,USA,Florida,"Gadsden Point, Tampa Bay",fishing,James Kelley,M,2-inch lacerations,N,Unknown,24,
7048,Unprovoked,USA,North Carolina,Somewhere between Hatteras and Beaufort,swimming,"""youthful male""",M,"""Lost leg""",N,Unknown,24,Jul
7052,Unprovoked,USA,Hawaii,Puna,questionable,"A ""chiefess""",F,Ankle bitten,N,Unknown,24,


## Hipótesis de riesgos, letalidad y ubicación

Vamos a filtrar por riesgo para comprobar cuál es el mejor lugar para un negocio de exploración de tiburones con el menor riesgo posible según nuestro análisis de los datos.

In [None]:
# Filtramos por `mes`, `injury` y `state` para saber qué mes es más probable
# el encuentro con tiburones y en qué ubicación

fatality_state = df_shark_attacks.groupby(['state', 'injury','mes'], as_index=False).size()

# Comprobamos

print(fatality_state.head())

fatality_state['state'].value_counts()

state
Florida              1003
Hawaii                307
California            299
South Carolina        156
North Carolina        111
Texas                  71
New Jersey             54
New York               47
Oregon                 30
Massachusetts          18
Louisiana              18
Alabama                17
Georgia                17
Puerto Rico            16
Virginia               14
Maryland                9
Rhode Island            8
Delaware                8
Mississippi             7
Questionable            6
Connecticut             6
Us Virgin Islands       5
Palmyra Atoll           2
Maui                    2
Washington              2
Guam                    2
Bahamas                 2
Maine                   2
East Coast              2
Wake Island             1
Virgin Islands          1
Carolina Coast          1
Cayman Islands          1
Cuba                    1
Samoa                   1
Midway Atoll            1
Missouri                1
Pennsylvania            1
Kentuc

## Creación de nuevo dataframe

Después de la limpieza hemos observado que los `state` con mayor número de casos son `Florida`, `Hawaii` y `California`, y basaremos nuestro análisis en estos estados. Para ello, crearemos un dataframe para trabajar con él

In [26]:
# Creamos Dataframe

df_california_hawaii = df_shark_attacks[(df_shark_attacks['state'] == 'California') | (df_shark_attacks['state'] == 'Hawaii')].copy()

# Reseteamos `index`

df_california_hawaii.reset_index(drop=True,inplace=True)

# Comprobamos

print(df_california_hawaii.count())

type         673
country      673
state        673
location     673
activity     673
name         673
sex          673
injury       673
fatal        673
species      673
age_clean    673
mes          641
dtype: int64


In [27]:
# Comprobaciones

print(df_california_hawaii.head())

print(df_california_hawaii.describe(include="all").T)

print(df_california_hawaii.describe())

         type country       state                     location  \
0  Unprovoked     USA  California   Lovers Point Pacific Grove   
1  Unprovoked     USA  California                 Salmon Creek   
2    Provoked     USA      Hawaii             Ka'alu"alu Beach   
3  Unprovoked     USA      Hawaii  Pine Trees Hanalei Bay Kaui   
4  Unprovoked     USA  California              Catalina Island   

                activity                    name sex  \
0               swimming               Erica Fox   F   
1                surfing               Anonymous   M   
2  freeing trapped shark  Josiah Kaimani Ventura   M   
3               swimming          Chance Swanson   M   
4               swimming      Christopher Murray   M   

                                              injury fatal      species  \
0  Taken by shark body recovered with multiple in...     Y  White Shark   
1                                        Hand Injury     N  White Shark   
2                               Bite woun

In [None]:
# Visualización del dataframe

df_california_hawaii

Unnamed: 0,type,country,state,location,activity,name,sex,injury,fatal,species,age_clean,mes
0,Unprovoked,USA,California,Lovers Point Pacific Grove,swimming,Erica Fox,F,Taken by shark body recovered with multiple in...,Y,White Shark,55,Dec
1,Unprovoked,USA,California,Salmon Creek,surfing,Anonymous,M,Hand Injury,N,White Shark,24,Dec
2,Provoked,USA,Hawaii,"Ka'alu""alu Beach",freeing trapped shark,Josiah Kaimani Ventura,M,Bite wounds to thigh,N,Other Shark,24,Dec
3,Unprovoked,USA,Hawaii,Pine Trees Hanalei Bay Kaui,swimming,Chance Swanson,M,Injuries to legs,N,Unknown,24,Nov
4,Unprovoked,USA,California,Catalina Island,swimming,Christopher Murray,M,Leg and foot injury,N,Other Shark,54,Sep
...,...,...,...,...,...,...,...,...,...,...,...,...
668,Unprovoked,USA,California,"Monterey, Montery County",fishing,Anonymous,M,FATAL PROVOKED INCIDENTS,Y,Other Shark,24,
669,Unprovoked,USA,California,"LaJolla, San Diego County",diving,Charles Fleming,M,Calf bitten,N,Unknown,24,
670,Invalid,USA,Hawaii,"Portlock, Oahu",diving,Val Valentine,M,A 4.3 m [14'] shark made threat display. No in...,UNKNOWN,Unknown,24,
671,Unprovoked,USA,California,"Capistrano, Orange County",questionable,Anonymous,F,Leg injured,N,White Shark,24,Jun


## Hipótesis

### Pregunta: ¿En qué época del año se producen más ataques y con qué especies ocurren y qué letalidad tienen esos ataques?

In [51]:
# Época o temporada con más avistamientos

ranking = df_california_hawaii.groupby(['state', 'mes','species', 'fatal'], as_index=False).size()

# Cuándo hay más tiburones 

print(df_california_hawaii.groupby(["state", "mes"]).size())

# Cuándo son menos fatales

print(df_california_hawaii.loc[df_california_hawaii["fatal"] == "N"].groupby(["state", "mes"]).size())

# Suma

print(df_california_hawaii.loc[df_california_hawaii["fatal"] == "Y"].groupby(["state"]).describe())
print(df_california_hawaii.loc[df_california_hawaii["fatal"] == "N"].groupby(["state"]).describe())

# Época del año

df_california_hawaii.loc[df_california_hawaii["fatal"] == "N"].groupby(["state", "mes"]).describe()

state       mes
California  Apr    12
            Aug    48
            Dec    16
            Feb     7
            Jan    11
            Jul    57
            Jun    27
            Mar     8
            May    21
            Nov    24
            Oct    44
            Sep    44
Hawaii      Apr    33
            Aug    23
            Dec    31
            Feb    18
            Jan    18
            Jul    23
            Jun    24
            Mar    27
            May    22
            Nov    29
            Oct    48
            Sep    26
dtype: int64
state       mes
California  Apr     8
            Aug    40
            Dec    11
            Feb     5
            Jan     8
            Jul    50
            Jun    21
            Mar     7
            May    15
            Nov    23
            Oct    39
            Sep    42
Hawaii      Apr    21
            Aug    16
            Dec    22
            Feb    12
            Jan    13
            Jul    16
            Jun    19
         

Unnamed: 0_level_0,Unnamed: 1_level_0,age_clean,age_clean,age_clean,age_clean,age_clean,age_clean,age_clean,age_clean
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max
state,mes,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
California,Apr,8.0,26.0,9.319718,16.0,24.0,24.0,24.0,48.0
California,Aug,40.0,27.525,10.155174,10.0,22.0,24.0,32.25,57.0
California,Dec,11.0,31.363636,8.800826,24.0,24.5,27.0,36.0,50.0
California,Feb,5.0,24.0,3.535534,19.0,24.0,24.0,24.0,29.0
California,Jan,8.0,24.25,3.453776,19.0,23.25,24.0,25.25,29.0
California,Jul,50.0,28.84,11.138021,13.0,24.0,24.0,33.5,61.0
California,Jun,21.0,35.428571,13.238472,15.0,24.0,31.0,44.0,62.0
California,Mar,7.0,34.285714,13.852969,24.0,24.0,27.0,43.0,55.0
California,May,15.0,32.066667,12.969561,15.0,24.0,28.0,43.0,57.0
California,Nov,23.0,30.0,9.371136,21.0,24.0,25.0,34.5,50.0


#### Conclusión de hipótesis 1

- Los avistamientos son igualmente numerosos en `California` y `Hawaii` pero el riesgo es menor significativamente en `California`. Por eso, decidimos centrarnos en `California` para realizar el estudio completo de viabilidad de un negocio de avistamiento de tiburones. 
- El avistamiento de tiburones con mayor tasa de éxito ocurre en los meses de junio, julio, agosto, septiembre y octubre.