In [1]:
import pandas as pd
from functions import leer_datos

url = "https://www.sharkattackfile.net/spreadsheets/GSAF5.xls"
df = leer_datos(url)

Estandarizamos los nombres de las columnas y mostramos todas las disponibles

In [2]:
pd.set_option("display.max_columns", None)
df.columns = df.columns.str.lower().str.strip().str.replace(" ", "_")

Eliminamos los valores nulos de la columna "date"

In [3]:
df = df.dropna(subset = ["date"])

Eliminamos las filas cuya columna "year" no aporte valor

In [4]:
df = df[df["year"] != 0.0]

Cambiamos el type de la columna "year" de float a int

In [5]:
df.dtypes

date               object
year              float64
type               object
country            object
state              object
location           object
activity           object
name               object
sex                object
age                object
injury             object
unnamed:_11        object
time               object
species            object
source             object
pdf                object
href_formula       object
href               object
case_number        object
case_number.1      object
original_order    float64
unnamed:_21        object
unnamed:_22        object
dtype: object

In [6]:
df["year"] = df["year"].astype("Int64")

In [7]:
df.dtypes

date               object
year                Int64
type               object
country            object
state              object
location           object
activity           object
name               object
sex                object
age                object
injury             object
unnamed:_11        object
time               object
species            object
source             object
pdf                object
href_formula       object
href               object
case_number        object
case_number.1      object
original_order    float64
unnamed:_21        object
unnamed:_22        object
dtype: object

Filtramos las fechas a los últimos 50 años

In [8]:
año_inicio = 1974
año_final = 2024

df = df[(df["year"] >= año_inicio) & (df["year"] <= año_final)]

Eliminamos columnas vacías

In [9]:
df = df.drop(columns = ["pdf", "href_formula", "href", "case_number", "case_number.1", "original_order", "unnamed:_21", "unnamed:_22"])

Renombramos columna fallecidos

In [10]:
df = df.rename(columns = {"unnamed:_11": "fatal"})

Corregimos valores nulos de determinadas columnas y los sustituimos por "Unknown" o "U"

In [11]:
df["activity"] = df["activity"].fillna("Unknown")
df["name"] = df["name"].fillna("Unknown")
df["sex"] = df["sex"].fillna("U")
df["species"] = df["species"].fillna("Unknown")
df["fatal"] = df["fatal"].fillna("U")

In [16]:
df

Unnamed: 0,date,year,type,country,state,location,activity,name,sex,age,injury,fatal,time,species,source
0,15 Mar 2024,2024,Unprovoked,AUSTRALIA,Queensland,Bargara Beach,Swimming,Brooklyn Sauer,F,13,"Minor injuries to back, abdomen and legs",U,16h00,Tiger shark,"Yahoo News, 3/15/2024"
1,04 Mar 2024,2024,Unprovoked,USA,Hawaii,"Old Man's, Waikiki",Surfing,Matthew White,M,,"No injury, shark bit surfboard",N,,Tiger shark 8',"Surfer, 3/6/2024F"
2,02 Mar-2024,2024,Unprovoked,USA,Hawaii,"Rainbows, Oahu",Swimming,Unknown,F,11,Lacerations to left foot,N,13h30,3' to 4' shark,"Hawaii News Now, 3/4/2024"
3,25 Feb-2024,2024,Unprovoked,AUSTRALIA,Western Australia,"Sandlnd Island, Jurian Bay",Unknown,female,F,46,Leg bitten,N,11h30,Tiger shark,"WA Today, 2/26/2024"
4,14 Feb-2024,2024,Unprovoked,INDIA,Maharashtra,"Vaitarna River, Palghar District",Fishing,Vicky Suresh Govari,M,32,Calf of lower left leg injured,N,,"Bull shark, 7'","Times of India, 2/14/2024"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3904,07-Jan-1974,1974,Unprovoked,SOUTH AFRICA,KwaZulu-Natal,Amanzimtoti,Swimming,Cornelius “Les” Pyper,M,33,Knee & calf lacerated,N,14h10,2 m to 2.5 m [6.75' to 8.25'] shark,"L. Pyper, J. Bass, G. Charter; B. Davis, M. Le..."
3905,07-Jan-1974,1974,Unprovoked,MOZAMBIQUE,Gaza,Xai Xai,Swimming,Oaulkurt-Pape,M,,FATAL,Y,,Unknown,"Johannesburg Star, 1/8/1974"
3906,07-Jan-1974,1974,Unprovoked,MOZAMBIQUE,Gaza,Xai Xai,Swimming,male,M,32,FATAL,Y,,Unknown,"P. Logan, GSAF"
3907,Summer 1974,1974,Unprovoked,AUSTRALIA,Western Australia,Emu Channel,Spearfishing,Glen Tunbridge,M,,8 to 10 puncture marks around knee,N,,"Bronze whaler shark, 4'",G. Tunbridge
