In [1]:
import numpy as np
import pandas as pd

In [20]:
shark_df = pd.read_excel('./shark-dataset.xls')
shark_df.columns = shark_df.columns.str.lower().str.strip().str.replace(" ", "_")
shark_df = shark_df.drop(['type', 'state', 'name', 'species', 'source', 'pdf', 'href_formula', 'href', 'case_number', 'case_number.1', 'original_order', 'unnamed:_21', 'unnamed:_22', 'time'], axis=1)
shark_df.rename(columns={'unnamed:_11': 'fatal'}, inplace=True)
shark_df

Unnamed: 0,date,year,country,location,activity,sex,age,injury,fatal
0,2024-10-14 00:00:00,2024.0,Honduras,Atlantida,Swimming,M,38,Back and thigh extensive tissue loss,N
1,2024-10-11 00:00:00,2024.0,USA,Brevard County Orlando,Surfing,M,16,Bite to left arm,N
2,2024-09-17 00:00:00,2024.0,Honduras,Puerto Castillo,Diving for shellfish,M,33,Bite to left leg,N
3,2024-09-16 00:00:00,2024.0,Morocco,West of Dakhla,Swimming - jumped off yacht,F,30,Leg severed,Y
4,2024-08-26 00:00:00,2024.0,Jamaica,Falmouth,Spearfishing,M,16,Head right arm and leg severed,Y
...,...,...,...,...,...,...,...,...,...
6968,Before 1903,0.0,AUSTRALIA,Roebuck Bay,Diving,M,,FATAL,Y
6969,Before 1903,0.0,AUSTRALIA,,Pearl diving,M,,FATAL,Y
6970,1900-1905,0.0,USA,Ocracoke Inlet,Swimming,M,,FATAL,Y
6971,1883-1889,0.0,PANAMA,"Panama Bay 8ºN, 79ºW",,M,,FATAL,Y


In [68]:
shark_df.country.unique() # many many countries (unsure values and very small countries)
top_20_countries = shark_df.country.value_counts().head(20)

# stick to top 20 countries
shark_df = shark_df[shark_df.country.isin(top_20_countries.index)] # deletes around 1000 of originally 6900 rows

# first letter in uppercase for all countries except USA for consistency
shark_df.country = shark_df.country.apply(lambda x: x.title() if x != 'USA' else x)
shark_df.head()

Unnamed: 0,date,year,country,location,activity,sex,age,injury,fatal
1,2024-10-11 00:00:00,2024.0,USA,Brevard County Orlando,Surfing,M,16,Bite to left arm,N
8,2024-07-08 00:00:00,2024.0,USA,Ponce de Leon Inlet Volusia County,Diving into Water,M,14,Lower left leg injury,N
9,2024-07-05 00:00:00,2024.0,USA,New Smyrna Beach,Wading,M,26,Minor injury to left foot,N
10,2024-07-04 00:00:00,2024.0,USA,South Padre Island,Swimming,F,Middle age,Bite to left leg calf muscle removed,N
11,2024-07-04 00:00:00,2026.0,USA,South Padre Island,Swimming,F,18,Minor cuts to calf,N


# Function Block (Risk Calculation)

In [74]:
# must be updated relative to the cleaned dataframe
coastline_lengths = {
    'USA': 19924,
    'Australia': 25760,
    'Bahamas': 3542,
    'South Africa': 2798,
    'Mexico': 9330,
    'New Zealand': 15134,
    'Egypt': 2450,
    'Spain': 4964,
    'Ecuador': 2237,
    'French Polynesia': 2525,
    'New Caledonia': 2254,
    'Cuba': 5746,
    'Brazil': 7491,
    'Fiji': 1129,
    'Japan': 29751,
    'Thailand': 3219,
    'Costa Rica': 1290,
    'Réunion': 207}

def risk_country(client_country):
    
    top_20_countries = shark_df.country.value_counts().head(20)
    
    # attack risk for the client country = number of attacks in the client country divided by the coastline length 
    attack_risk = round(top_20_countries[client_country] / coastline_lengths[client_country], 3)     
    
    # number of attacks in the client country that were fatal
    number_of_fatal_attacks = len(shark_df[(shark_df.country == client_country) & (shark_df.fatal == 'Y')])
    
    # number of fatal attacks divided by the total number of shark attacks in that country
    fatality_risk = number_of_fatal_attacks / top_20_countries[client_country]
    
    return fatality_risk * attack_risk

def risk_activity(client_activity):
    
    top_10_risks = shark_df.activity.value_counts().head(10)
    
    number_of_fatal_attacks = len(shark_df[(shark_df.activity == client_activity) & (shark_df.fatal == 'Y')])
    fatality_risk = number_of_fatal_attacks / top_10_risks[client_activity]
    
    return fatality_risk

def risk_age(client_age):
    
    number_of_fatal_attacks = len(shark_df[(shark_df.age == client_age) & (shark_df.fatal == 'Y')])
    fatality_risk = number_of_fatal_attacks / shark_df.age.value_counts()[client_age]
    
    return fatality_risk

def risk_season(client_season):
    
    seasons = shark_df.activity.value_counts()
    
    number_of_fatal_attacks = len(shark_df[(shark_df.season == client_season) & (shark_df.fatal == 'Y')])
    fatality_risk = number_of_fatal_attacks / seasons[client_season]
    
    return fatality_risk

def assign_insurance(client_country, client_activity, client_age):
    
    risk = risk_country(client_country) * risk_activity(client_activity) * risk_age(client_age)
    
    if risk >= 0.1:
        return 'high'
    
    elif risk > 0.01:
        return 'medium'
    
    else:
        return 'low'

In [85]:
# User input function
def client_entries():
    client_country = input('Please enter the country you travel to: ').strip()
    client_activity = input('Please enter your activity: ').strip()
    client_age = input('Please enter your age: ').strip()
    #client_sex = input('Please enter your sex: ').lower().strip()
    #client_season = input('Please enter the season of your trip: ').lower().strip()
    #client_year
    return client_country, client_activity, client_age

In [86]:
assign_insurance(*client_entries())

Please enter the country you travel to: Australia
Please enter your activity: Swimming
Please enter your age: 34


'low'